More use of tags - drop add_header action -> allow + headers+tags

This commit is contained in:
Dan Milne
2025-11-20 11:55:04 +11:00
parent 3f274c842c
commit de2eb43e2b
17 changed files with 526 additions and 49 deletions

View File

@@ -105,6 +105,11 @@ class Event < ApplicationRecord
joins(:network_range).where("network_ranges.network = ?", cidr)
}
# Bot filtering scopes
scope :bots, -> { where(is_bot: true) }
scope :humans, -> { where(is_bot: false) }
scope :exclude_bots, -> { where(is_bot: false) }
# Add association for the optional network_range_id
belongs_to :network_range, optional: true
@@ -191,6 +196,9 @@ class Event < ApplicationRecord
# Populate network intelligence from IP address
before_save :populate_network_intelligence, if: :should_populate_network_intelligence?
# Detect bot traffic using user agent and network intelligence
before_save :detect_bot_traffic, if: :should_detect_bot?
# Backfill network intelligence for all events
def self.backfill_network_intelligence!(batch_size: 10_000)
total = where(country: nil).count
@@ -693,10 +701,88 @@ class Event < ApplicationRecord
self.server_name = payload["server_name"]
self.environment = payload["environment"]
# Extract agent info
agent_data = payload.dig("agent") || {}
self.agent_version = agent_data["version"]
self.agent_name = agent_data["name"]
end
def should_detect_bot?
# Detect bots if user agent is present or if we have network intelligence
user_agent.present? || network_range_id.present?
end
def detect_bot_traffic
self.is_bot = bot_detected?
rescue => e
Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
self.is_bot = false # Default to non-bot on error
end
def bot_detected?
# Multi-signal bot detection approach with tagging:
# 1. User agent detection (DeviceDetector gem) - adds bot:name tag
# 2. Network range source matching (bot_import_* sources) - adds network tags
# 3. Fallback to datacenter classification for infrastructure-based detection
# Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
if user_agent.present?
begin
detector = DeviceDetector.new(user_agent)
if detector.bot?
# Add bot tag with specific bot name
bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
add_tag("bot:#{bot_name}")
return true
end
rescue => e
Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
end
end
# Signal 2: Network range from known bot sources
if network_range_id.present?
range = NetworkRange.find_by(id: network_range_id)
if range
# Check if the network range source indicates a bot import
if range.source&.start_with?('bot_import_')
# Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
bot_type = range.source.sub('bot_import_', '')
add_tag("bot:#{bot_type}")
add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
return true
end
# Check if the company is a known bot provider (from bot imports)
# Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
'anthropic', 'cloudflare', 'microsoft', 'facebook',
'meta', 'apple', 'duckduckgo']
company_lower = company&.downcase
if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
return true
end
end
end
# Signal 3: Datacenter traffic is often bot traffic
# However, this is less precise so we use it as a weaker signal
# Only mark as bot if datacenter AND has other suspicious characteristics
if is_datacenter && user_agent.present?
# Generic/common bot user agents in datacenter networks
ua_lower = user_agent.downcase
bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
add_tag("bot:datacenter")
add_tag("datacenter:true")
return true
end
end
# Default: not a bot
false
end
end

View File

@@ -34,7 +34,8 @@ class EventDdb
SQL
# Convert to hash like ActiveRecord .group.count returns
result.to_a.to_h { |row| [row["waf_action"], row["count"]] }
# DuckDB returns arrays: [waf_action, count]
result.to_a.to_h { |row| [row[0], row[1]] }
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}"
@@ -54,7 +55,8 @@ class EventDdb
SQL
# Return array of [country, count] tuples like ActiveRecord
result.to_a.map { |row| [row["country"], row["count"]] }
# DuckDB returns arrays: [country, count]
result.to_a.map { |row| [row[0], row[1]] }
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}"
@@ -73,7 +75,8 @@ class EventDdb
LIMIT ?
SQL
result.to_a.map { |row| [row["ip_address"], row["count"]] }
# DuckDB returns arrays: [ip_address, count]
result.to_a.map { |row| [row[0], row[1]] }
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}"
@@ -94,7 +97,8 @@ class EventDdb
SQL
# Convert to hash with Time keys like ActiveRecord
result.to_a.to_h { |row| [row["hour"], row["count"]] }
# DuckDB returns arrays: [hour, count]
result.to_a.to_h { |row| [row[0], row[1]] }
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}"
@@ -495,5 +499,128 @@ class EventDdb
Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}"
nil
end
# Bot traffic analysis - breakdown of bot vs human traffic
def bot_traffic_breakdown(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT
is_bot,
COUNT(*) as event_count,
COUNT(DISTINCT ip_address) as unique_ips
FROM events
WHERE timestamp >= ?
GROUP BY is_bot
SQL
# Convert to hash: is_bot => { event_count, unique_ips }
# DuckDB returns arrays: [is_bot, event_count, unique_ips]
result.to_a.to_h do |row|
[
row[0] ? "bot" : "human", # row[0] = is_bot
{
"event_count" => row[1], # row[1] = event_count
"unique_ips" => row[2] # row[2] = unique_ips
}
]
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_breakdown: #{e.message}"
nil
end
# Count human traffic (non-bot) since timestamp
def human_traffic_count(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
WHERE timestamp >= ? AND is_bot = false
SQL
result.first&.first || 0
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in human_traffic_count: #{e.message}"
nil
end
# Count bot traffic since timestamp
def bot_traffic_count(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
WHERE timestamp >= ? AND is_bot = true
SQL
result.first&.first || 0
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_count: #{e.message}"
nil
end
# Top bot user agents
def top_bot_user_agents(start_time, limit = 20)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
user_agent,
COUNT(*) as event_count,
COUNT(DISTINCT ip_address) as unique_ips
FROM events
WHERE timestamp >= ? AND is_bot = true AND user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY event_count DESC
LIMIT ?
SQL
# DuckDB returns arrays: [user_agent, event_count, unique_ips]
result.to_a.map do |row|
{
user_agent: row[0], # row[0] = user_agent
event_count: row[1], # row[1] = event_count
unique_ips: row[2] # row[2] = unique_ips
}
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_bot_user_agents: #{e.message}"
nil
end
# Bot traffic timeline (hourly breakdown)
def bot_traffic_timeline(start_time, end_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time, end_time)
SELECT
DATE_TRUNC('hour', timestamp) as hour,
SUM(CASE WHEN is_bot = true THEN 1 ELSE 0 END) as bot_count,
SUM(CASE WHEN is_bot = false THEN 1 ELSE 0 END) as human_count
FROM events
WHERE timestamp >= ? AND timestamp < ?
GROUP BY hour
ORDER BY hour
SQL
# Convert to hash with Time keys
# DuckDB returns arrays: [hour, bot_count, human_count]
result.to_a.to_h do |row|
[
row[0], # row[0] = hour
{
"bot_count" => row[1], # row[1] = bot_count
"human_count" => row[2], # row[2] = human_count
"total" => row[1] + row[2]
}
]
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_timeline: #{e.message}"
nil
end
end
end

View File

@@ -7,7 +7,11 @@
# and classification flags (datacenter, proxy, VPN).
class NetworkRange < ApplicationRecord
# Sources for network range creation
SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country].freeze
SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country
bot_import_amazon_aws bot_import_google bot_import_microsoft_bing bot_import_anthropic
bot_import_openai_searchbot bot_import_openai_chatgpt_user bot_import_openai_gptbot
bot_import_cloudflare bot_import_facebook bot_import_applebot bot_import_duckduckgo
production_import].freeze
# Associations
has_many :rules, dependent: :destroy
@@ -116,19 +120,19 @@ class NetworkRange < ApplicationRecord
# Parent/child relationships
def parent_ranges
NetworkRange.where("?::inet << network AND masklen(network) < ?", network.to_s, prefix_length)
.order("masklen(network) DESC")
# Find networks that contain this network (less specific / shorter prefix)
# The << operator implicitly means the containing network has a shorter prefix
# IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
NetworkRange.where("?::inet << network", cidr)
.order("masklen(network) DESC") # Most specific parent first
end
def child_ranges
NetworkRange.where("network >> ?::inet AND masklen(network) > ?", network.to_s, prefix_length)
.order("masklen(network) ASC")
end
def sibling_ranges
NetworkRange.where("masklen(network) = ?", prefix_length)
.where("network && ?::inet", network.to_s)
.where.not(id: id)
# Find networks that are contained by this network (more specific / longer prefix)
# The >> operator implicitly means the contained network has a longer prefix
# IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
NetworkRange.where("?::inet >> network", cidr)
.order("masklen(network) ASC") # Least specific child first
end
# Find nearest parent with intelligence data

View File

@@ -149,6 +149,42 @@ class Rule < ApplicationRecord
metadata&.dig('header_value')
end
# Tag-related methods
def tags
metadata_hash['tags'] || []
end
def tags=(new_tags)
self.metadata = metadata_hash.merge('tags' => Array(new_tags))
end
def add_tag(tag)
current_tags = tags
return if current_tags.include?(tag.to_s)
self.metadata = metadata_hash.merge('tags' => (current_tags + [tag.to_s]))
end
def remove_tag(tag)
current_tags = tags
return unless current_tags.include?(tag.to_s)
self.metadata = metadata_hash.merge('tags' => (current_tags - [tag.to_s]))
end
def has_tag?(tag)
tags.include?(tag.to_s)
end
# Headers for add_header action or metadata-based header injection
def headers
metadata_hash['headers'] || {}
end
def headers=(new_headers)
self.metadata = metadata_hash.merge('headers' => new_headers.to_h)
end
def related_surgical_rules
if surgical_block?
# Find the corresponding exception rule