More use of tags - drop add_header action -> allow + headers+tags

2025-11-20 11:55:04 +11:00
parent 3f274c842c
commit de2eb43e2b
17 changed files with 526 additions and 49 deletions
--- a/app/models/event.rb
+++ b/app/models/event.rb
@@ -105,6 +105,11 @@ class Event < ApplicationRecord
    joins(:network_range).where("network_ranges.network = ?", cidr)
  }

+  # Bot filtering scopes
+  scope :bots, -> { where(is_bot: true) }
+  scope :humans, -> { where(is_bot: false) }
+  scope :exclude_bots, -> { where(is_bot: false) }
+
  # Add association for the optional network_range_id
  belongs_to :network_range, optional: true

@@ -191,6 +196,9 @@ class Event < ApplicationRecord
  # Populate network intelligence from IP address
  before_save :populate_network_intelligence, if: :should_populate_network_intelligence?

+  # Detect bot traffic using user agent and network intelligence
+  before_save :detect_bot_traffic, if: :should_detect_bot?
+
  # Backfill network intelligence for all events
  def self.backfill_network_intelligence!(batch_size: 10_000)
    total = where(country: nil).count
@@ -693,10 +701,88 @@ class Event < ApplicationRecord
    self.server_name = payload["server_name"]
    self.environment = payload["environment"]

-    
+
    # Extract agent info
    agent_data = payload.dig("agent") || {}
    self.agent_version = agent_data["version"]
    self.agent_name = agent_data["name"]
  end
+
+  def should_detect_bot?
+    # Detect bots if user agent is present or if we have network intelligence
+    user_agent.present? || network_range_id.present?
+  end
+
+  def detect_bot_traffic
+    self.is_bot = bot_detected?
+  rescue => e
+    Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
+    self.is_bot = false # Default to non-bot on error
+  end
+
+  def bot_detected?
+    # Multi-signal bot detection approach with tagging:
+    # 1. User agent detection (DeviceDetector gem) - adds bot:name tag
+    # 2. Network range source matching (bot_import_* sources) - adds network tags
+    # 3. Fallback to datacenter classification for infrastructure-based detection
+
+    # Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
+    if user_agent.present?
+      begin
+        detector = DeviceDetector.new(user_agent)
+        if detector.bot?
+          # Add bot tag with specific bot name
+          bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
+          add_tag("bot:#{bot_name}")
+          return true
+        end
+      rescue => e
+        Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
+      end
+    end
+
+    # Signal 2: Network range from known bot sources
+    if network_range_id.present?
+      range = NetworkRange.find_by(id: network_range_id)
+      if range
+        # Check if the network range source indicates a bot import
+        if range.source&.start_with?('bot_import_')
+          # Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
+          bot_type = range.source.sub('bot_import_', '')
+          add_tag("bot:#{bot_type}")
+          add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
+          return true
+        end
+
+        # Check if the company is a known bot provider (from bot imports)
+        # Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
+        known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
+                               'anthropic', 'cloudflare', 'microsoft', 'facebook',
+                               'meta', 'apple', 'duckduckgo']
+        company_lower = company&.downcase
+        if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
+          add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
+          add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
+          return true
+        end
+      end
+    end
+
+    # Signal 3: Datacenter traffic is often bot traffic
+    # However, this is less precise so we use it as a weaker signal
+    # Only mark as bot if datacenter AND has other suspicious characteristics
+    if is_datacenter && user_agent.present?
+      # Generic/common bot user agents in datacenter networks
+      ua_lower = user_agent.downcase
+      bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
+      if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
+        add_tag("bot:datacenter")
+        add_tag("datacenter:true")
+        return true
+      end
+    end
+
+    # Default: not a bot
+    false
+  end
 end