baffle-hub/app/models/event.rb

# frozen_string_literal: true

class Event < ApplicationRecord
  # Normalized association for hosts (most valuable compression)
  belongs_to :request_host, optional: true

  # WAF rule associations
  belongs_to :rule, optional: true
  has_one :waf_policy, through: :rule

  # Enums for fixed value sets
  # Canonical WAF action order - aligned with Rule and Agent models
  #
  # IMPORTANT: These values were swapped to match baffle-agent convention:
  # - deny: 0 (blocked traffic)
  # - allow: 1 (allowed traffic)
  #
  # When using raw integer values in queries:
  # - waf_action = 0  -> denied/blocked requests
  # - waf_action = 1  -> allowed requests
  # - waf_action = 2  -> redirect requests
  # - waf_action = 3  -> challenge requests
  # - waf_action = 4  -> log-only requests
  enum :waf_action, {
    deny: 0,        # deny/block
    allow: 1,       # allow/pass
    redirect: 2,    # redirect
    challenge: 3,   # challenge (CAPTCHA, JS challenge, etc.)
    log: 4          # log only, no action (monitoring mode)
  }, default: :allow, scopes: false

  enum :request_method, {
    get: 0,     # GET
    post: 1,    # POST
    put: 2,     # PUT
    patch: 3,   # PATCH
    delete: 4,  # DELETE
    head: 5,    # HEAD
    options: 6  # OPTIONS
  }, default: :get, scopes: false

  # Serialize segment IDs as array for easy manipulation in Railssqit
  serialize :request_segment_ids, type: Array, coder: JSON

  # Tags are stored as JSON arrays with PostgreSQL jsonb type
  # This provides direct array access and efficient indexing
  attribute :tags, :json, default: -> { [] }

  validates :request_id, presence: true, uniqueness: true
  validates :timestamp, presence: true

  scope :recent, -> { order(timestamp: :desc) }
  scope :by_ip, ->(ip) { where(ip_address: ip) }
  scope :by_user_agent, ->(user_agent) { where(user_agent: user_agent) }
  scope :by_waf_action, ->(waf_action) { where(waf_action: waf_action) }
  scope :blocked, -> { where(waf_action: :deny) }
  scope :allowed, -> { where(waf_action: :allow) }
  scope :logged, -> { where(waf_action: :log) }

  # Tag-based filtering scopes using PostgreSQL array operators
  scope :with_tag, ->(tag) { where("tags @> ARRAY[?]", tag.to_s) }

  scope :with_any_tags, ->(tags) {
    return none if tags.blank?
    tag_array = Array(tags).map(&:to_s)
    where("tags && ARRAY[?]", tag_array)
  }

  scope :with_all_tags, ->(tags) {
    return none if tags.blank?
    tag_array = Array(tags).map(&:to_s)
    where("tags @> ARRAY[?]", tag_array)
  }

  # Network-based filtering scopes - now using denormalized columns
  scope :by_company, ->(company) {
    where("company ILIKE ?", "%#{company}%")
  }

  scope :by_country, ->(country) {
    where(country: country)
  }

  scope :by_network_type, ->(type) {
    case type.to_s.downcase
    when "datacenter"
      where(is_datacenter: true)
    when "vpn"
      where(is_vpn: true)
    when "proxy"
      where(is_proxy: true)
    when "standard"
      where(is_datacenter: false, is_vpn: false, is_proxy: false)
    else
      none
    end
  }

  scope :by_asn, ->(asn) {
    where(asn: asn.to_i)
  }

  scope :by_network_cidr, ->(cidr) {
    # This still requires a join since we need to match CIDR
    joins(:network_range).where("network_ranges.network = ?", cidr)
  }

  # Bot filtering scopes
  scope :bots, -> { where(is_bot: true) }
  scope :humans, -> { where(is_bot: false) }
  scope :exclude_bots, -> { where(is_bot: false) }

  # Add association for the optional network_range_id
  belongs_to :network_range, optional: true

  # Path prefix matching using range queries (uses B-tree index efficiently)
  scope :with_path_prefix, ->(prefix_segment_ids) {
    return none if prefix_segment_ids.blank?

    # Use range queries instead of LIKE for index usage
    # Example: [1,2] prefix matches [1,2], [1,2,3], [1,2,3,4], etc.
    prefix_str = prefix_segment_ids.to_json  # "[1,2]"

    # For exact match: request_segment_ids = "[1,2]"
    # For prefix match: "[1,2," <= request_segment_ids < "[1,3,"
    # This works because JSON arrays sort lexicographically

    # Build the range upper bound by incrementing last segment ID
    upper_prefix = prefix_segment_ids[0..-2] + [prefix_segment_ids.last + 1]
    upper_str = upper_prefix.to_json
    lower_prefix_str = "#{prefix_str[0..-2]},"  # "[1,2," - matches longer paths

    # Use raw SQL to bypass serializer (it expects Array but we're comparing strings)
    where("request_segment_ids = ? OR (request_segment_ids >= ? AND request_segment_ids < ?)",
          prefix_str, lower_prefix_str, upper_str)
  }

  # Path depth queries
  scope :path_depth, ->(depth) {
    where("json_array_length(request_segment_ids) = ?", depth)
  }

  scope :path_depth_greater_than, ->(depth) {
    where("json_array_length(request_segment_ids) > ?", depth)
  }

  # Analytics: Get response time percentiles over different time windows
  def self.response_time_percentiles(windows: { hour: 1.hour, day: 1.day, week: 1.week })
    results = {}

    windows.each do |label, duration|
      scope = where('timestamp >= ?', duration.ago)

      stats = scope.pick(
        Arel.sql(<<~SQL.squish)
          percentile_cont(0.5) WITHIN GROUP (ORDER BY response_time_ms) as p50,
          percentile_cont(0.95) WITHIN GROUP (ORDER BY response_time_ms) as p95,
          percentile_cont(0.99) WITHIN GROUP (ORDER BY response_time_ms) as p99,
          COUNT(*) as count
        SQL
      )

      results[label] = if stats
        {
          p50: stats[0]&.round(2),
          p95: stats[1]&.round(2),
          p99: stats[2]&.round(2),
          count: stats[3]
        }
      else
        { p50: nil, p95: nil, p99: nil, count: 0 }
      end
    end

    results
  end

  # Helper methods
  def path_depth
    request_segment_ids&.length || 0
  end

  def reconstructed_path
    return request_path if request_segment_ids.blank?

    segments = PathSegment.where(id: request_segment_ids).index_by(&:id)
    '/' + request_segment_ids.map { |id| segments[id]&.segment }.compact.join('/')
  end

  # Extract key fields from payload before saving
  before_validation :extract_fields_from_payload

  # Normalize event fields after extraction
  after_validation :normalize_event_fields, if: :should_normalize?

  # Populate network intelligence from IP address
  before_save :populate_network_intelligence, if: :should_populate_network_intelligence?

  # Detect bot traffic using user agent and network intelligence
  before_save :detect_bot_traffic, if: :should_detect_bot?

  # Backfill network intelligence for all events
  def self.backfill_network_intelligence!(batch_size: 10_000)
    total = where(country: nil).count
    return 0 if total.zero?

    puts "Backfilling network intelligence for #{total} events..."
    processed = 0

    where(country: nil).find_in_batches(batch_size: batch_size) do |batch|
      batch.each(&:save) # Triggers before_save callback
      processed += batch.size
      puts "  Processed #{processed}/#{total} (#{(processed.to_f / total * 100).round(1)}%)"
    end

    processed
  end

  # Backfill network intelligence for a specific event
  def backfill_network_intelligence!
    populate_network_intelligence
    save!
  end

  def self.create_from_waf_payload!(request_id, payload)
    # Normalize headers in payload during import phase
    normalized_payload = normalize_payload_headers(payload)

    # Create the WAF request event with agent-provided tags
    event = create!(
      request_id: request_id,
      timestamp: parse_timestamp(normalized_payload["timestamp"]),
      payload: normalized_payload,

      # WAF-specific fields
      ip_address: normalized_payload.dig("request", "ip"),
      user_agent: normalized_payload.dig("request", "headers", "user-agent") || normalized_payload.dig("request", "headers", "User-Agent"),
      # request_method will be set by extract_fields_from_payload + normalize_event_fields
      request_path: normalized_payload.dig("request", "path"),
      request_url: normalized_payload.dig("request", "url"),
      # request_protocol will be set by extract_fields_from_payload + normalize_event_fields
      response_status: normalized_payload.dig("response", "status_code"),
      response_time_ms: normalized_payload.dig("response", "duration_ms"),
      waf_action: normalize_action(normalized_payload["waf_action"]), # Normalize incoming action values
      # Support both new (rule_id) and old (rule_matched) field names during cutover
      rule_id: normalized_payload["rule_id"] || normalized_payload["rule_matched"],
      blocked_reason: normalized_payload["blocked_reason"],

      # Server/Environment info
      server_name: normalized_payload["server_name"],
      environment: normalized_payload["environment"],

      # Tags: start with agent-provided tags only
      tags: normalized_payload["tags"] || [],

      # WAF agent info
      agent_version: normalized_payload.dig("agent", "version"),
      agent_name: normalized_payload.dig("agent", "name")
    )

    # Apply rule tags using EventTagger service
    EventTagger.tag_event(event)

    event
  end

  # Normalize headers in payload to lower case during import phase
  def self.normalize_payload_headers(payload)
    return payload unless payload.is_a?(Hash)

    # Create a deep copy to avoid modifying the original
    normalized = Marshal.load(Marshal.dump(payload))

    # Normalize request headers
    if normalized.dig("request", "headers")&.is_a?(Hash)
      normalized["request"]["headers"] = normalized["request"]["headers"].transform_keys(&:downcase)
    end

    # Normalize response headers if they exist
    if normalized.dig("response", "headers")&.is_a?(Hash)
      normalized["response"]["headers"] = normalized["response"]["headers"].transform_keys(&:downcase)
    end

    normalized
  end

  def self.normalize_action(action)
    return "allow" if action.nil? || action.blank?

    case action.to_s.downcase
    when "allow", "pass", "allowed"
      "allow"
    when "deny", "block", "blocked", "deny_access"
      "deny"
    when "challenge"
      "challenge"
    when "redirect"
      "redirect"
    else
      Rails.logger.warn "Unknown action '#{action}', defaulting to 'allow'"
      "allow"
    end
  end

  def self.parse_timestamp(timestamp)
    case timestamp
    when String
      Time.parse(timestamp)
    when Numeric
      # Sentry timestamps can be in seconds with decimals
      Time.at(timestamp)
    when Time
      timestamp
    else
      Time.current
    end
  rescue => e
    Rails.logger.error "Failed to parse timestamp #{timestamp}: #{e.message}"
    Time.current
  end

  def request_details
    return {} unless payload.present?

    request_data = payload.dig("request") || {}
    {
      ip: request_data["ip"],
      method: request_data["method"],
      path: request_data["path"],
      url: request_data["url"],
      protocol: request_data["protocol"],
      headers: request_data["headers"] || {},
      query: request_data["query"] || {},
      body_size: request_data["body_size"]
    }
  end

  def response_details
    return {} unless payload.present?

    response_data = payload.dig("response") || {}
    {
      status_code: response_data["status_code"],
      duration_ms: response_data["duration_ms"],
      size: response_data["size"]
    }
  end

  def geo_details
    return {} unless payload.present?

    payload.dig("geo") || {}
  end

  def tags
    # Use the dedicated tags column (array), fallback to payload during transition
    # Ensure we always return an Array, even if payload has malformed data (e.g., {} instead of [])
    result = super.presence || payload&.dig("tags")
    return [] if result.nil?
    result.is_a?(Array) ? result : []
  end

  def headers
    raw_headers = payload&.dig("request", "headers") || {}
    normalize_headers(raw_headers)
  end

  def query_params
    payload&.dig("request", "query") || {}
  end

  def blocked?
    waf_action == 'deny'  # deny = 0
  end

  def allowed?
    waf_action == 'allow'  # allow = 1
  end

  def logged?
    waf_action == 'log'
  end

  def challenged?
    waf_action == 'challenge'
  end

  def rule_matched?
    rule_id.present?
  end

  # New path methods for normalization
  def path_segments
    return [] unless request_path.present?
    request_path.split('/').reject(&:blank?)
  end

  def path_segments_array
    @path_segments_array ||= path_segments
  end

  def request_hostname
    return nil unless request_url.present?
    URI.parse(request_url).hostname rescue nil
  end

  # Tag helper methods
  def add_tag(tag)
    tag_str = tag.to_s
    self.tags = (tags + [tag_str]).uniq unless tags.include?(tag_str)
  end

  def remove_tag(tag)
    tag_str = tag.to_s
    self.tags = tags - [tag_str] if tags.include?(tag_str)
  end

  def has_tag?(tag)
    tags.include?(tag.to_s)
  end

  def tag_list
    tags.join(', ')
  end

  # Normalize headers to lower case keys during import phase
  def normalize_headers(headers)
    return {} unless headers.is_a?(Hash)

    headers.transform_keys(&:downcase)
  end

  # Network range resolution methods
  def matching_network_ranges
    return [] unless ip_address.present?

    NetworkRange.contains_ip(ip_address.to_s).map do |range|
      {
        range: range,
        cidr: range.cidr,
        prefix_length: range.prefix_length,
        specificity: range.prefix_length,
        intelligence: range.inherited_intelligence
      }
    end.sort_by { |r| -r[:specificity] } # Most specific first
  end

  def most_specific_range
    # Use the cached network_range_id if available (much faster)
    return NetworkRange.find_by(id: network_range_id) if network_range_id.present?

    # Fallback to expensive lookup
    matching_network_ranges.first&.dig(:range)
  end

  def broadest_range
    matching_network_ranges.last&.dig(:range)
  end

  def network_intelligence
    # Use denormalized fields instead of expensive lookup
    {
      country: country,
      company: company,
      asn: asn,
      asn_org: asn_org,
      is_datacenter: is_datacenter,
      is_vpn: is_vpn,
      is_proxy: is_proxy
    }
  end

  # Denormalized attribute accessors - these now use the columns directly
  # No need to override - Rails provides these automatically:
  # - country (column)
  # - company (column)
  # - asn (column)
  # - asn_org (column)
  # - is_datacenter (column)
  # - is_vpn (column)
  # - is_proxy (column)

  # IP validation
  def valid_ipv4?
    return false unless ip_address.present?

    IPAddr.new(ip_address).ipv4?
  rescue IPAddr::InvalidAddressError
    false
  end

  def valid_ipv6?
    return false unless ip_address.present?

    IPAddr.new(ip_address).ipv6?
  rescue IPAddr::InvalidAddressError
    false
  end

  def valid_ip?
    valid_ipv4? || valid_ipv6?
  end

  # Rules affecting this IP
  def matching_rules
    return Rule.none unless ip_address.present?

    # Get all network ranges that contain this IP
    range_ids = matching_network_ranges.map { |r| r[:range].id }

    # Find rules for those ranges, ordered by priority (most specific first)
    Rule.network_rules
        .where(network_range_id: range_ids)
        .enabled
        .includes(:network_range)
        .order('masklen(network_ranges.network) DESC')
  end

  def active_blocking_rules
    matching_rules.where(waf_action: :deny)
  end

  def has_blocking_rules?
    active_blocking_rules.exists?
  end

  # Get full geo location details from network range
  def geo_location
    network_info = network_intelligence

    {
      country_code: network_info[:country],
      ip_address: ip_address,
      has_data: network_info[:country].present?,
      network_intelligence: network_info
    }
  end

  # Check if event has valid geo location data via network range
  def has_geo_data?
    network_intelligence[:country].present?
  end

  # Lookup country code for this event's IP via network range
  def lookup_country
    return nil if ip_address.blank?

    network_info = network_intelligence
    network_info[:country]
  rescue => e
    Rails.logger.error "Network lookup failed for #{ip_address}: #{e.message}"
    nil
  end

  private

  def should_normalize?
    request_host_id.nil? || request_segment_ids.blank?
  end

  def normalize_event_fields
    EventNormalizer.normalize_event!(self)
  rescue => e
    Rails.logger.error "Failed to normalize event #{id}: #{e.message}"
  end

  def should_populate_network_intelligence?
    # Only populate if IP is present and country is not yet set
    # Also repopulate if IP address changed (rare case)
    ip_address.present? && (country.blank? || ip_address_changed?)
  end

  def populate_network_intelligence
    return unless ip_address.present?

    # Convert IPAddr to string for PostgreSQL query
    ip_string = ip_address.to_s

    # CRITICAL: Always find_or_create /24 tracking network for public IPs
    # This /24 serves as:
    # 1. The tracking unit for IPAPI deduplication (stores ipapi_queried_at)
    # 2. The reference point for preventing duplicate API calls
    # 3. The fallback network if no more specific GeoIP data exists
    tracking_network = find_or_create_tracking_network(ip_string)

    # Find most specific network range with actual GeoIP data
    # This might be more specific (e.g., /25) or broader (e.g., /22) than the /24
    data_range = NetworkRange.where("network >>= ?", ip_string)
                             .where.not(country: nil) # Must have actual data
                             .order(Arel.sql("masklen(network) DESC"))
                             .first

    # Use the most specific range with data, or fall back to tracking network
    range = data_range || tracking_network

    if range
      # Populate all network intelligence fields from the range
      self.country = range.country
      self.company = range.company
      self.asn = range.asn
      self.asn_org = range.asn_org
      self.is_datacenter = range.is_datacenter || false
      self.is_vpn = range.is_vpn || false
      self.is_proxy = range.is_proxy || false
    else
      # No range at all (shouldn't happen, but defensive)
      self.is_datacenter = false
      self.is_vpn = false
      self.is_proxy = false
    end

    # ALWAYS set network_range_id to the tracking /24
    # This is what FetchIpapiDataJob uses to check ipapi_queried_at
    # and prevent duplicate API calls
    self.network_range_id = tracking_network&.id
  rescue => e
    Rails.logger.error "Failed to populate network intelligence for event #{id}: #{e.message}"
    # Set defaults on error to prevent null values
    self.is_datacenter = false
    self.is_vpn = false
    self.is_proxy = false
  end

  # Find or create the /24 tracking network for this IP
  # This is the fundamental unit for IPAPI deduplication
  def find_or_create_tracking_network(ip_string)
    return nil if private_or_reserved_ip?(ip_string)

    ip_addr = IPAddr.new(ip_string)

    # Calculate /24 for IPv4, /64 for IPv6
    if ip_addr.ipv4?
      prefix_length = 24
      mask = (2**32 - 1) ^ ((2**(32 - prefix_length)) - 1)
      network_int = ip_addr.to_i & mask
      network_base = IPAddr.new(network_int, Socket::AF_INET)
      network_cidr = "#{network_base}/#{prefix_length}"  # e.g., "1.2.3.0/24"
    else
      prefix_length = 64
      mask = (2**128 - 1) ^ ((2**(128 - prefix_length)) - 1)
      network_int = ip_addr.to_i & mask
      network_base = IPAddr.new(network_int, Socket::AF_INET6)
      network_cidr = "#{network_base}/#{prefix_length}"  # e.g., "2001:db8::/64"
    end

    # Find or create the tracking network
    NetworkRange.find_or_create_by!(network: network_cidr) do |nr|
      nr.source = 'auto_generated'
      nr.creation_reason = 'tracking unit for IPAPI deduplication'
      nr.is_datacenter = NetworkRangeGenerator.datacenter_ip?(ip_addr) rescue false
      nr.is_vpn = false
      nr.is_proxy = false
    end
  rescue => e
    Rails.logger.error "Failed to create tracking network for IP #{ip_string}: #{e.message}"
    nil
  end

  # Check if IP is private or reserved (should not create network ranges)
  def private_or_reserved_ip?(ip_string = nil)
    ip_str = ip_string || ip_address.to_s
    ip = IPAddr.new(ip_str)

    # Private and reserved ranges
    [
      IPAddr.new('10.0.0.0/8'),
      IPAddr.new('172.16.0.0/12'),
      IPAddr.new('192.168.0.0/16'),
      IPAddr.new('127.0.0.0/8'),
      IPAddr.new('169.254.0.0/16'),
      IPAddr.new('224.0.0.0/4'),
      IPAddr.new('240.0.0.0/4'),
      IPAddr.new('::1/128'),
      IPAddr.new('fc00::/7'),
      IPAddr.new('fe80::/10'),
      IPAddr.new('ff00::/8')
    ].any? { |range| range.include?(ip) }
  rescue IPAddr::InvalidAddressError
    true # Treat invalid IPs as "reserved"
  end

  def extract_fields_from_payload
    return unless payload.present?

    # Extract WAF-specific fields for direct querying
    request_data = payload.dig("request") || {}
    response_data = payload.dig("response") || {}

    self.ip_address = request_data["ip"]

    # Extract user agent with header name standardization
    headers = request_data["headers"] || {}
    normalized_headers = normalize_headers(headers)
    self.user_agent = normalized_headers["user-agent"] || normalized_headers["User-Agent"]

    self.request_path = request_data["path"]
    self.request_url = request_data["url"]
    self.response_status = response_data["status_code"]
    self.response_time_ms = response_data["duration_ms"]
    # Support both new (rule_id) and old (rule_matched) field names during cutover
    self.rule_id = payload["rule_id"] || payload["rule_matched"]
    self.blocked_reason = payload["blocked_reason"]

    # Store original values for normalization only if they don't exist yet
    # This prevents overwriting during multiple callback runs
    @raw_request_method ||= request_data["method"]
    @raw_request_protocol ||= request_data["protocol"]
    @raw_action ||= payload["waf_action"]

    # Extract server/environment info
    self.server_name = payload["server_name"]
    self.environment = payload["environment"]


    # Extract agent info
    agent_data = payload.dig("agent") || {}
    self.agent_version = agent_data["version"]
    self.agent_name = agent_data["name"]
  end

  def should_detect_bot?
    # Detect bots if user agent is present or if we have network intelligence
    user_agent.present? || network_range_id.present?
  end

  def detect_bot_traffic
    self.is_bot = bot_detected?
  rescue => e
    Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
    self.is_bot = false # Default to non-bot on error
  end

  def bot_detected?
    # Multi-signal bot detection approach with tagging:
    # 1. User agent detection (DeviceDetector gem) - adds bot:name tag
    # 2. Network range source matching (bot_import_* sources) - adds network tags
    # 3. Fallback to datacenter classification for infrastructure-based detection

    # Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
    if user_agent.present?
      begin
        detector = DeviceDetector.new(user_agent)
        if detector.bot?
          # Add bot tag with specific bot name
          bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
          add_tag("bot:#{bot_name}")
          return true
        end
      rescue => e
        Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
      end
    end

    # Signal 2: Network range from known bot sources
    if network_range_id.present?
      range = NetworkRange.find_by(id: network_range_id)
      if range
        # Check if the network range source indicates a bot import
        if range.source&.start_with?('bot_import_')
          # Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
          bot_type = range.source.sub('bot_import_', '')
          add_tag("bot:#{bot_type}")
          add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
          return true
        end

        # Check if the company is a known bot provider (from bot imports)
        # Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
        known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
                               'anthropic', 'cloudflare', 'microsoft', 'facebook',
                               'meta', 'apple', 'duckduckgo']
        company_lower = company&.downcase
        if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
          add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
          add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
          return true
        end
      end
    end

    # Signal 3: Datacenter traffic is often bot traffic
    # However, this is less precise so we use it as a weaker signal
    # Only mark as bot if datacenter AND has other suspicious characteristics
    if is_datacenter && user_agent.present?
      # Generic/common bot user agents in datacenter networks
      ua_lower = user_agent.downcase
      bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
      if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
        add_tag("bot:datacenter")
        add_tag("datacenter:true")
        return true
      end
    end

    # Default: not a bot
    false
  end
end