799 lines
25 KiB
Ruby
799 lines
25 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class Event < ApplicationRecord
|
|
# Normalized association for hosts (most valuable compression)
|
|
belongs_to :request_host, optional: true
|
|
|
|
# WAF rule associations
|
|
belongs_to :rule, optional: true
|
|
has_one :waf_policy, through: :rule
|
|
|
|
# Enums for fixed value sets
|
|
# Canonical WAF action order - aligned with Rule and Agent models
|
|
#
|
|
# IMPORTANT: These values were swapped to match baffle-agent convention:
|
|
# - deny: 0 (blocked traffic)
|
|
# - allow: 1 (allowed traffic)
|
|
#
|
|
# When using raw integer values in queries:
|
|
# - waf_action = 0 -> denied/blocked requests
|
|
# - waf_action = 1 -> allowed requests
|
|
# - waf_action = 2 -> redirect requests
|
|
# - waf_action = 3 -> challenge requests
|
|
# - waf_action = 4 -> log-only requests
|
|
enum :waf_action, {
|
|
deny: 0, # deny/block
|
|
allow: 1, # allow/pass
|
|
redirect: 2, # redirect
|
|
challenge: 3, # challenge (CAPTCHA, JS challenge, etc.)
|
|
log: 4 # log only, no action (monitoring mode)
|
|
}, default: :allow, scopes: false
|
|
|
|
enum :request_method, {
|
|
get: 0, # GET
|
|
post: 1, # POST
|
|
put: 2, # PUT
|
|
patch: 3, # PATCH
|
|
delete: 4, # DELETE
|
|
head: 5, # HEAD
|
|
options: 6 # OPTIONS
|
|
}, default: :get, scopes: false
|
|
|
|
# Serialize segment IDs as array for easy manipulation in Railssqit
|
|
serialize :request_segment_ids, type: Array, coder: JSON
|
|
|
|
# Tags are stored as JSON arrays with PostgreSQL jsonb type
|
|
# This provides direct array access and efficient indexing
|
|
attribute :tags, :json, default: -> { [] }
|
|
|
|
validates :request_id, presence: true, uniqueness: true
|
|
validates :timestamp, presence: true
|
|
|
|
scope :recent, -> { order(timestamp: :desc) }
|
|
scope :by_ip, ->(ip) { where(ip_address: ip) }
|
|
scope :by_user_agent, ->(user_agent) { where(user_agent: user_agent) }
|
|
scope :by_waf_action, ->(waf_action) { where(waf_action: waf_action) }
|
|
scope :blocked, -> { where(waf_action: :deny) }
|
|
scope :allowed, -> { where(waf_action: :allow) }
|
|
scope :logged, -> { where(waf_action: :log) }
|
|
|
|
# Tag-based filtering scopes using PostgreSQL array operators
|
|
scope :with_tag, ->(tag) { where("tags @> ARRAY[?]", tag.to_s) }
|
|
|
|
scope :with_any_tags, ->(tags) {
|
|
return none if tags.blank?
|
|
tag_array = Array(tags).map(&:to_s)
|
|
where("tags && ARRAY[?]", tag_array)
|
|
}
|
|
|
|
scope :with_all_tags, ->(tags) {
|
|
return none if tags.blank?
|
|
tag_array = Array(tags).map(&:to_s)
|
|
where("tags @> ARRAY[?]", tag_array)
|
|
}
|
|
|
|
# Network-based filtering scopes - now using denormalized columns
|
|
scope :by_company, ->(company) {
|
|
where("company ILIKE ?", "%#{company}%")
|
|
}
|
|
|
|
scope :by_country, ->(country) {
|
|
where(country: country)
|
|
}
|
|
|
|
scope :by_network_type, ->(type) {
|
|
case type.to_s.downcase
|
|
when "datacenter"
|
|
where(is_datacenter: true)
|
|
when "vpn"
|
|
where(is_vpn: true)
|
|
when "proxy"
|
|
where(is_proxy: true)
|
|
when "standard"
|
|
where(is_datacenter: false, is_vpn: false, is_proxy: false)
|
|
else
|
|
none
|
|
end
|
|
}
|
|
|
|
scope :by_asn, ->(asn) {
|
|
where(asn: asn.to_i)
|
|
}
|
|
|
|
scope :by_network_cidr, ->(cidr) {
|
|
# This still requires a join since we need to match CIDR
|
|
joins(:network_range).where("network_ranges.network = ?", cidr)
|
|
}
|
|
|
|
# Bot filtering scopes
|
|
scope :bots, -> { where(is_bot: true) }
|
|
scope :humans, -> { where(is_bot: false) }
|
|
scope :exclude_bots, -> { where(is_bot: false) }
|
|
|
|
# Add association for the optional network_range_id
|
|
belongs_to :network_range, optional: true
|
|
|
|
# Path prefix matching using range queries (uses B-tree index efficiently)
|
|
scope :with_path_prefix, ->(prefix_segment_ids) {
|
|
return none if prefix_segment_ids.blank?
|
|
|
|
# Use range queries instead of LIKE for index usage
|
|
# Example: [1,2] prefix matches [1,2], [1,2,3], [1,2,3,4], etc.
|
|
prefix_str = prefix_segment_ids.to_json # "[1,2]"
|
|
|
|
# For exact match: request_segment_ids = "[1,2]"
|
|
# For prefix match: "[1,2," <= request_segment_ids < "[1,3,"
|
|
# This works because JSON arrays sort lexicographically
|
|
|
|
# Build the range upper bound by incrementing last segment ID
|
|
upper_prefix = prefix_segment_ids[0..-2] + [prefix_segment_ids.last + 1]
|
|
upper_str = upper_prefix.to_json
|
|
lower_prefix_str = "#{prefix_str[0..-2]}," # "[1,2," - matches longer paths
|
|
|
|
# Use raw SQL to bypass serializer (it expects Array but we're comparing strings)
|
|
where("request_segment_ids = ? OR (request_segment_ids >= ? AND request_segment_ids < ?)",
|
|
prefix_str, lower_prefix_str, upper_str)
|
|
}
|
|
|
|
# Path depth queries
|
|
scope :path_depth, ->(depth) {
|
|
where("json_array_length(request_segment_ids) = ?", depth)
|
|
}
|
|
|
|
scope :path_depth_greater_than, ->(depth) {
|
|
where("json_array_length(request_segment_ids) > ?", depth)
|
|
}
|
|
|
|
# Analytics: Get response time percentiles over different time windows
|
|
def self.response_time_percentiles(windows: { hour: 1.hour, day: 1.day, week: 1.week })
|
|
results = {}
|
|
|
|
windows.each do |label, duration|
|
|
scope = where('timestamp >= ?', duration.ago)
|
|
|
|
stats = scope.pick(
|
|
Arel.sql(<<~SQL.squish)
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY response_time_ms) as p50,
|
|
percentile_cont(0.95) WITHIN GROUP (ORDER BY response_time_ms) as p95,
|
|
percentile_cont(0.99) WITHIN GROUP (ORDER BY response_time_ms) as p99,
|
|
COUNT(*) as count
|
|
SQL
|
|
)
|
|
|
|
results[label] = if stats
|
|
{
|
|
p50: stats[0]&.round(2),
|
|
p95: stats[1]&.round(2),
|
|
p99: stats[2]&.round(2),
|
|
count: stats[3]
|
|
}
|
|
else
|
|
{ p50: nil, p95: nil, p99: nil, count: 0 }
|
|
end
|
|
end
|
|
|
|
results
|
|
end
|
|
|
|
# Helper methods
|
|
def path_depth
|
|
request_segment_ids&.length || 0
|
|
end
|
|
|
|
def reconstructed_path
|
|
return request_path if request_segment_ids.blank?
|
|
|
|
segments = PathSegment.where(id: request_segment_ids).index_by(&:id)
|
|
'/' + request_segment_ids.map { |id| segments[id]&.segment }.compact.join('/')
|
|
end
|
|
|
|
# Extract key fields from payload before saving
|
|
before_validation :extract_fields_from_payload
|
|
|
|
# Normalize event fields after extraction
|
|
after_validation :normalize_event_fields, if: :should_normalize?
|
|
|
|
# Populate network intelligence from IP address
|
|
before_save :populate_network_intelligence, if: :should_populate_network_intelligence?
|
|
|
|
# Detect bot traffic using user agent and network intelligence
|
|
before_save :detect_bot_traffic, if: :should_detect_bot?
|
|
|
|
# Backfill network intelligence for all events
|
|
def self.backfill_network_intelligence!(batch_size: 10_000)
|
|
total = where(country: nil).count
|
|
return 0 if total.zero?
|
|
|
|
puts "Backfilling network intelligence for #{total} events..."
|
|
processed = 0
|
|
|
|
where(country: nil).find_in_batches(batch_size: batch_size) do |batch|
|
|
batch.each(&:save) # Triggers before_save callback
|
|
processed += batch.size
|
|
puts " Processed #{processed}/#{total} (#{(processed.to_f / total * 100).round(1)}%)"
|
|
end
|
|
|
|
processed
|
|
end
|
|
|
|
# Backfill network intelligence for a specific event
|
|
def backfill_network_intelligence!
|
|
populate_network_intelligence
|
|
save!
|
|
end
|
|
|
|
def self.create_from_waf_payload!(request_id, payload)
|
|
# Normalize headers in payload during import phase
|
|
normalized_payload = normalize_payload_headers(payload)
|
|
|
|
# Create the WAF request event with agent-provided tags
|
|
event = create!(
|
|
request_id: request_id,
|
|
timestamp: parse_timestamp(normalized_payload["timestamp"]),
|
|
payload: normalized_payload,
|
|
|
|
# WAF-specific fields
|
|
ip_address: normalized_payload.dig("request", "ip"),
|
|
user_agent: normalized_payload.dig("request", "headers", "user-agent") || normalized_payload.dig("request", "headers", "User-Agent"),
|
|
# request_method will be set by extract_fields_from_payload + normalize_event_fields
|
|
request_path: normalized_payload.dig("request", "path"),
|
|
request_url: normalized_payload.dig("request", "url"),
|
|
# request_protocol will be set by extract_fields_from_payload + normalize_event_fields
|
|
response_status: normalized_payload.dig("response", "status_code"),
|
|
response_time_ms: normalized_payload.dig("response", "duration_ms"),
|
|
waf_action: normalize_action(normalized_payload["waf_action"]), # Normalize incoming action values
|
|
# Support both new (rule_id) and old (rule_matched) field names during cutover
|
|
rule_id: normalized_payload["rule_id"] || normalized_payload["rule_matched"],
|
|
blocked_reason: normalized_payload["blocked_reason"],
|
|
|
|
# Server/Environment info
|
|
server_name: normalized_payload["server_name"],
|
|
environment: normalized_payload["environment"],
|
|
|
|
# Tags: start with agent-provided tags only
|
|
tags: normalized_payload["tags"] || [],
|
|
|
|
# WAF agent info
|
|
agent_version: normalized_payload.dig("agent", "version"),
|
|
agent_name: normalized_payload.dig("agent", "name")
|
|
)
|
|
|
|
# Apply rule tags using EventTagger service
|
|
EventTagger.tag_event(event)
|
|
|
|
event
|
|
end
|
|
|
|
# Normalize headers in payload to lower case during import phase
|
|
def self.normalize_payload_headers(payload)
|
|
return payload unless payload.is_a?(Hash)
|
|
|
|
# Create a deep copy to avoid modifying the original
|
|
normalized = Marshal.load(Marshal.dump(payload))
|
|
|
|
# Normalize request headers
|
|
if normalized.dig("request", "headers")&.is_a?(Hash)
|
|
normalized["request"]["headers"] = normalized["request"]["headers"].transform_keys(&:downcase)
|
|
end
|
|
|
|
# Normalize response headers if they exist
|
|
if normalized.dig("response", "headers")&.is_a?(Hash)
|
|
normalized["response"]["headers"] = normalized["response"]["headers"].transform_keys(&:downcase)
|
|
end
|
|
|
|
normalized
|
|
end
|
|
|
|
def self.normalize_action(action)
|
|
return "allow" if action.nil? || action.blank?
|
|
|
|
case action.to_s.downcase
|
|
when "allow", "pass", "allowed"
|
|
"allow"
|
|
when "deny", "block", "blocked", "deny_access"
|
|
"deny"
|
|
when "challenge"
|
|
"challenge"
|
|
when "redirect"
|
|
"redirect"
|
|
else
|
|
Rails.logger.warn "Unknown action '#{action}', defaulting to 'allow'"
|
|
"allow"
|
|
end
|
|
end
|
|
|
|
def self.parse_timestamp(timestamp)
|
|
case timestamp
|
|
when String
|
|
Time.parse(timestamp)
|
|
when Numeric
|
|
# Sentry timestamps can be in seconds with decimals
|
|
Time.at(timestamp)
|
|
when Time
|
|
timestamp
|
|
else
|
|
Time.current
|
|
end
|
|
rescue => e
|
|
Rails.logger.error "Failed to parse timestamp #{timestamp}: #{e.message}"
|
|
Time.current
|
|
end
|
|
|
|
def request_details
|
|
return {} unless payload.present?
|
|
|
|
request_data = payload.dig("request") || {}
|
|
{
|
|
ip: request_data["ip"],
|
|
method: request_data["method"],
|
|
path: request_data["path"],
|
|
url: request_data["url"],
|
|
protocol: request_data["protocol"],
|
|
headers: request_data["headers"] || {},
|
|
query: request_data["query"] || {},
|
|
body_size: request_data["body_size"]
|
|
}
|
|
end
|
|
|
|
def response_details
|
|
return {} unless payload.present?
|
|
|
|
response_data = payload.dig("response") || {}
|
|
{
|
|
status_code: response_data["status_code"],
|
|
duration_ms: response_data["duration_ms"],
|
|
size: response_data["size"]
|
|
}
|
|
end
|
|
|
|
def geo_details
|
|
return {} unless payload.present?
|
|
|
|
payload.dig("geo") || {}
|
|
end
|
|
|
|
def tags
|
|
# Use the dedicated tags column (array), fallback to payload during transition
|
|
# Ensure we always return an Array, even if payload has malformed data (e.g., {} instead of [])
|
|
result = super.presence || payload&.dig("tags")
|
|
return [] if result.nil?
|
|
result.is_a?(Array) ? result : []
|
|
end
|
|
|
|
def headers
|
|
raw_headers = payload&.dig("request", "headers") || {}
|
|
normalize_headers(raw_headers)
|
|
end
|
|
|
|
def query_params
|
|
payload&.dig("request", "query") || {}
|
|
end
|
|
|
|
def blocked?
|
|
waf_action == 'deny' # deny = 0
|
|
end
|
|
|
|
def allowed?
|
|
waf_action == 'allow' # allow = 1
|
|
end
|
|
|
|
def logged?
|
|
waf_action == 'log'
|
|
end
|
|
|
|
def challenged?
|
|
waf_action == 'challenge'
|
|
end
|
|
|
|
def rule_matched?
|
|
rule_id.present?
|
|
end
|
|
|
|
# New path methods for normalization
|
|
def path_segments
|
|
return [] unless request_path.present?
|
|
request_path.split('/').reject(&:blank?)
|
|
end
|
|
|
|
def path_segments_array
|
|
@path_segments_array ||= path_segments
|
|
end
|
|
|
|
def request_hostname
|
|
return nil unless request_url.present?
|
|
URI.parse(request_url).hostname rescue nil
|
|
end
|
|
|
|
# Tag helper methods
|
|
def add_tag(tag)
|
|
tag_str = tag.to_s
|
|
self.tags = (tags + [tag_str]).uniq unless tags.include?(tag_str)
|
|
end
|
|
|
|
def remove_tag(tag)
|
|
tag_str = tag.to_s
|
|
self.tags = tags - [tag_str] if tags.include?(tag_str)
|
|
end
|
|
|
|
def has_tag?(tag)
|
|
tags.include?(tag.to_s)
|
|
end
|
|
|
|
def tag_list
|
|
tags.join(', ')
|
|
end
|
|
|
|
# Normalize headers to lower case keys during import phase
|
|
def normalize_headers(headers)
|
|
return {} unless headers.is_a?(Hash)
|
|
|
|
headers.transform_keys(&:downcase)
|
|
end
|
|
|
|
# Network range resolution methods
|
|
def matching_network_ranges
|
|
return [] unless ip_address.present?
|
|
|
|
NetworkRange.contains_ip(ip_address.to_s).map do |range|
|
|
{
|
|
range: range,
|
|
cidr: range.cidr,
|
|
prefix_length: range.prefix_length,
|
|
specificity: range.prefix_length,
|
|
intelligence: range.inherited_intelligence
|
|
}
|
|
end.sort_by { |r| -r[:specificity] } # Most specific first
|
|
end
|
|
|
|
def most_specific_range
|
|
# Use the cached network_range_id if available (much faster)
|
|
return NetworkRange.find_by(id: network_range_id) if network_range_id.present?
|
|
|
|
# Fallback to expensive lookup
|
|
matching_network_ranges.first&.dig(:range)
|
|
end
|
|
|
|
def broadest_range
|
|
matching_network_ranges.last&.dig(:range)
|
|
end
|
|
|
|
def network_intelligence
|
|
# Use denormalized fields instead of expensive lookup
|
|
{
|
|
country: country,
|
|
company: company,
|
|
asn: asn,
|
|
asn_org: asn_org,
|
|
is_datacenter: is_datacenter,
|
|
is_vpn: is_vpn,
|
|
is_proxy: is_proxy
|
|
}
|
|
end
|
|
|
|
# Denormalized attribute accessors - these now use the columns directly
|
|
# No need to override - Rails provides these automatically:
|
|
# - country (column)
|
|
# - company (column)
|
|
# - asn (column)
|
|
# - asn_org (column)
|
|
# - is_datacenter (column)
|
|
# - is_vpn (column)
|
|
# - is_proxy (column)
|
|
|
|
# IP validation
|
|
def valid_ipv4?
|
|
return false unless ip_address.present?
|
|
|
|
IPAddr.new(ip_address).ipv4?
|
|
rescue IPAddr::InvalidAddressError
|
|
false
|
|
end
|
|
|
|
def valid_ipv6?
|
|
return false unless ip_address.present?
|
|
|
|
IPAddr.new(ip_address).ipv6?
|
|
rescue IPAddr::InvalidAddressError
|
|
false
|
|
end
|
|
|
|
def valid_ip?
|
|
valid_ipv4? || valid_ipv6?
|
|
end
|
|
|
|
# Rules affecting this IP
|
|
def matching_rules
|
|
return Rule.none unless ip_address.present?
|
|
|
|
# Get all network ranges that contain this IP
|
|
range_ids = matching_network_ranges.map { |r| r[:range].id }
|
|
|
|
# Find rules for those ranges, ordered by priority (most specific first)
|
|
Rule.network_rules
|
|
.where(network_range_id: range_ids)
|
|
.enabled
|
|
.includes(:network_range)
|
|
.order('masklen(network_ranges.network) DESC')
|
|
end
|
|
|
|
def active_blocking_rules
|
|
matching_rules.where(waf_action: :deny)
|
|
end
|
|
|
|
def has_blocking_rules?
|
|
active_blocking_rules.exists?
|
|
end
|
|
|
|
# Get full geo location details from network range
|
|
def geo_location
|
|
network_info = network_intelligence
|
|
|
|
{
|
|
country_code: network_info[:country],
|
|
ip_address: ip_address,
|
|
has_data: network_info[:country].present?,
|
|
network_intelligence: network_info
|
|
}
|
|
end
|
|
|
|
# Check if event has valid geo location data via network range
|
|
def has_geo_data?
|
|
network_intelligence[:country].present?
|
|
end
|
|
|
|
# Lookup country code for this event's IP via network range
|
|
def lookup_country
|
|
return nil if ip_address.blank?
|
|
|
|
network_info = network_intelligence
|
|
network_info[:country]
|
|
rescue => e
|
|
Rails.logger.error "Network lookup failed for #{ip_address}: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
private
|
|
|
|
def should_normalize?
|
|
request_host_id.nil? || request_segment_ids.blank?
|
|
end
|
|
|
|
def normalize_event_fields
|
|
EventNormalizer.normalize_event!(self)
|
|
rescue => e
|
|
Rails.logger.error "Failed to normalize event #{id}: #{e.message}"
|
|
end
|
|
|
|
def should_populate_network_intelligence?
|
|
# Only populate if IP is present and country is not yet set
|
|
# Also repopulate if IP address changed (rare case)
|
|
ip_address.present? && (country.blank? || ip_address_changed?)
|
|
end
|
|
|
|
def populate_network_intelligence
|
|
return unless ip_address.present?
|
|
|
|
# Convert IPAddr to string for PostgreSQL query
|
|
ip_string = ip_address.to_s
|
|
|
|
# CRITICAL: Always find_or_create /24 tracking network for public IPs
|
|
# This /24 serves as:
|
|
# 1. The tracking unit for IPAPI deduplication (stores ipapi_queried_at)
|
|
# 2. The reference point for preventing duplicate API calls
|
|
# 3. The fallback network if no more specific GeoIP data exists
|
|
tracking_network = find_or_create_tracking_network(ip_string)
|
|
|
|
# Find most specific network range with actual GeoIP data
|
|
# This might be more specific (e.g., /25) or broader (e.g., /22) than the /24
|
|
data_range = NetworkRange.where("network >>= ?", ip_string)
|
|
.where.not(country: nil) # Must have actual data
|
|
.order(Arel.sql("masklen(network) DESC"))
|
|
.first
|
|
|
|
# Use the most specific range with data, or fall back to tracking network
|
|
range = data_range || tracking_network
|
|
|
|
if range
|
|
# Populate all network intelligence fields from the range
|
|
self.country = range.country
|
|
self.company = range.company
|
|
self.asn = range.asn
|
|
self.asn_org = range.asn_org
|
|
self.is_datacenter = range.is_datacenter || false
|
|
self.is_vpn = range.is_vpn || false
|
|
self.is_proxy = range.is_proxy || false
|
|
else
|
|
# No range at all (shouldn't happen, but defensive)
|
|
self.is_datacenter = false
|
|
self.is_vpn = false
|
|
self.is_proxy = false
|
|
end
|
|
|
|
# ALWAYS set network_range_id to the tracking /24
|
|
# This is what FetchIpapiDataJob uses to check ipapi_queried_at
|
|
# and prevent duplicate API calls
|
|
self.network_range_id = tracking_network&.id
|
|
rescue => e
|
|
Rails.logger.error "Failed to populate network intelligence for event #{id}: #{e.message}"
|
|
# Set defaults on error to prevent null values
|
|
self.is_datacenter = false
|
|
self.is_vpn = false
|
|
self.is_proxy = false
|
|
end
|
|
|
|
# Find or create the /24 tracking network for this IP
|
|
# This is the fundamental unit for IPAPI deduplication
|
|
def find_or_create_tracking_network(ip_string)
|
|
return nil if private_or_reserved_ip?(ip_string)
|
|
|
|
ip_addr = IPAddr.new(ip_string)
|
|
|
|
# Calculate /24 for IPv4, /64 for IPv6
|
|
if ip_addr.ipv4?
|
|
prefix_length = 24
|
|
mask = (2**32 - 1) ^ ((2**(32 - prefix_length)) - 1)
|
|
network_int = ip_addr.to_i & mask
|
|
network_base = IPAddr.new(network_int, Socket::AF_INET)
|
|
network_cidr = "#{network_base}/#{prefix_length}" # e.g., "1.2.3.0/24"
|
|
else
|
|
prefix_length = 64
|
|
mask = (2**128 - 1) ^ ((2**(128 - prefix_length)) - 1)
|
|
network_int = ip_addr.to_i & mask
|
|
network_base = IPAddr.new(network_int, Socket::AF_INET6)
|
|
network_cidr = "#{network_base}/#{prefix_length}" # e.g., "2001:db8::/64"
|
|
end
|
|
|
|
# Find or create the tracking network
|
|
NetworkRange.find_or_create_by!(network: network_cidr) do |nr|
|
|
nr.source = 'auto_generated'
|
|
nr.creation_reason = 'tracking unit for IPAPI deduplication'
|
|
nr.is_datacenter = NetworkRangeGenerator.datacenter_ip?(ip_addr) rescue false
|
|
nr.is_vpn = false
|
|
nr.is_proxy = false
|
|
end
|
|
rescue => e
|
|
Rails.logger.error "Failed to create tracking network for IP #{ip_string}: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Check if IP is private or reserved (should not create network ranges)
|
|
def private_or_reserved_ip?(ip_string = nil)
|
|
ip_str = ip_string || ip_address.to_s
|
|
ip = IPAddr.new(ip_str)
|
|
|
|
# Private and reserved ranges
|
|
[
|
|
IPAddr.new('10.0.0.0/8'),
|
|
IPAddr.new('172.16.0.0/12'),
|
|
IPAddr.new('192.168.0.0/16'),
|
|
IPAddr.new('127.0.0.0/8'),
|
|
IPAddr.new('169.254.0.0/16'),
|
|
IPAddr.new('224.0.0.0/4'),
|
|
IPAddr.new('240.0.0.0/4'),
|
|
IPAddr.new('::1/128'),
|
|
IPAddr.new('fc00::/7'),
|
|
IPAddr.new('fe80::/10'),
|
|
IPAddr.new('ff00::/8')
|
|
].any? { |range| range.include?(ip) }
|
|
rescue IPAddr::InvalidAddressError
|
|
true # Treat invalid IPs as "reserved"
|
|
end
|
|
|
|
def extract_fields_from_payload
|
|
return unless payload.present?
|
|
|
|
# Extract WAF-specific fields for direct querying
|
|
request_data = payload.dig("request") || {}
|
|
response_data = payload.dig("response") || {}
|
|
|
|
self.ip_address = request_data["ip"]
|
|
|
|
# Extract user agent with header name standardization
|
|
headers = request_data["headers"] || {}
|
|
normalized_headers = normalize_headers(headers)
|
|
self.user_agent = normalized_headers["user-agent"] || normalized_headers["User-Agent"]
|
|
|
|
self.request_path = request_data["path"]
|
|
self.request_url = request_data["url"]
|
|
self.response_status = response_data["status_code"]
|
|
self.response_time_ms = response_data["duration_ms"]
|
|
# Support both new (rule_id) and old (rule_matched) field names during cutover
|
|
self.rule_id = payload["rule_id"] || payload["rule_matched"]
|
|
self.blocked_reason = payload["blocked_reason"]
|
|
|
|
# Store original values for normalization only if they don't exist yet
|
|
# This prevents overwriting during multiple callback runs
|
|
@raw_request_method ||= request_data["method"]
|
|
@raw_request_protocol ||= request_data["protocol"]
|
|
@raw_action ||= payload["waf_action"]
|
|
|
|
# Extract server/environment info
|
|
self.server_name = payload["server_name"]
|
|
self.environment = payload["environment"]
|
|
|
|
|
|
# Extract agent info
|
|
agent_data = payload.dig("agent") || {}
|
|
self.agent_version = agent_data["version"]
|
|
self.agent_name = agent_data["name"]
|
|
end
|
|
|
|
def should_detect_bot?
|
|
# Detect bots if user agent is present or if we have network intelligence
|
|
user_agent.present? || network_range_id.present?
|
|
end
|
|
|
|
def detect_bot_traffic
|
|
self.is_bot = bot_detected?
|
|
rescue => e
|
|
Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
|
|
self.is_bot = false # Default to non-bot on error
|
|
end
|
|
|
|
def bot_detected?
|
|
# Multi-signal bot detection approach with tagging:
|
|
# 1. User agent detection (DeviceDetector gem) - adds bot:name tag
|
|
# 2. Network range source matching (bot_import_* sources) - adds network tags
|
|
# 3. Fallback to datacenter classification for infrastructure-based detection
|
|
|
|
# Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
|
|
if user_agent.present?
|
|
begin
|
|
detector = DeviceDetector.new(user_agent)
|
|
if detector.bot?
|
|
# Add bot tag with specific bot name
|
|
bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
|
|
add_tag("bot:#{bot_name}")
|
|
return true
|
|
end
|
|
rescue => e
|
|
Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
|
|
end
|
|
end
|
|
|
|
# Signal 2: Network range from known bot sources
|
|
if network_range_id.present?
|
|
range = NetworkRange.find_by(id: network_range_id)
|
|
if range
|
|
# Check if the network range source indicates a bot import
|
|
if range.source&.start_with?('bot_import_')
|
|
# Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
|
|
bot_type = range.source.sub('bot_import_', '')
|
|
add_tag("bot:#{bot_type}")
|
|
add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
|
|
return true
|
|
end
|
|
|
|
# Check if the company is a known bot provider (from bot imports)
|
|
# Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
|
|
known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
|
|
'anthropic', 'cloudflare', 'microsoft', 'facebook',
|
|
'meta', 'apple', 'duckduckgo']
|
|
company_lower = company&.downcase
|
|
if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
|
|
add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
|
|
add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
|
|
return true
|
|
end
|
|
end
|
|
end
|
|
|
|
# Signal 3: Datacenter traffic is often bot traffic
|
|
# However, this is less precise so we use it as a weaker signal
|
|
# Only mark as bot if datacenter AND has other suspicious characteristics
|
|
if is_datacenter && user_agent.present?
|
|
# Generic/common bot user agents in datacenter networks
|
|
ua_lower = user_agent.downcase
|
|
bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
|
|
if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
|
|
add_tag("bot:datacenter")
|
|
add_tag("datacenter:true")
|
|
return true
|
|
end
|
|
end
|
|
|
|
# Default: not a bot
|
|
false
|
|
end
|
|
end
|