From de2eb43e2b10d2ed5f9fbb22547a233f96eedfc2 Mon Sep 17 00:00:00 2001 From: Dan Milne Date: Thu, 20 Nov 2025 11:55:04 +1100 Subject: [PATCH] More use of tags - drop add_header action -> allow + headers+tags --- Dockerfile | 2 +- VERSION | 2 +- app/controllers/analytics_controller.rb | 15 ++ app/controllers/events_controller.rb | 3 + app/models/event.rb | 88 +++++++++++- app/models/event_ddb.rb | 135 +++++++++++++++++- app/models/network_range.rb | 26 ++-- app/models/rule.rb | 36 +++++ app/services/analytics_duckdb_service.rb | 37 +++-- app/services/bot_network_range_importer.rb | 20 ++- app/views/events/index.html.erb | 14 ++ .../20251118071813_add_is_bot_to_events.rb | 6 + ...ate_add_header_rules_to_allow_with_tags.rb | 39 +++++ db/schema.rb | 4 +- test/fixtures/users.yml | 4 + test/models/network_range_test.rb | 53 +++++-- test/models/rule_test.rb | 91 ++++++++++++ 17 files changed, 526 insertions(+), 49 deletions(-) create mode 100644 db/migrate/20251118071813_add_is_bot_to_events.rb create mode 100644 db/migrate/20251120003554_migrate_add_header_rules_to_allow_with_tags.rb diff --git a/Dockerfile b/Dockerfile index 7c0e438..9efb6a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,7 +27,7 @@ RUN apt-get update -qq && \ *) \ echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \ esac && \ - wget "https://install.duckdb.org/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \ + wget "https://github.com/duckdb/duckdb/releases/download/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \ unzip /tmp/libduckdb.zip -d /tmp/duckdb && \ cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \ cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \ diff --git a/VERSION b/VERSION index 0ea3a94..ee1372d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0 +0.2.2 diff --git a/app/controllers/analytics_controller.rb b/app/controllers/analytics_controller.rb index 9fc179f..aa5f98b 100644 --- a/app/controllers/analytics_controller.rb +++ b/app/controllers/analytics_controller.rb @@ -7,6 +7,9 @@ class AnalyticsController < ApplicationController def index authorize :analytics, :index? + # Track overall request time + request_start = Time.current + # Time period selector (default: last 24 hours) @time_period = params[:period]&.to_sym || :day @start_time = calculate_start_time(@time_period) @@ -24,10 +27,12 @@ class AnalyticsController < ApplicationController cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}" # Core statistics - cached (uses DuckDB if available) + stat_start = Time.current @total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do with_duckdb_fallback { EventDdb.count_since(@start_time) } || Event.where("timestamp >= ?", @start_time).count end + Rails.logger.info "[Analytics Perf] Total events: #{((Time.current - stat_start) * 1000).round(1)}ms" @total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do Rule.enabled.count @@ -42,14 +47,17 @@ class AnalyticsController < ApplicationController end # Event breakdown by action - cached (uses DuckDB if available) + stat_start = Time.current @event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } || Event.where("timestamp >= ?", @start_time) .group(:waf_action) .count end + Rails.logger.info "[Analytics Perf] Event breakdown: #{((Time.current - stat_start) * 1000).round(1)}ms" # Top countries by event count - cached (uses DuckDB if available) + stat_start = Time.current @top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } || Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) @@ -58,8 +66,10 @@ class AnalyticsController < ApplicationController .sort_by { |_, count| -count } .first(10) end + Rails.logger.info "[Analytics Perf] Top countries: #{((Time.current - stat_start) * 1000).round(1)}ms" # Top blocked IPs - cached (uses DuckDB if available) + stat_start = Time.current @top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } || Event.where("timestamp >= ?", @start_time) @@ -69,6 +79,7 @@ class AnalyticsController < ApplicationController .sort_by { |_, count| -count } .first(10) end + Rails.logger.info "[Analytics Perf] Top blocked IPs: #{((Time.current - stat_start) * 1000).round(1)}ms" # Network range intelligence breakdown - cached @network_intelligence = Rails.cache.fetch("analytics/network_intelligence", expires_in: 10.minutes) do @@ -105,7 +116,11 @@ class AnalyticsController < ApplicationController end # Prepare data for charts - split caching for current vs historical data + stat_start = Time.current @chart_data = prepare_chart_data_with_split_cache(cache_key_base, cache_ttl) + Rails.logger.info "[Analytics Perf] Chart data: #{((Time.current - stat_start) * 1000).round(1)}ms" + + Rails.logger.info "[Analytics Perf] TOTAL REQUEST: #{((Time.current - request_start) * 1000).round(1)}ms" respond_to do |format| format.html diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 9fe0ae8..012bce0 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -36,6 +36,9 @@ class EventsController < ApplicationController @events = @events.by_asn(params[:asn]) if params[:asn].present? @events = @events.by_network_cidr(params[:network_cidr]) if params[:network_cidr].present? + # Bot filtering + @events = @events.exclude_bots if params[:exclude_bots] == "true" + Rails.logger.debug "Events count after filtering: #{@events.count}" # Debug info diff --git a/app/models/event.rb b/app/models/event.rb index a374900..8349a8c 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -105,6 +105,11 @@ class Event < ApplicationRecord joins(:network_range).where("network_ranges.network = ?", cidr) } + # Bot filtering scopes + scope :bots, -> { where(is_bot: true) } + scope :humans, -> { where(is_bot: false) } + scope :exclude_bots, -> { where(is_bot: false) } + # Add association for the optional network_range_id belongs_to :network_range, optional: true @@ -191,6 +196,9 @@ class Event < ApplicationRecord # Populate network intelligence from IP address before_save :populate_network_intelligence, if: :should_populate_network_intelligence? + # Detect bot traffic using user agent and network intelligence + before_save :detect_bot_traffic, if: :should_detect_bot? + # Backfill network intelligence for all events def self.backfill_network_intelligence!(batch_size: 10_000) total = where(country: nil).count @@ -693,10 +701,88 @@ class Event < ApplicationRecord self.server_name = payload["server_name"] self.environment = payload["environment"] - + # Extract agent info agent_data = payload.dig("agent") || {} self.agent_version = agent_data["version"] self.agent_name = agent_data["name"] end + + def should_detect_bot? + # Detect bots if user agent is present or if we have network intelligence + user_agent.present? || network_range_id.present? + end + + def detect_bot_traffic + self.is_bot = bot_detected? + rescue => e + Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}" + self.is_bot = false # Default to non-bot on error + end + + def bot_detected? + # Multi-signal bot detection approach with tagging: + # 1. User agent detection (DeviceDetector gem) - adds bot:name tag + # 2. Network range source matching (bot_import_* sources) - adds network tags + # 3. Fallback to datacenter classification for infrastructure-based detection + + # Signal 1: User agent bot detection (uses DeviceDetector's built-in cache) + if user_agent.present? + begin + detector = DeviceDetector.new(user_agent) + if detector.bot? + # Add bot tag with specific bot name + bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown' + add_tag("bot:#{bot_name}") + return true + end + rescue => e + Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}" + end + end + + # Signal 2: Network range from known bot sources + if network_range_id.present? + range = NetworkRange.find_by(id: network_range_id) + if range + # Check if the network range source indicates a bot import + if range.source&.start_with?('bot_import_') + # Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot') + bot_type = range.source.sub('bot_import_', '') + add_tag("bot:#{bot_type}") + add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present? + return true + end + + # Check if the company is a known bot provider (from bot imports) + # Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc. + known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai', + 'anthropic', 'cloudflare', 'microsoft', 'facebook', + 'meta', 'apple', 'duckduckgo'] + company_lower = company&.downcase + if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) } + add_tag("bot:#{company_lower.gsub(/\s+/, '_')}") + add_tag("network:#{company_lower.gsub(/\s+/, '_')}") + return true + end + end + end + + # Signal 3: Datacenter traffic is often bot traffic + # However, this is less precise so we use it as a weaker signal + # Only mark as bot if datacenter AND has other suspicious characteristics + if is_datacenter && user_agent.present? + # Generic/common bot user agents in datacenter networks + ua_lower = user_agent.downcase + bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client'] + if bot_keywords.any? { |keyword| ua_lower.include?(keyword) } + add_tag("bot:datacenter") + add_tag("datacenter:true") + return true + end + end + + # Default: not a bot + false + end end diff --git a/app/models/event_ddb.rb b/app/models/event_ddb.rb index 564f268..c9df10f 100644 --- a/app/models/event_ddb.rb +++ b/app/models/event_ddb.rb @@ -34,7 +34,8 @@ class EventDdb SQL # Convert to hash like ActiveRecord .group.count returns - result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + # DuckDB returns arrays: [waf_action, count] + result.to_a.to_h { |row| [row[0], row[1]] } end rescue StandardError => e Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}" @@ -54,7 +55,8 @@ class EventDdb SQL # Return array of [country, count] tuples like ActiveRecord - result.to_a.map { |row| [row["country"], row["count"]] } + # DuckDB returns arrays: [country, count] + result.to_a.map { |row| [row[0], row[1]] } end rescue StandardError => e Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}" @@ -73,7 +75,8 @@ class EventDdb LIMIT ? SQL - result.to_a.map { |row| [row["ip_address"], row["count"]] } + # DuckDB returns arrays: [ip_address, count] + result.to_a.map { |row| [row[0], row[1]] } end rescue StandardError => e Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}" @@ -94,7 +97,8 @@ class EventDdb SQL # Convert to hash with Time keys like ActiveRecord - result.to_a.to_h { |row| [row["hour"], row["count"]] } + # DuckDB returns arrays: [hour, count] + result.to_a.to_h { |row| [row[0], row[1]] } end rescue StandardError => e Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}" @@ -495,5 +499,128 @@ class EventDdb Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}" nil end + + # Bot traffic analysis - breakdown of bot vs human traffic + def bot_traffic_breakdown(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT + is_bot, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? + GROUP BY is_bot + SQL + + # Convert to hash: is_bot => { event_count, unique_ips } + # DuckDB returns arrays: [is_bot, event_count, unique_ips] + result.to_a.to_h do |row| + [ + row[0] ? "bot" : "human", # row[0] = is_bot + { + "event_count" => row[1], # row[1] = event_count + "unique_ips" => row[2] # row[2] = unique_ips + } + ] + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in bot_traffic_breakdown: #{e.message}" + nil + end + + # Count human traffic (non-bot) since timestamp + def human_traffic_count(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT COUNT(*) as count + FROM events + WHERE timestamp >= ? AND is_bot = false + SQL + + result.first&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in human_traffic_count: #{e.message}" + nil + end + + # Count bot traffic since timestamp + def bot_traffic_count(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT COUNT(*) as count + FROM events + WHERE timestamp >= ? AND is_bot = true + SQL + + result.first&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in bot_traffic_count: #{e.message}" + nil + end + + # Top bot user agents + def top_bot_user_agents(start_time, limit = 20) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + user_agent, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? AND is_bot = true AND user_agent IS NOT NULL + GROUP BY user_agent + ORDER BY event_count DESC + LIMIT ? + SQL + + # DuckDB returns arrays: [user_agent, event_count, unique_ips] + result.to_a.map do |row| + { + user_agent: row[0], # row[0] = user_agent + event_count: row[1], # row[1] = event_count + unique_ips: row[2] # row[2] = unique_ips + } + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_bot_user_agents: #{e.message}" + nil + end + + # Bot traffic timeline (hourly breakdown) + def bot_traffic_timeline(start_time, end_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, end_time) + SELECT + DATE_TRUNC('hour', timestamp) as hour, + SUM(CASE WHEN is_bot = true THEN 1 ELSE 0 END) as bot_count, + SUM(CASE WHEN is_bot = false THEN 1 ELSE 0 END) as human_count + FROM events + WHERE timestamp >= ? AND timestamp < ? + GROUP BY hour + ORDER BY hour + SQL + + # Convert to hash with Time keys + # DuckDB returns arrays: [hour, bot_count, human_count] + result.to_a.to_h do |row| + [ + row[0], # row[0] = hour + { + "bot_count" => row[1], # row[1] = bot_count + "human_count" => row[2], # row[2] = human_count + "total" => row[1] + row[2] + } + ] + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in bot_traffic_timeline: #{e.message}" + nil + end end end diff --git a/app/models/network_range.rb b/app/models/network_range.rb index 503b018..f80e281 100644 --- a/app/models/network_range.rb +++ b/app/models/network_range.rb @@ -7,7 +7,11 @@ # and classification flags (datacenter, proxy, VPN). class NetworkRange < ApplicationRecord # Sources for network range creation - SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country].freeze + SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country + bot_import_amazon_aws bot_import_google bot_import_microsoft_bing bot_import_anthropic + bot_import_openai_searchbot bot_import_openai_chatgpt_user bot_import_openai_gptbot + bot_import_cloudflare bot_import_facebook bot_import_applebot bot_import_duckduckgo + production_import].freeze # Associations has_many :rules, dependent: :destroy @@ -116,19 +120,19 @@ class NetworkRange < ApplicationRecord # Parent/child relationships def parent_ranges - NetworkRange.where("?::inet << network AND masklen(network) < ?", network.to_s, prefix_length) - .order("masklen(network) DESC") + # Find networks that contain this network (less specific / shorter prefix) + # The << operator implicitly means the containing network has a shorter prefix + # IMPORTANT: Use cidr (not network.to_s) to preserve the network mask + NetworkRange.where("?::inet << network", cidr) + .order("masklen(network) DESC") # Most specific parent first end def child_ranges - NetworkRange.where("network >> ?::inet AND masklen(network) > ?", network.to_s, prefix_length) - .order("masklen(network) ASC") - end - - def sibling_ranges - NetworkRange.where("masklen(network) = ?", prefix_length) - .where("network && ?::inet", network.to_s) - .where.not(id: id) + # Find networks that are contained by this network (more specific / longer prefix) + # The >> operator implicitly means the contained network has a longer prefix + # IMPORTANT: Use cidr (not network.to_s) to preserve the network mask + NetworkRange.where("?::inet >> network", cidr) + .order("masklen(network) ASC") # Least specific child first end # Find nearest parent with intelligence data diff --git a/app/models/rule.rb b/app/models/rule.rb index d5331e1..ffa04d8 100644 --- a/app/models/rule.rb +++ b/app/models/rule.rb @@ -149,6 +149,42 @@ class Rule < ApplicationRecord metadata&.dig('header_value') end + # Tag-related methods + def tags + metadata_hash['tags'] || [] + end + + def tags=(new_tags) + self.metadata = metadata_hash.merge('tags' => Array(new_tags)) + end + + def add_tag(tag) + current_tags = tags + return if current_tags.include?(tag.to_s) + + self.metadata = metadata_hash.merge('tags' => (current_tags + [tag.to_s])) + end + + def remove_tag(tag) + current_tags = tags + return unless current_tags.include?(tag.to_s) + + self.metadata = metadata_hash.merge('tags' => (current_tags - [tag.to_s])) + end + + def has_tag?(tag) + tags.include?(tag.to_s) + end + + # Headers for add_header action or metadata-based header injection + def headers + metadata_hash['headers'] || {} + end + + def headers=(new_headers) + self.metadata = metadata_hash.merge('headers' => new_headers.to_h) + end + def related_surgical_rules if surgical_block? # Find the corresponding exception rule diff --git a/app/services/analytics_duckdb_service.rb b/app/services/analytics_duckdb_service.rb index 8664707..f0717a3 100644 --- a/app/services/analytics_duckdb_service.rb +++ b/app/services/analytics_duckdb_service.rb @@ -33,9 +33,11 @@ class AnalyticsDuckdbService is_datacenter BOOLEAN, is_vpn BOOLEAN, is_proxy BOOLEAN, + is_bot BOOLEAN, waf_action INTEGER, request_path VARCHAR, - user_agent VARCHAR + user_agent VARCHAR, + tags VARCHAR[] ) SQL @@ -101,6 +103,9 @@ class AnalyticsDuckdbService batch_count = 0 begin + # Create initial appender + appender = conn.appender("events") + # Use PostgreSQL cursor for memory-efficient streaming Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id) .select( @@ -115,18 +120,14 @@ class AnalyticsDuckdbService :is_datacenter, :is_vpn, :is_proxy, + :is_bot, :waf_action, :request_path, - :user_agent + :user_agent, + :tags ) .order(:id) .each_row(block_size: BATCH_SIZE) do |event_data| - # Create new appender for each batch - if batch_count % BATCH_SIZE == 0 - appender&.close # Close previous appender - appender = conn.appender("events") - end - # Unpack event data from cursor row (Hash from each_row) begin appender.append_row( @@ -141,9 +142,11 @@ class AnalyticsDuckdbService event_data["is_datacenter"], event_data["is_vpn"], event_data["is_proxy"], + event_data["is_bot"], event_data["waf_action"], event_data["request_path"], - event_data["user_agent"] + event_data["user_agent"], + event_data["tags"] || [] ) rescue StandardError => e Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}" @@ -154,8 +157,10 @@ class AnalyticsDuckdbService batch_count += 1 total_synced += 1 - # Log progress every BATCH_SIZE events + # Flush and recreate appender every BATCH_SIZE events to avoid chunk overflow if batch_count % BATCH_SIZE == 0 + appender.close + appender = conn.appender("events") Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)" end end @@ -222,7 +227,8 @@ class AnalyticsDuckdbService SQL # Convert to hash like PostgreSQL returns - result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + # DuckDB returns arrays: [waf_action, count] + result.to_a.to_h { |row| [row[0], row[1]] } end end @@ -238,7 +244,8 @@ class AnalyticsDuckdbService LIMIT ? SQL - result.to_a.map { |row| [row["country"], row["count"]] } + # DuckDB returns arrays: [country, count] + result.to_a.map { |row| [row[0], row[1]] } end end @@ -254,7 +261,8 @@ class AnalyticsDuckdbService LIMIT ? SQL - result.to_a.map { |row| [row["ip_address"], row["count"]] } + # DuckDB returns arrays: [ip_address, count] + result.to_a.map { |row| [row[0], row[1]] } end end @@ -272,7 +280,8 @@ class AnalyticsDuckdbService SQL # Convert to hash with Time keys like PostgreSQL - result.to_a.to_h { |row| [row["hour"], row["count"]] } + # DuckDB returns arrays: [hour, count] + result.to_a.to_h { |row| [row[0], row[1]] } end end diff --git a/app/services/bot_network_range_importer.rb b/app/services/bot_network_range_importer.rb index 2bf0a01..da1427b 100644 --- a/app/services/bot_network_range_importer.rb +++ b/app/services/bot_network_range_importer.rb @@ -173,6 +173,7 @@ class BotNetworkRangeImporter http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 + http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https' response = http.get(uri.request_uri) raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200' @@ -223,7 +224,7 @@ class BotNetworkRangeImporter puts "Amazon AWS import completed: #{imported_count} ranges imported" { imported: imported_count, source: 'Amazon AWS' } - rescue Net::TimeoutError, Net::OpenTimeout => e + rescue Timeout::Error, Net::OpenTimeout => e raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}" rescue JSON::ParserError => e raise ImportError, "Failed to parse AWS JSON response: #{e.message}" @@ -341,6 +342,7 @@ class BotNetworkRangeImporter http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 + http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https' response = http.get(uri.request_uri) raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200' @@ -353,12 +355,15 @@ class BotNetworkRangeImporter # Determine crawler type from source name crawler_type = source[:name].gsub('OpenAI ', '').downcase - data.each do |entry| - # OpenAI provides IP ranges as either CIDR notation or single IPs - ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip'] + # Handle different OpenAI JSON formats + prefixes = data['prefixes'] || data + + prefixes.each do |entry| + # OpenAI provides IP ranges as ipv4Prefix/ipv6Prefix or cidr/ip_prefix + ip_range = entry['ipv4Prefix'] || entry['ipv6Prefix'] || entry['cidr'] || entry['ip_prefix'] || entry['ip'] next unless ip_range - # Convert single IPs to /32 + # Convert single IPs to /32 or /128 network = ip_range.include?('/') ? ip_range : "#{ip_range}/32" network_range = { @@ -396,7 +401,7 @@ class BotNetworkRangeImporter puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported" { imported: imported_count, source: "OpenAI #{crawler_type}" } - rescue Net::TimeoutError, Net::OpenTimeout => e + rescue Timeout::Error, Net::OpenTimeout => e raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}" rescue JSON::ParserError => e raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}" @@ -483,7 +488,8 @@ class BotNetworkRangeImporter raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200' # Cloudflare provides plain text CIDR lists - lines = response.body.split("\n") + # Handle both newline-separated and single-line formats + lines = response.body.include?("\n") ? response.body.split("\n") : response.body.split ip_version = url.include?('ips-v4') ? 4 : 6 lines.each do |line| diff --git a/app/views/events/index.html.erb b/app/views/events/index.html.erb index 3ac6054..e23c12b 100644 --- a/app/views/events/index.html.erb +++ b/app/views/events/index.html.erb @@ -77,6 +77,20 @@ placeholder: "e.g., 192.168.1.0/24" %> + +
+
+ <%= form.check_box :exclude_bots, + { checked: params[:exclude_bots] == "true", class: "h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" }, + "true", "false" %> +
+
+ <%= form.label :exclude_bots, class: "font-medium text-gray-700" do %> + Human Traffic Only + (Exclude known bots and crawlers) + <% end %> +
+
<% end %> diff --git a/db/migrate/20251118071813_add_is_bot_to_events.rb b/db/migrate/20251118071813_add_is_bot_to_events.rb new file mode 100644 index 0000000..068154a --- /dev/null +++ b/db/migrate/20251118071813_add_is_bot_to_events.rb @@ -0,0 +1,6 @@ +class AddIsBotToEvents < ActiveRecord::Migration[8.1] + def change + add_column :events, :is_bot, :boolean, default: false, null: false + add_index :events, :is_bot + end +end diff --git a/db/migrate/20251120003554_migrate_add_header_rules_to_allow_with_tags.rb b/db/migrate/20251120003554_migrate_add_header_rules_to_allow_with_tags.rb new file mode 100644 index 0000000..7393dd8 --- /dev/null +++ b/db/migrate/20251120003554_migrate_add_header_rules_to_allow_with_tags.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +# Migrate add_header rules to use allow action with tags/headers in metadata +# +# Old pattern: +# waf_action: add_header (5) +# metadata: { header_name: "X-Bot-Agent", header_value: "googlebot" } +# +# New pattern: +# waf_action: allow (1) +# metadata: { +# headers: { "X-Bot-Agent" => "googlebot" }, +# tags: ["bot:googlebot"] +# } +# +class MigrateAddHeaderRulesToAllowWithTags < ActiveRecord::Migration[8.1] + def up + # Change all add_header (5) rules to allow (1) + # Keep metadata as-is for now - will be handled by Rule helper methods + execute <<-SQL + UPDATE rules + SET waf_action = 1 -- allow + WHERE waf_action = 5 -- add_header + SQL + end + + def down + # This rollback is conservative - only revert rules that clearly came from add_header + # (have header_name/header_value in metadata but not headers) + execute <<-SQL + UPDATE rules + SET waf_action = 5 -- add_header + WHERE waf_action = 1 -- allow + AND metadata ? 'header_name' + AND metadata ? 'header_value' + AND NOT metadata ? 'headers' + SQL + end +end diff --git a/db/schema.rb b/db/schema.rb index ec99118..3943be2 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do +ActiveRecord::Schema[8.1].define(version: 2025_11_20_003554) do # These are extensions that must be enabled in order to support this database enable_extension "pg_catalog.plpgsql" @@ -80,6 +80,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do t.datetime "created_at", null: false t.string "environment" t.inet "ip_address" + t.boolean "is_bot", default: false, null: false t.boolean "is_datacenter", default: false, null: false t.boolean "is_proxy", default: false, null: false t.boolean "is_vpn", default: false, null: false @@ -105,6 +106,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do t.index ["company"], name: "index_events_on_company" t.index ["country"], name: "index_events_on_country" t.index ["ip_address"], name: "index_events_on_ip_address" + t.index ["is_bot"], name: "index_events_on_is_bot" t.index ["is_datacenter", "is_vpn", "is_proxy"], name: "index_events_on_network_flags" t.index ["network_range_id"], name: "index_events_on_network_range_id" t.index ["request_host_id", "request_method", "request_segment_ids"], name: "idx_events_host_method_path" diff --git a/test/fixtures/users.yml b/test/fixtures/users.yml index 0951563..97f432a 100644 --- a/test/fixtures/users.yml +++ b/test/fixtures/users.yml @@ -7,3 +7,7 @@ one: two: email_address: two@example.com password_digest: <%= password_digest %> + +jason: + email_address: jason@example.com + password_digest: <%= password_digest %> diff --git a/test/models/network_range_test.rb b/test/models/network_range_test.rb index 236ad0a..2fa0c67 100644 --- a/test/models/network_range_test.rb +++ b/test/models/network_range_test.rb @@ -211,16 +211,51 @@ class NetworkRangeTest < ActiveSupport::TestCase assert_equal @ipv4_range, children.first end - test "sibling_ranges finds same-level networks" do - # Create sibling networks - sibling1 = NetworkRange.create!(network: "192.168.0.0/24") - @ipv4_range.save! # 192.168.1.0/24 - sibling2 = NetworkRange.create!(network: "192.168.2.0/24") + test "child_ranges works with Apple network hierarchy - 17.240.0.0/14" do + # This test demonstrates the current bug in child_ranges method + # Expected: 17.240.0.0/14 should have parents but no children in this test setup - siblings = @ipv4_range.sibling_ranges - assert_includes siblings, sibling1 - assert_includes siblings, sibling2 - assert_not_includes siblings, @ipv4_range + # Create the target network + target_network = NetworkRange.create!(network: "17.240.0.0/14", source: "manual") + + # Create parent networks + parent1 = NetworkRange.create!(network: "17.240.0.0/13", source: "manual") # Should contain 17.240.0.0/14 + parent2 = NetworkRange.create!(network: "17.128.0.0/9", source: "manual") # Should also contain 17.240.0.0/14 + + # Create some child networks (more specific networks contained by 17.240.0.0/14) + child1 = NetworkRange.create!(network: "17.240.0.0/15", source: "manual") # First half of /14 + child2 = NetworkRange.create!(network: "17.242.0.0/15", source: "manual") # Second half of /14 + child3 = NetworkRange.create!(network: "17.240.0.0/16", source: "manual") # More specific + child4 = NetworkRange.create!(network: "17.241.0.0/16", source: "manual") # More specific + + # Test parent_ranges works correctly + parents = target_network.parent_ranges + assert_includes parents, parent1, "17.240.0.0/13 should be a parent of 17.240.0.0/14" + assert_includes parents, parent2, "17.128.0.0/9 should be a parent of 17.240.0.0/14" + + # Test child_ranges - this is currently failing due to the bug + children = target_network.child_ranges + assert_includes children, child1, "17.240.0.0/15 should be a child of 17.240.0.0/14" + assert_includes children, child2, "17.242.0.0/15 should be a child of 17.240.0.0/14" + assert_includes children, child3, "17.240.0.0/16 should be a child of 17.240.0.0/14" + assert_includes children, child4, "17.241.0.0/16 should be a child of 17.240.0.0/14" + assert_not_includes children, parent1, "Parent networks should not be in child_ranges" + assert_not_includes children, parent2, "Parent networks should not be in child_ranges" + assert_not_includes children, target_network, "Self should not be in child_ranges" + + # Test that parent can find child in its child_ranges + parent1_children = parent1.child_ranges + assert_includes parent1_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.240.0.0/13" + + parent2_children = parent2.child_ranges + assert_includes parent2_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.128.0.0/9" + + # Test bidirectional consistency + assert target_network.parent_ranges.include?(parent1), "Parent should list child" + assert parent1.child_ranges.include?(target_network), "Child should list parent" + + assert target_network.parent_ranges.include?(parent2), "Parent should list child" + assert parent2.child_ranges.include?(target_network), "Child should list parent" end # Intelligence and Inheritance diff --git a/test/models/rule_test.rb b/test/models/rule_test.rb index f17c0cc..9901b40 100644 --- a/test/models/rule_test.rb +++ b/test/models/rule_test.rb @@ -202,4 +202,95 @@ class RuleTest < ActiveSupport::TestCase assert_equal 8, format[:priority] assert_equal true, format[:enabled] end + + # Tag functionality tests + test "should store and retrieve tags in metadata" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + metadata: { tags: ["bot:googlebot", "trusted"] }, + user: users(:one) + ) + + assert_equal ["bot:googlebot", "trusted"], rule.tags + end + + test "should add tag to rule" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + user: users(:one) + ) + + rule.add_tag("bot:googlebot") + rule.save! + + assert_includes rule.tags, "bot:googlebot" + end + + test "should remove tag from rule" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + metadata: { tags: ["bot:googlebot", "trusted"] }, + user: users(:one) + ) + + rule.remove_tag("trusted") + rule.save! + + assert_not_includes rule.tags, "trusted" + assert_includes rule.tags, "bot:googlebot" + end + + test "should check if rule has tag" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + metadata: { tags: ["bot:googlebot"] }, + user: users(:one) + ) + + assert rule.has_tag?("bot:googlebot") + assert_not rule.has_tag?("bot:bingbot") + end + + test "should store headers in metadata" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + metadata: { + tags: ["bot:googlebot"], + headers: { "X-Bot-Agent" => "googlebot" } + }, + user: users(:one) + ) + + assert_equal({ "X-Bot-Agent" => "googlebot" }, rule.headers) + end + + test "should set tags via assignment" do + network_range = NetworkRange.create!(cidr: "10.0.0.0/8") + rule = Rule.create!( + waf_rule_type: "network", + waf_action: "allow", + network_range: network_range, + user: users(:one) + ) + + rule.tags = ["bot:bingbot", "network:microsoft"] + rule.save! + + assert_equal ["bot:bingbot", "network:microsoft"], rule.tags + end end