From 3f274c842c4e59f665128b6f1be0d9aa4c35d8f9 Mon Sep 17 00:00:00 2001 From: Dan Milne Date: Tue, 18 Nov 2025 16:40:05 +1100 Subject: [PATCH] Fix some blocked/allow laggards after migrating. Add DuckDB for outstanding analyitcs performance. Start adding an import for all bot networks --- app/controllers/analytics_controller.rb | 190 ++++-- .../bot_network_ranges_controller.rb | 126 ++++ app/controllers/network_ranges_controller.rb | 79 ++- app/controllers/rules_controller.rb | 28 +- .../controllers/rule_form_controller.js | 36 ++ app/jobs/cleanup_old_events_job.rb | 52 ++ app/jobs/fetch_ipapi_data_job.rb | 29 +- app/jobs/import_all_bot_network_ranges_job.rb | 26 + app/jobs/import_bot_network_ranges_job.rb | 47 ++ app/jobs/process_waf_event_job.rb | 11 +- app/jobs/sync_events_to_duckdb_job.rb | 89 +++ app/models/event.rb | 15 +- app/models/event_ddb.rb | 499 +++++++++++++++ app/models/network_range.rb | 56 +- app/models/rule.rb | 27 +- app/models/waf_policy.rb | 40 +- app/services/analytics_duckdb_service.rb | 284 +++++++++ app/services/bot_network_range_importer.rb | 573 ++++++++++++++++++ app/services/ipapi.rb | 103 ++++ app/views/analytics/networks.html.erb | 17 +- app/views/bot_network_ranges/index.html.erb | 171 ++++++ app/views/bot_network_ranges/show.html.erb | 175 ++++++ app/views/events/index.html.erb | 5 +- app/views/layouts/application.html.erb | 4 + app/views/rules/index.html.erb | 12 +- app/views/rules/new.html.erb | 42 +- app/views/rules/show.html.erb | 6 +- config/recurring.yml | 6 + config/routes.rb | 12 + docs/rule-architecture.md | 2 +- test/controllers/rules_controller_test.rb | 65 ++ test/fixtures/files/ipapi_91_84_96_0.json | 66 ++ test/jobs/cleanup_old_events_job_test.rb | 195 ++++++ test/models/event_test.rb | 2 +- test/models/rule_path_pattern_test.rb | 233 +++++++ test/services/ipapi_test.rb | 134 ++++ test/services/path_rule_matcher_test.rb | 216 +++++++ 37 files changed, 3522 insertions(+), 151 deletions(-) create mode 100644 app/controllers/bot_network_ranges_controller.rb create mode 100644 app/javascript/controllers/rule_form_controller.js create mode 100644 app/jobs/cleanup_old_events_job.rb create mode 100644 app/jobs/import_all_bot_network_ranges_job.rb create mode 100644 app/jobs/import_bot_network_ranges_job.rb create mode 100644 app/jobs/sync_events_to_duckdb_job.rb create mode 100644 app/models/event_ddb.rb create mode 100644 app/services/analytics_duckdb_service.rb create mode 100644 app/services/bot_network_range_importer.rb create mode 100644 app/views/bot_network_ranges/index.html.erb create mode 100644 app/views/bot_network_ranges/show.html.erb create mode 100644 test/controllers/rules_controller_test.rb create mode 100644 test/fixtures/files/ipapi_91_84_96_0.json create mode 100644 test/jobs/cleanup_old_events_job_test.rb create mode 100644 test/models/rule_path_pattern_test.rb create mode 100644 test/services/ipapi_test.rb create mode 100644 test/services/path_rule_matcher_test.rb diff --git a/app/controllers/analytics_controller.rb b/app/controllers/analytics_controller.rb index 8c31a13..9fc179f 100644 --- a/app/controllers/analytics_controller.rb +++ b/app/controllers/analytics_controller.rb @@ -23,9 +23,10 @@ class AnalyticsController < ApplicationController # Cache key includes period and start_time (hour-aligned for consistency) cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}" - # Core statistics - cached + # Core statistics - cached (uses DuckDB if available) @total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time).count + with_duckdb_fallback { EventDdb.count_since(@start_time) } || + Event.where("timestamp >= ?", @start_time).count end @total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do @@ -40,31 +41,33 @@ class AnalyticsController < ApplicationController NetworkRange.count end - # Event breakdown by action - cached + # Event breakdown by action - cached (uses DuckDB if available) @event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time) - .group(:waf_action) - .count - # Keys are already strings ("allow", "deny", etc.) from the enum + with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } || + Event.where("timestamp >= ?", @start_time) + .group(:waf_action) + .count end - # Top countries by event count - cached (now uses denormalized country column) + # Top countries by event count - cached (uses DuckDB if available) @top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do - Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) - .group(:country) - .count - .sort_by { |_, count| -count } - .first(10) + with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } || + Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) + .group(:country) + .count + .sort_by { |_, count| -count } + .first(10) end - # Top blocked IPs - cached + # Top blocked IPs - cached (uses DuckDB if available) @top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time) - .where(waf_action: 1) # deny action in enum - .group(:ip_address) - .count - .sort_by { |_, count| -count } - .first(10) + with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } || + Event.where("timestamp >= ?", @start_time) + .where(waf_action: 0) # deny action in enum + .group(:ip_address) + .count + .sort_by { |_, count| -count } + .first(10) end # Network range intelligence breakdown - cached @@ -92,7 +95,7 @@ class AnalyticsController < ApplicationController total_users: User.count, active_rules: Rule.enabled.count, disabled_rules: Rule.where(enabled: false).count, - recent_errors: Event.where("timestamp >= ? AND waf_action = ?", @start_time, 1).count # 1 = deny + recent_errors: Event.where("timestamp >= ? AND waf_action = ?", @start_time, 0).count # 0 = deny } end @@ -117,38 +120,90 @@ class AnalyticsController < ApplicationController @time_period = params[:period]&.to_sym || :day @start_time = calculate_start_time(@time_period) - # Top networks by request volume (using denormalized network_range_id) - # Use a subquery approach to avoid PostgreSQL GROUP BY issues with network_ranges.* - event_stats = Event.where("timestamp >= ?", @start_time) - .where.not(network_range_id: nil) - .group(:network_range_id) - .select("network_range_id, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") + # Top networks by request volume - use DuckDB if available + network_stats = with_duckdb_fallback { EventDdb.top_networks(@start_time, 50) } - # Join the stats back to NetworkRange to get full network details - @top_networks = NetworkRange.joins("INNER JOIN (#{event_stats.to_sql}) stats ON stats.network_range_id = network_ranges.id") - .select("network_ranges.*, stats.event_count, stats.unique_ips") - .order("stats.event_count DESC") - .limit(50) + if network_stats + # DuckDB path: array format [network_range_id, event_count, unique_ips] + network_ids = network_stats.map { |row| row[0] } + stats_by_id = network_stats.to_h { |row| [row[0], { event_count: row[1], unique_ips: row[2] }] } + + @top_networks = NetworkRange.where(id: network_ids) + .to_a + .map do |network| + stats = stats_by_id[network.id] + network.define_singleton_method(:event_count) { stats[:event_count] } + network.define_singleton_method(:unique_ips) { stats[:unique_ips] } + + # Add inherited intelligence support + intelligence = network.inherited_intelligence + if intelligence[:inherited] + network.define_singleton_method(:display_company) { intelligence[:company] } + network.define_singleton_method(:display_country) { intelligence[:country] } + network.define_singleton_method(:inherited_from) { intelligence[:parent_cidr] } + network.define_singleton_method(:has_inherited_data?) { true } + else + network.define_singleton_method(:display_company) { network.company } + network.define_singleton_method(:display_country) { network.country } + network.define_singleton_method(:inherited_from) { nil } + network.define_singleton_method(:has_inherited_data?) { false } + end + + network + end + .sort_by { |n| -n.event_count } + else + # PostgreSQL fallback + event_stats = Event.where("timestamp >= ?", @start_time) + .where.not(network_range_id: nil) + .group(:network_range_id) + .select("network_range_id, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") + + @top_networks = NetworkRange.joins("INNER JOIN (#{event_stats.to_sql}) stats ON stats.network_range_id = network_ranges.id") + .select("network_ranges.*, stats.event_count, stats.unique_ips") + .order("stats.event_count DESC") + .limit(50) + + # Add inherited intelligence support for PostgreSQL fallback + @top_networks = @top_networks.to_a.map do |network| + intelligence = network.inherited_intelligence + if intelligence[:inherited] + network.define_singleton_method(:display_company) { intelligence[:company] } + network.define_singleton_method(:display_country) { intelligence[:country] } + network.define_singleton_method(:inherited_from) { intelligence[:parent_cidr] } + network.define_singleton_method(:has_inherited_data?) { true } + else + network.define_singleton_method(:display_company) { network.company } + network.define_singleton_method(:display_country) { network.country } + network.define_singleton_method(:inherited_from) { nil } + network.define_singleton_method(:has_inherited_data?) { false } + end + network + end + end # Network type breakdown with traffic stats @network_breakdown = calculate_network_type_stats(@start_time) - # Company breakdown for top traffic sources (using denormalized company column) - @top_companies = Event.where("timestamp >= ? AND company IS NOT NULL", @start_time) + # Company breakdown for top traffic sources - use DuckDB if available + @top_companies = with_duckdb_fallback { EventDdb.top_companies(@start_time, 20) } || + Event.where("timestamp >= ? AND company IS NOT NULL", @start_time) .group(:company) .select("company, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .order("event_count DESC") .limit(20) - # ASN breakdown (using denormalized asn columns) - @top_asns = Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time) + # ASN breakdown - use DuckDB if available + @top_asns = with_duckdb_fallback { EventDdb.top_asns(@start_time, 15) } || + Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time) .group(:asn, :asn_org) .select("asn, asn_org, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .order("event_count DESC") .limit(15) - # Geographic breakdown (using denormalized country column) - @top_countries = Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) + # Geographic breakdown - use DuckDB if available + @top_countries = with_duckdb_fallback { EventDdb.top_countries_with_stats(@start_time, 15) } || + Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) .group(:country) .select("country, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") .order("event_count DESC") @@ -191,12 +246,15 @@ class AnalyticsController < ApplicationController # Historical hours are cached for full TTL, current hour cached briefly for freshness # Cache historical hours (1-23 hours ago) - these are complete and won't change - # No expiration - will stick around until evicted by cache store + # No expiration - will stick around until evicted by cache store (uses DuckDB if available) historical_timeline = Rails.cache.fetch("#{cache_key_base}/chart_historical") do historical_start = 23.hours.ago.beginning_of_hour - events_by_hour = Event.where("timestamp >= ? AND timestamp < ?", historical_start, Time.current.beginning_of_hour) - .group("DATE_TRUNC('hour', timestamp)") - .count + current_hour_start = Time.current.beginning_of_hour + + events_by_hour = with_duckdb_fallback { EventDdb.hourly_timeline(historical_start, current_hour_start) } || + Event.where("timestamp >= ? AND timestamp < ?", historical_start, current_hour_start) + .group("DATE_TRUNC('hour', timestamp)") + .count (1..23).map do |hour_ago| hour_time = hour_ago.hours.ago.beginning_of_hour @@ -209,6 +267,7 @@ class AnalyticsController < ApplicationController end # Current hour (0 hours ago) - cache very briefly since it's actively accumulating + # ALWAYS use PostgreSQL for current hour to get real-time data (DuckDB syncs every minute) current_hour_data = Rails.cache.fetch("#{cache_key_base}/chart_current_hour", expires_in: 1.minute) do hour_time = Time.current.beginning_of_hour count = Event.where("timestamp >= ?", hour_time).count @@ -290,6 +349,12 @@ class AnalyticsController < ApplicationController end def calculate_network_type_stats(start_time) + # Try DuckDB first, fallback to PostgreSQL + duckdb_stats = with_duckdb_fallback { EventDdb.network_type_stats(start_time) } + + return duckdb_stats if duckdb_stats + + # PostgreSQL fallback # Get all network types with their traffic statistics using denormalized columns network_types = [ { type: 'datacenter', label: 'Datacenter', column: :is_datacenter }, @@ -333,6 +398,12 @@ class AnalyticsController < ApplicationController end def calculate_suspicious_patterns(start_time) + # Try DuckDB first, fallback to PostgreSQL + duckdb_patterns = with_duckdb_fallback { EventDdb.suspicious_patterns(start_time) } + + return duckdb_patterns if duckdb_patterns + + # PostgreSQL fallback patterns = {} # High volume networks (top 1% by request count) - using denormalized network_range_id @@ -358,9 +429,9 @@ class AnalyticsController < ApplicationController high_deny_networks = Event.where("timestamp >= ? AND network_range_id IS NOT NULL", start_time) .group(:network_range_id) .select("network_range_id, - COUNT(CASE WHEN waf_action = 1 THEN 1 END) as denied_count, + COUNT(CASE WHEN waf_action = 0 THEN 1 END) as denied_count, COUNT(*) as total_count") - .having("COUNT(CASE WHEN waf_action = 1 THEN 1 END)::float / COUNT(*) > 0.5") + .having("COUNT(CASE WHEN waf_action = 0 THEN 1 END)::float / COUNT(*) > 0.5") .having("COUNT(*) >= 10") # minimum threshold patterns[:high_deny_rate] = { @@ -392,12 +463,14 @@ class AnalyticsController < ApplicationController { id: network.id, cidr: network.cidr, - company: network.company, + company: network.display_company, asn: network.asn, - country: network.country, + country: network.display_country, network_type: network.network_type, event_count: network.event_count, - unique_ips: network.unique_ips + unique_ips: network.unique_ips, + has_inherited_data: network.has_inherited_data?, + inherited_from: network.inherited_from } }, network_breakdown: @network_breakdown, @@ -449,4 +522,27 @@ class AnalyticsController < ApplicationController } end end + + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[Analytics] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback + end + + # Check if DuckDB has recent data (within last 2 minutes) + # Returns true if DuckDB is up-to-date, false if potentially stale + def duckdb_is_fresh? + newest = AnalyticsDuckdbService.instance.newest_event_timestamp + return false if newest.nil? + + # Consider fresh if newest event is within 2 minutes + # (sync job runs every 1 minute, so 2 minutes allows for some lag) + newest >= 2.minutes.ago + rescue StandardError => e + Rails.logger.warn "[Analytics] Error checking DuckDB freshness: #{e.message}" + false + end end \ No newline at end of file diff --git a/app/controllers/bot_network_ranges_controller.rb b/app/controllers/bot_network_ranges_controller.rb new file mode 100644 index 0000000..d248eea --- /dev/null +++ b/app/controllers/bot_network_ranges_controller.rb @@ -0,0 +1,126 @@ +# frozen_string_literal: true + +class BotNetworkRangesController < ApplicationController + before_action :authenticate_user! + before_action :require_admin + + def index + @bot_sources = BotNetworkRangeImporter::BOT_SOURCES + @recent_imports = DataImport.where(import_type: 'bot_network_ranges').order(created_at: :desc).limit(10) + @bot_network_ranges = NetworkRange.where("source LIKE 'bot_import_%'").order(created_at: :desc).limit(50) + end + + def import + source_key = params[:source] + options = import_options + + if source_key.present? + # Perform import synchronously for immediate feedback + begin + result = BotNetworkRangeImporter.import_from_source(source_key, options) + + # Create a data import record + DataImport.create!( + import_type: 'bot_network_ranges', + source: source_key.to_s, + status: 'completed', + records_processed: result[:imported], + notes: "Imported from #{result[:source]}: #{result[:note] || 'Success'}" + ) + + flash[:notice] = "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + rescue => e + flash[:alert] = "Failed to import from #{source_key}: #{e.message}" + end + else + flash[:alert] = "Please select a source to import from" + end + + redirect_to bot_network_ranges_path + end + + def import_async + source_key = params[:source] + options = import_options + + if source_key.present? + # Create a data import record for tracking + data_import = DataImport.create!( + import_type: 'bot_network_ranges', + source: source_key.to_s, + status: 'pending', + records_processed: 0, + notes: "Import job queued for #{source_key}" + ) + + # Queue the background job + ImportBotNetworkRangesJob.perform_later(source_key, options.merge(data_import_id: data_import.id)) + + flash[:notice] = "Import job queued for #{source_key}. You'll be notified when it's complete." + else + flash[:alert] = "Please select a source to import from" + end + + redirect_to bot_network_ranges_path + end + + def import_all + options = import_options + + # Create a data import record for batch import + data_import = DataImport.create!( + import_type: 'bot_network_ranges', + source: 'all_sources', + status: 'pending', + records_processed: 0, + notes: "Batch import job queued for all available sources" + ) + + # Queue the batch import job + ImportAllBotNetworkRangesJob.perform_later(options.merge(data_import_id: data_import.id)) + + flash[:notice] = "Batch import job queued for all sources. This may take several minutes." + redirect_to bot_network_ranges_path + end + + def show + @network_ranges = NetworkRange.where("source LIKE 'bot_import_#{params[:source]}%'") + .order(created_at: :desc) + .page(params[:page]) + .per(50) + + @source_name = BotNetworkRangeImporter::BOT_SOURCES[params[:source].to_sym]&.dig(:name) || params[:source] + @import_stats = NetworkRange.where("source LIKE 'bot_import_#{params[:source]}%'") + .group(:source) + .count + end + + def destroy + source = params[:source] + deleted_count = NetworkRange.where("source LIKE 'bot_import_#{source}%'").delete_all + + flash[:notice] = "Deleted #{deleted_count} network ranges from #{source}" + redirect_to bot_network_ranges_path + end + + private + + def require_admin + redirect_to root_path, alert: 'Admin access required' unless current_user&.admin? + end + + def import_options + options = {} + + # AWS-specific options + if params[:aws_services].present? + options[:aws_services] = params[:aws_services].split(',').map(&:strip) + end + + # Batch size control + options[:batch_size] = params[:batch_size].to_i if params[:batch_size].present? + options[:batch_size] = 1000 if options[:batch_size].zero? + + options + end +end \ No newline at end of file diff --git a/app/controllers/network_ranges_controller.rb b/app/controllers/network_ranges_controller.rb index 552e839..5dc13a1 100644 --- a/app/controllers/network_ranges_controller.rb +++ b/app/controllers/network_ranges_controller.rb @@ -46,8 +46,10 @@ class NetworkRangesController < ApplicationController authorize @network_range if @network_range.persisted? - # Real network - use direct IP containment for consistency with stats - events_scope = Event.where("ip_address <<= ?", @network_range.cidr).recent + # Real network - use indexed network_range_id for much better performance + # Include child network ranges to capture all traffic within this network block + network_ids = [@network_range.id] + @network_range.child_ranges.pluck(:id) + events_scope = Event.where(network_range_id: network_ids).recent else # Virtual network - find events by IP range containment events_scope = Event.where("ip_address <<= ?::inet", @network_range.to_s).recent @@ -58,22 +60,24 @@ class NetworkRangesController < ApplicationController @child_ranges = @network_range.child_ranges.limit(20) @parent_ranges = @network_range.parent_ranges.limit(10) - @associated_rules = @network_range.persisted? ? @network_range.rules.includes(:user).order(created_at: :desc) : [] + @associated_rules = @network_range.persisted? ? @network_range.rules.includes(:user, :network_range, :waf_policy).order(created_at: :desc) : [] # Load rules from supernets and subnets - @supernet_rules = @network_range.persisted? ? @network_range.supernet_rules.includes(:network_range, :user).limit(10) : [] - @subnet_rules = @network_range.persisted? ? @network_range.child_rules.includes(:network_range, :user).limit(20) : [] + @supernet_rules = @network_range.persisted? ? @network_range.supernet_rules.includes(:network_range, :user, :waf_policy).limit(10) : [] + @subnet_rules = @network_range.persisted? ? @network_range.child_rules.includes(:network_range, :user, :waf_policy).limit(20) : [] # Traffic analytics (if we have events) @traffic_stats = calculate_traffic_stats(@network_range) - # Check if we have IPAPI data (or if parent has it) + # Check if we have IPAPI data (or if parent has it) - cache expensive parent lookup @has_ipapi_data = @network_range.has_network_data_from?(:ipapi) @parent_with_ipapi = nil unless @has_ipapi_data - # Check if parent has IPAPI data - parent = @network_range.parent_with_intelligence + # Cache expensive parent intelligence lookup + parent = Rails.cache.fetch("network_parent_intel:#{@network_range.cache_key}", expires_in: 1.hour) do + @network_range.parent_with_intelligence + end if parent&.has_network_data_from?(:ipapi) @parent_with_ipapi = parent @has_ipapi_data = true @@ -194,6 +198,15 @@ class NetworkRangesController < ApplicationController private + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[NetworkRanges] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback + end + def set_network_range # Handle CIDR slugs (e.g., "40.77.167.100_32" -> "40.77.167.100/32") cidr = params[:id].gsub('_', '/') @@ -248,27 +261,37 @@ class NetworkRangesController < ApplicationController # Use indexed network_range_id for much better performance instead of expensive CIDR operator # Include child network ranges to capture all traffic within this network block network_ids = [network_range.id] + network_range.child_ranges.pluck(:id) - base_query = Event.where(network_range_id: network_ids) - # Use separate queries: one for grouping (without ordering), one for recent activity (with ordering) - events_for_grouping = base_query.limit(1000) - events_for_activity = base_query.recent.limit(20) + # Try DuckDB first for stats (much faster) + duckdb_stats = with_duckdb_fallback { EventDdb.network_traffic_stats(network_ids) } + duckdb_top_paths = with_duckdb_fallback { EventDdb.network_top_paths(network_ids, 10) } + duckdb_top_agents = with_duckdb_fallback { EventDdb.network_top_user_agents(network_ids, 5) } - # Calculate counts properly - use consistent base_query for all counts - total_requests = base_query.count - unique_ips = base_query.except(:order).distinct.count(:ip_address) - blocked_requests = base_query.blocked.count - allowed_requests = base_query.allowed.count + if duckdb_stats + # DuckDB success - use fast aggregated stats + stats = duckdb_stats.merge( + top_paths: duckdb_top_paths&.to_h || {}, + top_user_agents: duckdb_top_agents&.to_h || {}, + recent_activity: Event.where(network_range_id: network_ids).recent.limit(20) + ) + else + # PostgreSQL fallback + base_query = Event.where(network_range_id: network_ids) + events_for_grouping = base_query.limit(1000) + events_for_activity = base_query.recent.limit(20) - { - total_requests: total_requests, - unique_ips: unique_ips, - blocked_requests: blocked_requests, - allowed_requests: allowed_requests, - top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10), - top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5), - recent_activity: events_for_activity - } + stats = { + total_requests: base_query.count, + unique_ips: base_query.except(:order).distinct.count(:ip_address), + blocked_requests: base_query.blocked.count, + allowed_requests: base_query.allowed.count, + top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10).to_h, + top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5).to_h, + recent_activity: events_for_activity + } + end + + stats else # No events - return empty stats { @@ -296,8 +319,8 @@ class NetworkRangesController < ApplicationController unique_ips: base_query.except(:order).distinct.count(:ip_address), blocked_requests: base_query.blocked.count, allowed_requests: base_query.allowed.count, - top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10), - top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5), + top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10).to_h, + top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5).to_h, recent_activity: events_for_activity } else diff --git a/app/controllers/rules_controller.rb b/app/controllers/rules_controller.rb index 4014b45..277fa21 100644 --- a/app/controllers/rules_controller.rb +++ b/app/controllers/rules_controller.rb @@ -46,12 +46,9 @@ class RulesController < ApplicationController process_quick_create_parameters # Handle network range creation if CIDR is provided - if params[:cidr].present? && @rule.network_rule? - network_range = NetworkRange.find_or_create_by(cidr: params[:cidr]) do |range| - range.user = Current.user - range.source = 'manual' - range.creation_reason = "Created for rule ##{@rule.id}" - end + cidr_param = params[:new_cidr].presence || params[:cidr].presence + if cidr_param.present? && @rule.network_rule? + network_range = NetworkRange.find_or_create_by_cidr(cidr_param, user: Current.user, source: 'manual') @rule.network_range = network_range end @@ -132,7 +129,9 @@ class RulesController < ApplicationController :expires_at, :enabled, :source, - :network_range_id + :network_range_id, + :header_name, + :header_value ] # Only include conditions for non-network rules @@ -250,15 +249,24 @@ def process_quick_create_parameters }) end - # Parse metadata if it's a string that looks like JSON - if @rule.metadata.is_a?(String) && @rule.metadata.starts_with?('{') + # Parse metadata textarea first if it's JSON + if @rule.metadata.is_a?(String) && @rule.metadata.present? && @rule.metadata.starts_with?('{') begin @rule.metadata = JSON.parse(@rule.metadata) rescue JSON::ParserError - # Keep as string if not valid JSON + # Keep as string if not valid JSON - will be caught by validation end end + # Ensure metadata is a hash + @rule.metadata = {} unless @rule.metadata.is_a?(Hash) + + # Handle add_header fields - use provided params or existing metadata values + if @rule.add_header_action? && (params[:header_name].present? || params[:header_value].present?) + @rule.metadata['header_name'] = params[:header_name].presence || @rule.metadata['header_name'] || 'X-Bot-Agent' + @rule.metadata['header_value'] = params[:header_value].presence || @rule.metadata['header_value'] || 'Unknown' + end + # Handle expires_at parsing for text input if params.dig(:rule, :expires_at).present? expires_at_str = params[:rule][:expires_at].strip diff --git a/app/javascript/controllers/rule_form_controller.js b/app/javascript/controllers/rule_form_controller.js new file mode 100644 index 0000000..0f4e7be --- /dev/null +++ b/app/javascript/controllers/rule_form_controller.js @@ -0,0 +1,36 @@ +import { Controller } from "@hotwired/stimulus" + +export default class RuleFormController extends Controller { + static targets = ["actionSelect", "addHeaderSection", "expirationCheckbox", "expirationField"] + + connect() { + this.updateActionSections() + } + + updateActionSections() { + const selectedAction = this.actionSelectTarget.value + + // Hide all action-specific sections + this.addHeaderSectionTarget.classList.add('hidden') + + // Show relevant section based on action + switch(selectedAction) { + case 'add_header': + this.addHeaderSectionTarget.classList.remove('hidden') + break + } + } + + toggleExpiration() { + if (this.expirationCheckboxTarget.checked) { + this.expirationFieldTarget.classList.remove('hidden') + } else { + this.expirationFieldTarget.classList.add('hidden') + // Clear the datetime field when unchecked + const datetimeInput = this.expirationFieldTarget.querySelector('input[type="datetime-local"]') + if (datetimeInput) { + datetimeInput.value = '' + } + } + } +} diff --git a/app/jobs/cleanup_old_events_job.rb b/app/jobs/cleanup_old_events_job.rb new file mode 100644 index 0000000..51f0362 --- /dev/null +++ b/app/jobs/cleanup_old_events_job.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +# CleanupOldEventsJob - Removes events older than the configured retention period +# +# This job runs periodically (hourly) to clean up old events based on the +# event_retention_days setting. This helps keep the database size manageable +# and improves query performance. +# +# The retention period is configurable via the 'event_retention_days' setting +# (default: 90 days). This allows administrators to balance between historical +# data retention and database performance. +# +# Schedule: Every hour (configured in config/recurring.yml) +class CleanupOldEventsJob < ApplicationJob + queue_as :background + + def perform + retention_days = Setting.event_retention_days + + # Don't delete if retention is set to 0 or negative (disabled) + if retention_days <= 0 + Rails.logger.info "CleanupOldEventsJob: Event retention disabled (retention_days: #{retention_days})" + return 0 + end + + cutoff_date = retention_days.days.ago + + # Count events to be deleted + old_events = Event.where('timestamp < ?', cutoff_date) + count = old_events.count + + if count.zero? + Rails.logger.info "CleanupOldEventsJob: No events older than #{retention_days} days found" + return 0 + end + + Rails.logger.info "CleanupOldEventsJob: Deleting #{count} events older than #{retention_days} days (before #{cutoff_date})" + + # Delete in batches to avoid long-running transactions + deleted_count = 0 + batch_size = 10_000 + + old_events.in_batches(of: batch_size) do |batch| + batch_count = batch.delete_all + deleted_count += batch_count + Rails.logger.info "CleanupOldEventsJob: Deleted batch of #{batch_count} events (total: #{deleted_count}/#{count})" + end + + Rails.logger.info "CleanupOldEventsJob: Successfully deleted #{deleted_count} events" + deleted_count + end +end diff --git a/app/jobs/fetch_ipapi_data_job.rb b/app/jobs/fetch_ipapi_data_job.rb index 8adeeef..477c4e8 100644 --- a/app/jobs/fetch_ipapi_data_job.rb +++ b/app/jobs/fetch_ipapi_data_job.rb @@ -15,31 +15,14 @@ class FetchIpapiDataJob < ApplicationJob ipapi_data = Ipapi.lookup(sample_ip) if ipapi_data.present? && !ipapi_data.key?('error') - # Check if IPAPI returned a different route than our tracking network - ipapi_route = ipapi_data.dig('asn', 'route') - target_network = tracking_network + # Process IPAPI data and create network ranges + result = Ipapi.process_ipapi_data(ipapi_data, tracking_network) - if ipapi_route.present? && ipapi_route != tracking_network.cidr - # IPAPI returned a different CIDR - find or create that network range - Rails.logger.info "IPAPI returned different route: #{ipapi_route} (requested: #{tracking_network.cidr})" + # Mark the tracking network as having been queried + # Use the broadest CIDR returned for deduplication + tracking_network.mark_ipapi_queried!(result[:broadest_cidr]) - target_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr| - nr.source = 'api_imported' - nr.creation_reason = "Created from IPAPI lookup for #{tracking_network.cidr}" - end - - Rails.logger.info "Storing IPAPI data on correct network: #{target_network.cidr}" - end - - # Store data on the target network (wherever IPAPI said it belongs) - target_network.set_network_data(:ipapi, ipapi_data) - target_network.last_api_fetch = Time.current - target_network.save! - - # Mark the tracking network as having been queried, with the CIDR that was returned - tracking_network.mark_ipapi_queried!(target_network.cidr) - - Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (stored on #{target_network.cidr})" + Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (created #{result[:networks].length} networks)" # Broadcast to the tracking network broadcast_ipapi_update(tracking_network, ipapi_data) diff --git a/app/jobs/import_all_bot_network_ranges_job.rb b/app/jobs/import_all_bot_network_ranges_job.rb new file mode 100644 index 0000000..3e84425 --- /dev/null +++ b/app/jobs/import_all_bot_network_ranges_job.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# ImportAllBotNetworkRangesJob - Background job for importing from all bot sources +class ImportAllBotNetworkRangesJob < ApplicationJob + queue_as :default + + def perform(options = {}) + Rails.logger.info "Starting batch import of all bot network ranges" + + results = BotNetworkRangeImporter.import_all_sources(options) + + # Send completion summary + Rails.logger.info "Batch import completed. Summary: #{results}" + + # Broadcast summary to clients + ActionCable.server.broadcast( + "bot_imports", + { + type: 'batch_summary', + status: 'completed', + results: results, + message: "Batch import completed for all sources" + } + ) + end +end \ No newline at end of file diff --git a/app/jobs/import_bot_network_ranges_job.rb b/app/jobs/import_bot_network_ranges_job.rb new file mode 100644 index 0000000..217051e --- /dev/null +++ b/app/jobs/import_bot_network_ranges_job.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +# ImportBotNetworkRangesJob - Background job for importing bot network ranges +# +# Imports network ranges from official bot provider sources. +# Runs asynchronously to avoid blocking the web interface. +class ImportBotNetworkRangesJob < ApplicationJob + queue_as :default + + def perform(source_key, options = {}) + Rails.logger.info "Starting bot network range import for source: #{source_key}" + + begin + result = BotNetworkRangeImporter.import_from_source(source_key, options) + + # Send notification or log completion + Rails.logger.info "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + + # Optionally broadcast via Turbo Streams for real-time updates + ActionCable.server.broadcast( + "bot_imports", + { + source: source_key, + status: 'completed', + imported: result[:imported], + message: "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + } + ) + + rescue => e + Rails.logger.error "Bot network range import failed for #{source_key}: #{e.message}" + + # Broadcast error notification + ActionCable.server.broadcast( + "bot_imports", + { + source: source_key, + status: 'error', + error: e.message, + message: "Failed to import from #{source_key}: #{e.message}" + } + ) + + raise e + end + end +end \ No newline at end of file diff --git a/app/jobs/process_waf_event_job.rb b/app/jobs/process_waf_event_job.rb index dac56a3..d502449 100644 --- a/app/jobs/process_waf_event_job.rb +++ b/app/jobs/process_waf_event_job.rb @@ -53,16 +53,15 @@ class ProcessWafEventJob < ApplicationJob # Queue IPAPI enrichment based on /24 tracking # The tracking network is the /24 that stores ipapi_queried_at if NetworkRange.should_fetch_ipapi_for_ip?(event.ip_address) - # Use tracking network for fetch status to avoid race conditions - if tracking_network.is_fetching_api_data?(:ipapi) - Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - already being fetched" - else - tracking_network.mark_as_fetching_api_data!(:ipapi) + # Atomically mark as fetching - this prevents duplicate jobs via database lock + if tracking_network.mark_as_fetching_api_data!(:ipapi) Rails.logger.info "Queueing IPAPI fetch for IP #{event.ip_address} (tracking network: #{tracking_network.cidr})" FetchIpapiDataJob.perform_later(network_range_id: tracking_network.id) + else + Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - another job already started" end else - Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried recently" + Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried or being fetched" end # Evaluate WAF policies inline if needed (lazy evaluation) diff --git a/app/jobs/sync_events_to_duckdb_job.rb b/app/jobs/sync_events_to_duckdb_job.rb new file mode 100644 index 0000000..5741a18 --- /dev/null +++ b/app/jobs/sync_events_to_duckdb_job.rb @@ -0,0 +1,89 @@ +# frozen_string_literal: true + +# Background job to sync events from PostgreSQL to DuckDB +# Runs every 5 minutes to keep analytics database up-to-date +# Uses watermark tracking to only sync new events +class SyncEventsToDuckdbJob < ApplicationJob + queue_as :default + + # Key for storing last sync timestamp in Rails cache + WATERMARK_CACHE_KEY = "duckdb_last_sync_time" + WATERMARK_TTL = 1.week + + # Overlap window to catch late-arriving events + SYNC_OVERLAP = 1.minute + + def perform + service = AnalyticsDuckdbService.instance + + # Determine where to start syncing + from_timestamp = determine_sync_start_time(service) + + Rails.logger.info "[DuckDB Sync] Starting sync from #{from_timestamp}" + + # Sync new events using PostgreSQL cursor + DuckDB Appender + # (setup_schema is called internally within sync_new_events) + count = service.sync_new_events(from_timestamp) + + # Update watermark if we synced any events + if count > 0 + update_last_sync_time + Rails.logger.info "[DuckDB Sync] Successfully synced #{count} events" + else + Rails.logger.info "[DuckDB Sync] No new events to sync" + end + rescue StandardError => e + Rails.logger.error "[DuckDB Sync] Job failed: #{e.message}" + Rails.logger.error e.backtrace.join("\n") + raise # Re-raise to mark job as failed in Solid Queue + end + + private + + # Determine timestamp to start syncing from + # Strategy: + # 1. First run (DuckDB empty): sync from oldest PostgreSQL event + # 2. Subsequent runs: sync from last watermark with overlap + def determine_sync_start_time(service) + oldest_duckdb = service.oldest_event_timestamp + + if oldest_duckdb.nil? + # DuckDB is empty - this is the first sync + # Start from oldest PostgreSQL event (or reasonable cutoff) + oldest_pg = Event.minimum(:timestamp) + + if oldest_pg.nil? + # No events in PostgreSQL at all + Rails.logger.warn "[DuckDB Sync] No events found in PostgreSQL" + 1.day.ago # Default to recent window + else + Rails.logger.info "[DuckDB Sync] First sync - starting from oldest event: #{oldest_pg}" + oldest_pg + end + else + # DuckDB has data - sync from last watermark with overlap + last_sync = Rails.cache.read(WATERMARK_CACHE_KEY) + + if last_sync.nil? + # Watermark not in cache (maybe cache expired or restarted) + # Fall back to newest event in DuckDB + newest_duckdb = service.newest_event_timestamp + start_time = newest_duckdb ? newest_duckdb - SYNC_OVERLAP : oldest_duckdb + Rails.logger.info "[DuckDB Sync] Watermark not found, using newest DuckDB event: #{start_time}" + start_time + else + # Normal case: use watermark with overlap to catch late arrivals + start_time = last_sync - SYNC_OVERLAP + Rails.logger.debug "[DuckDB Sync] Using watermark: #{last_sync} (with #{SYNC_OVERLAP}s overlap)" + start_time + end + end + end + + # Update last sync watermark in cache + def update_last_sync_time + now = Time.current + Rails.cache.write(WATERMARK_CACHE_KEY, now, expires_in: WATERMARK_TTL) + Rails.logger.debug "[DuckDB Sync] Updated watermark to #{now}" + end +end diff --git a/app/models/event.rb b/app/models/event.rb index c68e338..a374900 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -10,6 +10,17 @@ class Event < ApplicationRecord # Enums for fixed value sets # Canonical WAF action order - aligned with Rule and Agent models + # + # IMPORTANT: These values were swapped to match baffle-agent convention: + # - deny: 0 (blocked traffic) + # - allow: 1 (allowed traffic) + # + # When using raw integer values in queries: + # - waf_action = 0 -> denied/blocked requests + # - waf_action = 1 -> allowed requests + # - waf_action = 2 -> redirect requests + # - waf_action = 3 -> challenge requests + # - waf_action = 4 -> log-only requests enum :waf_action, { deny: 0, # deny/block allow: 1, # allow/pass @@ -341,11 +352,11 @@ class Event < ApplicationRecord end def blocked? - waf_action.in?(['block', 'deny']) + waf_action == 'deny' # deny = 0 end def allowed? - waf_action.in?(['allow', 'pass']) + waf_action == 'allow' # allow = 1 end def logged? diff --git a/app/models/event_ddb.rb b/app/models/event_ddb.rb new file mode 100644 index 0000000..564f268 --- /dev/null +++ b/app/models/event_ddb.rb @@ -0,0 +1,499 @@ +# frozen_string_literal: true + +require 'ostruct' + +# EventDdb - DuckDB-backed analytics queries for events +# Provides an ActiveRecord-like interface for querying DuckDB events table +# Falls back to PostgreSQL Event model if DuckDB is unavailable +class EventDdb + class << self + # Get DuckDB service + def service + AnalyticsDuckdbService.instance + end + + # Total events since timestamp + def count_since(start_time) + service.with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time) + result.first&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in count_since: #{e.message}" + nil # Fallback to PostgreSQL + end + + # Event breakdown by WAF action + def breakdown_by_action(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT waf_action, COUNT(*) as count + FROM events + WHERE timestamp >= ? + GROUP BY waf_action + SQL + + # Convert to hash like ActiveRecord .group.count returns + result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}" + nil + end + + # Top countries with event counts + def top_countries(start_time, limit = 10) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT country, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY count DESC + LIMIT ? + SQL + + # Return array of [country, count] tuples like ActiveRecord + result.to_a.map { |row| [row["country"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}" + nil + end + + # Top blocked IPs + def top_blocked_ips(start_time, limit = 10) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT ip_address, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND waf_action = 0 + GROUP BY ip_address + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["ip_address"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}" + nil + end + + # Hourly timeline aggregation + def hourly_timeline(start_time, end_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, end_time) + SELECT + DATE_TRUNC('hour', timestamp) as hour, + COUNT(*) as count + FROM events + WHERE timestamp >= ? AND timestamp < ? + GROUP BY hour + ORDER BY hour + SQL + + # Convert to hash with Time keys like ActiveRecord + result.to_a.to_h { |row| [row["hour"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}" + nil + end + + # Top networks by traffic volume + # Returns array of arrays: [network_range_id, event_count, unique_ips] + def top_networks(start_time, limit = 50) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + network_range_id, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + ORDER BY event_count DESC + LIMIT ? + SQL + + result.to_a + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_networks: #{e.message}" + nil + end + + # Top companies + # Returns array of OpenStruct objects with: company, event_count, unique_ips, network_count + def top_companies(start_time, limit = 20) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + company, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? AND company IS NOT NULL + GROUP BY company + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + company: row[0], + event_count: row[1], + unique_ips: row[2], + network_count: row[3] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_companies: #{e.message}" + nil + end + + # Top ASNs + # Returns array of OpenStruct objects with: asn, asn_org, event_count, unique_ips, network_count + def top_asns(start_time, limit = 15) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + asn, + asn_org, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? AND asn IS NOT NULL + GROUP BY asn, asn_org + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + asn: row[0], + asn_org: row[1], + event_count: row[2], + unique_ips: row[3], + network_count: row[4] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_asns: #{e.message}" + nil + end + + # Network type breakdown (datacenter, VPN, proxy, standard) + # Returns hash with network_type as key and hash of stats as value + def network_type_breakdown(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT + CASE + WHEN is_datacenter THEN 'datacenter' + WHEN is_vpn THEN 'vpn' + WHEN is_proxy THEN 'proxy' + ELSE 'standard' + END as network_type, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? + GROUP BY network_type + SQL + + # Convert arrays to hash: network_type => { event_count, unique_ips, network_count } + result.to_a.to_h do |row| + [ + row[0], # network_type + { + "event_count" => row[1], + "unique_ips" => row[2], + "network_count" => row[3] + } + ] + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_type_breakdown: #{e.message}" + nil + end + + # Top countries with detailed stats (event count and unique IPs) + # Returns array of OpenStruct objects with: country, event_count, unique_ips + def top_countries_with_stats(start_time, limit = 15) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + country, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + country: row[0], + event_count: row[1], + unique_ips: row[2] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_countries_with_stats: #{e.message}" + nil + end + + # Network type stats with formatted output matching controller expectations + # Returns hash with type keys containing label, networks, events, unique_ips, percentage + def network_type_stats(start_time) + service.with_connection do |conn| + # Get total events for percentage calculation + total_result = conn.query("SELECT COUNT(*) as total FROM events WHERE timestamp >= ?", start_time) + total_events = total_result.first&.first || 0 + + # Get breakdown by network type + breakdown = network_type_breakdown(start_time) + return nil unless breakdown + + # Format results with labels and percentages + results = {} + + { + 'datacenter' => 'Datacenter', + 'vpn' => 'VPN', + 'proxy' => 'Proxy', + 'standard' => 'Standard' + }.each do |type, label| + stats = breakdown[type] + event_count = stats ? stats["event_count"] : 0 + + results[type] = { + label: label, + networks: stats ? stats["network_count"] : 0, + events: event_count, + unique_ips: stats ? stats["unique_ips"] : 0, + percentage: total_events > 0 ? ((event_count.to_f / total_events) * 100).round(1) : 0 + } + end + + results + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_type_stats: #{e.message}" + nil + end + + # Network range traffic statistics + # Returns comprehensive stats for a given network range ID(s) + def network_traffic_stats(network_range_ids) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + # Get all stats in a single query + result = conn.query(<<~SQL, *network_range_ids) + SELECT + COUNT(*) as total_requests, + COUNT(DISTINCT ip_address) as unique_ips, + SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as blocked_requests, + SUM(CASE WHEN waf_action = 1 THEN 1 ELSE 0 END) as allowed_requests + FROM events + WHERE network_range_id IN (#{placeholders}) + SQL + + stats_row = result.first + return nil unless stats_row + + { + total_requests: stats_row[0] || 0, + unique_ips: stats_row[1] || 0, + blocked_requests: stats_row[2] || 0, + allowed_requests: stats_row[3] || 0 + } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_traffic_stats: #{e.message}" + nil + end + + # Top paths for network range(s) + def network_top_paths(network_range_ids, limit = 10) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids, limit) + SELECT + request_path, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND request_path IS NOT NULL + GROUP BY request_path + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_top_paths: #{e.message}" + nil + end + + # Top user agents for network range(s) + def network_top_user_agents(network_range_ids, limit = 5) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids, limit) + SELECT + user_agent, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND user_agent IS NOT NULL + GROUP BY user_agent + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_top_user_agents: #{e.message}" + nil + end + + # Full user agent tally for network range(s) + # Returns hash of user_agent => count for all agents in the network + def network_agent_tally(network_range_ids) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids) + SELECT + user_agent, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND user_agent IS NOT NULL + GROUP BY user_agent + SQL + + # Convert to hash matching Ruby .tally format + result.to_a.to_h { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_agent_tally: #{e.message}" + nil + end + + # Suspicious network activity patterns + # Detects high-volume networks, high deny rates, and distributed companies + def suspicious_patterns(start_time) + service.with_connection do |conn| + # High volume networks (5x average) + avg_query = conn.query(<<~SQL, start_time) + SELECT + AVG(event_count) as avg_events + FROM ( + SELECT network_range_id, COUNT(*) as event_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + ) network_stats + SQL + + avg_events = avg_query.first&.first || 0 + threshold = avg_events * 5 + + high_volume = conn.query(<<~SQL, start_time, threshold) + SELECT + network_range_id, + COUNT(*) as event_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + HAVING COUNT(*) > ? + ORDER BY event_count DESC + SQL + + # High deny rate networks (>50% blocked, min 10 requests) + high_deny = conn.query(<<~SQL, start_time) + SELECT + network_range_id, + SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as denied_count, + COUNT(*) as total_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + HAVING CAST(SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) > 0.5 + AND COUNT(*) >= 10 + ORDER BY denied_count DESC + SQL + + # Distributed companies (appearing with 5+ unique IPs) + distributed_companies = conn.query(<<~SQL, start_time) + SELECT + company, + COUNT(DISTINCT ip_address) as ip_count + FROM events + WHERE timestamp >= ? AND company IS NOT NULL + GROUP BY company + HAVING COUNT(DISTINCT ip_address) > 5 + ORDER BY ip_count DESC + LIMIT 10 + SQL + + { + high_volume: { + count: high_volume.to_a.length, + networks: high_volume.to_a.map { |row| row[0] } # network_range_id + }, + high_deny_rate: { + count: high_deny.to_a.length, + network_ids: high_deny.to_a.map { |row| row[0] } # network_range_id + }, + distributed_companies: distributed_companies.to_a.map { |row| + { + company: row[0], # company name + subnets: row[1] # ip_count + } + } + } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}" + nil + end + end +end diff --git a/app/models/network_range.rb b/app/models/network_range.rb index 0cf96b5..503b018 100644 --- a/app/models/network_range.rb +++ b/app/models/network_range.rb @@ -158,13 +158,26 @@ class NetworkRange < ApplicationRecord end def mark_as_fetching_api_data!(source) - self.network_data ||= {} - self.network_data['fetching_status'] ||= {} - self.network_data['fetching_status'][source.to_s] = { - 'started_at' => Time.current.to_f, - 'job_id' => SecureRandom.hex(8) - } - save! + # Use database-level locking to prevent race conditions + transaction do + # Reload with lock to get fresh data + lock! + + # Double-check that we're not already fetching + if is_fetching_api_data?(source) + Rails.logger.info "Another job already started fetching #{source} for #{cidr}" + return false + end + + self.network_data ||= {} + self.network_data['fetching_status'] ||= {} + self.network_data['fetching_status'][source.to_s] = { + 'started_at' => Time.current.to_f, + 'job_id' => SecureRandom.hex(8) + } + save! + true + end end def clear_fetching_status!(source) @@ -222,9 +235,29 @@ class NetworkRange < ApplicationRecord end def agent_tally - # Rails.cache.fetch("#{to_s}:agent_tally", expires_in: 5.minutes) do - events.map(&:user_agent).tally - # end + Rails.cache.fetch("#{cache_key}:agent_tally", expires_in: 5.minutes) do + # Use DuckDB for fast agent tally instead of loading all events into memory + if persisted? && events_count > 0 + # Include child network ranges to capture all traffic within this network block + network_ids = [id] + child_ranges.pluck(:id) + + # Try DuckDB first for much faster aggregation + duckdb_tally = with_duckdb_fallback { EventDdb.network_agent_tally(network_ids) } + duckdb_tally || {} + else + # Virtual network - fallback to PostgreSQL CIDR query + events.map(&:user_agent).tally + end + end + end + + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[NetworkRange] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback end # Geographic lookup @@ -334,6 +367,9 @@ class NetworkRange < ApplicationRecord def self.should_fetch_ipapi_for_ip?(ip_address) tracking_network = find_or_create_tracking_network_for_ip(ip_address) + # Check if currently being fetched (prevents duplicate jobs) + return false if tracking_network.is_fetching_api_data?(:ipapi) + # Check if /24 has been queried recently queried_at = tracking_network.network_data&.dig('ipapi_queried_at') return true if queried_at.nil? diff --git a/app/models/rule.rb b/app/models/rule.rb index 58ac6f1..d5331e1 100644 --- a/app/models/rule.rb +++ b/app/models/rule.rb @@ -7,7 +7,7 @@ class Rule < ApplicationRecord # Rule enums (prefix needed to avoid rate_limit collision) # Canonical WAF action order - aligned with Agent and Event models - enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4 }, prefix: :action + enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4, add_header: 5 }, prefix: :action enum :waf_rule_type, { network: 0, rate_limit: 1, path_pattern: 2 }, prefix: :type SOURCES = %w[manual auto:scanner_detected auto:rate_limit_exceeded auto:bot_detected imported default manual:surgical_block manual:surgical_exception policy].freeze @@ -120,6 +120,10 @@ class Rule < ApplicationRecord action_challenge? end + def add_header_action? + action_add_header? + end + # Redirect/challenge convenience methods def redirect_url metadata_hash['redirect_url'] @@ -137,6 +141,14 @@ class Rule < ApplicationRecord metadata&.dig('challenge_message') end + def header_name + metadata&.dig('header_name') + end + + def header_value + metadata&.dig('header_value') + end + def related_surgical_rules if surgical_block? # Find the corresponding exception rule @@ -421,6 +433,12 @@ class Rule < ApplicationRecord if source&.start_with?('auto:') || source == 'default' self.user ||= User.find_by(role: 1) # admin role end + + # Set default header values for add_header action + if add_header_action? + self.metadata['header_name'] ||= 'X-Bot-Agent' + self.metadata['header_value'] ||= 'Unknown' + end end def calculate_priority_for_network_rules @@ -504,6 +522,13 @@ class Rule < ApplicationRecord if challenge_type_value && !%w[captcha javascript proof_of_work].include?(challenge_type_value) errors.add(:metadata, "challenge_type must be one of: captcha, javascript, proof_of_work") end + when "add_header" + unless metadata&.dig("header_name").present? + errors.add(:metadata, "must include 'header_name' for add_header action") + end + unless metadata&.dig("header_value").present? + errors.add(:metadata, "must include 'header_value' for add_header action") + end end end diff --git a/app/models/waf_policy.rb b/app/models/waf_policy.rb index ff39972..b5f4017 100644 --- a/app/models/waf_policy.rb +++ b/app/models/waf_policy.rb @@ -9,7 +9,7 @@ class WafPolicy < ApplicationRecord POLICY_TYPES = %w[country asn company network_type path_pattern].freeze # Actions - what to do when traffic matches this policy - ACTIONS = %w[allow deny redirect challenge].freeze + ACTIONS = %w[allow deny redirect challenge add_header].freeze # Associations belongs_to :user @@ -25,6 +25,7 @@ validate :targets_must_be_array validate :validate_targets_by_type validate :validate_redirect_configuration, if: :redirect_policy_action? validate :validate_challenge_configuration, if: :challenge_policy_action? + validate :validate_add_header_configuration, if: :add_header_policy_action? # Scopes scope :enabled, -> { where(enabled: true) } @@ -95,6 +96,10 @@ validate :targets_must_be_array policy_action == 'challenge' end + def add_header_policy_action? + policy_action == 'add_header' + end + # Lifecycle methods def active? enabled? && !expired? @@ -163,7 +168,7 @@ validate :targets_must_be_array priority: network_range.prefix_length ) - # Handle redirect/challenge specific data + # Handle redirect/challenge/add_header specific data if redirect_action? && additional_data['redirect_url'] rule.update!( metadata: rule.metadata.merge( @@ -178,6 +183,13 @@ validate :targets_must_be_array challenge_message: additional_data['challenge_message'] ) ) + elsif add_header_action? + rule.update!( + metadata: rule.metadata.merge( + header_name: additional_data['header_name'], + header_value: additional_data['header_value'] + ) + ) end rule @@ -212,7 +224,7 @@ validate :targets_must_be_array priority: 50 # Default priority for path rules ) - # Handle redirect/challenge specific data + # Handle redirect/challenge/add_header specific data if redirect_action? && additional_data['redirect_url'] rule.update!( metadata: rule.metadata.merge( @@ -227,6 +239,13 @@ validate :targets_must_be_array challenge_message: additional_data['challenge_message'] ) ) + elsif add_header_action? + rule.update!( + metadata: rule.metadata.merge( + header_name: additional_data['header_name'], + header_value: additional_data['header_value'] + ) + ) end rule @@ -346,6 +365,12 @@ validate :targets_must_be_array self.targets ||= [] self.additional_data ||= {} self.enabled = true if enabled.nil? + + # Set default header values for add_header action + if add_header_policy_action? + self.additional_data['header_name'] ||= 'X-Bot-Agent' + self.additional_data['header_value'] ||= 'Unknown' + end end def targets_must_be_array @@ -430,6 +455,15 @@ validate :targets_must_be_array end end + def validate_add_header_configuration + if additional_data['header_name'].blank? + errors.add(:additional_data, "must include 'header_name' for add_header action") + end + if additional_data['header_value'].blank? + errors.add(:additional_data, "must include 'header_value' for add_header action") + end + end + # Matching logic for different policy types def matches_country?(network_range) country = network_range.country || network_range.inherited_intelligence[:country] diff --git a/app/services/analytics_duckdb_service.rb b/app/services/analytics_duckdb_service.rb new file mode 100644 index 0000000..8664707 --- /dev/null +++ b/app/services/analytics_duckdb_service.rb @@ -0,0 +1,284 @@ +# frozen_string_literal: true + +# Service for managing DuckDB analytics database +# Provides fast analytical queries on events data using columnar storage +class AnalyticsDuckdbService + include Singleton + + DUCKDB_PATH = Rails.root.join("storage", "analytics.duckdb").to_s + BATCH_SIZE = 10_000 + + # Execute block with connection, ensuring database and connection are closed afterward + def with_connection + db = DuckDB::Database.open(DUCKDB_PATH) + conn = db.connect + yield conn + ensure + conn&.close + db&.close + end + + # Create events table if it doesn't exist (must be called within with_connection block) + def setup_schema(conn) + conn.execute(<<~SQL) + CREATE TABLE IF NOT EXISTS events ( + id BIGINT PRIMARY KEY, + timestamp TIMESTAMP NOT NULL, + ip_address VARCHAR, + network_range_id BIGINT, + country VARCHAR, + company VARCHAR, + asn INTEGER, + asn_org VARCHAR, + is_datacenter BOOLEAN, + is_vpn BOOLEAN, + is_proxy BOOLEAN, + waf_action INTEGER, + request_path VARCHAR, + user_agent VARCHAR + ) + SQL + + Rails.logger.info "[DuckDB] Schema setup complete" + end + + # Get timestamp of oldest event in DuckDB + # Returns nil if table is empty + def oldest_event_timestamp + with_connection do |conn| + result = conn.query("SELECT MIN(timestamp) as oldest FROM events") + first_row = result.first + first_row&.first # Returns the value or nil + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting oldest timestamp: #{e.message}" + nil + end + + # Get timestamp of newest event in DuckDB + # Returns nil if table is empty + def newest_event_timestamp + with_connection do |conn| + result = conn.query("SELECT MAX(timestamp) as newest FROM events") + first_row = result.first + first_row&.first # Returns the value or nil + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting newest timestamp: #{e.message}" + nil + end + + # Get maximum event ID already synced to DuckDB + def max_synced_id + with_connection do |conn| + result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events") + first_row = result.first + first_row&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting max ID: #{e.message}" + 0 + end + + # Sync new events from PostgreSQL to DuckDB + # Uses PostgreSQL cursor for memory-efficient streaming + # Uses Appender API for fast bulk inserts + # Filters by ID to avoid duplicates + def sync_new_events(from_timestamp) + total_synced = 0 + + with_connection do |conn| + # Ensure table exists + setup_schema(conn) + + # Get max ID already in DuckDB to avoid duplicates + max_id_result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events") + max_id = max_id_result.first&.first || 0 + Rails.logger.info "[DuckDB] Syncing events from #{from_timestamp}, max_id=#{max_id}" + + start_time = Time.current + appender = nil + batch_count = 0 + + begin + # Use PostgreSQL cursor for memory-efficient streaming + Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id) + .select( + :id, + :timestamp, + :ip_address, + :network_range_id, + :country, + :company, + :asn, + :asn_org, + :is_datacenter, + :is_vpn, + :is_proxy, + :waf_action, + :request_path, + :user_agent + ) + .order(:id) + .each_row(block_size: BATCH_SIZE) do |event_data| + # Create new appender for each batch + if batch_count % BATCH_SIZE == 0 + appender&.close # Close previous appender + appender = conn.appender("events") + end + + # Unpack event data from cursor row (Hash from each_row) + begin + appender.append_row( + event_data["id"], + event_data["timestamp"], + event_data["ip_address"]&.to_s, + event_data["network_range_id"], + event_data["country"], + event_data["company"], + event_data["asn"], + event_data["asn_org"], + event_data["is_datacenter"], + event_data["is_vpn"], + event_data["is_proxy"], + event_data["waf_action"], + event_data["request_path"], + event_data["user_agent"] + ) + rescue StandardError => e + Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}" + Rails.logger.error "[DuckDB] event_data = #{event_data.inspect}" + raise + end + + batch_count += 1 + total_synced += 1 + + # Log progress every BATCH_SIZE events + if batch_count % BATCH_SIZE == 0 + Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)" + end + end + + # Close final appender + appender&.close + + duration = Time.current - start_time + rate = total_synced / duration if duration > 0 + Rails.logger.info "[DuckDB] Sync complete: #{total_synced} events in #{duration.round(2)}s (~#{rate&.round(0)} events/sec)" + rescue StandardError => e + appender&.close rescue nil # Ensure appender is closed on error + Rails.logger.error "[DuckDB] Error syncing events: #{e.message}" + Rails.logger.error e.backtrace.join("\n") + raise # Re-raise to be caught by outer rescue + end + end + + total_synced + rescue StandardError => e + Rails.logger.error "[DuckDB] Sync failed: #{e.message}" + 0 + end + + # Execute analytical query on DuckDB + def query(sql, *params) + with_connection do |conn| + conn.query(sql, *params) + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Query error: #{e.message}" + Rails.logger.error "SQL: #{sql}" + raise + end + + # Get event count in DuckDB + def event_count + with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events") + first_row = result.first + first_row&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting event count: #{e.message}" + 0 + end + + # Analytics query: Total events since timestamp + def total_events_since(start_time) + with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time) + result.first&.first || 0 + end + end + + # Analytics query: Event breakdown by WAF action + def event_breakdown_by_action(start_time) + with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT waf_action, COUNT(*) as count + FROM events + WHERE timestamp >= ? + GROUP BY waf_action + SQL + + # Convert to hash like PostgreSQL returns + result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + end + end + + # Analytics query: Top countries + def top_countries(start_time, limit = 10) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT country, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["country"], row["count"]] } + end + end + + # Analytics query: Top blocked IPs + def top_blocked_ips(start_time, limit = 10) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT ip_address, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND waf_action = 0 + GROUP BY ip_address + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["ip_address"], row["count"]] } + end + end + + # Analytics query: Hourly timeline (events grouped by hour) + def hourly_timeline(start_time, end_time) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, end_time) + SELECT + DATE_TRUNC('hour', timestamp) as hour, + COUNT(*) as count + FROM events + WHERE timestamp >= ? AND timestamp < ? + GROUP BY hour + ORDER BY hour + SQL + + # Convert to hash with Time keys like PostgreSQL + result.to_a.to_h { |row| [row["hour"], row["count"]] } + end + end + + # Close DuckDB connection (for cleanup/testing) + def close + @connection&.close + @connection = nil + end +end diff --git a/app/services/bot_network_range_importer.rb b/app/services/bot_network_range_importer.rb new file mode 100644 index 0000000..2bf0a01 --- /dev/null +++ b/app/services/bot_network_range_importer.rb @@ -0,0 +1,573 @@ +# frozen_string_literal: true + +# BotNetworkRangeImporter - Service for importing official bot network ranges +# +# Imports network ranges from official bot provider sources like: +# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json +# - Google: Official crawler IP lists +# - Microsoft/Bing: Bot network ranges +# - Anthropic: Service network ranges +# - OpenAI: Service network ranges +class BotNetworkRangeImporter + class ImportError < StandardError; end + + # Official sources for bot network ranges + BOT_SOURCES = { + amazon_aws: { + name: 'Amazon AWS', + url: 'https://ip-ranges.amazonaws.com/ip-ranges.json', + format: :json, + parser: :parse_aws_ranges, + description: 'Official AWS IP ranges including Amazonbot and other services' + }, + google: { + name: 'Google', + # Note: These URLs may need to be updated based on current Google documentation + urls: [ + 'https://developers.google.com/search/docs/files/googlebot.json', + 'https://developers.google.com/search/docs/files/special-crawlers.json' + ], + format: :json, + parser: :parse_google_ranges, + description: 'Googlebot and other Google crawler IP ranges' + }, + microsoft_bing: { + name: 'Microsoft Bing', + # Note: Microsoft may require web scraping or API access + url: 'https://www.bing.com/toolbox/bingbot.json', + format: :json, + parser: :parse_microsoft_ranges, + description: 'Bingbot and other Microsoft crawler IP ranges' + }, + anthropic: { + name: 'Anthropic Claude', + # Note: Anthropic ranges may need manual updates or different approach + url: 'https://docs.anthropic.com/claude/reference/ip_ranges', + format: :html, + parser: :parse_anthropic_ranges, + description: 'Anthropic Claude API service IP ranges' + }, + openai_searchbot: { + name: 'OpenAI SearchBot', + url: 'https://openai.com/searchbot.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI SearchBot for ChatGPT search features' + }, + openai_chatgpt_user: { + name: 'OpenAI ChatGPT-User', + url: 'https://openai.com/chatgpt-user.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs' + }, + openai_gptbot: { + name: 'OpenAI GPTBot', + url: 'https://openai.com/gptbot.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI GPTBot for training AI foundation models' + }, + cloudflare: { + name: 'Cloudflare', + urls: [ + 'https://www.cloudflare.com/ips-v4', + 'https://www.cloudflare.com/ips-v6' + ], + format: :text, + parser: :parse_cloudflare_ranges, + description: 'Cloudflare network ranges including their crawlers and services' + }, + facebook: { + name: 'Facebook/Meta', + url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/', + format: :html, + parser: :parse_facebook_ranges, + description: 'Facebook/Meta crawlers and bots' + }, + applebot: { + name: 'Applebot', + url: 'https://support.apple.com/en-us/HT204683', + format: :html, + parser: :parse_applebot_ranges, + description: 'Applebot crawler for Apple search and Siri' + }, + duckduckgo: { + name: 'DuckDuckBot', + url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/', + format: :html, + parser: :parse_duckduckgo_ranges, + description: 'DuckDuckGo search crawler' + } + }.freeze + + def self.import_from_source(source_key, options = {}) + source = BOT_SOURCES[source_key.to_sym] + raise ImportError, "Unknown source: #{source_key}" unless source + + puts "Importing bot network ranges from #{source[:name]}..." + + case source[:parser] + when :parse_aws_ranges + parse_aws_ranges(source, options) + when :parse_google_ranges + parse_google_ranges(source, options) + when :parse_microsoft_ranges + parse_microsoft_ranges(source, options) + when :parse_anthropic_ranges + parse_anthropic_ranges(source, options) + when :parse_openai_ranges + parse_openai_ranges(source, options) + when :parse_cloudflare_ranges + parse_cloudflare_ranges(source, options) + when :parse_facebook_ranges + parse_facebook_ranges(source, options) + when :parse_applebot_ranges + parse_applebot_ranges(source, options) + when :parse_duckduckgo_ranges + parse_duckduckgo_ranges(source, options) + else + raise ImportError, "Unknown parser: #{source[:parser]}" + end + end + + def self.import_all_sources(options = {}) + results = {} + + BOT_SOURCES.each do |source_key, source| + puts "\n" + "="*50 + puts "Processing #{source[:name]}..." + puts "="*50 + + begin + results[source_key] = import_from_source(source_key, options) + rescue => e + Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}" + results[source_key] = { error: e.message, imported: 0 } + end + end + + puts "\n" + "="*50 + puts "Import Summary" + puts "="*50 + + results.each do |source, result| + if result[:error] + puts "#{source}: FAILED - #{result[:error]}" + else + puts "#{source}: SUCCESS - #{result[:imported]} ranges imported" + end + end + + results + end + + private + + # Amazon AWS IP ranges parser + def self.parse_aws_ranges(source, options = {}) + require 'net/http' + require 'uri' + + uri = URI.parse(source[:url]) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200' + + data = JSON.parse(response.body) + imported_count = 0 + batch_size = options[:batch_size] || 1000 + batch = [] + + # Filter for relevant services (can be customized) + relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT'] + + data['prefixes'].each do |prefix| + # Focus on relevant services and regions + next unless relevant_services.include?(prefix['service']) + + network_range = { + network: prefix['ip_prefix'], + source: 'bot_import_amazon_aws', + asn: nil, # AWS doesn't provide ASN in this feed + asn_org: 'Amazon Web Services', + company: 'Amazon', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + aws_service: prefix['service'], + aws_region: prefix['region'], + aws_network_border_group: prefix['network_border_group'], + import_date: Time.current.iso8601 + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Amazon AWS') + batch = [] + puts "Imported #{imported_count} AWS ranges..." + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Amazon AWS') + end + + puts "Amazon AWS import completed: #{imported_count} ranges imported" + { imported: imported_count, source: 'Amazon AWS' } + rescue Net::TimeoutError, Net::OpenTimeout => e + raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}" + rescue JSON::ParserError => e + raise ImportError, "Failed to parse AWS JSON response: #{e.message}" + end + + # Google crawler IP ranges parser + def self.parse_google_ranges(source, options = {}) + imported_count = 0 + + # Try each potential URL + urls = Array(source[:urls] || source[:url]) + + urls.each do |url| + begin + puts "Attempting to fetch Google ranges from: #{url}" + + uri = URI.parse(url) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + next unless response.code == '200' + + data = JSON.parse(response.body) + + batch_size = options[:batch_size] || 1000 + batch = [] + + # Parse Google crawler format (varies by file type) + if data.is_a?(Array) + data.each do |entry| + next unless entry['cidr'] || entry['prefix'] + + network_range = { + network: entry['cidr'] || entry['prefix'], + source: 'bot_import_google', + asn: nil, + asn_org: 'Google LLC', + company: 'Google', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + crawler_type: entry['crawler_type'] || 'unknown', + user_agent: entry['user_agent'], + import_date: Time.current.iso8601 + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Google') + batch = [] + puts "Imported #{imported_count} Google ranges..." + end + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Google') + end + + puts "Google import completed: #{imported_count} ranges imported" + return { imported: imported_count, source: 'Google' } + + rescue => e + Rails.logger.warn "Failed to fetch from #{url}: #{e.message}" + next + end + end + + raise ImportError, "Failed to fetch Google crawler ranges from any URL" + end + + # Microsoft Bing crawler IP ranges parser + def self.parse_microsoft_ranges(source, options = {}) + # Microsoft requires special handling as they may not provide direct JSON + # This is a placeholder implementation + + puts "Microsoft Bing crawler import requires manual configuration or web scraping" + puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use" + + { + imported: 0, + source: 'Microsoft Bing', + note: 'Manual configuration required - Microsoft does not provide direct IP range feeds' + } + end + + # Anthropic service IP ranges parser + def self.parse_anthropic_ranges(source, options = {}) + # Anthropic ranges may need to be manually configured + # This is a placeholder implementation + + puts "Anthropic Claude service ranges require manual configuration" + puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges" + + { + imported: 0, + source: 'Anthropic', + note: 'Manual configuration required - Anthropic does not provide automated IP range feeds' + } + end + + # OpenAI crawler IP ranges parser + def self.parse_openai_ranges(source, options = {}) + require 'net/http' + require 'uri' + + uri = URI.parse(source[:url]) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200' + + data = JSON.parse(response.body) + imported_count = 0 + batch_size = options[:batch_size] || 1000 + batch = [] + + # Determine crawler type from source name + crawler_type = source[:name].gsub('OpenAI ', '').downcase + + data.each do |entry| + # OpenAI provides IP ranges as either CIDR notation or single IPs + ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip'] + next unless ip_range + + # Convert single IPs to /32 + network = ip_range.include?('/') ? ip_range : "#{ip_range}/32" + + network_range = { + network: network, + source: "bot_import_openai_#{crawler_type}", + asn: nil, + asn_org: 'OpenAI', + company: 'OpenAI', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + crawler_type: crawler_type, + crawler_purpose: crawler_purpose(crawler_type), + user_agent: openai_user_agent(crawler_type), + import_date: Time.current.iso8601, + source_url: source[:url] + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, "OpenAI #{crawler_type}") + batch = [] + puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..." + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, "OpenAI #{crawler_type}") + end + + puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported" + { imported: imported_count, source: "OpenAI #{crawler_type}" } + rescue Net::TimeoutError, Net::OpenTimeout => e + raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}" + rescue JSON::ParserError => e + raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}" + end + + def self.import_batch(batch_data, source_name) + # Check for existing ranges to avoid duplicates + existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network) + new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) } + + if new_ranges.any? + NetworkRange.insert_all(new_ranges) + puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)" + else + puts "No new #{source_name} ranges to import (all duplicates)" + end + + new_ranges.size + rescue => e + Rails.logger.error "Failed to import #{source_name} batch: #{e.message}" + + # Fallback to individual imports + imported = 0 + new_ranges.each do |data| + begin + NetworkRange.create!(data) + imported += 1 + rescue => individual_error + Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}" + end + end + + imported + end + + # Helper method to determine crawler purpose based on type + def self.crawler_purpose(crawler_type) + case crawler_type + when 'searchbot' + 'Used to link to and surface websites in search results in ChatGPT\'s search features' + when 'chatgpt-user' + 'User actions in ChatGPT and Custom GPTs, including GPT Actions' + when 'gptbot' + 'Used to crawl content for training OpenAI\'s generative AI foundation models' + else + 'Unknown purpose' + end + end + + # Helper method to get OpenAI user agent strings + def self.openai_user_agent(crawler_type) + case crawler_type + when 'searchbot' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot' + when 'chatgpt-user' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot' + when 'gptbot' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot' + else + 'Unknown user agent' + end + end + + # Cloudflare IP ranges parser + def self.parse_cloudflare_ranges(source, options = {}) + require 'net/http' + require 'uri' + + imported_count = 0 + urls = Array(source[:urls]) + batch_size = options[:batch_size] || 1000 + batch = [] + + urls.each do |url| + begin + puts "Fetching Cloudflare ranges from: #{url}" + + uri = URI.parse(url) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200' + + # Cloudflare provides plain text CIDR lists + lines = response.body.split("\n") + ip_version = url.include?('ips-v4') ? 4 : 6 + + lines.each do |line| + line = line.strip + next if line.empty? || line.start_with?('#') + + # Validate CIDR format + next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/) + + network_range = { + network: line, + source: 'bot_import_cloudflare', + asn: nil, + asn_org: 'Cloudflare', + company: 'Cloudflare', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + ip_version: ip_version, + import_date: Time.current.iso8601, + source_url: url, + service_type: 'cdn_and_security' + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Cloudflare') + batch = [] + puts "Imported #{imported_count} Cloudflare ranges..." + end + end + + rescue => e + Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}" + next + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Cloudflare') + end + + puts "Cloudflare import completed: #{imported_count} ranges imported" + { imported: imported_count, source: 'Cloudflare' } + end + + # Facebook/Meta crawler ranges parser (placeholder) + def self.parse_facebook_ranges(source, options = {}) + puts "Facebook/Meta crawler ranges require web scraping or manual configuration" + puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/" + + { + imported: 0, + source: 'Facebook/Meta', + note: 'Manual configuration required - Facebook does not provide automated IP range feeds' + } + end + + # Applebot crawler ranges parser (placeholder) + def self.parse_applebot_ranges(source, options = {}) + puts "Applebot ranges require web scraping or manual configuration" + puts "Refer to: https://support.apple.com/en-us/HT204683" + + { + imported: 0, + source: 'Applebot', + note: 'Manual configuration required - Apple does not provide automated IP range feeds' + } + end + + # DuckDuckBot crawler ranges parser (placeholder) + def self.parse_duckduckgo_ranges(source, options = {}) + puts "DuckDuckBot ranges require web scraping or manual configuration" + puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/" + + { + imported: 0, + source: 'DuckDuckBot', + note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds' + } + end +end \ No newline at end of file diff --git a/app/services/ipapi.rb b/app/services/ipapi.rb index 1afc1bd..2f5ed80 100644 --- a/app/services/ipapi.rb +++ b/app/services/ipapi.rb @@ -53,4 +53,107 @@ class Ipapi next end end + + # Parse company/datacenter network range from IPAPI data + # Handles "X.X.X.X - Y.Y.Y.Y" format and converts to CIDR + def self.parse_company_network_range(ipapi_data) + # Try company.network first, then datacenter.network + network_range = ipapi_data.dig('company', 'network') || ipapi_data.dig('datacenter', 'network') + return nil if network_range.blank? + + # Parse "X.X.X.X - Y.Y.Y.Y" format + if network_range.include?(' - ') + start_ip_str, end_ip_str = network_range.split(' - ').map(&:strip) + + begin + start_ip = IPAddr.new(start_ip_str) + end_ip = IPAddr.new(end_ip_str) + + # Calculate the number of IPs in the range + num_ips = end_ip.to_i - start_ip.to_i + 1 + + # Calculate prefix length from number of IPs + # num_ips = 2^(32 - prefix_length) for IPv4 + prefix_length = 32 - Math.log2(num_ips).to_i + + # Verify it's a valid CIDR block (power of 2) + if 2**(32 - prefix_length) == num_ips + cidr = "#{start_ip_str}/#{prefix_length}" + Rails.logger.debug "Parsed company network range: #{network_range} -> #{cidr}" + return cidr + else + Rails.logger.warn "Network range #{network_range} is not a valid CIDR block (#{num_ips} IPs)" + return nil + end + rescue IPAddr::InvalidAddressError => e + Rails.logger.error "Invalid IP in company network range: #{network_range} (#{e.message})" + return nil + end + elsif network_range.include?('/') + # Already in CIDR format + return network_range + else + Rails.logger.warn "Unknown network range format: #{network_range}" + return nil + end + end + + # Populate NetworkRange attributes from IPAPI data + def self.populate_network_attributes(network_range, ipapi_data) + network_range.asn = ipapi_data.dig('asn', 'asn') + network_range.asn_org = ipapi_data.dig('asn', 'org') || ipapi_data.dig('company', 'name') + network_range.company = ipapi_data.dig('company', 'name') + network_range.country = ipapi_data.dig('location', 'country_code') + network_range.is_datacenter = ipapi_data['is_datacenter'] || false + network_range.is_vpn = ipapi_data['is_vpn'] || false + network_range.is_proxy = ipapi_data['is_proxy'] || false + end + + # Process IPAPI data and create network ranges + # Returns array of created/updated NetworkRange objects + def self.process_ipapi_data(ipapi_data, tracking_network) + created_networks = [] + + # Extract and create company/datacenter network range if present + company_network_cidr = parse_company_network_range(ipapi_data) + if company_network_cidr.present? + company_range = NetworkRange.find_or_create_by(network: company_network_cidr) do |nr| + nr.source = 'api_imported' + nr.creation_reason = "Company allocation from IPAPI for #{tracking_network.cidr}" + end + + # Always update attributes (whether new or existing) + populate_network_attributes(company_range, ipapi_data) + company_range.set_network_data(:ipapi, ipapi_data) + company_range.last_api_fetch = Time.current + company_range.save! + + created_networks << company_range + Rails.logger.info "Created/updated company network: #{company_range.cidr}" + end + + # Extract and create ASN route network if present + ipapi_route = ipapi_data.dig('asn', 'route') + if ipapi_route.present? && ipapi_route != tracking_network.cidr + route_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr| + nr.source = 'api_imported' + nr.creation_reason = "BGP route from IPAPI lookup for #{tracking_network.cidr}" + end + + # Always update attributes (whether new or existing) + populate_network_attributes(route_network, ipapi_data) + route_network.set_network_data(:ipapi, ipapi_data) + route_network.last_api_fetch = Time.current + route_network.save! + + created_networks << route_network + Rails.logger.info "Created/updated BGP route network: #{route_network.cidr}" + end + + # Return both the created networks and the broadest CIDR for deduplication + { + networks: created_networks, + broadest_cidr: company_network_cidr.presence || ipapi_route || tracking_network.cidr + } + end end \ No newline at end of file diff --git a/app/views/analytics/networks.html.erb b/app/views/analytics/networks.html.erb index 8dd7f37..7c8fa8e 100644 --- a/app/views/analytics/networks.html.erb +++ b/app/views/analytics/networks.html.erb @@ -141,8 +141,11 @@ class: "text-blue-600 hover:text-blue-800 hover:underline font-mono font-medium" %>
- <% if network.country.present? %> - 🏳️ <%= network.country %> + <% if network.display_country.present? %> + 🏳️ <%= network.display_country %> + <% if network.has_inherited_data? && network.display_country != network.country %> + * + <% end %> <% end %> <% if network.asn.present? %> • ASN <%= network.asn %> @@ -150,7 +153,15 @@
- <%= network.company || 'Unknown' %> +
+ <%= network.display_company || 'Unknown' %> + <% if network.has_inherited_data? %> +
+ from <%= link_to network.inherited_from, network_range_path(NetworkRange.find_by(network: network.inherited_from)), + class: "text-blue-600 hover:text-blue-800 hover:underline" %> +
+ <% end %> +
<% if network.is_datacenter? %> diff --git a/app/views/bot_network_ranges/index.html.erb b/app/views/bot_network_ranges/index.html.erb new file mode 100644 index 0000000..ac08cc6 --- /dev/null +++ b/app/views/bot_network_ranges/index.html.erb @@ -0,0 +1,171 @@ +<% content_for :title, "Bot Network Ranges" %> + +
+ +
+

Bot Network Ranges

+

Import and manage official network ranges for search crawlers and API bots

+
+ + +
+
+

Available Sources

+
+
+
+ <% @bot_sources.each do |key, source| %> +
+
+

<%= source[:name] %>

+ + <%= source[:url] ? 'Available' : 'Manual' %> + +
+

<%= source[:description] %>

+ +
+ <%= form_with url: import_bot_network_ranges_path, method: :post, class: "inline" do |f| %> + <%= hidden_field_tag :source, key %> + <%= f.submit "Import Now", + class: "px-3 py-1 text-xs font-medium text-white bg-blue-600 rounded hover:bg-blue-700 transition-colors disabled:opacity-50", + disabled: !source[:url] %> + <% end %> + + <%= form_with url: import_async_bot_network_ranges_path, method: :post, class: "inline" do |f| %> + <%= hidden_field_tag :source, key %> + <%= f.submit "Import Async", + class: "px-3 py-1 text-xs font-medium text-white bg-purple-600 rounded hover:bg-purple-700 transition-colors disabled:opacity-50", + disabled: !source[:url] %> + <% end %> + + <%= link_to "View", bot_network_range_path(key), + class: "px-3 py-1 text-xs font-medium text-gray-700 bg-gray-200 rounded hover:bg-gray-300 transition-colors" %> +
+
+ <% end %> +
+
+
+ + +
+
+

Batch Import

+
+
+

Import from all available sources (this may take several minutes).

+ + <%= form_with url: import_all_bot_network_ranges_path, method: :post do |f| %> +
+ <%= f.submit "Import All Sources", + class: "px-6 py-2 font-medium text-white bg-green-600 rounded hover:bg-green-700 transition-colors", + confirm: "This will import from all available sources and may take several minutes. Continue?" %> +
+ <% end %> +
+
+ + + <% if @recent_imports.any? %> +
+
+

Recent Imports

+
+
+ + + + + + + + + + + + <% @recent_imports.each do |import| %> + + + + + + + + <% end %> + +
SourceStatusRecordsDateNotes
+ <%= import.source.titleize %> + + + <%= import.status.titleize %> + + + <%= import.records_processed&.to_s || '0' %> + + <%= import.created_at.strftime('%Y-%m-%d %H:%M') %> + + <%= import.notes %> +
+
+
+ <% end %> + + + <% if @bot_network_ranges.any? %> +
+
+

Recently Imported Bot Ranges

+
+
+ + + + + + + + + + + + <% @bot_network_ranges.each do |range| %> + + + + + + + + <% end %> + +
NetworkSourceCompanyCreatedDetails
+ <%= range.network %> + + <%= range.source.gsub('bot_import_', '').titleize %> + + <%= range.company || 'Unknown' %> + + <%= range.created_at.strftime('%Y-%m-%d %H:%M') %> + + <% if range.additional_data.present? %> + <% data = JSON.parse(range.additional_data) rescue {} %> + <% if data['crawler_type'] %> + + <%= data['crawler_type'].titleize %> + + <% end %> + <% if data['aws_service'] %> + + <%= data['aws_service'] %> + + <% end %> + <% end %> +
+
+
+ <% end %> +
+ + + \ No newline at end of file diff --git a/app/views/bot_network_ranges/show.html.erb b/app/views/bot_network_ranges/show.html.erb new file mode 100644 index 0000000..1f21487 --- /dev/null +++ b/app/views/bot_network_ranges/show.html.erb @@ -0,0 +1,175 @@ +<% content_for :title, "#{@source_name} Network Ranges" %> + +
+ +
+
+
+

<%= @source_name %> Network Ranges

+

Network ranges imported from <%= @source_name %> official sources

+
+
+ <%= link_to "Back to Sources", bot_network_ranges_path, + class: "px-4 py-2 text-sm font-medium text-gray-700 bg-gray-200 rounded hover:bg-gray-300 transition-colors" %> + + <%= form_with url: bot_network_range_path(params[:source]), method: :delete, class: "inline" do |f| %> + <%= f.submit "Delete All Ranges", + class: "px-4 py-2 text-sm font-medium text-white bg-red-600 rounded hover:bg-red-700 transition-colors", + confirm: "Are you sure you want to delete all #{@source_name} network ranges? This action cannot be undone." %> + <% end %> +
+
+
+ + + <% if @import_stats.any? %> +
+
+

Import Statistics

+
+
+
+ <% @import_stats.each do |source, count| %> +
+
<%= count %>
+
<%= source.gsub('bot_import_', '').titleize %>
+
+ <% end %> +
+
+
+ <% end %> + + +
+
+
+

Network Ranges

+
+ Showing <%= @network_ranges.offset_value + 1 %> to <%= [@network_ranges.offset_value + @network_ranges.current_page_count, @network_ranges.total_count].min %> + of <%= @network_ranges.total_count %> ranges +
+
+
+ +
+ + + + + + + + + + + + + <% @network_ranges.each do |range| %> + + + + + + + + + <% end %> + +
NetworkSourceCompanyCountryCreatedDetails
+ <%= link_to range.network, network_range_path(range), class: "text-blue-600 hover:text-blue-800" %> + + <%= range.source.gsub('bot_import_', '').titleize %> + + <%= range.company || 'Unknown' %> + + <%= range.country || 'Unknown' %> + + <%= range.created_at.strftime('%Y-%m-%d %H:%M') %> + + <% if range.additional_data.present? %> + <% data = JSON.parse(range.additional_data) rescue {} %> +
+ <% if data['crawler_type'] %> + + <%= data['crawler_type'].titleize %> + + <% end %> + <% if data['crawler_purpose'] %> + + Purpose + + <% end %> + <% if data['aws_service'] %> + + <%= data['aws_service'] %> + + <% end %> + <% if data['aws_region'] %> + + <%= data['aws_region'] %> + + <% end %> + <% if data['ip_version'] %> + + IPv<%= data['ip_version'] %> + + <% end %> +
+ <% end %> +
+
+ + + <% if @network_ranges.total_pages > 1 %> +
+
+
+ Page <%= @network_ranges.current_page %> of <%= @network_ranges.total_pages %> +
+
+ <% if @network_ranges.prev_page %> + <%= link_to "Previous", bot_network_range_path(params[:source], page: @network_ranges.prev_page), + class: "px-3 py-1 text-sm font-medium text-gray-700 bg-white border border-gray-300 rounded hover:bg-gray-50" %> + <% end %> + + <%# Show page numbers %> + <% (1..@network_ranges.total_pages).select { |p| p == 1 || p == @network_ranges.total_pages || (p - @network_ranges.current_page).abs <= 2 }.each do |page| %> + <% if page == @network_ranges.current_page %> + + <%= page %> + + <% else %> + <%= link_to page, bot_network_range_path(params[:source], page: page), + class: "px-3 py-1 text-sm font-medium text-gray-700 bg-white border border-gray-300 rounded hover:bg-gray-50" %> + <% end %> + <% end %> + + <% if @network_ranges.next_page %> + <%= link_to "Next", bot_network_range_path(params[:source], page: @network_ranges.next_page), + class: "px-3 py-1 text-sm font-medium text-gray-700 bg-white border border-gray-300 rounded hover:bg-gray-50" %> + <% end %> +
+
+
+ <% end %> +
+ + <% if @network_ranges.empty? %> +
+
+
+ + + +
+

No network ranges found

+

+ No <%= @source_name %> network ranges have been imported yet. +

+ <%= link_to "Import #{@source_name} Ranges", bot_network_ranges_path, + class: "inline-flex items-center px-4 py-2 border border-transparent text-sm font-medium rounded-md shadow-sm text-white bg-blue-600 hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500" %> +
+
+ <% end %> +
\ No newline at end of file diff --git a/app/views/events/index.html.erb b/app/views/events/index.html.erb index 7f3650b..3ac6054 100644 --- a/app/views/events/index.html.erb +++ b/app/views/events/index.html.erb @@ -25,7 +25,7 @@
<%= form.label :waf_action, "Action", class: "block text-sm font-medium text-gray-700" %> <%= form.select :waf_action, - options_for_select([['All', ''], ['Allow', 'allow'], ['Deny', 'deny'], ['Redirect', 'redirect'], ['Challenge', 'challenge']], params[:waf_action]), + options_for_select([['All', ''], ['Allow', 'allow'], ['Deny', 'deny'], ['Redirect', 'redirect'], ['Challenge', 'challenge'], ['Add Header', 'add_header']], params[:waf_action]), { }, { class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm" } %>
@@ -178,9 +178,10 @@ when 'deny' then 'bg-red-100 text-red-800' when 'redirect' then 'bg-blue-100 text-blue-800' when 'challenge' then 'bg-yellow-100 text-yellow-800' + when 'add_header' then 'bg-purple-100 text-purple-800' else 'bg-gray-100 text-gray-800' end %>"> - <%= event.waf_action %> + <%= event.waf_action.humanize %> diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb index 1771240..0bfe49f 100644 --- a/app/views/layouts/application.html.erb +++ b/app/views/layouts/application.html.erb @@ -75,6 +75,8 @@ class: nav_link_class(network_ranges_path) %> <% if user_signed_in? && current_user_admin? %> + <%= link_to "🤖 Bot Ranges", bot_network_ranges_path, + class: nav_link_class(bot_network_ranges_path) %> <%= link_to "📊 Data Imports", data_imports_path, class: nav_link_class(data_imports_path) %> <%= link_to "🔗 DSNs", dsns_path, @@ -172,6 +174,8 @@ class: mobile_nav_link_class(network_ranges_path) %> <% if user_signed_in? && current_user_admin? %> + <%= link_to "🤖 Bot Ranges", bot_network_ranges_path, + class: mobile_nav_link_class(bot_network_ranges_path) %> <%= link_to "📊 Data Imports", data_imports_path, class: mobile_nav_link_class(data_imports_path) %> <%= link_to "🔗 DSNs", dsns_path, diff --git a/app/views/rules/index.html.erb b/app/views/rules/index.html.erb index 21808e6..c0b6fbd 100644 --- a/app/views/rules/index.html.erb +++ b/app/views/rules/index.html.erb @@ -225,14 +225,16 @@ <%= link_to "View", rule_path(rule), class: "text-blue-600 hover:text-blue-900 mr-3" %> <% if rule.enabled? %> - <%= link_to "Disable", disable_rule_path(rule), + <%= button_to "Disable", disable_rule_path(rule), method: :post, - data: { confirm: "Are you sure you want to disable this rule?" }, - class: "text-yellow-600 hover:text-yellow-900 mr-3" %> + form: { style: "display: inline;" }, + data: { turbo_confirm: "Are you sure you want to disable this rule?" }, + class: "text-yellow-600 hover:text-yellow-900 mr-3 bg-transparent border-0 p-0 cursor-pointer" %> <% else %> - <%= link_to "Enable", enable_rule_path(rule), + <%= button_to "Enable", enable_rule_path(rule), method: :post, - class: "text-green-600 hover:text-green-900 mr-3" %> + form: { style: "display: inline;" }, + class: "text-green-600 hover:text-green-900 mr-3 bg-transparent border-0 p-0 cursor-pointer" %> <% end %> <%= link_to "Edit", edit_rule_path(rule), class: "text-indigo-600 hover:text-indigo-900" %> diff --git a/app/views/rules/new.html.erb b/app/views/rules/new.html.erb index c591e1e..2327ecf 100644 --- a/app/views/rules/new.html.erb +++ b/app/views/rules/new.html.erb @@ -6,7 +6,7 @@

Create a WAF rule to allow, block, or rate limit traffic

-
+
<%= form_with(model: @rule, local: true, class: "space-y-6") do |form| %> <% if @rule.errors.any? %>
@@ -54,7 +54,8 @@ <%= form.select :waf_action, options_for_select(@waf_actions.map { |action, _| [action.humanize, action] }, @rule.waf_action), { prompt: "Select action" }, - { class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm" } %> + { class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm", + data: { rule_form_target: "actionSelect", action: "change->rule-form#updateActionSections" } } %>

What action to take when this rule matches

@@ -158,6 +159,27 @@
+ + +
<%= form.label :metadata, "Metadata", class: "block text-sm font-medium text-gray-700" %> @@ -197,10 +219,18 @@
- <%= form.label :expires_at, "Expires At", class: "block text-sm font-medium text-gray-700" %> - <%= form.datetime_local_field :expires_at, - class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm" %> -

Leave blank for permanent rule

+
+ <%= check_box_tag :set_expiration, "1", false, + class: "h-4 w-4 rounded border-gray-300 text-blue-600 focus:ring-blue-500", + data: { rule_form_target: "expirationCheckbox", action: "change->rule-form#toggleExpiration" } %> + <%= label_tag :set_expiration, "Set expiration", class: "ml-2 block text-sm font-medium text-gray-700" %> +
+
diff --git a/app/views/rules/show.html.erb b/app/views/rules/show.html.erb index 43687c9..e49209a 100644 --- a/app/views/rules/show.html.erb +++ b/app/views/rules/show.html.erb @@ -39,12 +39,12 @@
<%= link_to "Edit", edit_rule_path(@rule), class: "inline-flex items-center px-4 py-2 border border-gray-300 rounded-md shadow-sm text-sm font-medium text-gray-700 bg-white hover:bg-gray-50" %> <% if @rule.enabled? %> - <%= link_to "Disable", disable_rule_path(@rule), + <%= button_to "Disable", disable_rule_path(@rule), method: :post, - data: { confirm: "Are you sure you want to disable this rule?" }, + data: { turbo_confirm: "Are you sure you want to disable this rule?" }, class: "inline-flex items-center px-4 py-2 border border-yellow-300 rounded-md shadow-sm text-sm font-medium text-yellow-700 bg-yellow-50 hover:bg-yellow-100" %> <% else %> - <%= link_to "Enable", enable_rule_path(@rule), + <%= button_to "Enable", enable_rule_path(@rule), method: :post, class: "inline-flex items-center px-4 py-2 border border-green-300 rounded-md shadow-sm text-sm font-medium text-green-700 bg-green-50 hover:bg-green-100" %> <% end %> diff --git a/config/recurring.yml b/config/recurring.yml index ca8d105..896a13b 100644 --- a/config/recurring.yml +++ b/config/recurring.yml @@ -29,3 +29,9 @@ cleanup_old_events: class: CleanupOldEventsJob queue: background schedule: every hour + +# Sync events from PostgreSQL to DuckDB for fast analytics +sync_events_to_duckdb: + class: SyncEventsToDuckdbJob + queue: default + schedule: every 1 minutes diff --git a/config/routes.rb b/config/routes.rb index 81e1441..49a96b9 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -93,4 +93,16 @@ Rails.application.routes.draw do get :progress end end + + # Bot network range management (admin only) + resources :bot_network_ranges, only: [:index, :show] do + collection do + post :import + post :import_async + post :import_all + end + member do + delete :destroy + end + end end diff --git a/docs/rule-architecture.md b/docs/rule-architecture.md index 890eaa6..fb8cfa6 100644 --- a/docs/rule-architecture.md +++ b/docs/rule-architecture.md @@ -437,7 +437,7 @@ Ipv4Range.upsert({ network_start: cidr.to_i, network_end: cidr.to_range.end.to_i, network_prefix: 8, - waf_action: 1, # deny + waf_action: 0, # deny priority: 8 }, unique_by: :source) diff --git a/test/controllers/rules_controller_test.rb b/test/controllers/rules_controller_test.rb new file mode 100644 index 0000000..051ff68 --- /dev/null +++ b/test/controllers/rules_controller_test.rb @@ -0,0 +1,65 @@ +require "test_helper" + +class RulesControllerTest < ActionDispatch::IntegrationTest + setup do + @user = users(:one) + sign_in_as(@user) + end + + test "should create network rule with add_header action" do + assert_difference('Rule.count') do + post rules_path, params: { + rule: { + waf_rule_type: "network", + waf_action: "add_header", + network_range_id: "", + conditions: "{}", + metadata: "{}", + source: "manual", + expires_at: "", + enabled: "1" + }, + new_cidr: "52.167.145.0/24", + path_pattern: "", + match_type: "exact", + header_name: "X-Bot-Agent", + header_value: "Blah" + } + end + + rule = Rule.last + assert_equal "network", rule.waf_rule_type + assert_equal "add_header", rule.waf_action, "waf_action should be 'add_header' but was #{rule.waf_action.inspect}" + assert_equal "X-Bot-Agent", rule.metadata["header_name"] + assert_equal "Blah", rule.metadata["header_value"] + assert_not_nil rule.network_range + # Network range stores as /32 if no prefix given + assert_match /52\.167\.145\./, rule.network_range.network.to_s + + # Verify metadata JSON doesn't have duplicate keys + metadata_json = rule.metadata.to_json + refute_includes metadata_json, '"header_name":"X-Bot-Agent","header_value":"Blah","reason":"{}","header_name"', + "Metadata should not have duplicate keys" + end + + test "should create rule with waf_action properly set from string parameter" do + assert_difference('Rule.count') do + post rules_path, params: { + rule: { + waf_rule_type: "network", + waf_action: "deny", # Test with different action + network_range_id: "", + conditions: "{}", + metadata: '{"reason": "test"}', + source: "manual", + enabled: "1" + }, + new_cidr: "10.0.0.1/32" + } + end + + rule = Rule.last + assert_equal "deny", rule.waf_action, "waf_action should be 'deny'" + assert_equal "network", rule.waf_rule_type + end +end diff --git a/test/fixtures/files/ipapi_91_84_96_0.json b/test/fixtures/files/ipapi_91_84_96_0.json new file mode 100644 index 0000000..5363211 --- /dev/null +++ b/test/fixtures/files/ipapi_91_84_96_0.json @@ -0,0 +1,66 @@ +{ + "ip": "91.84.96.0", + "rir": "RIPE", + "is_bogon": false, + "is_mobile": false, + "is_satellite": false, + "is_crawler": false, + "is_datacenter": true, + "is_tor": false, + "is_proxy": false, + "is_vpn": false, + "is_abuser": false, + "datacenter": { + "datacenter": "SERVERS TECH FZCO", + "domain": "vdsina.com", + "network": "91.84.96.0 - 91.84.127.255" + }, + "company": { + "name": "SERVERS TECH FZCO", + "abuser_score": "0.0162 (Elevated)", + "domain": "vdsina.com", + "type": "hosting", + "network": "91.84.96.0 - 91.84.127.255", + "whois": "https://api.ipapi.is/?whois=91.84.96.0" + }, + "abuse": { + "name": "SERVERS TECH FZCO", + "address": "UNITED ARAB EMIRATES, Dubai, 336469, Ifza Business Park DDP, Building 1, office number 36298-001", + "email": "abuse@vdsina.com", + "phone": "+971525386329" + }, + "asn": { + "asn": 216071, + "abuser_score": "0.0181 (Elevated)", + "route": "91.84.96.0/24", + "descr": "VDSINA, AE", + "country": "ae", + "active": true, + "org": "SERVERS TECH FZCO", + "domain": "vdsina.com", + "abuse": "abuse@vdsina.com", + "type": "hosting", + "created": "2023-10-30", + "updated": "2023-10-30", + "rir": "RIPE", + "whois": "https://api.ipapi.is/?whois=AS216071" + }, + "location": { + "is_eu_member": true, + "calling_code": "31", + "currency_code": "EUR", + "continent": "EU", + "country": "The Netherlands", + "country_code": "NL", + "state": "North Holland", + "city": "Amsterdam", + "latitude": 52.37403, + "longitude": 4.88969, + "zip": "1384", + "timezone": "Europe/Brussels", + "local_time": "2025-11-17T22:21:06+01:00", + "local_time_unix": 1763414466, + "is_dst": false + }, + "elapsed_ms": 0.5 +} diff --git a/test/jobs/cleanup_old_events_job_test.rb b/test/jobs/cleanup_old_events_job_test.rb new file mode 100644 index 0000000..ffb73cf --- /dev/null +++ b/test/jobs/cleanup_old_events_job_test.rb @@ -0,0 +1,195 @@ +# frozen_string_literal: true + +require "test_helper" + +class CleanupOldEventsJobTest < ActiveJob::TestCase + setup do + # Clear any existing events + Event.delete_all + # Set default retention to 90 days + Setting.set('event_retention_days', '90') + end + + test "deletes events older than retention period" do + # Create old event (100 days ago - should be deleted) + old_event = Event.create!( + request_id: "old-request-#{SecureRandom.uuid}", + timestamp: 100.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + # Create recent event (30 days ago - should be kept) + recent_event = Event.create!( + request_id: "recent-request-#{SecureRandom.uuid}", + timestamp: 30.days.ago, + ip_address: "5.6.7.8", + payload: { request: { ip: "5.6.7.8" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 1, count + assert_raises(ActiveRecord::RecordNotFound) { old_event.reload } + assert_nothing_raised { recent_event.reload } + end + + test "respects custom retention period" do + # Set retention to 30 days + Setting.set('event_retention_days', '30') + + # Create event that's 40 days old (should be deleted with 30-day retention) + old_event = Event.create!( + request_id: "old-request-#{SecureRandom.uuid}", + timestamp: 40.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + # Create event that's 20 days old (should be kept) + recent_event = Event.create!( + request_id: "recent-request-#{SecureRandom.uuid}", + timestamp: 20.days.ago, + ip_address: "5.6.7.8", + payload: { request: { ip: "5.6.7.8" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 1, count + assert_raises(ActiveRecord::RecordNotFound) { old_event.reload } + assert_nothing_raised { recent_event.reload } + end + + test "does not delete when retention is zero" do + Setting.set('event_retention_days', '0') + + old_event = Event.create!( + request_id: "old-request-#{SecureRandom.uuid}", + timestamp: 100.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 0, count + assert_nothing_raised { old_event.reload } + end + + test "does not delete when retention is negative" do + Setting.set('event_retention_days', '-1') + + old_event = Event.create!( + request_id: "old-request-#{SecureRandom.uuid}", + timestamp: 100.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 0, count + assert_nothing_raised { old_event.reload } + end + + test "returns zero when no old events exist" do + # Create only recent events + Event.create!( + request_id: "recent-request-#{SecureRandom.uuid}", + timestamp: 30.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 0, count + end + + test "returns zero when no events exist" do + count = CleanupOldEventsJob.perform_now + + assert_equal 0, count + end + + test "deletes multiple old events" do + # Create 5 old events + 5.times do |i| + Event.create!( + request_id: "old-request-#{i}-#{SecureRandom.uuid}", + timestamp: 100.days.ago, + ip_address: "1.2.3.#{i}", + payload: { request: { ip: "1.2.3.#{i}" } } + ) + end + + # Create 3 recent events + 3.times do |i| + Event.create!( + request_id: "recent-request-#{i}-#{SecureRandom.uuid}", + timestamp: 30.days.ago, + ip_address: "5.6.7.#{i}", + payload: { request: { ip: "5.6.7.#{i}" } } + ) + end + + count = CleanupOldEventsJob.perform_now + + assert_equal 5, count + assert_equal 3, Event.count + end + + test "uses default retention when setting not configured" do + # Remove the setting + Setting.find_by(key: 'event_retention_days')&.destroy + + # Create event that's 100 days old (should be deleted with default 90-day retention) + old_event = Event.create!( + request_id: "old-request-#{SecureRandom.uuid}", + timestamp: 100.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + # Create event that's 80 days old (should be kept with default 90-day retention) + recent_event = Event.create!( + request_id: "recent-request-#{SecureRandom.uuid}", + timestamp: 80.days.ago, + ip_address: "5.6.7.8", + payload: { request: { ip: "5.6.7.8" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 1, count + assert_raises(ActiveRecord::RecordNotFound) { old_event.reload } + assert_nothing_raised { recent_event.reload } + end + + test "handles events at exact cutoff boundary correctly" do + Setting.set('event_retention_days', '90') + + # Create event exactly at cutoff (should be deleted - uses < comparison) + cutoff_event = Event.create!( + request_id: "cutoff-request-#{SecureRandom.uuid}", + timestamp: 90.days.ago, + ip_address: "1.2.3.4", + payload: { request: { ip: "1.2.3.4" } } + ) + + # Create event just inside cutoff (should be kept) + inside_event = Event.create!( + request_id: "inside-request-#{SecureRandom.uuid}", + timestamp: 89.days.ago, + ip_address: "5.6.7.8", + payload: { request: { ip: "5.6.7.8" } } + ) + + count = CleanupOldEventsJob.perform_now + + assert_equal 1, count + assert_raises(ActiveRecord::RecordNotFound) { cutoff_event.reload } + assert_nothing_raised { inside_event.reload } + end +end diff --git a/test/models/event_test.rb b/test/models/event_test.rb index 90749fa..cc1a437 100644 --- a/test/models/event_test.rb +++ b/test/models/event_test.rb @@ -228,7 +228,7 @@ class EventTest < ActiveSupport::TestCase assert_equal "post", event.request_method assert_equal "deny", event.waf_action assert_equal 1, event.request_method_before_type_cast # POST = 1 - assert_equal 1, event.waf_action_before_type_cast # DENY = 1 + assert_equal 0, event.waf_action_before_type_cast # DENY = 0 end test "payload extraction methods work correctly" do diff --git a/test/models/rule_path_pattern_test.rb b/test/models/rule_path_pattern_test.rb new file mode 100644 index 0000000..80eb41d --- /dev/null +++ b/test/models/rule_path_pattern_test.rb @@ -0,0 +1,233 @@ +# frozen_string_literal: true + +require "test_helper" + +class RulePathPatternTest < ActiveSupport::TestCase + setup do + @user = User.create!(email_address: "test@example.com", password: "password123") + end + + test "create_path_pattern_rule creates valid rule" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin/users", + match_type: "exact", + action: "deny", + user: @user + ) + + assert rule.persisted?, "Rule should be persisted" + assert_equal "path_pattern", rule.waf_rule_type + assert_equal "deny", rule.waf_action + assert_equal "exact", rule.path_match_type + assert_equal 2, rule.path_segment_ids.length + end + + test "create_path_pattern_rule auto-creates PathSegments" do + initial_count = PathSegment.count + + rule = Rule.create_path_pattern_rule( + pattern: "/new/path/here", + match_type: "prefix", + user: @user + ) + + assert_equal initial_count + 3, PathSegment.count, "Should create 3 new segments" + assert_equal 3, rule.path_segment_ids.length + end + + test "create_path_pattern_rule normalizes to lowercase" do + rule = Rule.create_path_pattern_rule( + pattern: "/Admin/Users", + match_type: "exact", + user: @user + ) + + segments = rule.path_segments_text + assert_equal ["admin", "users"], segments, "Segments should be lowercase" + end + + test "create_path_pattern_rule reuses existing PathSegments" do + # Create segment first + PathSegment.find_or_create_segment("admin") + initial_count = PathSegment.count + + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "exact", + user: @user + ) + + assert_equal initial_count, PathSegment.count, "Should not create duplicate segment" + assert_equal 1, rule.path_segment_ids.length + end + + test "create_path_pattern_rule validates match_type" do + assert_raises(ArgumentError, "Should raise for invalid match_type") do + Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "invalid", + user: @user + ) + end + end + + test "create_path_pattern_rule validates pattern not empty" do + assert_raises(ArgumentError, "Should raise for empty pattern") do + Rule.create_path_pattern_rule( + pattern: "/", + match_type: "exact", + user: @user + ) + end + end + + test "validation requires segment_ids for path_pattern rules" do + rule = Rule.new( + waf_rule_type: "path_pattern", + waf_action: "deny", + conditions: { match_type: "exact" }, # Missing segment_ids + user: @user + ) + + refute rule.valid?, "Rule should be invalid without segment_ids" + assert_includes rule.errors[:conditions], "must include 'segment_ids' array for path_pattern rules" + end + + test "validation requires match_type for path_pattern rules" do + admin_seg = PathSegment.find_or_create_segment("admin") + + rule = Rule.new( + waf_rule_type: "path_pattern", + waf_action: "deny", + conditions: { segment_ids: [admin_seg.id] }, # Missing match_type + user: @user + ) + + refute rule.valid?, "Rule should be invalid without match_type" + assert_includes rule.errors[:conditions], "match_type must be one of: exact, prefix, suffix, contains" + end + + test "validation checks match_type is valid" do + admin_seg = PathSegment.find_or_create_segment("admin") + + rule = Rule.new( + waf_rule_type: "path_pattern", + waf_action: "deny", + conditions: { segment_ids: [admin_seg.id], match_type: "invalid" }, + user: @user + ) + + refute rule.valid?, "Rule should be invalid with invalid match_type" + assert_includes rule.errors[:conditions], "match_type must be one of: exact, prefix, suffix, contains" + end + + test "validation checks segment IDs exist" do + rule = Rule.new( + waf_rule_type: "path_pattern", + waf_action: "deny", + conditions: { segment_ids: [99999], match_type: "exact" }, # Non-existent ID + user: @user + ) + + refute rule.valid?, "Rule should be invalid with non-existent segment IDs" + assert_match(/non-existent path segment IDs/, rule.errors[:conditions].first) + end + + test "path_pattern_display returns human-readable path" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin/users", + match_type: "exact", + user: @user + ) + + assert_equal "/admin/users", rule.path_pattern_display + end + + test "path_segments_text returns segment text array" do + rule = Rule.create_path_pattern_rule( + pattern: "/api/v1/users", + match_type: "prefix", + user: @user + ) + + assert_equal ["api", "v1", "users"], rule.path_segments_text + end + + test "to_agent_format includes segment_ids and match_type for path rules" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "prefix", + action: "deny", + user: @user + ) + + agent_format = rule.to_agent_format + + assert_equal "path_pattern", agent_format[:waf_rule_type] + assert_equal "deny", agent_format[:waf_action] + assert agent_format[:conditions].key?(:segment_ids), "Should include segment_ids" + assert_equal "prefix", agent_format[:conditions][:match_type] + assert_kind_of Array, agent_format[:conditions][:segment_ids] + end + + test "supports all four match types" do + %w[exact prefix suffix contains].each do |match_type| + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: match_type, + user: @user + ) + + assert rule.persisted?, "Should create rule with #{match_type} match type" + assert_equal match_type, rule.path_match_type + end + end + + test "supports all action types" do + %w[allow deny challenge].each do |action| + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "exact", + action: action, + user: @user + ) + + assert rule.persisted?, "Should create rule with #{action} action" + assert_equal action, rule.waf_action + end + end + + test "supports redirect action with metadata" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "exact", + action: "redirect", + user: @user, + metadata: { redirect_url: "https://example.com" } + ) + + assert rule.persisted?, "Should create rule with redirect action" + assert_equal "redirect", rule.waf_action + end + + test "stores metadata with human-readable segments" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin/users", + match_type: "exact", + user: @user + ) + + assert_equal ["admin", "users"], rule.metadata["segments"] + assert_equal "/admin/users", rule.metadata["pattern_display"] + end + + test "stores original pattern in conditions" do + rule = Rule.create_path_pattern_rule( + pattern: "/Admin/Users", # Mixed case + match_type: "exact", + user: @user + ) + + assert_equal "/Admin/Users", rule.conditions["original_pattern"] + end +end diff --git a/test/services/ipapi_test.rb b/test/services/ipapi_test.rb new file mode 100644 index 0000000..9da4dfa --- /dev/null +++ b/test/services/ipapi_test.rb @@ -0,0 +1,134 @@ +require "test_helper" + +class IpapiTest < ActiveSupport::TestCase + def setup + @ipapi_data = JSON.parse( + File.read(Rails.root.join("test/fixtures/files/ipapi_91_84_96_0.json")) + ) + end + + test "parse_company_network_range extracts and converts IP range to CIDR" do + cidr = Ipapi.parse_company_network_range(@ipapi_data) + + assert_equal "91.84.96.0/19", cidr + end + + test "parse_company_network_range handles already formatted CIDR" do + data = { "company" => { "network" => "1.2.3.0/24" } } + cidr = Ipapi.parse_company_network_range(data) + + assert_equal "1.2.3.0/24", cidr + end + + test "parse_company_network_range returns nil for invalid range" do + data = { "company" => { "network" => "invalid" } } + cidr = Ipapi.parse_company_network_range(data) + + assert_nil cidr + end + + test "parse_company_network_range returns nil when no network data present" do + data = { "company" => {} } + cidr = Ipapi.parse_company_network_range(data) + + assert_nil cidr + end + + test "parse_company_network_range falls back to datacenter.network" do + data = { "datacenter" => { "network" => "1.2.3.0 - 1.2.3.255" } } + cidr = Ipapi.parse_company_network_range(data) + + assert_equal "1.2.3.0/24", cidr + end + + test "populate_network_attributes sets all network attributes" do + network_range = NetworkRange.new(network: "91.84.96.0/24") + Ipapi.populate_network_attributes(network_range, @ipapi_data) + + assert_equal 216071, network_range.asn + assert_equal "SERVERS TECH FZCO", network_range.asn_org + assert_equal "SERVERS TECH FZCO", network_range.company + assert_equal "NL", network_range.country + assert network_range.is_datacenter + refute network_range.is_vpn + refute network_range.is_proxy + end + + test "process_ipapi_data creates both company and BGP route networks" do + # Use a different tracking network so BGP route gets created + tracking_network = NetworkRange.create!( + network: "91.84.97.0/24", + source: "auto_generated" + ) + + assert_difference "NetworkRange.count", 2 do + result = Ipapi.process_ipapi_data(@ipapi_data, tracking_network) + + assert_equal 2, result[:networks].length + assert_equal "91.84.96.0/19", result[:broadest_cidr] + end + + # Verify company network was created + company_network = NetworkRange.find_by(network: "91.84.96.0/19") + assert_not_nil company_network + assert_equal "api_imported", company_network.source + assert_equal "SERVERS TECH FZCO", company_network.company + assert company_network.is_datacenter + + # Verify BGP route network was created + bgp_network = NetworkRange.find_by(network: "91.84.96.0/24") + assert_not_nil bgp_network + assert_equal "SERVERS TECH FZCO", bgp_network.company + end + + test "process_ipapi_data handles missing company network gracefully" do + # Create data without company network range + data = @ipapi_data.deep_dup + data["company"].delete("network") + data["datacenter"].delete("network") + + tracking_network = NetworkRange.create!( + network: "91.84.96.0/24", + source: "auto_generated" + ) + + # Should only create the BGP route network (which matches tracking, so 0 new) + assert_no_difference "NetworkRange.count" do + result = Ipapi.process_ipapi_data(data, tracking_network) + + assert_equal 0, result[:networks].length + assert_equal "91.84.96.0/24", result[:broadest_cidr] + end + end + + test "process_ipapi_data updates existing networks instead of creating duplicates" do + # Pre-create both networks + company_network = NetworkRange.create!( + network: "91.84.96.0/19", + source: "manual", + company: "Old Company" + ) + + bgp_network = NetworkRange.create!( + network: "91.84.96.0/24", + source: "manual" + ) + + tracking_network = NetworkRange.create!( + network: "91.84.97.0/24", + source: "auto_generated" + ) + + # Should not create new networks, just update existing ones + assert_no_difference "NetworkRange.count" do + result = Ipapi.process_ipapi_data(@ipapi_data, tracking_network) + + assert_equal 2, result[:networks].length + end + + # Verify updates + company_network.reload + assert_equal "SERVERS TECH FZCO", company_network.company + assert company_network.network_data.key?("ipapi") + end +end diff --git a/test/services/path_rule_matcher_test.rb b/test/services/path_rule_matcher_test.rb new file mode 100644 index 0000000..d45ba5a --- /dev/null +++ b/test/services/path_rule_matcher_test.rb @@ -0,0 +1,216 @@ +# frozen_string_literal: true + +require "test_helper" + +class PathRuleMatcherTest < ActiveSupport::TestCase + setup do + @user = User.create!(email_address: "test@example.com", password: "password123") + + # Create path segments + @admin_segment = PathSegment.find_or_create_segment("admin") + @wp_login_segment = PathSegment.find_or_create_segment("wp-login.php") + @api_segment = PathSegment.find_or_create_segment("api") + @v1_segment = PathSegment.find_or_create_segment("v1") + @users_segment = PathSegment.find_or_create_segment("users") + @dashboard_segment = PathSegment.find_or_create_segment("dashboard") + end + + test "exact match - matches exact path only" do + rule = Rule.create_path_pattern_rule( + pattern: "/wp-login.php", + match_type: "exact", + action: "deny", + user: @user + ) + + # Create matching event + matching_event = create_event_with_segments([@wp_login_segment.id]) + assert PathRuleMatcher.matches?(rule, matching_event), "Should match exact path" + + # Create non-matching event (extra segment) + non_matching_event = create_event_with_segments([@admin_segment.id, @wp_login_segment.id]) + refute PathRuleMatcher.matches?(rule, non_matching_event), "Should not match path with extra segments" + end + + test "prefix match - matches paths starting with pattern" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "prefix", + action: "deny", + user: @user + ) + + # Should match /admin + event1 = create_event_with_segments([@admin_segment.id]) + assert PathRuleMatcher.matches?(rule, event1), "Should match exact prefix" + + # Should match /admin/dashboard + event2 = create_event_with_segments([@admin_segment.id, @dashboard_segment.id]) + assert PathRuleMatcher.matches?(rule, event2), "Should match prefix with additional segments" + + # Should match /admin/users/123 + event3 = create_event_with_segments([@admin_segment.id, @users_segment.id, create_segment("123").id]) + assert PathRuleMatcher.matches?(rule, event3), "Should match prefix with multiple additional segments" + + # Should NOT match /api/admin (admin not at start) + event4 = create_event_with_segments([@api_segment.id, @admin_segment.id]) + refute PathRuleMatcher.matches?(rule, event4), "Should not match when pattern not at start" + end + + test "suffix match - matches paths ending with pattern" do + rule = Rule.create_path_pattern_rule( + pattern: "/wp-login.php", + match_type: "suffix", + action: "deny", + user: @user + ) + + # Should match /wp-login.php + event1 = create_event_with_segments([@wp_login_segment.id]) + assert PathRuleMatcher.matches?(rule, event1), "Should match exact suffix" + + # Should match /admin/wp-login.php + event2 = create_event_with_segments([@admin_segment.id, @wp_login_segment.id]) + assert PathRuleMatcher.matches?(rule, event2), "Should match suffix with prefix segments" + + # Should match /backup/admin/wp-login.php + backup_seg = create_segment("backup") + event3 = create_event_with_segments([backup_seg.id, @admin_segment.id, @wp_login_segment.id]) + assert PathRuleMatcher.matches?(rule, event3), "Should match suffix with multiple prefix segments" + + # Should NOT match /wp-login.php/test (suffix has extra segment) + test_seg = create_segment("test") + event4 = create_event_with_segments([@wp_login_segment.id, test_seg.id]) + refute PathRuleMatcher.matches?(rule, event4), "Should not match when pattern not at end" + end + + test "contains match - matches paths containing pattern" do + rule = Rule.create_path_pattern_rule( + pattern: "/admin", + match_type: "contains", + action: "deny", + user: @user + ) + + # Should match /admin + event1 = create_event_with_segments([@admin_segment.id]) + assert PathRuleMatcher.matches?(rule, event1), "Should match exact contains" + + # Should match /api/admin/users + event2 = create_event_with_segments([@api_segment.id, @admin_segment.id, @users_segment.id]) + assert PathRuleMatcher.matches?(rule, event2), "Should match contains in middle" + + # Should match /super/secret/admin/panel + super_seg = create_segment("super") + secret_seg = create_segment("secret") + panel_seg = create_segment("panel") + event3 = create_event_with_segments([super_seg.id, secret_seg.id, @admin_segment.id, panel_seg.id]) + assert PathRuleMatcher.matches?(rule, event3), "Should match contains with prefix and suffix" + + # Should NOT match /administrator (different segment) + administrator_seg = create_segment("administrator") + event4 = create_event_with_segments([administrator_seg.id]) + refute PathRuleMatcher.matches?(rule, event4), "Should not match different segment" + end + + test "contains match with multi-segment pattern" do + rule = Rule.create_path_pattern_rule( + pattern: "/api/admin", + match_type: "contains", + action: "deny", + user: @user + ) + + # Should match /api/admin + event1 = create_event_with_segments([@api_segment.id, @admin_segment.id]) + assert PathRuleMatcher.matches?(rule, event1), "Should match exact contains" + + # Should match /v1/api/admin/users + event2 = create_event_with_segments([@v1_segment.id, @api_segment.id, @admin_segment.id, @users_segment.id]) + assert PathRuleMatcher.matches?(rule, event2), "Should match consecutive segments in middle" + + # Should NOT match /api/v1/admin (segments not consecutive) + event3 = create_event_with_segments([@api_segment.id, @v1_segment.id, @admin_segment.id]) + refute PathRuleMatcher.matches?(rule, event3), "Should not match non-consecutive segments" + end + + test "case insensitive matching through PathSegment normalization" do + # PathSegment.find_or_create_segment normalizes to lowercase + rule = Rule.create_path_pattern_rule( + pattern: "/Admin/Users", # Mixed case + match_type: "exact", + action: "deny", + user: @user + ) + + # Event with lowercase path should match + event = create_event_with_segments([@admin_segment.id, @users_segment.id]) + assert PathRuleMatcher.matches?(rule, event), "Should match case-insensitively" + end + + test "matching_rules returns all matching rules" do + rule1 = Rule.create_path_pattern_rule(pattern: "/admin", match_type: "prefix", action: "deny", user: @user) + rule2 = Rule.create_path_pattern_rule(pattern: "/admin/users", match_type: "exact", action: "allow", user: @user) + rule3 = Rule.create_path_pattern_rule(pattern: "/api", match_type: "prefix", action: "deny", user: @user) + + event = create_event_with_segments([@admin_segment.id, @users_segment.id]) + + matching = PathRuleMatcher.matching_rules(event) + assert_includes matching, rule1, "Should include prefix rule" + assert_includes matching, rule2, "Should include exact rule" + refute_includes matching, rule3, "Should not include non-matching rule" + end + + test "evaluate returns first matching action" do + Rule.create_path_pattern_rule(pattern: "/admin", match_type: "prefix", action: "deny", user: @user) + + event = create_event_with_segments([@admin_segment.id, @dashboard_segment.id]) + + action = PathRuleMatcher.evaluate(event) + assert_equal "deny", action, "Should return deny action" + end + + test "evaluate returns allow for non-matching event" do + Rule.create_path_pattern_rule(pattern: "/admin", match_type: "exact", action: "deny", user: @user) + + event = create_event_with_segments([@api_segment.id]) + + action = PathRuleMatcher.evaluate(event) + assert_equal "allow", action, "Should return allow for non-matching event" + end + + test "does not match disabled rules" do + rule = Rule.create_path_pattern_rule(pattern: "/admin", match_type: "exact", action: "deny", user: @user) + rule.update!(enabled: false) + + event = create_event_with_segments([@admin_segment.id]) + + matching = PathRuleMatcher.matching_rules(event) + assert_empty matching, "Should not match disabled rules" + end + + test "does not match expired rules" do + rule = Rule.create_path_pattern_rule(pattern: "/admin", match_type: "exact", action: "deny", user: @user) + rule.update!(expires_at: 1.hour.ago) + + event = create_event_with_segments([@admin_segment.id]) + + matching = PathRuleMatcher.matching_rules(event) + assert_empty matching, "Should not match expired rules" + end + + private + + def create_event_with_segments(segment_ids) + Event.new( + request_id: SecureRandom.uuid, + timestamp: Time.current, + request_segment_ids: segment_ids, + ip_address: "1.2.3.4" + ) + end + + def create_segment(text) + PathSegment.find_or_create_segment(text) + end +end