Fix some blocked/allow laggards after migrating. Add DuckDB for outstanding analyitcs performance. Start adding an import for all bot networks

2025-11-18 16:40:05 +11:00
parent ef56779584
commit 3f274c842c
37 changed files with 3522 additions and 151 deletions
--- a/app/services/analytics_duckdb_service.rb
+++ b/app/services/analytics_duckdb_service.rb
@@ -0,0 +1,284 @@
+# frozen_string_literal: true
+
+# Service for managing DuckDB analytics database
+# Provides fast analytical queries on events data using columnar storage
+class AnalyticsDuckdbService
+  include Singleton
+
+  DUCKDB_PATH = Rails.root.join("storage", "analytics.duckdb").to_s
+  BATCH_SIZE = 10_000
+
+  # Execute block with connection, ensuring database and connection are closed afterward
+  def with_connection
+    db = DuckDB::Database.open(DUCKDB_PATH)
+    conn = db.connect
+    yield conn
+  ensure
+    conn&.close
+    db&.close
+  end
+
+  # Create events table if it doesn't exist (must be called within with_connection block)
+  def setup_schema(conn)
+    conn.execute(<<~SQL)
+      CREATE TABLE IF NOT EXISTS events (
+        id BIGINT PRIMARY KEY,
+        timestamp TIMESTAMP NOT NULL,
+        ip_address VARCHAR,
+        network_range_id BIGINT,
+        country VARCHAR,
+        company VARCHAR,
+        asn INTEGER,
+        asn_org VARCHAR,
+        is_datacenter BOOLEAN,
+        is_vpn BOOLEAN,
+        is_proxy BOOLEAN,
+        waf_action INTEGER,
+        request_path VARCHAR,
+        user_agent VARCHAR
+      )
+    SQL
+
+    Rails.logger.info "[DuckDB] Schema setup complete"
+  end
+
+  # Get timestamp of oldest event in DuckDB
+  # Returns nil if table is empty
+  def oldest_event_timestamp
+    with_connection do |conn|
+      result = conn.query("SELECT MIN(timestamp) as oldest FROM events")
+      first_row = result.first
+      first_row&.first # Returns the value or nil
+    end
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Error getting oldest timestamp: #{e.message}"
+    nil
+  end
+
+  # Get timestamp of newest event in DuckDB
+  # Returns nil if table is empty
+  def newest_event_timestamp
+    with_connection do |conn|
+      result = conn.query("SELECT MAX(timestamp) as newest FROM events")
+      first_row = result.first
+      first_row&.first # Returns the value or nil
+    end
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Error getting newest timestamp: #{e.message}"
+    nil
+  end
+
+  # Get maximum event ID already synced to DuckDB
+  def max_synced_id
+    with_connection do |conn|
+      result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events")
+      first_row = result.first
+      first_row&.first || 0
+    end
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Error getting max ID: #{e.message}"
+    0
+  end
+
+  # Sync new events from PostgreSQL to DuckDB
+  # Uses PostgreSQL cursor for memory-efficient streaming
+  # Uses Appender API for fast bulk inserts
+  # Filters by ID to avoid duplicates
+  def sync_new_events(from_timestamp)
+    total_synced = 0
+
+    with_connection do |conn|
+      # Ensure table exists
+      setup_schema(conn)
+
+      # Get max ID already in DuckDB to avoid duplicates
+      max_id_result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events")
+      max_id = max_id_result.first&.first || 0
+      Rails.logger.info "[DuckDB] Syncing events from #{from_timestamp}, max_id=#{max_id}"
+
+      start_time = Time.current
+      appender = nil
+      batch_count = 0
+
+      begin
+        # Use PostgreSQL cursor for memory-efficient streaming
+        Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id)
+             .select(
+               :id,
+               :timestamp,
+               :ip_address,
+               :network_range_id,
+               :country,
+               :company,
+               :asn,
+               :asn_org,
+               :is_datacenter,
+               :is_vpn,
+               :is_proxy,
+               :waf_action,
+               :request_path,
+               :user_agent
+             )
+             .order(:id)
+             .each_row(block_size: BATCH_SIZE) do |event_data|
+          # Create new appender for each batch
+          if batch_count % BATCH_SIZE == 0
+            appender&.close # Close previous appender
+            appender = conn.appender("events")
+          end
+
+          # Unpack event data from cursor row (Hash from each_row)
+          begin
+            appender.append_row(
+              event_data["id"],
+              event_data["timestamp"],
+              event_data["ip_address"]&.to_s,
+              event_data["network_range_id"],
+              event_data["country"],
+              event_data["company"],
+              event_data["asn"],
+              event_data["asn_org"],
+              event_data["is_datacenter"],
+              event_data["is_vpn"],
+              event_data["is_proxy"],
+              event_data["waf_action"],
+              event_data["request_path"],
+              event_data["user_agent"]
+            )
+          rescue StandardError => e
+            Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}"
+            Rails.logger.error "[DuckDB] event_data = #{event_data.inspect}"
+            raise
+          end
+
+          batch_count += 1
+          total_synced += 1
+
+          # Log progress every BATCH_SIZE events
+          if batch_count % BATCH_SIZE == 0
+            Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)"
+          end
+        end
+
+        # Close final appender
+        appender&.close
+
+        duration = Time.current - start_time
+        rate = total_synced / duration if duration > 0
+        Rails.logger.info "[DuckDB] Sync complete: #{total_synced} events in #{duration.round(2)}s (~#{rate&.round(0)} events/sec)"
+      rescue StandardError => e
+        appender&.close rescue nil # Ensure appender is closed on error
+        Rails.logger.error "[DuckDB] Error syncing events: #{e.message}"
+        Rails.logger.error e.backtrace.join("\n")
+        raise # Re-raise to be caught by outer rescue
+      end
+    end
+
+    total_synced
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Sync failed: #{e.message}"
+    0
+  end
+
+  # Execute analytical query on DuckDB
+  def query(sql, *params)
+    with_connection do |conn|
+      conn.query(sql, *params)
+    end
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Query error: #{e.message}"
+    Rails.logger.error "SQL: #{sql}"
+    raise
+  end
+
+  # Get event count in DuckDB
+  def event_count
+    with_connection do |conn|
+      result = conn.query("SELECT COUNT(*) as count FROM events")
+      first_row = result.first
+      first_row&.first || 0
+    end
+  rescue StandardError => e
+    Rails.logger.error "[DuckDB] Error getting event count: #{e.message}"
+    0
+  end
+
+  # Analytics query: Total events since timestamp
+  def total_events_since(start_time)
+    with_connection do |conn|
+      result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time)
+      result.first&.first || 0
+    end
+  end
+
+  # Analytics query: Event breakdown by WAF action
+  def event_breakdown_by_action(start_time)
+    with_connection do |conn|
+      result = conn.query(<<~SQL, start_time)
+        SELECT waf_action, COUNT(*) as count
+        FROM events
+        WHERE timestamp >= ?
+        GROUP BY waf_action
+      SQL
+
+      # Convert to hash like PostgreSQL returns
+      result.to_a.to_h { |row| [row["waf_action"], row["count"]] }
+    end
+  end
+
+  # Analytics query: Top countries
+  def top_countries(start_time, limit = 10)
+    with_connection do |conn|
+      result = conn.query(<<~SQL, start_time, limit)
+        SELECT country, COUNT(*) as count
+        FROM events
+        WHERE timestamp >= ? AND country IS NOT NULL
+        GROUP BY country
+        ORDER BY count DESC
+        LIMIT ?
+      SQL
+
+      result.to_a.map { |row| [row["country"], row["count"]] }
+    end
+  end
+
+  # Analytics query: Top blocked IPs
+  def top_blocked_ips(start_time, limit = 10)
+    with_connection do |conn|
+      result = conn.query(<<~SQL, start_time, limit)
+        SELECT ip_address, COUNT(*) as count
+        FROM events
+        WHERE timestamp >= ? AND waf_action = 0
+        GROUP BY ip_address
+        ORDER BY count DESC
+        LIMIT ?
+      SQL
+
+      result.to_a.map { |row| [row["ip_address"], row["count"]] }
+    end
+  end
+
+  # Analytics query: Hourly timeline (events grouped by hour)
+  def hourly_timeline(start_time, end_time)
+    with_connection do |conn|
+      result = conn.query(<<~SQL, start_time, end_time)
+        SELECT
+          DATE_TRUNC('hour', timestamp) as hour,
+          COUNT(*) as count
+        FROM events
+        WHERE timestamp >= ? AND timestamp < ?
+        GROUP BY hour
+        ORDER BY hour
+      SQL
+
+      # Convert to hash with Time keys like PostgreSQL
+      result.to_a.to_h { |row| [row["hour"], row["count"]] }
+    end
+  end
+
+  # Close DuckDB connection (for cleanup/testing)
+  def close
+    @connection&.close
+    @connection = nil
+  end
+end
--- a/app/services/bot_network_range_importer.rb
+++ b/app/services/bot_network_range_importer.rb
@@ -0,0 +1,573 @@
+# frozen_string_literal: true
+
+# BotNetworkRangeImporter - Service for importing official bot network ranges
+#
+# Imports network ranges from official bot provider sources like:
+# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json
+# - Google: Official crawler IP lists
+# - Microsoft/Bing: Bot network ranges
+# - Anthropic: Service network ranges
+# - OpenAI: Service network ranges
+class BotNetworkRangeImporter
+  class ImportError < StandardError; end
+
+  # Official sources for bot network ranges
+  BOT_SOURCES = {
+    amazon_aws: {
+      name: 'Amazon AWS',
+      url: 'https://ip-ranges.amazonaws.com/ip-ranges.json',
+      format: :json,
+      parser: :parse_aws_ranges,
+      description: 'Official AWS IP ranges including Amazonbot and other services'
+    },
+    google: {
+      name: 'Google',
+      # Note: These URLs may need to be updated based on current Google documentation
+      urls: [
+        'https://developers.google.com/search/docs/files/googlebot.json',
+        'https://developers.google.com/search/docs/files/special-crawlers.json'
+      ],
+      format: :json,
+      parser: :parse_google_ranges,
+      description: 'Googlebot and other Google crawler IP ranges'
+    },
+    microsoft_bing: {
+      name: 'Microsoft Bing',
+      # Note: Microsoft may require web scraping or API access
+      url: 'https://www.bing.com/toolbox/bingbot.json',
+      format: :json,
+      parser: :parse_microsoft_ranges,
+      description: 'Bingbot and other Microsoft crawler IP ranges'
+    },
+    anthropic: {
+      name: 'Anthropic Claude',
+      # Note: Anthropic ranges may need manual updates or different approach
+      url: 'https://docs.anthropic.com/claude/reference/ip_ranges',
+      format: :html,
+      parser: :parse_anthropic_ranges,
+      description: 'Anthropic Claude API service IP ranges'
+    },
+    openai_searchbot: {
+      name: 'OpenAI SearchBot',
+      url: 'https://openai.com/searchbot.json',
+      format: :json,
+      parser: :parse_openai_ranges,
+      description: 'OpenAI SearchBot for ChatGPT search features'
+    },
+    openai_chatgpt_user: {
+      name: 'OpenAI ChatGPT-User',
+      url: 'https://openai.com/chatgpt-user.json',
+      format: :json,
+      parser: :parse_openai_ranges,
+      description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs'
+    },
+    openai_gptbot: {
+      name: 'OpenAI GPTBot',
+      url: 'https://openai.com/gptbot.json',
+      format: :json,
+      parser: :parse_openai_ranges,
+      description: 'OpenAI GPTBot for training AI foundation models'
+    },
+    cloudflare: {
+      name: 'Cloudflare',
+      urls: [
+        'https://www.cloudflare.com/ips-v4',
+        'https://www.cloudflare.com/ips-v6'
+      ],
+      format: :text,
+      parser: :parse_cloudflare_ranges,
+      description: 'Cloudflare network ranges including their crawlers and services'
+    },
+    facebook: {
+      name: 'Facebook/Meta',
+      url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/',
+      format: :html,
+      parser: :parse_facebook_ranges,
+      description: 'Facebook/Meta crawlers and bots'
+    },
+    applebot: {
+      name: 'Applebot',
+      url: 'https://support.apple.com/en-us/HT204683',
+      format: :html,
+      parser: :parse_applebot_ranges,
+      description: 'Applebot crawler for Apple search and Siri'
+    },
+    duckduckgo: {
+      name: 'DuckDuckBot',
+      url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/',
+      format: :html,
+      parser: :parse_duckduckgo_ranges,
+      description: 'DuckDuckGo search crawler'
+    }
+  }.freeze
+
+  def self.import_from_source(source_key, options = {})
+    source = BOT_SOURCES[source_key.to_sym]
+    raise ImportError, "Unknown source: #{source_key}" unless source
+
+    puts "Importing bot network ranges from #{source[:name]}..."
+
+    case source[:parser]
+    when :parse_aws_ranges
+      parse_aws_ranges(source, options)
+    when :parse_google_ranges
+      parse_google_ranges(source, options)
+    when :parse_microsoft_ranges
+      parse_microsoft_ranges(source, options)
+    when :parse_anthropic_ranges
+      parse_anthropic_ranges(source, options)
+    when :parse_openai_ranges
+      parse_openai_ranges(source, options)
+    when :parse_cloudflare_ranges
+      parse_cloudflare_ranges(source, options)
+    when :parse_facebook_ranges
+      parse_facebook_ranges(source, options)
+    when :parse_applebot_ranges
+      parse_applebot_ranges(source, options)
+    when :parse_duckduckgo_ranges
+      parse_duckduckgo_ranges(source, options)
+    else
+      raise ImportError, "Unknown parser: #{source[:parser]}"
+    end
+  end
+
+  def self.import_all_sources(options = {})
+    results = {}
+
+    BOT_SOURCES.each do |source_key, source|
+      puts "\n" + "="*50
+      puts "Processing #{source[:name]}..."
+      puts "="*50
+
+      begin
+        results[source_key] = import_from_source(source_key, options)
+      rescue => e
+        Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}"
+        results[source_key] = { error: e.message, imported: 0 }
+      end
+    end
+
+    puts "\n" + "="*50
+    puts "Import Summary"
+    puts "="*50
+
+    results.each do |source, result|
+      if result[:error]
+        puts "#{source}: FAILED - #{result[:error]}"
+      else
+        puts "#{source}: SUCCESS - #{result[:imported]} ranges imported"
+      end
+    end
+
+    results
+  end
+
+  private
+
+  # Amazon AWS IP ranges parser
+  def self.parse_aws_ranges(source, options = {})
+    require 'net/http'
+    require 'uri'
+
+    uri = URI.parse(source[:url])
+    http = Net::HTTP.new(uri.host, uri.port)
+    http.use_ssl = true
+    http.read_timeout = 30
+
+    response = http.get(uri.request_uri)
+    raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'
+
+    data = JSON.parse(response.body)
+    imported_count = 0
+    batch_size = options[:batch_size] || 1000
+    batch = []
+
+    # Filter for relevant services (can be customized)
+    relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT']
+
+    data['prefixes'].each do |prefix|
+      # Focus on relevant services and regions
+      next unless relevant_services.include?(prefix['service'])
+
+      network_range = {
+        network: prefix['ip_prefix'],
+        source: 'bot_import_amazon_aws',
+        asn: nil, # AWS doesn't provide ASN in this feed
+        asn_org: 'Amazon Web Services',
+        company: 'Amazon',
+        country: nil,
+        is_datacenter: true,
+        is_proxy: false,
+        is_vpn: false,
+        additional_data: {
+          aws_service: prefix['service'],
+          aws_region: prefix['region'],
+          aws_network_border_group: prefix['network_border_group'],
+          import_date: Time.current.iso8601
+        }.to_json
+      }
+
+      batch << network_range
+
+      if batch.size >= batch_size
+        imported_count += import_batch(batch, 'Amazon AWS')
+        batch = []
+        puts "Imported #{imported_count} AWS ranges..."
+      end
+    end
+
+    # Import remaining records
+    if batch.any?
+      imported_count += import_batch(batch, 'Amazon AWS')
+    end
+
+    puts "Amazon AWS import completed: #{imported_count} ranges imported"
+    { imported: imported_count, source: 'Amazon AWS' }
+  rescue Net::TimeoutError, Net::OpenTimeout => e
+    raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
+  rescue JSON::ParserError => e
+    raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
+  end
+
+  # Google crawler IP ranges parser
+  def self.parse_google_ranges(source, options = {})
+    imported_count = 0
+
+    # Try each potential URL
+    urls = Array(source[:urls] || source[:url])
+
+    urls.each do |url|
+      begin
+        puts "Attempting to fetch Google ranges from: #{url}"
+
+        uri = URI.parse(url)
+        http = Net::HTTP.new(uri.host, uri.port)
+        http.use_ssl = true
+        http.read_timeout = 30
+
+        response = http.get(uri.request_uri)
+        next unless response.code == '200'
+
+        data = JSON.parse(response.body)
+
+        batch_size = options[:batch_size] || 1000
+        batch = []
+
+        # Parse Google crawler format (varies by file type)
+        if data.is_a?(Array)
+          data.each do |entry|
+            next unless entry['cidr'] || entry['prefix']
+
+            network_range = {
+              network: entry['cidr'] || entry['prefix'],
+              source: 'bot_import_google',
+              asn: nil,
+              asn_org: 'Google LLC',
+              company: 'Google',
+              country: nil,
+              is_datacenter: true,
+              is_proxy: false,
+              is_vpn: false,
+              additional_data: {
+                crawler_type: entry['crawler_type'] || 'unknown',
+                user_agent: entry['user_agent'],
+                import_date: Time.current.iso8601
+              }.to_json
+            }
+
+            batch << network_range
+
+            if batch.size >= batch_size
+              imported_count += import_batch(batch, 'Google')
+              batch = []
+              puts "Imported #{imported_count} Google ranges..."
+            end
+          end
+        end
+
+        # Import remaining records
+        if batch.any?
+          imported_count += import_batch(batch, 'Google')
+        end
+
+        puts "Google import completed: #{imported_count} ranges imported"
+        return { imported: imported_count, source: 'Google' }
+
+      rescue => e
+        Rails.logger.warn "Failed to fetch from #{url}: #{e.message}"
+        next
+      end
+    end
+
+    raise ImportError, "Failed to fetch Google crawler ranges from any URL"
+  end
+
+  # Microsoft Bing crawler IP ranges parser
+  def self.parse_microsoft_ranges(source, options = {})
+    # Microsoft requires special handling as they may not provide direct JSON
+    # This is a placeholder implementation
+
+    puts "Microsoft Bing crawler import requires manual configuration or web scraping"
+    puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use"
+
+    {
+      imported: 0,
+      source: 'Microsoft Bing',
+      note: 'Manual configuration required - Microsoft does not provide direct IP range feeds'
+    }
+  end
+
+  # Anthropic service IP ranges parser
+  def self.parse_anthropic_ranges(source, options = {})
+    # Anthropic ranges may need to be manually configured
+    # This is a placeholder implementation
+
+    puts "Anthropic Claude service ranges require manual configuration"
+    puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges"
+
+    {
+      imported: 0,
+      source: 'Anthropic',
+      note: 'Manual configuration required - Anthropic does not provide automated IP range feeds'
+    }
+  end
+
+  # OpenAI crawler IP ranges parser
+  def self.parse_openai_ranges(source, options = {})
+    require 'net/http'
+    require 'uri'
+
+    uri = URI.parse(source[:url])
+    http = Net::HTTP.new(uri.host, uri.port)
+    http.use_ssl = true
+    http.read_timeout = 30
+
+    response = http.get(uri.request_uri)
+    raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'
+
+    data = JSON.parse(response.body)
+    imported_count = 0
+    batch_size = options[:batch_size] || 1000
+    batch = []
+
+    # Determine crawler type from source name
+    crawler_type = source[:name].gsub('OpenAI ', '').downcase
+
+    data.each do |entry|
+      # OpenAI provides IP ranges as either CIDR notation or single IPs
+      ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip']
+      next unless ip_range
+
+      # Convert single IPs to /32
+      network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"
+
+      network_range = {
+        network: network,
+        source: "bot_import_openai_#{crawler_type}",
+        asn: nil,
+        asn_org: 'OpenAI',
+        company: 'OpenAI',
+        country: nil,
+        is_datacenter: true,
+        is_proxy: false,
+        is_vpn: false,
+        additional_data: {
+          crawler_type: crawler_type,
+          crawler_purpose: crawler_purpose(crawler_type),
+          user_agent: openai_user_agent(crawler_type),
+          import_date: Time.current.iso8601,
+          source_url: source[:url]
+        }.to_json
+      }
+
+      batch << network_range
+
+      if batch.size >= batch_size
+        imported_count += import_batch(batch, "OpenAI #{crawler_type}")
+        batch = []
+        puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..."
+      end
+    end
+
+    # Import remaining records
+    if batch.any?
+      imported_count += import_batch(batch, "OpenAI #{crawler_type}")
+    end
+
+    puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
+    { imported: imported_count, source: "OpenAI #{crawler_type}" }
+  rescue Net::TimeoutError, Net::OpenTimeout => e
+    raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
+  rescue JSON::ParserError => e
+    raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
+  end
+
+  def self.import_batch(batch_data, source_name)
+    # Check for existing ranges to avoid duplicates
+    existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network)
+    new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) }
+
+    if new_ranges.any?
+      NetworkRange.insert_all(new_ranges)
+      puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)"
+    else
+      puts "No new #{source_name} ranges to import (all duplicates)"
+    end
+
+    new_ranges.size
+  rescue => e
+    Rails.logger.error "Failed to import #{source_name} batch: #{e.message}"
+
+    # Fallback to individual imports
+    imported = 0
+    new_ranges.each do |data|
+      begin
+        NetworkRange.create!(data)
+        imported += 1
+      rescue => individual_error
+        Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}"
+      end
+    end
+
+    imported
+  end
+
+  # Helper method to determine crawler purpose based on type
+  def self.crawler_purpose(crawler_type)
+    case crawler_type
+    when 'searchbot'
+      'Used to link to and surface websites in search results in ChatGPT\'s search features'
+    when 'chatgpt-user'
+      'User actions in ChatGPT and Custom GPTs, including GPT Actions'
+    when 'gptbot'
+      'Used to crawl content for training OpenAI\'s generative AI foundation models'
+    else
+      'Unknown purpose'
+    end
+  end
+
+  # Helper method to get OpenAI user agent strings
+  def self.openai_user_agent(crawler_type)
+    case crawler_type
+    when 'searchbot'
+      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot'
+    when 'chatgpt-user'
+      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot'
+    when 'gptbot'
+      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot'
+    else
+      'Unknown user agent'
+    end
+  end
+
+  # Cloudflare IP ranges parser
+  def self.parse_cloudflare_ranges(source, options = {})
+    require 'net/http'
+    require 'uri'
+
+    imported_count = 0
+    urls = Array(source[:urls])
+    batch_size = options[:batch_size] || 1000
+    batch = []
+
+    urls.each do |url|
+      begin
+        puts "Fetching Cloudflare ranges from: #{url}"
+
+        uri = URI.parse(url)
+        http = Net::HTTP.new(uri.host, uri.port)
+        http.use_ssl = true
+        http.read_timeout = 30
+
+        response = http.get(uri.request_uri)
+        raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'
+
+        # Cloudflare provides plain text CIDR lists
+        lines = response.body.split("\n")
+        ip_version = url.include?('ips-v4') ? 4 : 6
+
+        lines.each do |line|
+          line = line.strip
+          next if line.empty? || line.start_with?('#')
+
+          # Validate CIDR format
+          next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/)
+
+          network_range = {
+            network: line,
+            source: 'bot_import_cloudflare',
+            asn: nil,
+            asn_org: 'Cloudflare',
+            company: 'Cloudflare',
+            country: nil,
+            is_datacenter: true,
+            is_proxy: false,
+            is_vpn: false,
+            additional_data: {
+              ip_version: ip_version,
+              import_date: Time.current.iso8601,
+              source_url: url,
+              service_type: 'cdn_and_security'
+            }.to_json
+          }
+
+          batch << network_range
+
+          if batch.size >= batch_size
+            imported_count += import_batch(batch, 'Cloudflare')
+            batch = []
+            puts "Imported #{imported_count} Cloudflare ranges..."
+          end
+        end
+
+      rescue => e
+        Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}"
+        next
+      end
+    end
+
+    # Import remaining records
+    if batch.any?
+      imported_count += import_batch(batch, 'Cloudflare')
+    end
+
+    puts "Cloudflare import completed: #{imported_count} ranges imported"
+    { imported: imported_count, source: 'Cloudflare' }
+  end
+
+  # Facebook/Meta crawler ranges parser (placeholder)
+  def self.parse_facebook_ranges(source, options = {})
+    puts "Facebook/Meta crawler ranges require web scraping or manual configuration"
+    puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/"
+
+    {
+      imported: 0,
+      source: 'Facebook/Meta',
+      note: 'Manual configuration required - Facebook does not provide automated IP range feeds'
+    }
+  end
+
+  # Applebot crawler ranges parser (placeholder)
+  def self.parse_applebot_ranges(source, options = {})
+    puts "Applebot ranges require web scraping or manual configuration"
+    puts "Refer to: https://support.apple.com/en-us/HT204683"
+
+    {
+      imported: 0,
+      source: 'Applebot',
+      note: 'Manual configuration required - Apple does not provide automated IP range feeds'
+    }
+  end
+
+  # DuckDuckBot crawler ranges parser (placeholder)
+  def self.parse_duckduckgo_ranges(source, options = {})
+    puts "DuckDuckBot ranges require web scraping or manual configuration"
+    puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/"
+
+    {
+      imported: 0,
+      source: 'DuckDuckBot',
+      note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds'
+    }
+  end
+end
--- a/app/services/ipapi.rb
+++ b/app/services/ipapi.rb
@@ -53,4 +53,107 @@ class Ipapi
      next
    end
  end
+
+  # Parse company/datacenter network range from IPAPI data
+  # Handles "X.X.X.X - Y.Y.Y.Y" format and converts to CIDR
+  def self.parse_company_network_range(ipapi_data)
+    # Try company.network first, then datacenter.network
+    network_range = ipapi_data.dig('company', 'network') || ipapi_data.dig('datacenter', 'network')
+    return nil if network_range.blank?
+
+    # Parse "X.X.X.X - Y.Y.Y.Y" format
+    if network_range.include?(' - ')
+      start_ip_str, end_ip_str = network_range.split(' - ').map(&:strip)
+
+      begin
+        start_ip = IPAddr.new(start_ip_str)
+        end_ip = IPAddr.new(end_ip_str)
+
+        # Calculate the number of IPs in the range
+        num_ips = end_ip.to_i - start_ip.to_i + 1
+
+        # Calculate prefix length from number of IPs
+        # num_ips = 2^(32 - prefix_length) for IPv4
+        prefix_length = 32 - Math.log2(num_ips).to_i
+
+        # Verify it's a valid CIDR block (power of 2)
+        if 2**(32 - prefix_length) == num_ips
+          cidr = "#{start_ip_str}/#{prefix_length}"
+          Rails.logger.debug "Parsed company network range: #{network_range} -> #{cidr}"
+          return cidr
+        else
+          Rails.logger.warn "Network range #{network_range} is not a valid CIDR block (#{num_ips} IPs)"
+          return nil
+        end
+      rescue IPAddr::InvalidAddressError => e
+        Rails.logger.error "Invalid IP in company network range: #{network_range} (#{e.message})"
+        return nil
+      end
+    elsif network_range.include?('/')
+      # Already in CIDR format
+      return network_range
+    else
+      Rails.logger.warn "Unknown network range format: #{network_range}"
+      return nil
+    end
+  end
+
+  # Populate NetworkRange attributes from IPAPI data
+  def self.populate_network_attributes(network_range, ipapi_data)
+    network_range.asn = ipapi_data.dig('asn', 'asn')
+    network_range.asn_org = ipapi_data.dig('asn', 'org') || ipapi_data.dig('company', 'name')
+    network_range.company = ipapi_data.dig('company', 'name')
+    network_range.country = ipapi_data.dig('location', 'country_code')
+    network_range.is_datacenter = ipapi_data['is_datacenter'] || false
+    network_range.is_vpn = ipapi_data['is_vpn'] || false
+    network_range.is_proxy = ipapi_data['is_proxy'] || false
+  end
+
+  # Process IPAPI data and create network ranges
+  # Returns array of created/updated NetworkRange objects
+  def self.process_ipapi_data(ipapi_data, tracking_network)
+    created_networks = []
+
+    # Extract and create company/datacenter network range if present
+    company_network_cidr = parse_company_network_range(ipapi_data)
+    if company_network_cidr.present?
+      company_range = NetworkRange.find_or_create_by(network: company_network_cidr) do |nr|
+        nr.source = 'api_imported'
+        nr.creation_reason = "Company allocation from IPAPI for #{tracking_network.cidr}"
+      end
+
+      # Always update attributes (whether new or existing)
+      populate_network_attributes(company_range, ipapi_data)
+      company_range.set_network_data(:ipapi, ipapi_data)
+      company_range.last_api_fetch = Time.current
+      company_range.save!
+
+      created_networks << company_range
+      Rails.logger.info "Created/updated company network: #{company_range.cidr}"
+    end
+
+    # Extract and create ASN route network if present
+    ipapi_route = ipapi_data.dig('asn', 'route')
+    if ipapi_route.present? && ipapi_route != tracking_network.cidr
+      route_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr|
+        nr.source = 'api_imported'
+        nr.creation_reason = "BGP route from IPAPI lookup for #{tracking_network.cidr}"
+      end
+
+      # Always update attributes (whether new or existing)
+      populate_network_attributes(route_network, ipapi_data)
+      route_network.set_network_data(:ipapi, ipapi_data)
+      route_network.last_api_fetch = Time.current
+      route_network.save!
+
+      created_networks << route_network
+      Rails.logger.info "Created/updated BGP route network: #{route_network.cidr}"
+    end
+
+    # Return both the created networks and the broadest CIDR for deduplication
+    {
+      networks: created_networks,
+      broadest_cidr: company_network_cidr.presence || ipapi_route || tracking_network.cidr
+    }
+  end
 end