# frozen_string_literal: true # BotNetworkRangeImporter - Service for importing official bot network ranges # # Imports network ranges from official bot provider sources like: # - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json # - Google: Official crawler IP lists # - Microsoft/Bing: Bot network ranges # - Anthropic: Service network ranges # - OpenAI: Service network ranges class BotNetworkRangeImporter class ImportError < StandardError; end # Official sources for bot network ranges BOT_SOURCES = { amazon_aws: { name: 'Amazon AWS', url: 'https://ip-ranges.amazonaws.com/ip-ranges.json', format: :json, parser: :parse_aws_ranges, description: 'Official AWS IP ranges including Amazonbot and other services' }, google: { name: 'Google', # Note: These URLs may need to be updated based on current Google documentation urls: [ 'https://developers.google.com/search/docs/files/googlebot.json', 'https://developers.google.com/search/docs/files/special-crawlers.json' ], format: :json, parser: :parse_google_ranges, description: 'Googlebot and other Google crawler IP ranges' }, microsoft_bing: { name: 'Microsoft Bing', # Note: Microsoft may require web scraping or API access url: 'https://www.bing.com/toolbox/bingbot.json', format: :json, parser: :parse_microsoft_ranges, description: 'Bingbot and other Microsoft crawler IP ranges' }, anthropic: { name: 'Anthropic Claude', # Note: Anthropic ranges may need manual updates or different approach url: 'https://docs.anthropic.com/claude/reference/ip_ranges', format: :html, parser: :parse_anthropic_ranges, description: 'Anthropic Claude API service IP ranges' }, openai_searchbot: { name: 'OpenAI SearchBot', url: 'https://openai.com/searchbot.json', format: :json, parser: :parse_openai_ranges, description: 'OpenAI SearchBot for ChatGPT search features' }, openai_chatgpt_user: { name: 'OpenAI ChatGPT-User', url: 'https://openai.com/chatgpt-user.json', format: :json, parser: :parse_openai_ranges, description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs' }, openai_gptbot: { name: 'OpenAI GPTBot', url: 'https://openai.com/gptbot.json', format: :json, parser: :parse_openai_ranges, description: 'OpenAI GPTBot for training AI foundation models' }, cloudflare: { name: 'Cloudflare', urls: [ 'https://www.cloudflare.com/ips-v4', 'https://www.cloudflare.com/ips-v6' ], format: :text, parser: :parse_cloudflare_ranges, description: 'Cloudflare network ranges including their crawlers and services' }, facebook: { name: 'Facebook/Meta', url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/', format: :html, parser: :parse_facebook_ranges, description: 'Facebook/Meta crawlers and bots' }, applebot: { name: 'Applebot', url: 'https://support.apple.com/en-us/HT204683', format: :html, parser: :parse_applebot_ranges, description: 'Applebot crawler for Apple search and Siri' }, duckduckgo: { name: 'DuckDuckBot', url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/', format: :html, parser: :parse_duckduckgo_ranges, description: 'DuckDuckGo search crawler' } }.freeze def self.import_from_source(source_key, options = {}) source = BOT_SOURCES[source_key.to_sym] raise ImportError, "Unknown source: #{source_key}" unless source puts "Importing bot network ranges from #{source[:name]}..." case source[:parser] when :parse_aws_ranges parse_aws_ranges(source, options) when :parse_google_ranges parse_google_ranges(source, options) when :parse_microsoft_ranges parse_microsoft_ranges(source, options) when :parse_anthropic_ranges parse_anthropic_ranges(source, options) when :parse_openai_ranges parse_openai_ranges(source, options) when :parse_cloudflare_ranges parse_cloudflare_ranges(source, options) when :parse_facebook_ranges parse_facebook_ranges(source, options) when :parse_applebot_ranges parse_applebot_ranges(source, options) when :parse_duckduckgo_ranges parse_duckduckgo_ranges(source, options) else raise ImportError, "Unknown parser: #{source[:parser]}" end end def self.import_all_sources(options = {}) results = {} BOT_SOURCES.each do |source_key, source| puts "\n" + "="*50 puts "Processing #{source[:name]}..." puts "="*50 begin results[source_key] = import_from_source(source_key, options) rescue => e Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}" results[source_key] = { error: e.message, imported: 0 } end end puts "\n" + "="*50 puts "Import Summary" puts "="*50 results.each do |source, result| if result[:error] puts "#{source}: FAILED - #{result[:error]}" else puts "#{source}: SUCCESS - #{result[:imported]} ranges imported" end end results end private # Amazon AWS IP ranges parser def self.parse_aws_ranges(source, options = {}) require 'net/http' require 'uri' uri = URI.parse(source[:url]) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 response = http.get(uri.request_uri) raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200' data = JSON.parse(response.body) imported_count = 0 batch_size = options[:batch_size] || 1000 batch = [] # Filter for relevant services (can be customized) relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT'] data['prefixes'].each do |prefix| # Focus on relevant services and regions next unless relevant_services.include?(prefix['service']) network_range = { network: prefix['ip_prefix'], source: 'bot_import_amazon_aws', asn: nil, # AWS doesn't provide ASN in this feed asn_org: 'Amazon Web Services', company: 'Amazon', country: nil, is_datacenter: true, is_proxy: false, is_vpn: false, additional_data: { aws_service: prefix['service'], aws_region: prefix['region'], aws_network_border_group: prefix['network_border_group'], import_date: Time.current.iso8601 }.to_json } batch << network_range if batch.size >= batch_size imported_count += import_batch(batch, 'Amazon AWS') batch = [] puts "Imported #{imported_count} AWS ranges..." end end # Import remaining records if batch.any? imported_count += import_batch(batch, 'Amazon AWS') end puts "Amazon AWS import completed: #{imported_count} ranges imported" { imported: imported_count, source: 'Amazon AWS' } rescue Net::TimeoutError, Net::OpenTimeout => e raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}" rescue JSON::ParserError => e raise ImportError, "Failed to parse AWS JSON response: #{e.message}" end # Google crawler IP ranges parser def self.parse_google_ranges(source, options = {}) imported_count = 0 # Try each potential URL urls = Array(source[:urls] || source[:url]) urls.each do |url| begin puts "Attempting to fetch Google ranges from: #{url}" uri = URI.parse(url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 response = http.get(uri.request_uri) next unless response.code == '200' data = JSON.parse(response.body) batch_size = options[:batch_size] || 1000 batch = [] # Parse Google crawler format (varies by file type) if data.is_a?(Array) data.each do |entry| next unless entry['cidr'] || entry['prefix'] network_range = { network: entry['cidr'] || entry['prefix'], source: 'bot_import_google', asn: nil, asn_org: 'Google LLC', company: 'Google', country: nil, is_datacenter: true, is_proxy: false, is_vpn: false, additional_data: { crawler_type: entry['crawler_type'] || 'unknown', user_agent: entry['user_agent'], import_date: Time.current.iso8601 }.to_json } batch << network_range if batch.size >= batch_size imported_count += import_batch(batch, 'Google') batch = [] puts "Imported #{imported_count} Google ranges..." end end end # Import remaining records if batch.any? imported_count += import_batch(batch, 'Google') end puts "Google import completed: #{imported_count} ranges imported" return { imported: imported_count, source: 'Google' } rescue => e Rails.logger.warn "Failed to fetch from #{url}: #{e.message}" next end end raise ImportError, "Failed to fetch Google crawler ranges from any URL" end # Microsoft Bing crawler IP ranges parser def self.parse_microsoft_ranges(source, options = {}) # Microsoft requires special handling as they may not provide direct JSON # This is a placeholder implementation puts "Microsoft Bing crawler import requires manual configuration or web scraping" puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use" { imported: 0, source: 'Microsoft Bing', note: 'Manual configuration required - Microsoft does not provide direct IP range feeds' } end # Anthropic service IP ranges parser def self.parse_anthropic_ranges(source, options = {}) # Anthropic ranges may need to be manually configured # This is a placeholder implementation puts "Anthropic Claude service ranges require manual configuration" puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges" { imported: 0, source: 'Anthropic', note: 'Manual configuration required - Anthropic does not provide automated IP range feeds' } end # OpenAI crawler IP ranges parser def self.parse_openai_ranges(source, options = {}) require 'net/http' require 'uri' uri = URI.parse(source[:url]) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 response = http.get(uri.request_uri) raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200' data = JSON.parse(response.body) imported_count = 0 batch_size = options[:batch_size] || 1000 batch = [] # Determine crawler type from source name crawler_type = source[:name].gsub('OpenAI ', '').downcase data.each do |entry| # OpenAI provides IP ranges as either CIDR notation or single IPs ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip'] next unless ip_range # Convert single IPs to /32 network = ip_range.include?('/') ? ip_range : "#{ip_range}/32" network_range = { network: network, source: "bot_import_openai_#{crawler_type}", asn: nil, asn_org: 'OpenAI', company: 'OpenAI', country: nil, is_datacenter: true, is_proxy: false, is_vpn: false, additional_data: { crawler_type: crawler_type, crawler_purpose: crawler_purpose(crawler_type), user_agent: openai_user_agent(crawler_type), import_date: Time.current.iso8601, source_url: source[:url] }.to_json } batch << network_range if batch.size >= batch_size imported_count += import_batch(batch, "OpenAI #{crawler_type}") batch = [] puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..." end end # Import remaining records if batch.any? imported_count += import_batch(batch, "OpenAI #{crawler_type}") end puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported" { imported: imported_count, source: "OpenAI #{crawler_type}" } rescue Net::TimeoutError, Net::OpenTimeout => e raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}" rescue JSON::ParserError => e raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}" end def self.import_batch(batch_data, source_name) # Check for existing ranges to avoid duplicates existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network) new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) } if new_ranges.any? NetworkRange.insert_all(new_ranges) puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)" else puts "No new #{source_name} ranges to import (all duplicates)" end new_ranges.size rescue => e Rails.logger.error "Failed to import #{source_name} batch: #{e.message}" # Fallback to individual imports imported = 0 new_ranges.each do |data| begin NetworkRange.create!(data) imported += 1 rescue => individual_error Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}" end end imported end # Helper method to determine crawler purpose based on type def self.crawler_purpose(crawler_type) case crawler_type when 'searchbot' 'Used to link to and surface websites in search results in ChatGPT\'s search features' when 'chatgpt-user' 'User actions in ChatGPT and Custom GPTs, including GPT Actions' when 'gptbot' 'Used to crawl content for training OpenAI\'s generative AI foundation models' else 'Unknown purpose' end end # Helper method to get OpenAI user agent strings def self.openai_user_agent(crawler_type) case crawler_type when 'searchbot' 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot' when 'chatgpt-user' 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot' when 'gptbot' 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot' else 'Unknown user agent' end end # Cloudflare IP ranges parser def self.parse_cloudflare_ranges(source, options = {}) require 'net/http' require 'uri' imported_count = 0 urls = Array(source[:urls]) batch_size = options[:batch_size] || 1000 batch = [] urls.each do |url| begin puts "Fetching Cloudflare ranges from: #{url}" uri = URI.parse(url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true http.read_timeout = 30 response = http.get(uri.request_uri) raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200' # Cloudflare provides plain text CIDR lists lines = response.body.split("\n") ip_version = url.include?('ips-v4') ? 4 : 6 lines.each do |line| line = line.strip next if line.empty? || line.start_with?('#') # Validate CIDR format next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/) network_range = { network: line, source: 'bot_import_cloudflare', asn: nil, asn_org: 'Cloudflare', company: 'Cloudflare', country: nil, is_datacenter: true, is_proxy: false, is_vpn: false, additional_data: { ip_version: ip_version, import_date: Time.current.iso8601, source_url: url, service_type: 'cdn_and_security' }.to_json } batch << network_range if batch.size >= batch_size imported_count += import_batch(batch, 'Cloudflare') batch = [] puts "Imported #{imported_count} Cloudflare ranges..." end end rescue => e Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}" next end end # Import remaining records if batch.any? imported_count += import_batch(batch, 'Cloudflare') end puts "Cloudflare import completed: #{imported_count} ranges imported" { imported: imported_count, source: 'Cloudflare' } end # Facebook/Meta crawler ranges parser (placeholder) def self.parse_facebook_ranges(source, options = {}) puts "Facebook/Meta crawler ranges require web scraping or manual configuration" puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/" { imported: 0, source: 'Facebook/Meta', note: 'Manual configuration required - Facebook does not provide automated IP range feeds' } end # Applebot crawler ranges parser (placeholder) def self.parse_applebot_ranges(source, options = {}) puts "Applebot ranges require web scraping or manual configuration" puts "Refer to: https://support.apple.com/en-us/HT204683" { imported: 0, source: 'Applebot', note: 'Manual configuration required - Apple does not provide automated IP range feeds' } end # DuckDuckBot crawler ranges parser (placeholder) def self.parse_duckduckgo_ranges(source, options = {}) puts "DuckDuckBot ranges require web scraping or manual configuration" puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/" { imported: 0, source: 'DuckDuckBot', note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds' } end end