Fix some blocked/allow laggards after migrating. Add DuckDB for outstanding analyitcs performance. Start adding an import for all bot networks
This commit is contained in:
284
app/services/analytics_duckdb_service.rb
Normal file
284
app/services/analytics_duckdb_service.rb
Normal file
@@ -0,0 +1,284 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Service for managing DuckDB analytics database
|
||||
# Provides fast analytical queries on events data using columnar storage
|
||||
class AnalyticsDuckdbService
|
||||
include Singleton
|
||||
|
||||
DUCKDB_PATH = Rails.root.join("storage", "analytics.duckdb").to_s
|
||||
BATCH_SIZE = 10_000
|
||||
|
||||
# Execute block with connection, ensuring database and connection are closed afterward
|
||||
def with_connection
|
||||
db = DuckDB::Database.open(DUCKDB_PATH)
|
||||
conn = db.connect
|
||||
yield conn
|
||||
ensure
|
||||
conn&.close
|
||||
db&.close
|
||||
end
|
||||
|
||||
# Create events table if it doesn't exist (must be called within with_connection block)
|
||||
def setup_schema(conn)
|
||||
conn.execute(<<~SQL)
|
||||
CREATE TABLE IF NOT EXISTS events (
|
||||
id BIGINT PRIMARY KEY,
|
||||
timestamp TIMESTAMP NOT NULL,
|
||||
ip_address VARCHAR,
|
||||
network_range_id BIGINT,
|
||||
country VARCHAR,
|
||||
company VARCHAR,
|
||||
asn INTEGER,
|
||||
asn_org VARCHAR,
|
||||
is_datacenter BOOLEAN,
|
||||
is_vpn BOOLEAN,
|
||||
is_proxy BOOLEAN,
|
||||
waf_action INTEGER,
|
||||
request_path VARCHAR,
|
||||
user_agent VARCHAR
|
||||
)
|
||||
SQL
|
||||
|
||||
Rails.logger.info "[DuckDB] Schema setup complete"
|
||||
end
|
||||
|
||||
# Get timestamp of oldest event in DuckDB
|
||||
# Returns nil if table is empty
|
||||
def oldest_event_timestamp
|
||||
with_connection do |conn|
|
||||
result = conn.query("SELECT MIN(timestamp) as oldest FROM events")
|
||||
first_row = result.first
|
||||
first_row&.first # Returns the value or nil
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error getting oldest timestamp: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Get timestamp of newest event in DuckDB
|
||||
# Returns nil if table is empty
|
||||
def newest_event_timestamp
|
||||
with_connection do |conn|
|
||||
result = conn.query("SELECT MAX(timestamp) as newest FROM events")
|
||||
first_row = result.first
|
||||
first_row&.first # Returns the value or nil
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error getting newest timestamp: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Get maximum event ID already synced to DuckDB
|
||||
def max_synced_id
|
||||
with_connection do |conn|
|
||||
result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events")
|
||||
first_row = result.first
|
||||
first_row&.first || 0
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error getting max ID: #{e.message}"
|
||||
0
|
||||
end
|
||||
|
||||
# Sync new events from PostgreSQL to DuckDB
|
||||
# Uses PostgreSQL cursor for memory-efficient streaming
|
||||
# Uses Appender API for fast bulk inserts
|
||||
# Filters by ID to avoid duplicates
|
||||
def sync_new_events(from_timestamp)
|
||||
total_synced = 0
|
||||
|
||||
with_connection do |conn|
|
||||
# Ensure table exists
|
||||
setup_schema(conn)
|
||||
|
||||
# Get max ID already in DuckDB to avoid duplicates
|
||||
max_id_result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events")
|
||||
max_id = max_id_result.first&.first || 0
|
||||
Rails.logger.info "[DuckDB] Syncing events from #{from_timestamp}, max_id=#{max_id}"
|
||||
|
||||
start_time = Time.current
|
||||
appender = nil
|
||||
batch_count = 0
|
||||
|
||||
begin
|
||||
# Use PostgreSQL cursor for memory-efficient streaming
|
||||
Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id)
|
||||
.select(
|
||||
:id,
|
||||
:timestamp,
|
||||
:ip_address,
|
||||
:network_range_id,
|
||||
:country,
|
||||
:company,
|
||||
:asn,
|
||||
:asn_org,
|
||||
:is_datacenter,
|
||||
:is_vpn,
|
||||
:is_proxy,
|
||||
:waf_action,
|
||||
:request_path,
|
||||
:user_agent
|
||||
)
|
||||
.order(:id)
|
||||
.each_row(block_size: BATCH_SIZE) do |event_data|
|
||||
# Create new appender for each batch
|
||||
if batch_count % BATCH_SIZE == 0
|
||||
appender&.close # Close previous appender
|
||||
appender = conn.appender("events")
|
||||
end
|
||||
|
||||
# Unpack event data from cursor row (Hash from each_row)
|
||||
begin
|
||||
appender.append_row(
|
||||
event_data["id"],
|
||||
event_data["timestamp"],
|
||||
event_data["ip_address"]&.to_s,
|
||||
event_data["network_range_id"],
|
||||
event_data["country"],
|
||||
event_data["company"],
|
||||
event_data["asn"],
|
||||
event_data["asn_org"],
|
||||
event_data["is_datacenter"],
|
||||
event_data["is_vpn"],
|
||||
event_data["is_proxy"],
|
||||
event_data["waf_action"],
|
||||
event_data["request_path"],
|
||||
event_data["user_agent"]
|
||||
)
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}"
|
||||
Rails.logger.error "[DuckDB] event_data = #{event_data.inspect}"
|
||||
raise
|
||||
end
|
||||
|
||||
batch_count += 1
|
||||
total_synced += 1
|
||||
|
||||
# Log progress every BATCH_SIZE events
|
||||
if batch_count % BATCH_SIZE == 0
|
||||
Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)"
|
||||
end
|
||||
end
|
||||
|
||||
# Close final appender
|
||||
appender&.close
|
||||
|
||||
duration = Time.current - start_time
|
||||
rate = total_synced / duration if duration > 0
|
||||
Rails.logger.info "[DuckDB] Sync complete: #{total_synced} events in #{duration.round(2)}s (~#{rate&.round(0)} events/sec)"
|
||||
rescue StandardError => e
|
||||
appender&.close rescue nil # Ensure appender is closed on error
|
||||
Rails.logger.error "[DuckDB] Error syncing events: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to be caught by outer rescue
|
||||
end
|
||||
end
|
||||
|
||||
total_synced
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Sync failed: #{e.message}"
|
||||
0
|
||||
end
|
||||
|
||||
# Execute analytical query on DuckDB
|
||||
def query(sql, *params)
|
||||
with_connection do |conn|
|
||||
conn.query(sql, *params)
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Query error: #{e.message}"
|
||||
Rails.logger.error "SQL: #{sql}"
|
||||
raise
|
||||
end
|
||||
|
||||
# Get event count in DuckDB
|
||||
def event_count
|
||||
with_connection do |conn|
|
||||
result = conn.query("SELECT COUNT(*) as count FROM events")
|
||||
first_row = result.first
|
||||
first_row&.first || 0
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error getting event count: #{e.message}"
|
||||
0
|
||||
end
|
||||
|
||||
# Analytics query: Total events since timestamp
|
||||
def total_events_since(start_time)
|
||||
with_connection do |conn|
|
||||
result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time)
|
||||
result.first&.first || 0
|
||||
end
|
||||
end
|
||||
|
||||
# Analytics query: Event breakdown by WAF action
|
||||
def event_breakdown_by_action(start_time)
|
||||
with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT waf_action, COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ?
|
||||
GROUP BY waf_action
|
||||
SQL
|
||||
|
||||
# Convert to hash like PostgreSQL returns
|
||||
result.to_a.to_h { |row| [row["waf_action"], row["count"]] }
|
||||
end
|
||||
end
|
||||
|
||||
# Analytics query: Top countries
|
||||
def top_countries(start_time, limit = 10)
|
||||
with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT country, COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND country IS NOT NULL
|
||||
GROUP BY country
|
||||
ORDER BY count DESC
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
result.to_a.map { |row| [row["country"], row["count"]] }
|
||||
end
|
||||
end
|
||||
|
||||
# Analytics query: Top blocked IPs
|
||||
def top_blocked_ips(start_time, limit = 10)
|
||||
with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT ip_address, COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND waf_action = 0
|
||||
GROUP BY ip_address
|
||||
ORDER BY count DESC
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
result.to_a.map { |row| [row["ip_address"], row["count"]] }
|
||||
end
|
||||
end
|
||||
|
||||
# Analytics query: Hourly timeline (events grouped by hour)
|
||||
def hourly_timeline(start_time, end_time)
|
||||
with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time, end_time)
|
||||
SELECT
|
||||
DATE_TRUNC('hour', timestamp) as hour,
|
||||
COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND timestamp < ?
|
||||
GROUP BY hour
|
||||
ORDER BY hour
|
||||
SQL
|
||||
|
||||
# Convert to hash with Time keys like PostgreSQL
|
||||
result.to_a.to_h { |row| [row["hour"], row["count"]] }
|
||||
end
|
||||
end
|
||||
|
||||
# Close DuckDB connection (for cleanup/testing)
|
||||
def close
|
||||
@connection&.close
|
||||
@connection = nil
|
||||
end
|
||||
end
|
||||
573
app/services/bot_network_range_importer.rb
Normal file
573
app/services/bot_network_range_importer.rb
Normal file
@@ -0,0 +1,573 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# BotNetworkRangeImporter - Service for importing official bot network ranges
|
||||
#
|
||||
# Imports network ranges from official bot provider sources like:
|
||||
# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json
|
||||
# - Google: Official crawler IP lists
|
||||
# - Microsoft/Bing: Bot network ranges
|
||||
# - Anthropic: Service network ranges
|
||||
# - OpenAI: Service network ranges
|
||||
class BotNetworkRangeImporter
|
||||
class ImportError < StandardError; end
|
||||
|
||||
# Official sources for bot network ranges
|
||||
BOT_SOURCES = {
|
||||
amazon_aws: {
|
||||
name: 'Amazon AWS',
|
||||
url: 'https://ip-ranges.amazonaws.com/ip-ranges.json',
|
||||
format: :json,
|
||||
parser: :parse_aws_ranges,
|
||||
description: 'Official AWS IP ranges including Amazonbot and other services'
|
||||
},
|
||||
google: {
|
||||
name: 'Google',
|
||||
# Note: These URLs may need to be updated based on current Google documentation
|
||||
urls: [
|
||||
'https://developers.google.com/search/docs/files/googlebot.json',
|
||||
'https://developers.google.com/search/docs/files/special-crawlers.json'
|
||||
],
|
||||
format: :json,
|
||||
parser: :parse_google_ranges,
|
||||
description: 'Googlebot and other Google crawler IP ranges'
|
||||
},
|
||||
microsoft_bing: {
|
||||
name: 'Microsoft Bing',
|
||||
# Note: Microsoft may require web scraping or API access
|
||||
url: 'https://www.bing.com/toolbox/bingbot.json',
|
||||
format: :json,
|
||||
parser: :parse_microsoft_ranges,
|
||||
description: 'Bingbot and other Microsoft crawler IP ranges'
|
||||
},
|
||||
anthropic: {
|
||||
name: 'Anthropic Claude',
|
||||
# Note: Anthropic ranges may need manual updates or different approach
|
||||
url: 'https://docs.anthropic.com/claude/reference/ip_ranges',
|
||||
format: :html,
|
||||
parser: :parse_anthropic_ranges,
|
||||
description: 'Anthropic Claude API service IP ranges'
|
||||
},
|
||||
openai_searchbot: {
|
||||
name: 'OpenAI SearchBot',
|
||||
url: 'https://openai.com/searchbot.json',
|
||||
format: :json,
|
||||
parser: :parse_openai_ranges,
|
||||
description: 'OpenAI SearchBot for ChatGPT search features'
|
||||
},
|
||||
openai_chatgpt_user: {
|
||||
name: 'OpenAI ChatGPT-User',
|
||||
url: 'https://openai.com/chatgpt-user.json',
|
||||
format: :json,
|
||||
parser: :parse_openai_ranges,
|
||||
description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs'
|
||||
},
|
||||
openai_gptbot: {
|
||||
name: 'OpenAI GPTBot',
|
||||
url: 'https://openai.com/gptbot.json',
|
||||
format: :json,
|
||||
parser: :parse_openai_ranges,
|
||||
description: 'OpenAI GPTBot for training AI foundation models'
|
||||
},
|
||||
cloudflare: {
|
||||
name: 'Cloudflare',
|
||||
urls: [
|
||||
'https://www.cloudflare.com/ips-v4',
|
||||
'https://www.cloudflare.com/ips-v6'
|
||||
],
|
||||
format: :text,
|
||||
parser: :parse_cloudflare_ranges,
|
||||
description: 'Cloudflare network ranges including their crawlers and services'
|
||||
},
|
||||
facebook: {
|
||||
name: 'Facebook/Meta',
|
||||
url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/',
|
||||
format: :html,
|
||||
parser: :parse_facebook_ranges,
|
||||
description: 'Facebook/Meta crawlers and bots'
|
||||
},
|
||||
applebot: {
|
||||
name: 'Applebot',
|
||||
url: 'https://support.apple.com/en-us/HT204683',
|
||||
format: :html,
|
||||
parser: :parse_applebot_ranges,
|
||||
description: 'Applebot crawler for Apple search and Siri'
|
||||
},
|
||||
duckduckgo: {
|
||||
name: 'DuckDuckBot',
|
||||
url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/',
|
||||
format: :html,
|
||||
parser: :parse_duckduckgo_ranges,
|
||||
description: 'DuckDuckGo search crawler'
|
||||
}
|
||||
}.freeze
|
||||
|
||||
def self.import_from_source(source_key, options = {})
|
||||
source = BOT_SOURCES[source_key.to_sym]
|
||||
raise ImportError, "Unknown source: #{source_key}" unless source
|
||||
|
||||
puts "Importing bot network ranges from #{source[:name]}..."
|
||||
|
||||
case source[:parser]
|
||||
when :parse_aws_ranges
|
||||
parse_aws_ranges(source, options)
|
||||
when :parse_google_ranges
|
||||
parse_google_ranges(source, options)
|
||||
when :parse_microsoft_ranges
|
||||
parse_microsoft_ranges(source, options)
|
||||
when :parse_anthropic_ranges
|
||||
parse_anthropic_ranges(source, options)
|
||||
when :parse_openai_ranges
|
||||
parse_openai_ranges(source, options)
|
||||
when :parse_cloudflare_ranges
|
||||
parse_cloudflare_ranges(source, options)
|
||||
when :parse_facebook_ranges
|
||||
parse_facebook_ranges(source, options)
|
||||
when :parse_applebot_ranges
|
||||
parse_applebot_ranges(source, options)
|
||||
when :parse_duckduckgo_ranges
|
||||
parse_duckduckgo_ranges(source, options)
|
||||
else
|
||||
raise ImportError, "Unknown parser: #{source[:parser]}"
|
||||
end
|
||||
end
|
||||
|
||||
def self.import_all_sources(options = {})
|
||||
results = {}
|
||||
|
||||
BOT_SOURCES.each do |source_key, source|
|
||||
puts "\n" + "="*50
|
||||
puts "Processing #{source[:name]}..."
|
||||
puts "="*50
|
||||
|
||||
begin
|
||||
results[source_key] = import_from_source(source_key, options)
|
||||
rescue => e
|
||||
Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}"
|
||||
results[source_key] = { error: e.message, imported: 0 }
|
||||
end
|
||||
end
|
||||
|
||||
puts "\n" + "="*50
|
||||
puts "Import Summary"
|
||||
puts "="*50
|
||||
|
||||
results.each do |source, result|
|
||||
if result[:error]
|
||||
puts "#{source}: FAILED - #{result[:error]}"
|
||||
else
|
||||
puts "#{source}: SUCCESS - #{result[:imported]} ranges imported"
|
||||
end
|
||||
end
|
||||
|
||||
results
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Amazon AWS IP ranges parser
|
||||
def self.parse_aws_ranges(source, options = {})
|
||||
require 'net/http'
|
||||
require 'uri'
|
||||
|
||||
uri = URI.parse(source[:url])
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'
|
||||
|
||||
data = JSON.parse(response.body)
|
||||
imported_count = 0
|
||||
batch_size = options[:batch_size] || 1000
|
||||
batch = []
|
||||
|
||||
# Filter for relevant services (can be customized)
|
||||
relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT']
|
||||
|
||||
data['prefixes'].each do |prefix|
|
||||
# Focus on relevant services and regions
|
||||
next unless relevant_services.include?(prefix['service'])
|
||||
|
||||
network_range = {
|
||||
network: prefix['ip_prefix'],
|
||||
source: 'bot_import_amazon_aws',
|
||||
asn: nil, # AWS doesn't provide ASN in this feed
|
||||
asn_org: 'Amazon Web Services',
|
||||
company: 'Amazon',
|
||||
country: nil,
|
||||
is_datacenter: true,
|
||||
is_proxy: false,
|
||||
is_vpn: false,
|
||||
additional_data: {
|
||||
aws_service: prefix['service'],
|
||||
aws_region: prefix['region'],
|
||||
aws_network_border_group: prefix['network_border_group'],
|
||||
import_date: Time.current.iso8601
|
||||
}.to_json
|
||||
}
|
||||
|
||||
batch << network_range
|
||||
|
||||
if batch.size >= batch_size
|
||||
imported_count += import_batch(batch, 'Amazon AWS')
|
||||
batch = []
|
||||
puts "Imported #{imported_count} AWS ranges..."
|
||||
end
|
||||
end
|
||||
|
||||
# Import remaining records
|
||||
if batch.any?
|
||||
imported_count += import_batch(batch, 'Amazon AWS')
|
||||
end
|
||||
|
||||
puts "Amazon AWS import completed: #{imported_count} ranges imported"
|
||||
{ imported: imported_count, source: 'Amazon AWS' }
|
||||
rescue Net::TimeoutError, Net::OpenTimeout => e
|
||||
raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
|
||||
rescue JSON::ParserError => e
|
||||
raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
|
||||
end
|
||||
|
||||
# Google crawler IP ranges parser
|
||||
def self.parse_google_ranges(source, options = {})
|
||||
imported_count = 0
|
||||
|
||||
# Try each potential URL
|
||||
urls = Array(source[:urls] || source[:url])
|
||||
|
||||
urls.each do |url|
|
||||
begin
|
||||
puts "Attempting to fetch Google ranges from: #{url}"
|
||||
|
||||
uri = URI.parse(url)
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
next unless response.code == '200'
|
||||
|
||||
data = JSON.parse(response.body)
|
||||
|
||||
batch_size = options[:batch_size] || 1000
|
||||
batch = []
|
||||
|
||||
# Parse Google crawler format (varies by file type)
|
||||
if data.is_a?(Array)
|
||||
data.each do |entry|
|
||||
next unless entry['cidr'] || entry['prefix']
|
||||
|
||||
network_range = {
|
||||
network: entry['cidr'] || entry['prefix'],
|
||||
source: 'bot_import_google',
|
||||
asn: nil,
|
||||
asn_org: 'Google LLC',
|
||||
company: 'Google',
|
||||
country: nil,
|
||||
is_datacenter: true,
|
||||
is_proxy: false,
|
||||
is_vpn: false,
|
||||
additional_data: {
|
||||
crawler_type: entry['crawler_type'] || 'unknown',
|
||||
user_agent: entry['user_agent'],
|
||||
import_date: Time.current.iso8601
|
||||
}.to_json
|
||||
}
|
||||
|
||||
batch << network_range
|
||||
|
||||
if batch.size >= batch_size
|
||||
imported_count += import_batch(batch, 'Google')
|
||||
batch = []
|
||||
puts "Imported #{imported_count} Google ranges..."
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Import remaining records
|
||||
if batch.any?
|
||||
imported_count += import_batch(batch, 'Google')
|
||||
end
|
||||
|
||||
puts "Google import completed: #{imported_count} ranges imported"
|
||||
return { imported: imported_count, source: 'Google' }
|
||||
|
||||
rescue => e
|
||||
Rails.logger.warn "Failed to fetch from #{url}: #{e.message}"
|
||||
next
|
||||
end
|
||||
end
|
||||
|
||||
raise ImportError, "Failed to fetch Google crawler ranges from any URL"
|
||||
end
|
||||
|
||||
# Microsoft Bing crawler IP ranges parser
|
||||
def self.parse_microsoft_ranges(source, options = {})
|
||||
# Microsoft requires special handling as they may not provide direct JSON
|
||||
# This is a placeholder implementation
|
||||
|
||||
puts "Microsoft Bing crawler import requires manual configuration or web scraping"
|
||||
puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use"
|
||||
|
||||
{
|
||||
imported: 0,
|
||||
source: 'Microsoft Bing',
|
||||
note: 'Manual configuration required - Microsoft does not provide direct IP range feeds'
|
||||
}
|
||||
end
|
||||
|
||||
# Anthropic service IP ranges parser
|
||||
def self.parse_anthropic_ranges(source, options = {})
|
||||
# Anthropic ranges may need to be manually configured
|
||||
# This is a placeholder implementation
|
||||
|
||||
puts "Anthropic Claude service ranges require manual configuration"
|
||||
puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges"
|
||||
|
||||
{
|
||||
imported: 0,
|
||||
source: 'Anthropic',
|
||||
note: 'Manual configuration required - Anthropic does not provide automated IP range feeds'
|
||||
}
|
||||
end
|
||||
|
||||
# OpenAI crawler IP ranges parser
|
||||
def self.parse_openai_ranges(source, options = {})
|
||||
require 'net/http'
|
||||
require 'uri'
|
||||
|
||||
uri = URI.parse(source[:url])
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'
|
||||
|
||||
data = JSON.parse(response.body)
|
||||
imported_count = 0
|
||||
batch_size = options[:batch_size] || 1000
|
||||
batch = []
|
||||
|
||||
# Determine crawler type from source name
|
||||
crawler_type = source[:name].gsub('OpenAI ', '').downcase
|
||||
|
||||
data.each do |entry|
|
||||
# OpenAI provides IP ranges as either CIDR notation or single IPs
|
||||
ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip']
|
||||
next unless ip_range
|
||||
|
||||
# Convert single IPs to /32
|
||||
network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"
|
||||
|
||||
network_range = {
|
||||
network: network,
|
||||
source: "bot_import_openai_#{crawler_type}",
|
||||
asn: nil,
|
||||
asn_org: 'OpenAI',
|
||||
company: 'OpenAI',
|
||||
country: nil,
|
||||
is_datacenter: true,
|
||||
is_proxy: false,
|
||||
is_vpn: false,
|
||||
additional_data: {
|
||||
crawler_type: crawler_type,
|
||||
crawler_purpose: crawler_purpose(crawler_type),
|
||||
user_agent: openai_user_agent(crawler_type),
|
||||
import_date: Time.current.iso8601,
|
||||
source_url: source[:url]
|
||||
}.to_json
|
||||
}
|
||||
|
||||
batch << network_range
|
||||
|
||||
if batch.size >= batch_size
|
||||
imported_count += import_batch(batch, "OpenAI #{crawler_type}")
|
||||
batch = []
|
||||
puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..."
|
||||
end
|
||||
end
|
||||
|
||||
# Import remaining records
|
||||
if batch.any?
|
||||
imported_count += import_batch(batch, "OpenAI #{crawler_type}")
|
||||
end
|
||||
|
||||
puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
|
||||
{ imported: imported_count, source: "OpenAI #{crawler_type}" }
|
||||
rescue Net::TimeoutError, Net::OpenTimeout => e
|
||||
raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
|
||||
rescue JSON::ParserError => e
|
||||
raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
|
||||
end
|
||||
|
||||
def self.import_batch(batch_data, source_name)
|
||||
# Check for existing ranges to avoid duplicates
|
||||
existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network)
|
||||
new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) }
|
||||
|
||||
if new_ranges.any?
|
||||
NetworkRange.insert_all(new_ranges)
|
||||
puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)"
|
||||
else
|
||||
puts "No new #{source_name} ranges to import (all duplicates)"
|
||||
end
|
||||
|
||||
new_ranges.size
|
||||
rescue => e
|
||||
Rails.logger.error "Failed to import #{source_name} batch: #{e.message}"
|
||||
|
||||
# Fallback to individual imports
|
||||
imported = 0
|
||||
new_ranges.each do |data|
|
||||
begin
|
||||
NetworkRange.create!(data)
|
||||
imported += 1
|
||||
rescue => individual_error
|
||||
Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}"
|
||||
end
|
||||
end
|
||||
|
||||
imported
|
||||
end
|
||||
|
||||
# Helper method to determine crawler purpose based on type
|
||||
def self.crawler_purpose(crawler_type)
|
||||
case crawler_type
|
||||
when 'searchbot'
|
||||
'Used to link to and surface websites in search results in ChatGPT\'s search features'
|
||||
when 'chatgpt-user'
|
||||
'User actions in ChatGPT and Custom GPTs, including GPT Actions'
|
||||
when 'gptbot'
|
||||
'Used to crawl content for training OpenAI\'s generative AI foundation models'
|
||||
else
|
||||
'Unknown purpose'
|
||||
end
|
||||
end
|
||||
|
||||
# Helper method to get OpenAI user agent strings
|
||||
def self.openai_user_agent(crawler_type)
|
||||
case crawler_type
|
||||
when 'searchbot'
|
||||
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot'
|
||||
when 'chatgpt-user'
|
||||
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot'
|
||||
when 'gptbot'
|
||||
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot'
|
||||
else
|
||||
'Unknown user agent'
|
||||
end
|
||||
end
|
||||
|
||||
# Cloudflare IP ranges parser
|
||||
def self.parse_cloudflare_ranges(source, options = {})
|
||||
require 'net/http'
|
||||
require 'uri'
|
||||
|
||||
imported_count = 0
|
||||
urls = Array(source[:urls])
|
||||
batch_size = options[:batch_size] || 1000
|
||||
batch = []
|
||||
|
||||
urls.each do |url|
|
||||
begin
|
||||
puts "Fetching Cloudflare ranges from: #{url}"
|
||||
|
||||
uri = URI.parse(url)
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'
|
||||
|
||||
# Cloudflare provides plain text CIDR lists
|
||||
lines = response.body.split("\n")
|
||||
ip_version = url.include?('ips-v4') ? 4 : 6
|
||||
|
||||
lines.each do |line|
|
||||
line = line.strip
|
||||
next if line.empty? || line.start_with?('#')
|
||||
|
||||
# Validate CIDR format
|
||||
next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/)
|
||||
|
||||
network_range = {
|
||||
network: line,
|
||||
source: 'bot_import_cloudflare',
|
||||
asn: nil,
|
||||
asn_org: 'Cloudflare',
|
||||
company: 'Cloudflare',
|
||||
country: nil,
|
||||
is_datacenter: true,
|
||||
is_proxy: false,
|
||||
is_vpn: false,
|
||||
additional_data: {
|
||||
ip_version: ip_version,
|
||||
import_date: Time.current.iso8601,
|
||||
source_url: url,
|
||||
service_type: 'cdn_and_security'
|
||||
}.to_json
|
||||
}
|
||||
|
||||
batch << network_range
|
||||
|
||||
if batch.size >= batch_size
|
||||
imported_count += import_batch(batch, 'Cloudflare')
|
||||
batch = []
|
||||
puts "Imported #{imported_count} Cloudflare ranges..."
|
||||
end
|
||||
end
|
||||
|
||||
rescue => e
|
||||
Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}"
|
||||
next
|
||||
end
|
||||
end
|
||||
|
||||
# Import remaining records
|
||||
if batch.any?
|
||||
imported_count += import_batch(batch, 'Cloudflare')
|
||||
end
|
||||
|
||||
puts "Cloudflare import completed: #{imported_count} ranges imported"
|
||||
{ imported: imported_count, source: 'Cloudflare' }
|
||||
end
|
||||
|
||||
# Facebook/Meta crawler ranges parser (placeholder)
|
||||
def self.parse_facebook_ranges(source, options = {})
|
||||
puts "Facebook/Meta crawler ranges require web scraping or manual configuration"
|
||||
puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/"
|
||||
|
||||
{
|
||||
imported: 0,
|
||||
source: 'Facebook/Meta',
|
||||
note: 'Manual configuration required - Facebook does not provide automated IP range feeds'
|
||||
}
|
||||
end
|
||||
|
||||
# Applebot crawler ranges parser (placeholder)
|
||||
def self.parse_applebot_ranges(source, options = {})
|
||||
puts "Applebot ranges require web scraping or manual configuration"
|
||||
puts "Refer to: https://support.apple.com/en-us/HT204683"
|
||||
|
||||
{
|
||||
imported: 0,
|
||||
source: 'Applebot',
|
||||
note: 'Manual configuration required - Apple does not provide automated IP range feeds'
|
||||
}
|
||||
end
|
||||
|
||||
# DuckDuckBot crawler ranges parser (placeholder)
|
||||
def self.parse_duckduckgo_ranges(source, options = {})
|
||||
puts "DuckDuckBot ranges require web scraping or manual configuration"
|
||||
puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/"
|
||||
|
||||
{
|
||||
imported: 0,
|
||||
source: 'DuckDuckBot',
|
||||
note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds'
|
||||
}
|
||||
end
|
||||
end
|
||||
@@ -53,4 +53,107 @@ class Ipapi
|
||||
next
|
||||
end
|
||||
end
|
||||
|
||||
# Parse company/datacenter network range from IPAPI data
|
||||
# Handles "X.X.X.X - Y.Y.Y.Y" format and converts to CIDR
|
||||
def self.parse_company_network_range(ipapi_data)
|
||||
# Try company.network first, then datacenter.network
|
||||
network_range = ipapi_data.dig('company', 'network') || ipapi_data.dig('datacenter', 'network')
|
||||
return nil if network_range.blank?
|
||||
|
||||
# Parse "X.X.X.X - Y.Y.Y.Y" format
|
||||
if network_range.include?(' - ')
|
||||
start_ip_str, end_ip_str = network_range.split(' - ').map(&:strip)
|
||||
|
||||
begin
|
||||
start_ip = IPAddr.new(start_ip_str)
|
||||
end_ip = IPAddr.new(end_ip_str)
|
||||
|
||||
# Calculate the number of IPs in the range
|
||||
num_ips = end_ip.to_i - start_ip.to_i + 1
|
||||
|
||||
# Calculate prefix length from number of IPs
|
||||
# num_ips = 2^(32 - prefix_length) for IPv4
|
||||
prefix_length = 32 - Math.log2(num_ips).to_i
|
||||
|
||||
# Verify it's a valid CIDR block (power of 2)
|
||||
if 2**(32 - prefix_length) == num_ips
|
||||
cidr = "#{start_ip_str}/#{prefix_length}"
|
||||
Rails.logger.debug "Parsed company network range: #{network_range} -> #{cidr}"
|
||||
return cidr
|
||||
else
|
||||
Rails.logger.warn "Network range #{network_range} is not a valid CIDR block (#{num_ips} IPs)"
|
||||
return nil
|
||||
end
|
||||
rescue IPAddr::InvalidAddressError => e
|
||||
Rails.logger.error "Invalid IP in company network range: #{network_range} (#{e.message})"
|
||||
return nil
|
||||
end
|
||||
elsif network_range.include?('/')
|
||||
# Already in CIDR format
|
||||
return network_range
|
||||
else
|
||||
Rails.logger.warn "Unknown network range format: #{network_range}"
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
# Populate NetworkRange attributes from IPAPI data
|
||||
def self.populate_network_attributes(network_range, ipapi_data)
|
||||
network_range.asn = ipapi_data.dig('asn', 'asn')
|
||||
network_range.asn_org = ipapi_data.dig('asn', 'org') || ipapi_data.dig('company', 'name')
|
||||
network_range.company = ipapi_data.dig('company', 'name')
|
||||
network_range.country = ipapi_data.dig('location', 'country_code')
|
||||
network_range.is_datacenter = ipapi_data['is_datacenter'] || false
|
||||
network_range.is_vpn = ipapi_data['is_vpn'] || false
|
||||
network_range.is_proxy = ipapi_data['is_proxy'] || false
|
||||
end
|
||||
|
||||
# Process IPAPI data and create network ranges
|
||||
# Returns array of created/updated NetworkRange objects
|
||||
def self.process_ipapi_data(ipapi_data, tracking_network)
|
||||
created_networks = []
|
||||
|
||||
# Extract and create company/datacenter network range if present
|
||||
company_network_cidr = parse_company_network_range(ipapi_data)
|
||||
if company_network_cidr.present?
|
||||
company_range = NetworkRange.find_or_create_by(network: company_network_cidr) do |nr|
|
||||
nr.source = 'api_imported'
|
||||
nr.creation_reason = "Company allocation from IPAPI for #{tracking_network.cidr}"
|
||||
end
|
||||
|
||||
# Always update attributes (whether new or existing)
|
||||
populate_network_attributes(company_range, ipapi_data)
|
||||
company_range.set_network_data(:ipapi, ipapi_data)
|
||||
company_range.last_api_fetch = Time.current
|
||||
company_range.save!
|
||||
|
||||
created_networks << company_range
|
||||
Rails.logger.info "Created/updated company network: #{company_range.cidr}"
|
||||
end
|
||||
|
||||
# Extract and create ASN route network if present
|
||||
ipapi_route = ipapi_data.dig('asn', 'route')
|
||||
if ipapi_route.present? && ipapi_route != tracking_network.cidr
|
||||
route_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr|
|
||||
nr.source = 'api_imported'
|
||||
nr.creation_reason = "BGP route from IPAPI lookup for #{tracking_network.cidr}"
|
||||
end
|
||||
|
||||
# Always update attributes (whether new or existing)
|
||||
populate_network_attributes(route_network, ipapi_data)
|
||||
route_network.set_network_data(:ipapi, ipapi_data)
|
||||
route_network.last_api_fetch = Time.current
|
||||
route_network.save!
|
||||
|
||||
created_networks << route_network
|
||||
Rails.logger.info "Created/updated BGP route network: #{route_network.cidr}"
|
||||
end
|
||||
|
||||
# Return both the created networks and the broadest CIDR for deduplication
|
||||
{
|
||||
networks: created_networks,
|
||||
broadest_cidr: company_network_cidr.presence || ipapi_route || tracking_network.cidr
|
||||
}
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user