Fix some blocked/allow laggards after migrating. Add DuckDB for outstanding analyitcs performance. Start adding an import for all bot networks
This commit is contained in:
52
app/jobs/cleanup_old_events_job.rb
Normal file
52
app/jobs/cleanup_old_events_job.rb
Normal file
@@ -0,0 +1,52 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# CleanupOldEventsJob - Removes events older than the configured retention period
|
||||
#
|
||||
# This job runs periodically (hourly) to clean up old events based on the
|
||||
# event_retention_days setting. This helps keep the database size manageable
|
||||
# and improves query performance.
|
||||
#
|
||||
# The retention period is configurable via the 'event_retention_days' setting
|
||||
# (default: 90 days). This allows administrators to balance between historical
|
||||
# data retention and database performance.
|
||||
#
|
||||
# Schedule: Every hour (configured in config/recurring.yml)
|
||||
class CleanupOldEventsJob < ApplicationJob
|
||||
queue_as :background
|
||||
|
||||
def perform
|
||||
retention_days = Setting.event_retention_days
|
||||
|
||||
# Don't delete if retention is set to 0 or negative (disabled)
|
||||
if retention_days <= 0
|
||||
Rails.logger.info "CleanupOldEventsJob: Event retention disabled (retention_days: #{retention_days})"
|
||||
return 0
|
||||
end
|
||||
|
||||
cutoff_date = retention_days.days.ago
|
||||
|
||||
# Count events to be deleted
|
||||
old_events = Event.where('timestamp < ?', cutoff_date)
|
||||
count = old_events.count
|
||||
|
||||
if count.zero?
|
||||
Rails.logger.info "CleanupOldEventsJob: No events older than #{retention_days} days found"
|
||||
return 0
|
||||
end
|
||||
|
||||
Rails.logger.info "CleanupOldEventsJob: Deleting #{count} events older than #{retention_days} days (before #{cutoff_date})"
|
||||
|
||||
# Delete in batches to avoid long-running transactions
|
||||
deleted_count = 0
|
||||
batch_size = 10_000
|
||||
|
||||
old_events.in_batches(of: batch_size) do |batch|
|
||||
batch_count = batch.delete_all
|
||||
deleted_count += batch_count
|
||||
Rails.logger.info "CleanupOldEventsJob: Deleted batch of #{batch_count} events (total: #{deleted_count}/#{count})"
|
||||
end
|
||||
|
||||
Rails.logger.info "CleanupOldEventsJob: Successfully deleted #{deleted_count} events"
|
||||
deleted_count
|
||||
end
|
||||
end
|
||||
@@ -15,31 +15,14 @@ class FetchIpapiDataJob < ApplicationJob
|
||||
ipapi_data = Ipapi.lookup(sample_ip)
|
||||
|
||||
if ipapi_data.present? && !ipapi_data.key?('error')
|
||||
# Check if IPAPI returned a different route than our tracking network
|
||||
ipapi_route = ipapi_data.dig('asn', 'route')
|
||||
target_network = tracking_network
|
||||
# Process IPAPI data and create network ranges
|
||||
result = Ipapi.process_ipapi_data(ipapi_data, tracking_network)
|
||||
|
||||
if ipapi_route.present? && ipapi_route != tracking_network.cidr
|
||||
# IPAPI returned a different CIDR - find or create that network range
|
||||
Rails.logger.info "IPAPI returned different route: #{ipapi_route} (requested: #{tracking_network.cidr})"
|
||||
# Mark the tracking network as having been queried
|
||||
# Use the broadest CIDR returned for deduplication
|
||||
tracking_network.mark_ipapi_queried!(result[:broadest_cidr])
|
||||
|
||||
target_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr|
|
||||
nr.source = 'api_imported'
|
||||
nr.creation_reason = "Created from IPAPI lookup for #{tracking_network.cidr}"
|
||||
end
|
||||
|
||||
Rails.logger.info "Storing IPAPI data on correct network: #{target_network.cidr}"
|
||||
end
|
||||
|
||||
# Store data on the target network (wherever IPAPI said it belongs)
|
||||
target_network.set_network_data(:ipapi, ipapi_data)
|
||||
target_network.last_api_fetch = Time.current
|
||||
target_network.save!
|
||||
|
||||
# Mark the tracking network as having been queried, with the CIDR that was returned
|
||||
tracking_network.mark_ipapi_queried!(target_network.cidr)
|
||||
|
||||
Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (stored on #{target_network.cidr})"
|
||||
Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (created #{result[:networks].length} networks)"
|
||||
|
||||
# Broadcast to the tracking network
|
||||
broadcast_ipapi_update(tracking_network, ipapi_data)
|
||||
|
||||
26
app/jobs/import_all_bot_network_ranges_job.rb
Normal file
26
app/jobs/import_all_bot_network_ranges_job.rb
Normal file
@@ -0,0 +1,26 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# ImportAllBotNetworkRangesJob - Background job for importing from all bot sources
|
||||
class ImportAllBotNetworkRangesJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def perform(options = {})
|
||||
Rails.logger.info "Starting batch import of all bot network ranges"
|
||||
|
||||
results = BotNetworkRangeImporter.import_all_sources(options)
|
||||
|
||||
# Send completion summary
|
||||
Rails.logger.info "Batch import completed. Summary: #{results}"
|
||||
|
||||
# Broadcast summary to clients
|
||||
ActionCable.server.broadcast(
|
||||
"bot_imports",
|
||||
{
|
||||
type: 'batch_summary',
|
||||
status: 'completed',
|
||||
results: results,
|
||||
message: "Batch import completed for all sources"
|
||||
}
|
||||
)
|
||||
end
|
||||
end
|
||||
47
app/jobs/import_bot_network_ranges_job.rb
Normal file
47
app/jobs/import_bot_network_ranges_job.rb
Normal file
@@ -0,0 +1,47 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# ImportBotNetworkRangesJob - Background job for importing bot network ranges
|
||||
#
|
||||
# Imports network ranges from official bot provider sources.
|
||||
# Runs asynchronously to avoid blocking the web interface.
|
||||
class ImportBotNetworkRangesJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def perform(source_key, options = {})
|
||||
Rails.logger.info "Starting bot network range import for source: #{source_key}"
|
||||
|
||||
begin
|
||||
result = BotNetworkRangeImporter.import_from_source(source_key, options)
|
||||
|
||||
# Send notification or log completion
|
||||
Rails.logger.info "Successfully imported #{result[:imported]} ranges from #{result[:source]}"
|
||||
|
||||
# Optionally broadcast via Turbo Streams for real-time updates
|
||||
ActionCable.server.broadcast(
|
||||
"bot_imports",
|
||||
{
|
||||
source: source_key,
|
||||
status: 'completed',
|
||||
imported: result[:imported],
|
||||
message: "Successfully imported #{result[:imported]} ranges from #{result[:source]}"
|
||||
}
|
||||
)
|
||||
|
||||
rescue => e
|
||||
Rails.logger.error "Bot network range import failed for #{source_key}: #{e.message}"
|
||||
|
||||
# Broadcast error notification
|
||||
ActionCable.server.broadcast(
|
||||
"bot_imports",
|
||||
{
|
||||
source: source_key,
|
||||
status: 'error',
|
||||
error: e.message,
|
||||
message: "Failed to import from #{source_key}: #{e.message}"
|
||||
}
|
||||
)
|
||||
|
||||
raise e
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -53,16 +53,15 @@ class ProcessWafEventJob < ApplicationJob
|
||||
# Queue IPAPI enrichment based on /24 tracking
|
||||
# The tracking network is the /24 that stores ipapi_queried_at
|
||||
if NetworkRange.should_fetch_ipapi_for_ip?(event.ip_address)
|
||||
# Use tracking network for fetch status to avoid race conditions
|
||||
if tracking_network.is_fetching_api_data?(:ipapi)
|
||||
Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - already being fetched"
|
||||
else
|
||||
tracking_network.mark_as_fetching_api_data!(:ipapi)
|
||||
# Atomically mark as fetching - this prevents duplicate jobs via database lock
|
||||
if tracking_network.mark_as_fetching_api_data!(:ipapi)
|
||||
Rails.logger.info "Queueing IPAPI fetch for IP #{event.ip_address} (tracking network: #{tracking_network.cidr})"
|
||||
FetchIpapiDataJob.perform_later(network_range_id: tracking_network.id)
|
||||
else
|
||||
Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - another job already started"
|
||||
end
|
||||
else
|
||||
Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried recently"
|
||||
Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried or being fetched"
|
||||
end
|
||||
|
||||
# Evaluate WAF policies inline if needed (lazy evaluation)
|
||||
|
||||
89
app/jobs/sync_events_to_duckdb_job.rb
Normal file
89
app/jobs/sync_events_to_duckdb_job.rb
Normal file
@@ -0,0 +1,89 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Background job to sync events from PostgreSQL to DuckDB
|
||||
# Runs every 5 minutes to keep analytics database up-to-date
|
||||
# Uses watermark tracking to only sync new events
|
||||
class SyncEventsToDuckdbJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
# Key for storing last sync timestamp in Rails cache
|
||||
WATERMARK_CACHE_KEY = "duckdb_last_sync_time"
|
||||
WATERMARK_TTL = 1.week
|
||||
|
||||
# Overlap window to catch late-arriving events
|
||||
SYNC_OVERLAP = 1.minute
|
||||
|
||||
def perform
|
||||
service = AnalyticsDuckdbService.instance
|
||||
|
||||
# Determine where to start syncing
|
||||
from_timestamp = determine_sync_start_time(service)
|
||||
|
||||
Rails.logger.info "[DuckDB Sync] Starting sync from #{from_timestamp}"
|
||||
|
||||
# Sync new events using PostgreSQL cursor + DuckDB Appender
|
||||
# (setup_schema is called internally within sync_new_events)
|
||||
count = service.sync_new_events(from_timestamp)
|
||||
|
||||
# Update watermark if we synced any events
|
||||
if count > 0
|
||||
update_last_sync_time
|
||||
Rails.logger.info "[DuckDB Sync] Successfully synced #{count} events"
|
||||
else
|
||||
Rails.logger.info "[DuckDB Sync] No new events to sync"
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB Sync] Job failed: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to mark job as failed in Solid Queue
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Determine timestamp to start syncing from
|
||||
# Strategy:
|
||||
# 1. First run (DuckDB empty): sync from oldest PostgreSQL event
|
||||
# 2. Subsequent runs: sync from last watermark with overlap
|
||||
def determine_sync_start_time(service)
|
||||
oldest_duckdb = service.oldest_event_timestamp
|
||||
|
||||
if oldest_duckdb.nil?
|
||||
# DuckDB is empty - this is the first sync
|
||||
# Start from oldest PostgreSQL event (or reasonable cutoff)
|
||||
oldest_pg = Event.minimum(:timestamp)
|
||||
|
||||
if oldest_pg.nil?
|
||||
# No events in PostgreSQL at all
|
||||
Rails.logger.warn "[DuckDB Sync] No events found in PostgreSQL"
|
||||
1.day.ago # Default to recent window
|
||||
else
|
||||
Rails.logger.info "[DuckDB Sync] First sync - starting from oldest event: #{oldest_pg}"
|
||||
oldest_pg
|
||||
end
|
||||
else
|
||||
# DuckDB has data - sync from last watermark with overlap
|
||||
last_sync = Rails.cache.read(WATERMARK_CACHE_KEY)
|
||||
|
||||
if last_sync.nil?
|
||||
# Watermark not in cache (maybe cache expired or restarted)
|
||||
# Fall back to newest event in DuckDB
|
||||
newest_duckdb = service.newest_event_timestamp
|
||||
start_time = newest_duckdb ? newest_duckdb - SYNC_OVERLAP : oldest_duckdb
|
||||
Rails.logger.info "[DuckDB Sync] Watermark not found, using newest DuckDB event: #{start_time}"
|
||||
start_time
|
||||
else
|
||||
# Normal case: use watermark with overlap to catch late arrivals
|
||||
start_time = last_sync - SYNC_OVERLAP
|
||||
Rails.logger.debug "[DuckDB Sync] Using watermark: #{last_sync} (with #{SYNC_OVERLAP}s overlap)"
|
||||
start_time
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Update last sync watermark in cache
|
||||
def update_last_sync_time
|
||||
now = Time.current
|
||||
Rails.cache.write(WATERMARK_CACHE_KEY, now, expires_in: WATERMARK_TTL)
|
||||
Rails.logger.debug "[DuckDB Sync] Updated watermark to #{now}"
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user