Use only parquet files for events

This commit is contained in:
Dan Milne
2025-12-03 17:16:38 +11:00
parent 032243ba6a
commit 693851f664
12 changed files with 673 additions and 165 deletions

View File

@@ -31,9 +31,31 @@ class EventDdb
AnalyticsDuckdbService.instance
end
# Helper to load parquet files into in-memory events view
# This allows all existing queries to work without modification
# Uses glob pattern to read all parquet files (excluding .temp files)
def with_events_from_parquet(&block)
service.with_connection do |conn|
# Create events view from all parquet files using glob pattern
# Pattern matches: minute/*.parquet, hours/*.parquet, days/*.parquet, weeks/*.parquet
# Excludes .temp files automatically (they don't match *.parquet)
parquet_pattern = "#{AnalyticsDuckdbService::PARQUET_BASE_PATH}/**/*.parquet"
conn.execute(<<~SQL)
CREATE VIEW events AS
SELECT * FROM read_parquet('#{parquet_pattern}')
SQL
yield conn
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error loading parquet files: #{e.message}"
nil
end
# Total events since timestamp
def count_since(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time)
result.first&.first || 0
end
@@ -44,7 +66,7 @@ class EventDdb
# Event breakdown by WAF action
def breakdown_by_action(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time)
SELECT waf_action, COUNT(*) as count
FROM events
@@ -65,7 +87,7 @@ class EventDdb
# Top countries with event counts
def top_countries(start_time, limit = 10)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT country, COUNT(*) as count
FROM events
@@ -86,7 +108,7 @@ class EventDdb
# Top blocked IPs
def top_blocked_ips(start_time, limit = 10)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT ip_address, COUNT(*) as count
FROM events
@@ -106,7 +128,7 @@ class EventDdb
# Hourly timeline aggregation
def hourly_timeline(start_time, end_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, end_time)
SELECT
DATE_TRUNC('hour', timestamp) as hour,
@@ -129,7 +151,7 @@ class EventDdb
# Top networks by traffic volume
# Returns array of arrays: [network_range_id, event_count, unique_ips]
def top_networks(start_time, limit = 50)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
network_range_id,
@@ -152,7 +174,7 @@ class EventDdb
# Top companies
# Returns array of OpenStruct objects with: company, event_count, unique_ips, network_count
def top_companies(start_time, limit = 20)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
company,
@@ -184,7 +206,7 @@ class EventDdb
# Top ASNs
# Returns array of OpenStruct objects with: asn, asn_org, event_count, unique_ips, network_count
def top_asns(start_time, limit = 15)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
asn,
@@ -218,7 +240,7 @@ class EventDdb
# Network type breakdown (datacenter, VPN, proxy, standard)
# Returns hash with network_type as key and hash of stats as value
def network_type_breakdown(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time)
SELECT
CASE
@@ -255,7 +277,7 @@ class EventDdb
# Top countries with detailed stats (event count and unique IPs)
# Returns array of OpenStruct objects with: country, event_count, unique_ips
def top_countries_with_stats(start_time, limit = 15)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
country,
@@ -285,7 +307,7 @@ class EventDdb
# Network type stats with formatted output matching controller expectations
# Returns hash with type keys containing label, networks, events, unique_ips, percentage
def network_type_stats(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
# Get total events for percentage calculation
total_result = conn.query("SELECT COUNT(*) as total FROM events WHERE timestamp >= ?", start_time)
total_events = total_result.first&.first || 0
@@ -328,7 +350,7 @@ class EventDdb
network_range_ids = Array(network_range_ids)
return nil if network_range_ids.empty?
service.with_connection do |conn|
with_events_from_parquet do |conn|
# Build IN clause with placeholders
placeholders = network_range_ids.map { "?" }.join(", ")
@@ -363,7 +385,7 @@ class EventDdb
network_range_ids = Array(network_range_ids)
return nil if network_range_ids.empty?
service.with_connection do |conn|
with_events_from_parquet do |conn|
# Build IN clause with placeholders
placeholders = network_range_ids.map { "?" }.join(", ")
@@ -391,7 +413,7 @@ class EventDdb
network_range_ids = Array(network_range_ids)
return nil if network_range_ids.empty?
service.with_connection do |conn|
with_events_from_parquet do |conn|
# Build IN clause with placeholders
placeholders = network_range_ids.map { "?" }.join(", ")
@@ -414,13 +436,36 @@ class EventDdb
nil
end
# Count events for network range(s)
# Returns integer count of all events in the network
def network_event_count(network_range_ids)
network_range_ids = Array(network_range_ids)
return nil if network_range_ids.empty?
with_events_from_parquet do |conn|
# Build IN clause with placeholders
placeholders = network_range_ids.map { "?" }.join(", ")
result = conn.query(<<~SQL, *network_range_ids)
SELECT COUNT(*) as count
FROM events
WHERE network_range_id IN (#{placeholders})
SQL
result.first&.first || 0
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in network_event_count: #{e.message}"
nil
end
# Full user agent tally for network range(s)
# Returns hash of user_agent => count for all agents in the network
def network_agent_tally(network_range_ids)
network_range_ids = Array(network_range_ids)
return nil if network_range_ids.empty?
service.with_connection do |conn|
with_events_from_parquet do |conn|
# Build IN clause with placeholders
placeholders = network_range_ids.map { "?" }.join(", ")
@@ -445,7 +490,7 @@ class EventDdb
# Suspicious network activity patterns
# Detects high-volume networks, high deny rates, and distributed companies
def suspicious_patterns(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
# High volume networks (5x average)
avg_query = conn.query(<<~SQL, start_time)
SELECT
@@ -523,7 +568,7 @@ class EventDdb
# Bot traffic analysis - breakdown of bot vs human traffic
def bot_traffic_breakdown(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time)
SELECT
is_bot,
@@ -553,7 +598,7 @@ class EventDdb
# Count human traffic (non-bot) since timestamp
def human_traffic_count(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
@@ -569,7 +614,7 @@ class EventDdb
# Count bot traffic since timestamp
def bot_traffic_count(start_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
@@ -585,7 +630,7 @@ class EventDdb
# Top bot user agents
def top_bot_user_agents(start_time, limit = 20)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
user_agent,
@@ -614,7 +659,7 @@ class EventDdb
# Bot traffic timeline (hourly breakdown)
def bot_traffic_timeline(start_time, end_time)
service.with_connection do |conn|
with_events_from_parquet do |conn|
result = conn.query(<<~SQL, start_time, end_time)
SELECT
DATE_TRUNC('hour', timestamp) as hour,
@@ -648,7 +693,63 @@ class EventDdb
# Returns { total_count:, events:[], page:, per_page: }
# Supports filters: ip, waf_action, country, rule_id, company, asn, network_type, network_range_id, exclude_bots
def search(filters = {}, page: 1, per_page: 50)
service.with_connection do |conn|
# Get list of Parquet files to query
parquet_files = service.parquet_files_for_range(1.year.ago, Time.current)
if parquet_files.empty?
Rails.logger.warn "[EventDdb] No Parquet files found, falling back to DuckDB"
return search_duckdb(filters, page, per_page)
end
# Query Parquet files using in-memory DuckDB (no file locks!)
service.with_parquet_connection do |conn|
# Build WHERE clause
where_clause, params = build_where_clause(filters)
# Build file list for read_parquet
file_list = parquet_files.map { |f| "'#{f}'" }.join(", ")
# Get total count
count_sql = "SELECT COUNT(*) FROM read_parquet([#{file_list}])#{where_clause}"
count_result = conn.query(count_sql, *params)
total_count = count_result.first&.first || 0
# Get paginated results
offset = (page - 1) * per_page
data_sql = <<~SQL
SELECT
id, timestamp, ip_address, network_range_id, country, company,
asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot,
waf_action, request_method, response_status, rule_id,
request_path, user_agent, tags
FROM read_parquet([#{file_list}])
#{where_clause}
ORDER BY timestamp DESC
LIMIT ? OFFSET ?
SQL
result = conn.query(data_sql, *params, per_page, offset)
# Convert rows to event-like objects
events = result.to_a.map { |row| row_to_event(row) }
{
total_count: total_count,
events: events,
page: page,
per_page: per_page
}
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in Parquet search: #{e.message}"
Rails.logger.error e.backtrace.join("\n")
nil
end
# Fallback to querying DuckDB directly (for backward compatibility)
def search_duckdb(filters = {}, page: 1, per_page: 50)
with_events_from_parquet do |conn|
# Build WHERE clause
where_clause, params = build_where_clause(filters)
@@ -685,7 +786,7 @@ class EventDdb
}
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in search: #{e.message}"
Rails.logger.error "[EventDdb] Error in DuckDB search: #{e.message}"
Rails.logger.error e.backtrace.join("\n")
nil
end