Update duckdb. use more duckdb

This commit is contained in:
Dan Milne
2025-12-25 11:59:53 +11:00
parent 693851f664
commit a0ff0edb73
2 changed files with 20 additions and 36 deletions

View File

@@ -27,7 +27,7 @@ RUN apt-get update -qq && \
*) \ *) \
echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \ echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \
esac && \ esac && \
wget "https://github.com/duckdb/duckdb/releases/download/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \ wget "https://github.com/duckdb/duckdb/releases/download/v1.4.3/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \
unzip /tmp/libduckdb.zip -d /tmp/duckdb && \ unzip /tmp/libduckdb.zip -d /tmp/duckdb && \
cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \ cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \
cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \ cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \

View File

@@ -28,17 +28,15 @@ class AnalyticsController < ApplicationController
# Core statistics - cached (uses DuckDB if available) # Core statistics - cached (uses DuckDB if available)
stat_start = Time.current stat_start = Time.current
@total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do @total_events = BaffleDl.count_since(@start_time)
with_duckdb_fallback { EventDdb.count_since(@start_time) } ||
Event.where("timestamp >= ?", @start_time).count
end
Rails.logger.info "[Analytics Perf] Total events: #{((Time.current - stat_start) * 1000).round(1)}ms" Rails.logger.info "[Analytics Perf] Total events: #{((Time.current - stat_start) * 1000).round(1)}ms"
@total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do @total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do
Rule.enabled.count Rule.enabled.count
end end
@network_ranges_with_events = Rails.cache.fetch("analytics/network_ranges_with_events", expires_in: 5.minutes) do @network_ranges_with_events = BaffleDl.count_network_ranges_with_events(@start_time) ||
Rails.cache.fetch("analytics/network_ranges_with_events", expires_in: 5.minutes) do
NetworkRange.with_events.count NetworkRange.with_events.count
end end
@@ -46,38 +44,24 @@ class AnalyticsController < ApplicationController
NetworkRange.count NetworkRange.count
end end
# Event breakdown by action - cached (uses DuckDB if available) # Event breakdown by action - use DuckDB directly for performance
stat_start = Time.current stat_start = Time.current
@event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do @event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } || BaffleDl.breakdown_by_action(@start_time) || {}
Event.where("timestamp >= ?", @start_time)
.group(:waf_action)
.count
end end
Rails.logger.info "[Analytics Perf] Event breakdown: #{((Time.current - stat_start) * 1000).round(1)}ms" Rails.logger.info "[Analytics Perf] Event breakdown: #{((Time.current - stat_start) * 1000).round(1)}ms"
# Top countries by event count - cached (uses DuckDB if available) # Top countries by event count - use DuckDB directly for performance
stat_start = Time.current stat_start = Time.current
@top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do @top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } || BaffleDl.top_countries(@start_time, 10) || []
Event.where("timestamp >= ? AND country IS NOT NULL", @start_time)
.group(:country)
.count
.sort_by { |_, count| -count }
.first(10)
end end
Rails.logger.info "[Analytics Perf] Top countries: #{((Time.current - stat_start) * 1000).round(1)}ms" Rails.logger.info "[Analytics Perf] Top countries: #{((Time.current - stat_start) * 1000).round(1)}ms"
# Top blocked IPs - cached (uses DuckDB if available) # Top blocked IPs - use DuckDB directly for performance
stat_start = Time.current stat_start = Time.current
@top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do @top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } || BaffleDl.top_blocked_ips(@start_time, 10) || []
Event.where("timestamp >= ?", @start_time)
.where(waf_action: 0) # deny action in enum
.group(:ip_address)
.count
.sort_by { |_, count| -count }
.first(10)
end end
Rails.logger.info "[Analytics Perf] Top blocked IPs: #{((Time.current - stat_start) * 1000).round(1)}ms" Rails.logger.info "[Analytics Perf] Top blocked IPs: #{((Time.current - stat_start) * 1000).round(1)}ms"
@@ -135,8 +119,8 @@ class AnalyticsController < ApplicationController
@time_period = params[:period]&.to_sym || :day @time_period = params[:period]&.to_sym || :day
@start_time = calculate_start_time(@time_period) @start_time = calculate_start_time(@time_period)
# Top networks by request volume - use DuckDB if available # Top networks by request volume - use DuckLake if available
network_stats = with_duckdb_fallback { EventDdb.top_networks(@start_time, 50) } network_stats = with_duckdb_fallback { BaffleDl.top_networks(@start_time, 50) }
if network_stats if network_stats
# DuckDB path: array format [network_range_id, event_count, unique_ips] # DuckDB path: array format [network_range_id, event_count, unique_ips]
@@ -200,24 +184,24 @@ class AnalyticsController < ApplicationController
# Network type breakdown with traffic stats # Network type breakdown with traffic stats
@network_breakdown = calculate_network_type_stats(@start_time) @network_breakdown = calculate_network_type_stats(@start_time)
# Company breakdown for top traffic sources - use DuckDB if available # Company breakdown for top traffic sources - use DuckLake if available
@top_companies = with_duckdb_fallback { EventDdb.top_companies(@start_time, 20) } || @top_companies = with_duckdb_fallback { BaffleDl.top_companies(@start_time, 20) } ||
Event.where("timestamp >= ? AND company IS NOT NULL", @start_time) Event.where("timestamp >= ? AND company IS NOT NULL", @start_time)
.group(:company) .group(:company)
.select("company, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .select("company, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count")
.order("event_count DESC") .order("event_count DESC")
.limit(20) .limit(20)
# ASN breakdown - use DuckDB if available # ASN breakdown - use DuckLake if available
@top_asns = with_duckdb_fallback { EventDdb.top_asns(@start_time, 15) } || @top_asns = with_duckdb_fallback { BaffleDl.top_asns(@start_time, 15) } ||
Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time) Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time)
.group(:asn, :asn_org) .group(:asn, :asn_org)
.select("asn, asn_org, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .select("asn, asn_org, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count")
.order("event_count DESC") .order("event_count DESC")
.limit(15) .limit(15)
# Geographic breakdown - use DuckDB if available # Geographic breakdown - use DuckLake if available
@top_countries = with_duckdb_fallback { EventDdb.top_countries_with_stats(@start_time, 15) } || @top_countries = with_duckdb_fallback { BaffleDl.top_countries_with_stats(@start_time, 15) } ||
Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) Event.where("timestamp >= ? AND country IS NOT NULL", @start_time)
.group(:country) .group(:country)
.select("country, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") .select("country, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips")