From 225d9701236c6f58b855bb40a6ab842a3ed2c595 Mon Sep 17 00:00:00 2001 From: Dan Milne Date: Thu, 25 Dec 2025 12:03:25 +1100 Subject: [PATCH] Update duckdb. use more duckdb. Fix the display of stats --- app/controllers/analytics_controller.rb | 106 ++++++++++++------ app/controllers/events_controller.rb | 4 +- app/controllers/network_ranges_controller.rb | 8 +- .../controllers/timeline_controller.js | 45 ++++++-- app/models/event_ddb.rb | 98 ++++------------ app/services/analytics_duckdb_service.rb | 5 + app/views/analytics/index.html.erb | 23 +++- bin/docker-entrypoint | 1 + config/database.yml | 1 + config/initializers/version.rb | 2 +- config/recurring.yml | 36 +++--- 11 files changed, 186 insertions(+), 143 deletions(-) diff --git a/app/controllers/analytics_controller.rb b/app/controllers/analytics_controller.rb index 3953293..b144c82 100644 --- a/app/controllers/analytics_controller.rb +++ b/app/controllers/analytics_controller.rb @@ -241,43 +241,83 @@ class AnalyticsController < ApplicationController end def prepare_chart_data_with_split_cache(cache_key_base, cache_ttl) - # Split timeline into historical (completed hours) and current (incomplete hour) - # Historical hours are cached for full TTL, current hour cached briefly for freshness + # Generate timeline based on selected time period + case @time_period + when :hour + # Show last 60 minutes for hour view + timeline_data = Rails.cache.fetch("#{cache_key_base}/chart_hourly", expires_in: 1.minute) do + # For hour view, show minute-by-minute data for the last hour + (0..59).map do |minutes_ago| + time_point = minutes_ago.minutes.ago + count = Event.where("timestamp >= ? AND timestamp < ?", time_point, time_point + 1.minute).count + { + time_iso: time_point.iso8601, + total: count + } + end.reverse + end - # Cache historical hours (1-23 hours ago) - these are complete and won't change - # No expiration - will stick around until evicted by cache store (uses DuckDB if available) - historical_timeline = Rails.cache.fetch("#{cache_key_base}/chart_historical") do - historical_start = 23.hours.ago.beginning_of_hour - current_hour_start = Time.current.beginning_of_hour + when :day + # Show last 24 hours (existing logic) + # Split timeline into historical (completed hours) and current (incomplete hour) + # Historical hours are cached for full TTL, current hour cached briefly for freshness - events_by_hour = with_duckdb_fallback { EventDdb.hourly_timeline(historical_start, current_hour_start) } || - Event.where("timestamp >= ? AND timestamp < ?", historical_start, current_hour_start) - .group("DATE_TRUNC('hour', timestamp)") - .count + # Cache historical hours (1-23 hours ago) - these are complete and won't change + # Use DuckDB directly for performance, no PostgreSQL fallback + historical_timeline = Rails.cache.fetch("#{cache_key_base}/chart_historical", expires_in: 1.hour) do + historical_start = 23.hours.ago.beginning_of_hour + current_hour_start = Time.current.beginning_of_hour - (1..23).map do |hour_ago| - hour_time = hour_ago.hours.ago.beginning_of_hour - hour_key = hour_time.utc + # Use DuckDB directly - if it fails, we'll show empty data rather than slow PostgreSQL + events_by_hour = BaffleDl.hourly_timeline(historical_start, current_hour_start) || {} + + (1..23).map do |hour_ago| + hour_time = hour_ago.hours.ago.beginning_of_hour + hour_key = hour_time.utc + { + time_iso: hour_time.iso8601, + total: events_by_hour[hour_key] || 0 + } + end.reverse + end + + # Current hour (0 hours ago) - cache very briefly since it's actively accumulating + # ALWAYS use PostgreSQL for current hour to get real-time data (DuckDB syncs every minute) + current_hour_data = Rails.cache.fetch("#{cache_key_base}/chart_current_hour", expires_in: 1.minute) do + hour_time = Time.current.beginning_of_hour + count = Event.where("timestamp >= ?", hour_time).count { time_iso: hour_time.iso8601, - total: events_by_hour[hour_key] || 0 + total: count } end - end - # Current hour (0 hours ago) - cache very briefly since it's actively accumulating - # ALWAYS use PostgreSQL for current hour to get real-time data (DuckDB syncs every minute) - current_hour_data = Rails.cache.fetch("#{cache_key_base}/chart_current_hour", expires_in: 1.minute) do - hour_time = Time.current.beginning_of_hour - count = Event.where("timestamp >= ?", hour_time).count - { - time_iso: hour_time.iso8601, - total: count - } - end + # Combine current + historical for full 24-hour timeline + timeline_data = [current_hour_data] + historical_timeline - # Combine current + historical for full 24-hour timeline - timeline_data = [current_hour_data] + historical_timeline + when :week, :month + # Show daily data for week/month views + days_to_show = @time_period == :week ? 7 : 30 + timeline_data = Rails.cache.fetch("#{cache_key_base}/chart_daily_#{days_to_show}", expires_in: cache_ttl) do + historical_start = days_to_show.days.ago.beginning_of_day + current_day_end = Time.current.end_of_day + + # Use DuckDB for all data including current day (max 1 minute delay) + daily_events = BaffleDl.daily_timeline(historical_start, current_day_end) || {} + + (0..days_to_show-1).map do |days_ago| + day_time = days_ago.days.ago.beginning_of_day + { + time_iso: day_time.iso8601, + total: daily_events[day_time] || 0 + } + end + end + + else + # Default to 24 hours + timeline_data = [] + end # Action distribution and other chart data (cached with main cache) other_chart_data = Rails.cache.fetch("#{cache_key_base}/chart_metadata", expires_in: cache_ttl) do @@ -323,7 +363,7 @@ class AnalyticsController < ApplicationController time_iso: hour_time.iso8601, total: events_by_hour[hour_key] || 0 } - end + end.reverse # Action distribution for pie chart action_distribution = @event_breakdown.map do |action, count| @@ -348,8 +388,8 @@ class AnalyticsController < ApplicationController end def calculate_network_type_stats(start_time) - # Try DuckDB first, fallback to PostgreSQL - duckdb_stats = with_duckdb_fallback { EventDdb.network_type_stats(start_time) } + # Try DuckLake first, fallback to PostgreSQL + duckdb_stats = with_duckdb_fallback { BaffleDl.network_type_stats(start_time) } return duckdb_stats if duckdb_stats @@ -397,8 +437,8 @@ class AnalyticsController < ApplicationController end def calculate_suspicious_patterns(start_time) - # Try DuckDB first, fallback to PostgreSQL - duckdb_patterns = with_duckdb_fallback { EventDdb.suspicious_patterns(start_time) } + # Try DuckLake first, fallback to PostgreSQL + duckdb_patterns = with_duckdb_fallback { BaffleDl.suspicious_patterns(start_time) } return duckdb_patterns if duckdb_patterns diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 08ea8eb..8e99c2a 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -37,8 +37,8 @@ class EventsController < ApplicationController filters[:network_range_id] = range.id if range end - # Try DuckDB first, fallback to PostgreSQL if unavailable - result = EventDdb.search(filters, page: params[:page]&.to_i || 1, per_page: 50) + # Try DuckLake first, fallback to PostgreSQL if unavailable + result = BaffleDl.search(filters, page: params[:page]&.to_i || 1, per_page: 50) if result # DuckDB query succeeded diff --git a/app/controllers/network_ranges_controller.rb b/app/controllers/network_ranges_controller.rb index 2df997c..c3829e7 100644 --- a/app/controllers/network_ranges_controller.rb +++ b/app/controllers/network_ranges_controller.rb @@ -262,10 +262,10 @@ class NetworkRangesController < ApplicationController # Include child network ranges to capture all traffic within this network block network_ids = [network_range.id] + network_range.child_ranges.pluck(:id) - # Try DuckDB first for stats (much faster) - duckdb_stats = with_duckdb_fallback { EventDdb.network_traffic_stats(network_ids) } - duckdb_top_paths = with_duckdb_fallback { EventDdb.network_top_paths(network_ids, 10) } - duckdb_top_agents = with_duckdb_fallback { EventDdb.network_top_user_agents(network_ids, 5) } + # Try DuckLake first for stats (much faster) + duckdb_stats = with_duckdb_fallback { BaffleDl.network_traffic_stats(network_ids) } + duckdb_top_paths = with_duckdb_fallback { BaffleDl.network_top_paths(network_ids, 10) } + duckdb_top_agents = with_duckdb_fallback { BaffleDl.network_top_user_agents(network_ids, 5) } if duckdb_stats # DuckDB success - use fast aggregated stats diff --git a/app/javascript/controllers/timeline_controller.js b/app/javascript/controllers/timeline_controller.js index 4acdaa4..7369b7b 100644 --- a/app/javascript/controllers/timeline_controller.js +++ b/app/javascript/controllers/timeline_controller.js @@ -37,20 +37,49 @@ export default class extends Controller { // Convert ISO time to local time const date = new Date(timeIso) - const localTime = date.toLocaleTimeString(undefined, { - hour: '2-digit', - minute: '2-digit', - hour12: false - }) - timeElement.textContent = localTime + // Determine if we should show date based on time range + const now = new Date() + const timeDiff = now - date + const hoursDiff = timeDiff / (1000 * 60 * 60) + + let displayTime + if (hoursDiff > 25) { + // For periods longer than 25 hours, show date only (no time) + displayTime = date.toLocaleDateString(undefined, { + month: 'short', + day: 'numeric' + }) + } else { + // Check if this is midnight UTC data (daily timeline) vs actual time data (hourly timeline) + // Daily timeline: time is at UTC midnight (hours/minutes/seconds = 0) + // Hourly timeline: time has actual hours/minutes + const utcHours = date.getUTCHours() + const utcMinutes = date.getUTCMinutes() + const utcSeconds = date.getUTCSeconds() + + if (utcHours === 0 && utcMinutes === 0 && utcSeconds === 0) { + // This is midnight UTC - treat as daily data, show date only + displayTime = date.toLocaleDateString(undefined, { + month: 'short', + day: 'numeric' + }) + } else { + // This is actual time data - show time only + displayTime = date.toLocaleTimeString(undefined, { + hour: '2-digit', + minute: '2-digit', + hour12: false + }) + } + } + + timeElement.textContent = displayTime timeElement.title = date.toLocaleString(undefined, { weekday: 'short', year: 'numeric', month: 'short', day: 'numeric', - hour: '2-digit', - minute: '2-digit', timeZoneName: 'short' }) diff --git a/app/models/event_ddb.rb b/app/models/event_ddb.rb index 68cda06..71f4c47 100644 --- a/app/models/event_ddb.rb +++ b/app/models/event_ddb.rb @@ -2,9 +2,9 @@ require 'ostruct' -# EventDdb - DuckDB-backed analytics queries for events -# Provides an ActiveRecord-like interface for querying DuckDB events table -# Falls back to PostgreSQL Event model if DuckDB is unavailable +# EventDdb - DuckLake-backed analytics queries for events +# Provides an ActiveRecord-like interface for querying DuckLake events table +# Falls back to PostgreSQL Event model if DuckLake is unavailable class EventDdb # Enum mappings from integer to string (matching Event model) ACTION_MAP = { @@ -26,30 +26,24 @@ class EventDdb }.freeze class << self - # Get DuckDB service + # Get DuckLake service def service - AnalyticsDuckdbService.instance + AnalyticsDucklakeService.new end - # Helper to load parquet files into in-memory events view + # Helper to work with DuckLake events table # This allows all existing queries to work without modification - # Uses glob pattern to read all parquet files (excluding .temp files) def with_events_from_parquet(&block) service.with_connection do |conn| - # Create events view from all parquet files using glob pattern - # Pattern matches: minute/*.parquet, hours/*.parquet, days/*.parquet, weeks/*.parquet - # Excludes .temp files automatically (they don't match *.parquet) - parquet_pattern = "#{AnalyticsDuckdbService::PARQUET_BASE_PATH}/**/*.parquet" - - conn.execute(<<~SQL) - CREATE VIEW events AS - SELECT * FROM read_parquet('#{parquet_pattern}') - SQL + # Ensure schema exists + service.setup_schema(conn) + # Use the DuckLake events table directly + # DuckLake automatically manages the Parquet files underneath yield conn end rescue StandardError => e - Rails.logger.error "[EventDdb] Error loading parquet files: #{e.message}" + Rails.logger.error "[EventDdb] Error accessing DuckLake events: #{e.message}" nil end @@ -691,70 +685,14 @@ class EventDdb # Search events with filters and pagination # Returns { total_count:, events:[], page:, per_page: } - # Supports filters: ip, waf_action, country, rule_id, company, asn, network_type, network_range_id, exclude_bots + # Supports filters: ip, waf_action, country, rule_id, company, asn, network_type, network_range_id, exclude_bots, request_path def search(filters = {}, page: 1, per_page: 50) - # Get list of Parquet files to query - parquet_files = service.parquet_files_for_range(1.year.ago, Time.current) - - if parquet_files.empty? - Rails.logger.warn "[EventDdb] No Parquet files found, falling back to DuckDB" - return search_duckdb(filters, page, per_page) - end - - # Query Parquet files using in-memory DuckDB (no file locks!) - service.with_parquet_connection do |conn| - # Build WHERE clause - where_clause, params = build_where_clause(filters) - - # Build file list for read_parquet - file_list = parquet_files.map { |f| "'#{f}'" }.join(", ") - - # Get total count - count_sql = "SELECT COUNT(*) FROM read_parquet([#{file_list}])#{where_clause}" - count_result = conn.query(count_sql, *params) - total_count = count_result.first&.first || 0 - - # Get paginated results - offset = (page - 1) * per_page - - data_sql = <<~SQL - SELECT - id, timestamp, ip_address, network_range_id, country, company, - asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot, - waf_action, request_method, response_status, rule_id, - request_path, user_agent, tags - FROM read_parquet([#{file_list}]) - #{where_clause} - ORDER BY timestamp DESC - LIMIT ? OFFSET ? - SQL - - result = conn.query(data_sql, *params, per_page, offset) - - # Convert rows to event-like objects - events = result.to_a.map { |row| row_to_event(row) } - - { - total_count: total_count, - events: events, - page: page, - per_page: per_page - } - end - rescue StandardError => e - Rails.logger.error "[EventDdb] Error in Parquet search: #{e.message}" - Rails.logger.error e.backtrace.join("\n") - nil - end - - # Fallback to querying DuckDB directly (for backward compatibility) - def search_duckdb(filters = {}, page: 1, per_page: 50) with_events_from_parquet do |conn| # Build WHERE clause where_clause, params = build_where_clause(filters) # Get total count - count_sql = "SELECT COUNT(*) FROM events#{where_clause}" + count_sql = "SELECT COUNT(*) FROM baffle.events#{where_clause}" count_result = conn.query(count_sql, *params) total_count = count_result.first&.first || 0 @@ -767,7 +705,7 @@ class EventDdb asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot, waf_action, request_method, response_status, rule_id, request_path, user_agent, tags - FROM events + FROM baffle.events #{where_clause} ORDER BY timestamp DESC LIMIT ? OFFSET ? @@ -786,7 +724,7 @@ class EventDdb } end rescue StandardError => e - Rails.logger.error "[EventDdb] Error in DuckDB search: #{e.message}" + Rails.logger.error "[EventDdb] Error in DuckLake search: #{e.message}" Rails.logger.error e.backtrace.join("\n") nil end @@ -852,6 +790,12 @@ class EventDdb end end + # Path filtering + if filters[:request_path].present? + conditions << "request_path = ?" + params << filters[:request_path] + end + # Bot filtering if filters[:exclude_bots] == true || filters[:exclude_bots] == "true" conditions << "is_bot = false" diff --git a/app/services/analytics_duckdb_service.rb b/app/services/analytics_duckdb_service.rb index 11a9638..9c78007 100644 --- a/app/services/analytics_duckdb_service.rb +++ b/app/services/analytics_duckdb_service.rb @@ -2,6 +2,11 @@ # Service for managing DuckDB analytics database # Provides fast analytical queries on events data using columnar storage + +# INSTALL ducklake; +# INSTALL sqlite; +# ATTACH 'ducklake:sqlite3:storage/ducklake.sqlite3' AS events (DATA_PATH 'storage/ducklake/events.ducklake'); + class AnalyticsDuckdbService include Singleton diff --git a/app/views/analytics/index.html.erb b/app/views/analytics/index.html.erb index b36fbd8..366e718 100644 --- a/app/views/analytics/index.html.erb +++ b/app/views/analytics/index.html.erb @@ -185,7 +185,13 @@
-

Events Timeline (Last 24 Hours)

+

Events Timeline (<%= case @time_period + when :hour then "Last Hour" + when :day then "Last 24 Hours" + when :week then "Last 7 Days" + when :month then "Last 30 Days" + else "Last 24 Hours" + end %>)

Times shown in your local timezone
@@ -381,7 +387,7 @@

Quick Actions

-
+
<%= link_to new_rule_path, class: "flex items-center justify-center px-4 py-3 bg-blue-600 text-white rounded-md hover:bg-blue-700 transition-colors" do %> @@ -393,17 +399,24 @@ - Add Network Range + Add Network <% end %> - <%= link_to events_path, class: "flex items-center justify-center px-4 py-3 bg-purple-600 text-white rounded-md hover:bg-purple-700 transition-colors" do %> + <%= link_to analytics_networks_path, class: "flex items-center justify-center px-4 py-3 bg-purple-600 text-white rounded-md hover:bg-purple-700 transition-colors" do %> + + + + Network Analytics + <% end %> + + <%= link_to events_path, class: "flex items-center justify-center px-4 py-3 bg-orange-600 text-white rounded-md hover:bg-orange-700 transition-colors" do %> View Events <% end %> - <%= link_to rules_path, class: "flex items-center justify-center px-4 py-3 bg-orange-600 text-white rounded-md hover:bg-orange-700 transition-colors" do %> + <%= link_to rules_path, class: "flex items-center justify-center px-4 py-3 bg-gray-600 text-white rounded-md hover:bg-gray-700 transition-colors" do %> diff --git a/bin/docker-entrypoint b/bin/docker-entrypoint index ed31659..4981162 100755 --- a/bin/docker-entrypoint +++ b/bin/docker-entrypoint @@ -3,6 +3,7 @@ # If running the rails server then create or migrate existing database if [ "${@: -2:1}" == "./bin/rails" ] && [ "${@: -1:1}" == "server" ]; then ./bin/rails db:prepare + ./bin/rails ducklake:setup fi exec "${@}" diff --git a/config/database.yml b/config/database.yml index ee6a77f..23570d4 100644 --- a/config/database.yml +++ b/config/database.yml @@ -55,6 +55,7 @@ production: database: baffle_hub_production username: baffle_hub password: <%= ENV["BAFFLE_HUB_DATABASE_PASSWORD"] %> + pool: 80 cache: <<: *sqlite_default database: storage/production_cache.sqlite3 diff --git a/config/initializers/version.rb b/config/initializers/version.rb index 013ca4f..ac38b48 100644 --- a/config/initializers/version.rb +++ b/config/initializers/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module BaffleHub - VERSION = "0.3.0" + VERSION = "0.4.0" end diff --git a/config/recurring.yml b/config/recurring.yml index d7d3c04..3505f69 100644 --- a/config/recurring.yml +++ b/config/recurring.yml @@ -30,20 +30,30 @@ cleanup_old_events: queue: background schedule: every hour -# Export events from PostgreSQL to Parquet files for fast analytics -export_events_to_parquet: - class: ExportEventsToParquetJob +# Export events from PostgreSQL to DuckLake for fast analytics +export_events_to_ducklake: + class: ExportEventsToDucklakeJob queue: default schedule: every 1 minutes -# Consolidate completed hours into day files -consolidate_parquet_hourly: - class: ConsolidateParquetHourlyJob - queue: default - schedule: "5 * * * *" # At 5 minutes past every hour +# Merge DuckLake files and clean up immediately after +merge_ducklake_files: + class: MergeDucklakeFilesJob + queue: background + schedule: every 15 minutes -# Consolidate completed week into archive (Monday 00:05) -consolidate_parquet_weekly: - class: ConsolidateParquetWeeklyJob - queue: default - schedule: "5 0 * * 1" # Monday at 00:05 +# OLD PARQUET SYSTEM (DISABLED - using DuckLake now) +# export_events_to_parquet: +# class: ExportEventsToParquetJob +# queue: default +# schedule: every 1 minutes +# +# consolidate_parquet_hourly: +# class: ConsolidateParquetHourlyJob +# queue: default +# schedule: "5 * * * *" # At 5 minutes past every hour +# +# consolidate_parquet_weekly: +# class: ConsolidateParquetWeeklyJob +# queue: default +# schedule: "5 0 * * 1" # Monday at 00:05