Use only parquet files for events
This commit is contained in:
52
app/jobs/bootstrap_parquet_export_job.rb
Normal file
52
app/jobs/bootstrap_parquet_export_job.rb
Normal file
@@ -0,0 +1,52 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# One-time job to bootstrap Parquet export system
|
||||
# Exports all existing DuckDB data to weekly Parquet archives
|
||||
# Run this once when setting up Parquet exports for the first time
|
||||
#
|
||||
# Usage:
|
||||
# BootstrapParquetExportJob.perform_now
|
||||
# # or via docker:
|
||||
# docker compose exec jobs bin/rails runner "BootstrapParquetExportJob.perform_now"
|
||||
class BootstrapParquetExportJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def perform
|
||||
service = AnalyticsDuckdbService.instance
|
||||
|
||||
# Check if DuckDB has any data
|
||||
event_count = service.event_count
|
||||
Rails.logger.info "[Parquet Bootstrap] DuckDB event count: #{event_count}"
|
||||
|
||||
if event_count == 0
|
||||
Rails.logger.warn "[Parquet Bootstrap] No events in DuckDB. Run SyncEventsToDuckdbJob first."
|
||||
return
|
||||
end
|
||||
|
||||
# Check if Parquet files already exist
|
||||
existing_weeks = Dir.glob(AnalyticsDuckdbService::PARQUET_WEEKS_PATH.join("*.parquet")).size
|
||||
if existing_weeks > 0
|
||||
Rails.logger.info "[Parquet Bootstrap] Found #{existing_weeks} existing week archives"
|
||||
end
|
||||
|
||||
Rails.logger.info "[Parquet Bootstrap] Starting export of all DuckDB data to Parquet..."
|
||||
|
||||
start_time = Time.current
|
||||
|
||||
# Run the bootstrap export
|
||||
service.export_all_to_parquet
|
||||
|
||||
duration = Time.current - start_time
|
||||
week_count = Dir.glob(AnalyticsDuckdbService::PARQUET_WEEKS_PATH.join("*.parquet")).size
|
||||
|
||||
Rails.logger.info "[Parquet Bootstrap] Complete!"
|
||||
Rails.logger.info "[Parquet Bootstrap] - Time taken: #{duration.round(2)} seconds"
|
||||
Rails.logger.info "[Parquet Bootstrap] - Week archives: #{week_count}"
|
||||
Rails.logger.info "[Parquet Bootstrap] - Storage: #{AnalyticsDuckdbService::PARQUET_BASE_PATH}"
|
||||
Rails.logger.info "[Parquet Bootstrap] System is ready - jobs will maintain exports automatically"
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[Parquet Bootstrap] Job failed: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to mark job as failed
|
||||
end
|
||||
end
|
||||
25
app/jobs/consolidate_parquet_hourly_job.rb
Normal file
25
app/jobs/consolidate_parquet_hourly_job.rb
Normal file
@@ -0,0 +1,25 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Background job to consolidate completed hour into day file
|
||||
# Runs at :05 past each hour (e.g., 01:05, 02:05, etc.)
|
||||
# Merges the previous hour's data into the day file and deletes the hour file
|
||||
class ConsolidateParquetHourlyJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def perform
|
||||
service = AnalyticsDuckdbService.instance
|
||||
|
||||
# Consolidate the previous hour (not current hour, which is still being written)
|
||||
previous_hour = 1.hour.ago
|
||||
|
||||
Rails.logger.info "[Parquet Consolidate] Starting hourly consolidation for #{previous_hour.strftime('%Y-%m-%d %H:00')}"
|
||||
|
||||
service.consolidate_hour_to_day(previous_hour)
|
||||
|
||||
Rails.logger.info "[Parquet Consolidate] Hourly consolidation complete"
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[Parquet Consolidate] Hourly job failed: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to mark job as failed in Solid Queue
|
||||
end
|
||||
end
|
||||
25
app/jobs/consolidate_parquet_weekly_job.rb
Normal file
25
app/jobs/consolidate_parquet_weekly_job.rb
Normal file
@@ -0,0 +1,25 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Background job to consolidate completed week into archive
|
||||
# Runs Monday at 00:05 (start of new week)
|
||||
# Merges the previous week's day files into a week archive and deletes day files
|
||||
class ConsolidateParquetWeeklyJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
def perform
|
||||
service = AnalyticsDuckdbService.instance
|
||||
|
||||
# Consolidate the previous week (Monday to Sunday)
|
||||
previous_week_start = 1.week.ago.beginning_of_week
|
||||
|
||||
Rails.logger.info "[Parquet Consolidate] Starting weekly consolidation for week starting #{previous_week_start.strftime('%Y-%m-%d')}"
|
||||
|
||||
service.consolidate_days_to_week(previous_week_start)
|
||||
|
||||
Rails.logger.info "[Parquet Consolidate] Weekly consolidation complete"
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[Parquet Consolidate] Weekly job failed: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to mark job as failed in Solid Queue
|
||||
end
|
||||
end
|
||||
@@ -1,89 +0,0 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Background job to sync events from PostgreSQL to DuckDB
|
||||
# Runs every 5 minutes to keep analytics database up-to-date
|
||||
# Uses watermark tracking to only sync new events
|
||||
class SyncEventsToDuckdbJob < ApplicationJob
|
||||
queue_as :default
|
||||
|
||||
# Key for storing last sync timestamp in Rails cache
|
||||
WATERMARK_CACHE_KEY = "duckdb_last_sync_time"
|
||||
WATERMARK_TTL = 1.week
|
||||
|
||||
# Overlap window to catch late-arriving events
|
||||
SYNC_OVERLAP = 1.minute
|
||||
|
||||
def perform
|
||||
service = AnalyticsDuckdbService.instance
|
||||
|
||||
# Determine where to start syncing
|
||||
from_timestamp = determine_sync_start_time(service)
|
||||
|
||||
Rails.logger.info "[DuckDB Sync] Starting sync from #{from_timestamp}"
|
||||
|
||||
# Sync new events using PostgreSQL cursor + DuckDB Appender
|
||||
# (setup_schema is called internally within sync_new_events)
|
||||
count = service.sync_new_events(from_timestamp)
|
||||
|
||||
# Update watermark if we synced any events
|
||||
if count > 0
|
||||
update_last_sync_time
|
||||
Rails.logger.info "[DuckDB Sync] Successfully synced #{count} events"
|
||||
else
|
||||
Rails.logger.info "[DuckDB Sync] No new events to sync"
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB Sync] Job failed: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise # Re-raise to mark job as failed in Solid Queue
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Determine timestamp to start syncing from
|
||||
# Strategy:
|
||||
# 1. First run (DuckDB empty): sync from oldest PostgreSQL event
|
||||
# 2. Subsequent runs: sync from last watermark with overlap
|
||||
def determine_sync_start_time(service)
|
||||
oldest_duckdb = service.oldest_event_timestamp
|
||||
|
||||
if oldest_duckdb.nil?
|
||||
# DuckDB is empty - this is the first sync
|
||||
# Start from oldest PostgreSQL event (or reasonable cutoff)
|
||||
oldest_pg = Event.minimum(:timestamp)
|
||||
|
||||
if oldest_pg.nil?
|
||||
# No events in PostgreSQL at all
|
||||
Rails.logger.warn "[DuckDB Sync] No events found in PostgreSQL"
|
||||
1.day.ago # Default to recent window
|
||||
else
|
||||
Rails.logger.info "[DuckDB Sync] First sync - starting from oldest event: #{oldest_pg}"
|
||||
oldest_pg
|
||||
end
|
||||
else
|
||||
# DuckDB has data - sync from last watermark with overlap
|
||||
last_sync = Rails.cache.read(WATERMARK_CACHE_KEY)
|
||||
|
||||
if last_sync.nil?
|
||||
# Watermark not in cache (maybe cache expired or restarted)
|
||||
# Fall back to newest event in DuckDB
|
||||
newest_duckdb = service.newest_event_timestamp
|
||||
start_time = newest_duckdb ? newest_duckdb - SYNC_OVERLAP : oldest_duckdb
|
||||
Rails.logger.info "[DuckDB Sync] Watermark not found, using newest DuckDB event: #{start_time}"
|
||||
start_time
|
||||
else
|
||||
# Normal case: use watermark with overlap to catch late arrivals
|
||||
start_time = last_sync - SYNC_OVERLAP
|
||||
Rails.logger.debug "[DuckDB Sync] Using watermark: #{last_sync} (with #{SYNC_OVERLAP}s overlap)"
|
||||
start_time
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Update last sync watermark in cache
|
||||
def update_last_sync_time
|
||||
now = Time.current
|
||||
Rails.cache.write(WATERMARK_CACHE_KEY, now, expires_in: WATERMARK_TTL)
|
||||
Rails.logger.debug "[DuckDB Sync] Updated watermark to #{now}"
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user