More use of tags - drop add_header action -> allow + headers+tags
This commit is contained in:
@@ -27,7 +27,7 @@ RUN apt-get update -qq && \
|
||||
*) \
|
||||
echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \
|
||||
esac && \
|
||||
wget "https://install.duckdb.org/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \
|
||||
wget "https://github.com/duckdb/duckdb/releases/download/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \
|
||||
unzip /tmp/libduckdb.zip -d /tmp/duckdb && \
|
||||
cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \
|
||||
cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \
|
||||
|
||||
@@ -7,6 +7,9 @@ class AnalyticsController < ApplicationController
|
||||
def index
|
||||
authorize :analytics, :index?
|
||||
|
||||
# Track overall request time
|
||||
request_start = Time.current
|
||||
|
||||
# Time period selector (default: last 24 hours)
|
||||
@time_period = params[:period]&.to_sym || :day
|
||||
@start_time = calculate_start_time(@time_period)
|
||||
@@ -24,10 +27,12 @@ class AnalyticsController < ApplicationController
|
||||
cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}"
|
||||
|
||||
# Core statistics - cached (uses DuckDB if available)
|
||||
stat_start = Time.current
|
||||
@total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do
|
||||
with_duckdb_fallback { EventDdb.count_since(@start_time) } ||
|
||||
Event.where("timestamp >= ?", @start_time).count
|
||||
end
|
||||
Rails.logger.info "[Analytics Perf] Total events: #{((Time.current - stat_start) * 1000).round(1)}ms"
|
||||
|
||||
@total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do
|
||||
Rule.enabled.count
|
||||
@@ -42,14 +47,17 @@ class AnalyticsController < ApplicationController
|
||||
end
|
||||
|
||||
# Event breakdown by action - cached (uses DuckDB if available)
|
||||
stat_start = Time.current
|
||||
@event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do
|
||||
with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } ||
|
||||
Event.where("timestamp >= ?", @start_time)
|
||||
.group(:waf_action)
|
||||
.count
|
||||
end
|
||||
Rails.logger.info "[Analytics Perf] Event breakdown: #{((Time.current - stat_start) * 1000).round(1)}ms"
|
||||
|
||||
# Top countries by event count - cached (uses DuckDB if available)
|
||||
stat_start = Time.current
|
||||
@top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do
|
||||
with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } ||
|
||||
Event.where("timestamp >= ? AND country IS NOT NULL", @start_time)
|
||||
@@ -58,8 +66,10 @@ class AnalyticsController < ApplicationController
|
||||
.sort_by { |_, count| -count }
|
||||
.first(10)
|
||||
end
|
||||
Rails.logger.info "[Analytics Perf] Top countries: #{((Time.current - stat_start) * 1000).round(1)}ms"
|
||||
|
||||
# Top blocked IPs - cached (uses DuckDB if available)
|
||||
stat_start = Time.current
|
||||
@top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do
|
||||
with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } ||
|
||||
Event.where("timestamp >= ?", @start_time)
|
||||
@@ -69,6 +79,7 @@ class AnalyticsController < ApplicationController
|
||||
.sort_by { |_, count| -count }
|
||||
.first(10)
|
||||
end
|
||||
Rails.logger.info "[Analytics Perf] Top blocked IPs: #{((Time.current - stat_start) * 1000).round(1)}ms"
|
||||
|
||||
# Network range intelligence breakdown - cached
|
||||
@network_intelligence = Rails.cache.fetch("analytics/network_intelligence", expires_in: 10.minutes) do
|
||||
@@ -105,7 +116,11 @@ class AnalyticsController < ApplicationController
|
||||
end
|
||||
|
||||
# Prepare data for charts - split caching for current vs historical data
|
||||
stat_start = Time.current
|
||||
@chart_data = prepare_chart_data_with_split_cache(cache_key_base, cache_ttl)
|
||||
Rails.logger.info "[Analytics Perf] Chart data: #{((Time.current - stat_start) * 1000).round(1)}ms"
|
||||
|
||||
Rails.logger.info "[Analytics Perf] TOTAL REQUEST: #{((Time.current - request_start) * 1000).round(1)}ms"
|
||||
|
||||
respond_to do |format|
|
||||
format.html
|
||||
|
||||
@@ -36,6 +36,9 @@ class EventsController < ApplicationController
|
||||
@events = @events.by_asn(params[:asn]) if params[:asn].present?
|
||||
@events = @events.by_network_cidr(params[:network_cidr]) if params[:network_cidr].present?
|
||||
|
||||
# Bot filtering
|
||||
@events = @events.exclude_bots if params[:exclude_bots] == "true"
|
||||
|
||||
Rails.logger.debug "Events count after filtering: #{@events.count}"
|
||||
|
||||
# Debug info
|
||||
|
||||
@@ -105,6 +105,11 @@ class Event < ApplicationRecord
|
||||
joins(:network_range).where("network_ranges.network = ?", cidr)
|
||||
}
|
||||
|
||||
# Bot filtering scopes
|
||||
scope :bots, -> { where(is_bot: true) }
|
||||
scope :humans, -> { where(is_bot: false) }
|
||||
scope :exclude_bots, -> { where(is_bot: false) }
|
||||
|
||||
# Add association for the optional network_range_id
|
||||
belongs_to :network_range, optional: true
|
||||
|
||||
@@ -191,6 +196,9 @@ class Event < ApplicationRecord
|
||||
# Populate network intelligence from IP address
|
||||
before_save :populate_network_intelligence, if: :should_populate_network_intelligence?
|
||||
|
||||
# Detect bot traffic using user agent and network intelligence
|
||||
before_save :detect_bot_traffic, if: :should_detect_bot?
|
||||
|
||||
# Backfill network intelligence for all events
|
||||
def self.backfill_network_intelligence!(batch_size: 10_000)
|
||||
total = where(country: nil).count
|
||||
@@ -693,10 +701,88 @@ class Event < ApplicationRecord
|
||||
self.server_name = payload["server_name"]
|
||||
self.environment = payload["environment"]
|
||||
|
||||
|
||||
|
||||
# Extract agent info
|
||||
agent_data = payload.dig("agent") || {}
|
||||
self.agent_version = agent_data["version"]
|
||||
self.agent_name = agent_data["name"]
|
||||
end
|
||||
|
||||
def should_detect_bot?
|
||||
# Detect bots if user agent is present or if we have network intelligence
|
||||
user_agent.present? || network_range_id.present?
|
||||
end
|
||||
|
||||
def detect_bot_traffic
|
||||
self.is_bot = bot_detected?
|
||||
rescue => e
|
||||
Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
|
||||
self.is_bot = false # Default to non-bot on error
|
||||
end
|
||||
|
||||
def bot_detected?
|
||||
# Multi-signal bot detection approach with tagging:
|
||||
# 1. User agent detection (DeviceDetector gem) - adds bot:name tag
|
||||
# 2. Network range source matching (bot_import_* sources) - adds network tags
|
||||
# 3. Fallback to datacenter classification for infrastructure-based detection
|
||||
|
||||
# Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
|
||||
if user_agent.present?
|
||||
begin
|
||||
detector = DeviceDetector.new(user_agent)
|
||||
if detector.bot?
|
||||
# Add bot tag with specific bot name
|
||||
bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
|
||||
add_tag("bot:#{bot_name}")
|
||||
return true
|
||||
end
|
||||
rescue => e
|
||||
Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
# Signal 2: Network range from known bot sources
|
||||
if network_range_id.present?
|
||||
range = NetworkRange.find_by(id: network_range_id)
|
||||
if range
|
||||
# Check if the network range source indicates a bot import
|
||||
if range.source&.start_with?('bot_import_')
|
||||
# Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
|
||||
bot_type = range.source.sub('bot_import_', '')
|
||||
add_tag("bot:#{bot_type}")
|
||||
add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
|
||||
return true
|
||||
end
|
||||
|
||||
# Check if the company is a known bot provider (from bot imports)
|
||||
# Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
|
||||
known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
|
||||
'anthropic', 'cloudflare', 'microsoft', 'facebook',
|
||||
'meta', 'apple', 'duckduckgo']
|
||||
company_lower = company&.downcase
|
||||
if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
|
||||
add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
|
||||
add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
|
||||
return true
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Signal 3: Datacenter traffic is often bot traffic
|
||||
# However, this is less precise so we use it as a weaker signal
|
||||
# Only mark as bot if datacenter AND has other suspicious characteristics
|
||||
if is_datacenter && user_agent.present?
|
||||
# Generic/common bot user agents in datacenter networks
|
||||
ua_lower = user_agent.downcase
|
||||
bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
|
||||
if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
|
||||
add_tag("bot:datacenter")
|
||||
add_tag("datacenter:true")
|
||||
return true
|
||||
end
|
||||
end
|
||||
|
||||
# Default: not a bot
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
@@ -34,7 +34,8 @@ class EventDdb
|
||||
SQL
|
||||
|
||||
# Convert to hash like ActiveRecord .group.count returns
|
||||
result.to_a.to_h { |row| [row["waf_action"], row["count"]] }
|
||||
# DuckDB returns arrays: [waf_action, count]
|
||||
result.to_a.to_h { |row| [row[0], row[1]] }
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}"
|
||||
@@ -54,7 +55,8 @@ class EventDdb
|
||||
SQL
|
||||
|
||||
# Return array of [country, count] tuples like ActiveRecord
|
||||
result.to_a.map { |row| [row["country"], row["count"]] }
|
||||
# DuckDB returns arrays: [country, count]
|
||||
result.to_a.map { |row| [row[0], row[1]] }
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}"
|
||||
@@ -73,7 +75,8 @@ class EventDdb
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
result.to_a.map { |row| [row["ip_address"], row["count"]] }
|
||||
# DuckDB returns arrays: [ip_address, count]
|
||||
result.to_a.map { |row| [row[0], row[1]] }
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}"
|
||||
@@ -94,7 +97,8 @@ class EventDdb
|
||||
SQL
|
||||
|
||||
# Convert to hash with Time keys like ActiveRecord
|
||||
result.to_a.to_h { |row| [row["hour"], row["count"]] }
|
||||
# DuckDB returns arrays: [hour, count]
|
||||
result.to_a.to_h { |row| [row[0], row[1]] }
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}"
|
||||
@@ -495,5 +499,128 @@ class EventDdb
|
||||
Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Bot traffic analysis - breakdown of bot vs human traffic
|
||||
def bot_traffic_breakdown(start_time)
|
||||
service.with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT
|
||||
is_bot,
|
||||
COUNT(*) as event_count,
|
||||
COUNT(DISTINCT ip_address) as unique_ips
|
||||
FROM events
|
||||
WHERE timestamp >= ?
|
||||
GROUP BY is_bot
|
||||
SQL
|
||||
|
||||
# Convert to hash: is_bot => { event_count, unique_ips }
|
||||
# DuckDB returns arrays: [is_bot, event_count, unique_ips]
|
||||
result.to_a.to_h do |row|
|
||||
[
|
||||
row[0] ? "bot" : "human", # row[0] = is_bot
|
||||
{
|
||||
"event_count" => row[1], # row[1] = event_count
|
||||
"unique_ips" => row[2] # row[2] = unique_ips
|
||||
}
|
||||
]
|
||||
end
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in bot_traffic_breakdown: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Count human traffic (non-bot) since timestamp
|
||||
def human_traffic_count(start_time)
|
||||
service.with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND is_bot = false
|
||||
SQL
|
||||
|
||||
result.first&.first || 0
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in human_traffic_count: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Count bot traffic since timestamp
|
||||
def bot_traffic_count(start_time)
|
||||
service.with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT COUNT(*) as count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND is_bot = true
|
||||
SQL
|
||||
|
||||
result.first&.first || 0
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in bot_traffic_count: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Top bot user agents
|
||||
def top_bot_user_agents(start_time, limit = 20)
|
||||
service.with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
user_agent,
|
||||
COUNT(*) as event_count,
|
||||
COUNT(DISTINCT ip_address) as unique_ips
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND is_bot = true AND user_agent IS NOT NULL
|
||||
GROUP BY user_agent
|
||||
ORDER BY event_count DESC
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
# DuckDB returns arrays: [user_agent, event_count, unique_ips]
|
||||
result.to_a.map do |row|
|
||||
{
|
||||
user_agent: row[0], # row[0] = user_agent
|
||||
event_count: row[1], # row[1] = event_count
|
||||
unique_ips: row[2] # row[2] = unique_ips
|
||||
}
|
||||
end
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in top_bot_user_agents: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Bot traffic timeline (hourly breakdown)
|
||||
def bot_traffic_timeline(start_time, end_time)
|
||||
service.with_connection do |conn|
|
||||
result = conn.query(<<~SQL, start_time, end_time)
|
||||
SELECT
|
||||
DATE_TRUNC('hour', timestamp) as hour,
|
||||
SUM(CASE WHEN is_bot = true THEN 1 ELSE 0 END) as bot_count,
|
||||
SUM(CASE WHEN is_bot = false THEN 1 ELSE 0 END) as human_count
|
||||
FROM events
|
||||
WHERE timestamp >= ? AND timestamp < ?
|
||||
GROUP BY hour
|
||||
ORDER BY hour
|
||||
SQL
|
||||
|
||||
# Convert to hash with Time keys
|
||||
# DuckDB returns arrays: [hour, bot_count, human_count]
|
||||
result.to_a.to_h do |row|
|
||||
[
|
||||
row[0], # row[0] = hour
|
||||
{
|
||||
"bot_count" => row[1], # row[1] = bot_count
|
||||
"human_count" => row[2], # row[2] = human_count
|
||||
"total" => row[1] + row[2]
|
||||
}
|
||||
]
|
||||
end
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in bot_traffic_timeline: #{e.message}"
|
||||
nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -7,7 +7,11 @@
|
||||
# and classification flags (datacenter, proxy, VPN).
|
||||
class NetworkRange < ApplicationRecord
|
||||
# Sources for network range creation
|
||||
SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country].freeze
|
||||
SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country
|
||||
bot_import_amazon_aws bot_import_google bot_import_microsoft_bing bot_import_anthropic
|
||||
bot_import_openai_searchbot bot_import_openai_chatgpt_user bot_import_openai_gptbot
|
||||
bot_import_cloudflare bot_import_facebook bot_import_applebot bot_import_duckduckgo
|
||||
production_import].freeze
|
||||
|
||||
# Associations
|
||||
has_many :rules, dependent: :destroy
|
||||
@@ -116,19 +120,19 @@ class NetworkRange < ApplicationRecord
|
||||
|
||||
# Parent/child relationships
|
||||
def parent_ranges
|
||||
NetworkRange.where("?::inet << network AND masklen(network) < ?", network.to_s, prefix_length)
|
||||
.order("masklen(network) DESC")
|
||||
# Find networks that contain this network (less specific / shorter prefix)
|
||||
# The << operator implicitly means the containing network has a shorter prefix
|
||||
# IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
|
||||
NetworkRange.where("?::inet << network", cidr)
|
||||
.order("masklen(network) DESC") # Most specific parent first
|
||||
end
|
||||
|
||||
def child_ranges
|
||||
NetworkRange.where("network >> ?::inet AND masklen(network) > ?", network.to_s, prefix_length)
|
||||
.order("masklen(network) ASC")
|
||||
end
|
||||
|
||||
def sibling_ranges
|
||||
NetworkRange.where("masklen(network) = ?", prefix_length)
|
||||
.where("network && ?::inet", network.to_s)
|
||||
.where.not(id: id)
|
||||
# Find networks that are contained by this network (more specific / longer prefix)
|
||||
# The >> operator implicitly means the contained network has a longer prefix
|
||||
# IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
|
||||
NetworkRange.where("?::inet >> network", cidr)
|
||||
.order("masklen(network) ASC") # Least specific child first
|
||||
end
|
||||
|
||||
# Find nearest parent with intelligence data
|
||||
|
||||
@@ -149,6 +149,42 @@ class Rule < ApplicationRecord
|
||||
metadata&.dig('header_value')
|
||||
end
|
||||
|
||||
# Tag-related methods
|
||||
def tags
|
||||
metadata_hash['tags'] || []
|
||||
end
|
||||
|
||||
def tags=(new_tags)
|
||||
self.metadata = metadata_hash.merge('tags' => Array(new_tags))
|
||||
end
|
||||
|
||||
def add_tag(tag)
|
||||
current_tags = tags
|
||||
return if current_tags.include?(tag.to_s)
|
||||
|
||||
self.metadata = metadata_hash.merge('tags' => (current_tags + [tag.to_s]))
|
||||
end
|
||||
|
||||
def remove_tag(tag)
|
||||
current_tags = tags
|
||||
return unless current_tags.include?(tag.to_s)
|
||||
|
||||
self.metadata = metadata_hash.merge('tags' => (current_tags - [tag.to_s]))
|
||||
end
|
||||
|
||||
def has_tag?(tag)
|
||||
tags.include?(tag.to_s)
|
||||
end
|
||||
|
||||
# Headers for add_header action or metadata-based header injection
|
||||
def headers
|
||||
metadata_hash['headers'] || {}
|
||||
end
|
||||
|
||||
def headers=(new_headers)
|
||||
self.metadata = metadata_hash.merge('headers' => new_headers.to_h)
|
||||
end
|
||||
|
||||
def related_surgical_rules
|
||||
if surgical_block?
|
||||
# Find the corresponding exception rule
|
||||
|
||||
@@ -33,9 +33,11 @@ class AnalyticsDuckdbService
|
||||
is_datacenter BOOLEAN,
|
||||
is_vpn BOOLEAN,
|
||||
is_proxy BOOLEAN,
|
||||
is_bot BOOLEAN,
|
||||
waf_action INTEGER,
|
||||
request_path VARCHAR,
|
||||
user_agent VARCHAR
|
||||
user_agent VARCHAR,
|
||||
tags VARCHAR[]
|
||||
)
|
||||
SQL
|
||||
|
||||
@@ -101,6 +103,9 @@ class AnalyticsDuckdbService
|
||||
batch_count = 0
|
||||
|
||||
begin
|
||||
# Create initial appender
|
||||
appender = conn.appender("events")
|
||||
|
||||
# Use PostgreSQL cursor for memory-efficient streaming
|
||||
Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id)
|
||||
.select(
|
||||
@@ -115,18 +120,14 @@ class AnalyticsDuckdbService
|
||||
:is_datacenter,
|
||||
:is_vpn,
|
||||
:is_proxy,
|
||||
:is_bot,
|
||||
:waf_action,
|
||||
:request_path,
|
||||
:user_agent
|
||||
:user_agent,
|
||||
:tags
|
||||
)
|
||||
.order(:id)
|
||||
.each_row(block_size: BATCH_SIZE) do |event_data|
|
||||
# Create new appender for each batch
|
||||
if batch_count % BATCH_SIZE == 0
|
||||
appender&.close # Close previous appender
|
||||
appender = conn.appender("events")
|
||||
end
|
||||
|
||||
# Unpack event data from cursor row (Hash from each_row)
|
||||
begin
|
||||
appender.append_row(
|
||||
@@ -141,9 +142,11 @@ class AnalyticsDuckdbService
|
||||
event_data["is_datacenter"],
|
||||
event_data["is_vpn"],
|
||||
event_data["is_proxy"],
|
||||
event_data["is_bot"],
|
||||
event_data["waf_action"],
|
||||
event_data["request_path"],
|
||||
event_data["user_agent"]
|
||||
event_data["user_agent"],
|
||||
event_data["tags"] || []
|
||||
)
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}"
|
||||
@@ -154,8 +157,10 @@ class AnalyticsDuckdbService
|
||||
batch_count += 1
|
||||
total_synced += 1
|
||||
|
||||
# Log progress every BATCH_SIZE events
|
||||
# Flush and recreate appender every BATCH_SIZE events to avoid chunk overflow
|
||||
if batch_count % BATCH_SIZE == 0
|
||||
appender.close
|
||||
appender = conn.appender("events")
|
||||
Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)"
|
||||
end
|
||||
end
|
||||
@@ -222,7 +227,8 @@ class AnalyticsDuckdbService
|
||||
SQL
|
||||
|
||||
# Convert to hash like PostgreSQL returns
|
||||
result.to_a.to_h { |row| [row["waf_action"], row["count"]] }
|
||||
# DuckDB returns arrays: [waf_action, count]
|
||||
result.to_a.to_h { |row| [row[0], row[1]] }
|
||||
end
|
||||
end
|
||||
|
||||
@@ -238,7 +244,8 @@ class AnalyticsDuckdbService
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
result.to_a.map { |row| [row["country"], row["count"]] }
|
||||
# DuckDB returns arrays: [country, count]
|
||||
result.to_a.map { |row| [row[0], row[1]] }
|
||||
end
|
||||
end
|
||||
|
||||
@@ -254,7 +261,8 @@ class AnalyticsDuckdbService
|
||||
LIMIT ?
|
||||
SQL
|
||||
|
||||
result.to_a.map { |row| [row["ip_address"], row["count"]] }
|
||||
# DuckDB returns arrays: [ip_address, count]
|
||||
result.to_a.map { |row| [row[0], row[1]] }
|
||||
end
|
||||
end
|
||||
|
||||
@@ -272,7 +280,8 @@ class AnalyticsDuckdbService
|
||||
SQL
|
||||
|
||||
# Convert to hash with Time keys like PostgreSQL
|
||||
result.to_a.to_h { |row| [row["hour"], row["count"]] }
|
||||
# DuckDB returns arrays: [hour, count]
|
||||
result.to_a.to_h { |row| [row[0], row[1]] }
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -173,6 +173,7 @@ class BotNetworkRangeImporter
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'
|
||||
@@ -223,7 +224,7 @@ class BotNetworkRangeImporter
|
||||
|
||||
puts "Amazon AWS import completed: #{imported_count} ranges imported"
|
||||
{ imported: imported_count, source: 'Amazon AWS' }
|
||||
rescue Net::TimeoutError, Net::OpenTimeout => e
|
||||
rescue Timeout::Error, Net::OpenTimeout => e
|
||||
raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
|
||||
rescue JSON::ParserError => e
|
||||
raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
|
||||
@@ -341,6 +342,7 @@ class BotNetworkRangeImporter
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.use_ssl = true
|
||||
http.read_timeout = 30
|
||||
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
|
||||
|
||||
response = http.get(uri.request_uri)
|
||||
raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'
|
||||
@@ -353,12 +355,15 @@ class BotNetworkRangeImporter
|
||||
# Determine crawler type from source name
|
||||
crawler_type = source[:name].gsub('OpenAI ', '').downcase
|
||||
|
||||
data.each do |entry|
|
||||
# OpenAI provides IP ranges as either CIDR notation or single IPs
|
||||
ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip']
|
||||
# Handle different OpenAI JSON formats
|
||||
prefixes = data['prefixes'] || data
|
||||
|
||||
prefixes.each do |entry|
|
||||
# OpenAI provides IP ranges as ipv4Prefix/ipv6Prefix or cidr/ip_prefix
|
||||
ip_range = entry['ipv4Prefix'] || entry['ipv6Prefix'] || entry['cidr'] || entry['ip_prefix'] || entry['ip']
|
||||
next unless ip_range
|
||||
|
||||
# Convert single IPs to /32
|
||||
# Convert single IPs to /32 or /128
|
||||
network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"
|
||||
|
||||
network_range = {
|
||||
@@ -396,7 +401,7 @@ class BotNetworkRangeImporter
|
||||
|
||||
puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
|
||||
{ imported: imported_count, source: "OpenAI #{crawler_type}" }
|
||||
rescue Net::TimeoutError, Net::OpenTimeout => e
|
||||
rescue Timeout::Error, Net::OpenTimeout => e
|
||||
raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
|
||||
rescue JSON::ParserError => e
|
||||
raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
|
||||
@@ -483,7 +488,8 @@ class BotNetworkRangeImporter
|
||||
raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'
|
||||
|
||||
# Cloudflare provides plain text CIDR lists
|
||||
lines = response.body.split("\n")
|
||||
# Handle both newline-separated and single-line formats
|
||||
lines = response.body.include?("\n") ? response.body.split("\n") : response.body.split
|
||||
ip_version = url.include?('ips-v4') ? 4 : 6
|
||||
|
||||
lines.each do |line|
|
||||
|
||||
@@ -77,6 +77,20 @@
|
||||
placeholder: "e.g., 192.168.1.0/24" %>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Bot Filtering -->
|
||||
<div class="mt-4 flex items-center">
|
||||
<div class="flex items-center h-5">
|
||||
<%= form.check_box :exclude_bots,
|
||||
{ checked: params[:exclude_bots] == "true", class: "h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" },
|
||||
"true", "false" %>
|
||||
</div>
|
||||
<div class="ml-3 text-sm">
|
||||
<%= form.label :exclude_bots, class: "font-medium text-gray-700" do %>
|
||||
Human Traffic Only
|
||||
<span class="font-normal text-gray-500">(Exclude known bots and crawlers)</span>
|
||||
<% end %>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<% end %>
|
||||
</div>
|
||||
|
||||
6
db/migrate/20251118071813_add_is_bot_to_events.rb
Normal file
6
db/migrate/20251118071813_add_is_bot_to_events.rb
Normal file
@@ -0,0 +1,6 @@
|
||||
class AddIsBotToEvents < ActiveRecord::Migration[8.1]
|
||||
def change
|
||||
add_column :events, :is_bot, :boolean, default: false, null: false
|
||||
add_index :events, :is_bot
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,39 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# Migrate add_header rules to use allow action with tags/headers in metadata
|
||||
#
|
||||
# Old pattern:
|
||||
# waf_action: add_header (5)
|
||||
# metadata: { header_name: "X-Bot-Agent", header_value: "googlebot" }
|
||||
#
|
||||
# New pattern:
|
||||
# waf_action: allow (1)
|
||||
# metadata: {
|
||||
# headers: { "X-Bot-Agent" => "googlebot" },
|
||||
# tags: ["bot:googlebot"]
|
||||
# }
|
||||
#
|
||||
class MigrateAddHeaderRulesToAllowWithTags < ActiveRecord::Migration[8.1]
|
||||
def up
|
||||
# Change all add_header (5) rules to allow (1)
|
||||
# Keep metadata as-is for now - will be handled by Rule helper methods
|
||||
execute <<-SQL
|
||||
UPDATE rules
|
||||
SET waf_action = 1 -- allow
|
||||
WHERE waf_action = 5 -- add_header
|
||||
SQL
|
||||
end
|
||||
|
||||
def down
|
||||
# This rollback is conservative - only revert rules that clearly came from add_header
|
||||
# (have header_name/header_value in metadata but not headers)
|
||||
execute <<-SQL
|
||||
UPDATE rules
|
||||
SET waf_action = 5 -- add_header
|
||||
WHERE waf_action = 1 -- allow
|
||||
AND metadata ? 'header_name'
|
||||
AND metadata ? 'header_value'
|
||||
AND NOT metadata ? 'headers'
|
||||
SQL
|
||||
end
|
||||
end
|
||||
@@ -10,7 +10,7 @@
|
||||
#
|
||||
# It's strongly recommended that you check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do
|
||||
ActiveRecord::Schema[8.1].define(version: 2025_11_20_003554) do
|
||||
# These are extensions that must be enabled in order to support this database
|
||||
enable_extension "pg_catalog.plpgsql"
|
||||
|
||||
@@ -80,6 +80,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do
|
||||
t.datetime "created_at", null: false
|
||||
t.string "environment"
|
||||
t.inet "ip_address"
|
||||
t.boolean "is_bot", default: false, null: false
|
||||
t.boolean "is_datacenter", default: false, null: false
|
||||
t.boolean "is_proxy", default: false, null: false
|
||||
t.boolean "is_vpn", default: false, null: false
|
||||
@@ -105,6 +106,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do
|
||||
t.index ["company"], name: "index_events_on_company"
|
||||
t.index ["country"], name: "index_events_on_country"
|
||||
t.index ["ip_address"], name: "index_events_on_ip_address"
|
||||
t.index ["is_bot"], name: "index_events_on_is_bot"
|
||||
t.index ["is_datacenter", "is_vpn", "is_proxy"], name: "index_events_on_network_flags"
|
||||
t.index ["network_range_id"], name: "index_events_on_network_range_id"
|
||||
t.index ["request_host_id", "request_method", "request_segment_ids"], name: "idx_events_host_method_path"
|
||||
|
||||
4
test/fixtures/users.yml
vendored
4
test/fixtures/users.yml
vendored
@@ -7,3 +7,7 @@ one:
|
||||
two:
|
||||
email_address: two@example.com
|
||||
password_digest: <%= password_digest %>
|
||||
|
||||
jason:
|
||||
email_address: jason@example.com
|
||||
password_digest: <%= password_digest %>
|
||||
|
||||
@@ -211,16 +211,51 @@ class NetworkRangeTest < ActiveSupport::TestCase
|
||||
assert_equal @ipv4_range, children.first
|
||||
end
|
||||
|
||||
test "sibling_ranges finds same-level networks" do
|
||||
# Create sibling networks
|
||||
sibling1 = NetworkRange.create!(network: "192.168.0.0/24")
|
||||
@ipv4_range.save! # 192.168.1.0/24
|
||||
sibling2 = NetworkRange.create!(network: "192.168.2.0/24")
|
||||
test "child_ranges works with Apple network hierarchy - 17.240.0.0/14" do
|
||||
# This test demonstrates the current bug in child_ranges method
|
||||
# Expected: 17.240.0.0/14 should have parents but no children in this test setup
|
||||
|
||||
siblings = @ipv4_range.sibling_ranges
|
||||
assert_includes siblings, sibling1
|
||||
assert_includes siblings, sibling2
|
||||
assert_not_includes siblings, @ipv4_range
|
||||
# Create the target network
|
||||
target_network = NetworkRange.create!(network: "17.240.0.0/14", source: "manual")
|
||||
|
||||
# Create parent networks
|
||||
parent1 = NetworkRange.create!(network: "17.240.0.0/13", source: "manual") # Should contain 17.240.0.0/14
|
||||
parent2 = NetworkRange.create!(network: "17.128.0.0/9", source: "manual") # Should also contain 17.240.0.0/14
|
||||
|
||||
# Create some child networks (more specific networks contained by 17.240.0.0/14)
|
||||
child1 = NetworkRange.create!(network: "17.240.0.0/15", source: "manual") # First half of /14
|
||||
child2 = NetworkRange.create!(network: "17.242.0.0/15", source: "manual") # Second half of /14
|
||||
child3 = NetworkRange.create!(network: "17.240.0.0/16", source: "manual") # More specific
|
||||
child4 = NetworkRange.create!(network: "17.241.0.0/16", source: "manual") # More specific
|
||||
|
||||
# Test parent_ranges works correctly
|
||||
parents = target_network.parent_ranges
|
||||
assert_includes parents, parent1, "17.240.0.0/13 should be a parent of 17.240.0.0/14"
|
||||
assert_includes parents, parent2, "17.128.0.0/9 should be a parent of 17.240.0.0/14"
|
||||
|
||||
# Test child_ranges - this is currently failing due to the bug
|
||||
children = target_network.child_ranges
|
||||
assert_includes children, child1, "17.240.0.0/15 should be a child of 17.240.0.0/14"
|
||||
assert_includes children, child2, "17.242.0.0/15 should be a child of 17.240.0.0/14"
|
||||
assert_includes children, child3, "17.240.0.0/16 should be a child of 17.240.0.0/14"
|
||||
assert_includes children, child4, "17.241.0.0/16 should be a child of 17.240.0.0/14"
|
||||
assert_not_includes children, parent1, "Parent networks should not be in child_ranges"
|
||||
assert_not_includes children, parent2, "Parent networks should not be in child_ranges"
|
||||
assert_not_includes children, target_network, "Self should not be in child_ranges"
|
||||
|
||||
# Test that parent can find child in its child_ranges
|
||||
parent1_children = parent1.child_ranges
|
||||
assert_includes parent1_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.240.0.0/13"
|
||||
|
||||
parent2_children = parent2.child_ranges
|
||||
assert_includes parent2_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.128.0.0/9"
|
||||
|
||||
# Test bidirectional consistency
|
||||
assert target_network.parent_ranges.include?(parent1), "Parent should list child"
|
||||
assert parent1.child_ranges.include?(target_network), "Child should list parent"
|
||||
|
||||
assert target_network.parent_ranges.include?(parent2), "Parent should list child"
|
||||
assert parent2.child_ranges.include?(target_network), "Child should list parent"
|
||||
end
|
||||
|
||||
# Intelligence and Inheritance
|
||||
|
||||
@@ -202,4 +202,95 @@ class RuleTest < ActiveSupport::TestCase
|
||||
assert_equal 8, format[:priority]
|
||||
assert_equal true, format[:enabled]
|
||||
end
|
||||
|
||||
# Tag functionality tests
|
||||
test "should store and retrieve tags in metadata" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
metadata: { tags: ["bot:googlebot", "trusted"] },
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
assert_equal ["bot:googlebot", "trusted"], rule.tags
|
||||
end
|
||||
|
||||
test "should add tag to rule" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
rule.add_tag("bot:googlebot")
|
||||
rule.save!
|
||||
|
||||
assert_includes rule.tags, "bot:googlebot"
|
||||
end
|
||||
|
||||
test "should remove tag from rule" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
metadata: { tags: ["bot:googlebot", "trusted"] },
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
rule.remove_tag("trusted")
|
||||
rule.save!
|
||||
|
||||
assert_not_includes rule.tags, "trusted"
|
||||
assert_includes rule.tags, "bot:googlebot"
|
||||
end
|
||||
|
||||
test "should check if rule has tag" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
metadata: { tags: ["bot:googlebot"] },
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
assert rule.has_tag?("bot:googlebot")
|
||||
assert_not rule.has_tag?("bot:bingbot")
|
||||
end
|
||||
|
||||
test "should store headers in metadata" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
metadata: {
|
||||
tags: ["bot:googlebot"],
|
||||
headers: { "X-Bot-Agent" => "googlebot" }
|
||||
},
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
assert_equal({ "X-Bot-Agent" => "googlebot" }, rule.headers)
|
||||
end
|
||||
|
||||
test "should set tags via assignment" do
|
||||
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
|
||||
rule = Rule.create!(
|
||||
waf_rule_type: "network",
|
||||
waf_action: "allow",
|
||||
network_range: network_range,
|
||||
user: users(:one)
|
||||
)
|
||||
|
||||
rule.tags = ["bot:bingbot", "network:microsoft"]
|
||||
rule.save!
|
||||
|
||||
assert_equal ["bot:bingbot", "network:microsoft"], rule.tags
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user