From 88c7dc67f247ffbc4edf218c39656b05f26d572f Mon Sep 17 00:00:00 2001 From: Dan Milne Date: Tue, 24 Dec 2024 10:43:17 +1100 Subject: [PATCH] Add code and tests to handle a disallowed clause with value --- lib/probot.rb | 11 +++++++++-- probot.gemspec | 1 + test/test_probot.rb | 14 ++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/lib/probot.rb b/lib/probot.rb index 4eb8597..54a1e98 100644 --- a/lib/probot.rb +++ b/lib/probot.rb @@ -31,6 +31,7 @@ class Probot @current_agents = ["*"] @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } @sitemaps = [] + @site = URI(data) if data.start_with?("http") @doc = @site.nil? ? data : fetch_robots_txt(@site) parse(@doc) @@ -92,7 +93,9 @@ class Probot # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt if data.allow? || data.disallow? - @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } + @current_agents.each do |agent| + rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil? + end # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. subsequent_agent = false @@ -128,6 +131,8 @@ class Probot def clean_value = raw_value.split("#").first&.strip + def clean_url = clean_value&.then { URI(_1).to_s } + def agent? = key == "user-agent" def disallow? = key == "disallow" @@ -140,11 +145,13 @@ class Probot def value return clean_value.to_f if crawl_delay? - return URI(clean_value).to_s if disallow? || allow? + return clean_url if disallow? || allow? raw_value rescue URI::InvalidURIError raw_value + rescue ArgumentError + raw_value end end diff --git a/probot.gemspec b/probot.gemspec index b148774..adc0738 100644 --- a/probot.gemspec +++ b/probot.gemspec @@ -29,4 +29,5 @@ Gem::Specification.new do |spec| spec.bindir = "exe" spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } spec.require_paths = ["lib"] + spec.add_development_dependency "debug" end diff --git a/test/test_probot.rb b/test/test_probot.rb index c860338..f13584a 100644 --- a/test/test_probot.rb +++ b/test/test_probot.rb @@ -95,6 +95,19 @@ class TestProbot < Minitest::Test crawl_delay: 0 } ] + }, + { + txt: %("User-agent: *\nDisallow: /wp/wp-admin/\nAllow: /wp/wp-admin/admin-ajax.php\n\nUser-agent: *\nDisallow: /wp-content/uploads/wpo/wpo-plugins-tables-list.json\n\n# START YOAST BLOCK\n# ---------------------------\nUser-agent: *\nDisallow:\n\nSitemap: https://prhinternationalsales.com/sitemap_index.xml\n# ---------------------------\n# END YOAST BLOCK"), + sitemaps: ["https://prhinternationalsales.com/sitemap_index.xml"], + found_agents: ["*"], + tests: [ + { + agent: "*", + allowed: ["/wp/wp-admin/admin-ajax.php"], + disallowed: ["/wp/wp-admin/", "/wp-content/uploads/wpo/wpo-plugins-tables-list.json"], + crawl_delay: 0 + } + ] } ].freeze @@ -132,6 +145,7 @@ class TestProbot < Minitest::Test def test_empty_allow_disallow assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? + assert Probot.new(%(User-agent: *\nDisallow:\n\n)).rules.dig("*", "disallow").empty? end def test_consecutive_user_agents