From 88c7dc67f247ffbc4edf218c39656b05f26d572f Mon Sep 17 00:00:00 2001
From: Dan Milne <d@nmilne.com>
Date: Tue, 24 Dec 2024 10:43:17 +1100
Subject: [PATCH] Add code and tests to handle a disallowed clause with  value

---
 lib/probot.rb       | 11 +++++++++--
 probot.gemspec      |  1 +
 test/test_probot.rb | 14 ++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/lib/probot.rb b/lib/probot.rb
index 4eb8597..54a1e98 100644
--- a/lib/probot.rb
+++ b/lib/probot.rb
@@ -31,6 +31,7 @@ class Probot
     @current_agents = ["*"]
     @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
     @sitemaps = []
+
     @site = URI(data) if data.start_with?("http")
     @doc = @site.nil? ? data : fetch_robots_txt(@site)
     parse(@doc)
@@ -92,7 +93,9 @@ class Probot
 
       # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
       if data.allow? || data.disallow?
-        @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) }
+        @current_agents.each do |agent|
+          rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil?
+        end
 
         # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
         subsequent_agent = false
@@ -128,6 +131,8 @@ class Probot
 
     def clean_value = raw_value.split("#").first&.strip
 
+    def clean_url = clean_value&.then { URI(_1).to_s }
+
     def agent? = key == "user-agent"
 
     def disallow? = key == "disallow"
@@ -140,11 +145,13 @@ class Probot
 
     def value
       return clean_value.to_f if crawl_delay?
-      return URI(clean_value).to_s if disallow? || allow?
+      return clean_url if disallow? || allow?
 
       raw_value
     rescue URI::InvalidURIError
       raw_value
+    rescue ArgumentError
+      raw_value
     end
   end
 
diff --git a/probot.gemspec b/probot.gemspec
index b148774..adc0738 100644
--- a/probot.gemspec
+++ b/probot.gemspec
@@ -29,4 +29,5 @@ Gem::Specification.new do |spec|
   spec.bindir = "exe"
   spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
+  spec.add_development_dependency "debug"
 end
diff --git a/test/test_probot.rb b/test/test_probot.rb
index c860338..f13584a 100644
--- a/test/test_probot.rb
+++ b/test/test_probot.rb
@@ -95,6 +95,19 @@ class TestProbot < Minitest::Test
           crawl_delay: 0
         }
       ]
+    },
+    {
+      txt: %("User-agent: *\nDisallow: /wp/wp-admin/\nAllow: /wp/wp-admin/admin-ajax.php\n\nUser-agent: *\nDisallow: /wp-content/uploads/wpo/wpo-plugins-tables-list.json\n\n# START YOAST BLOCK\n# ---------------------------\nUser-agent: *\nDisallow:\n\nSitemap: https://prhinternationalsales.com/sitemap_index.xml\n# ---------------------------\n# END YOAST BLOCK"),
+      sitemaps: ["https://prhinternationalsales.com/sitemap_index.xml"],
+      found_agents: ["*"],
+      tests: [
+        {
+          agent: "*",
+          allowed: ["/wp/wp-admin/admin-ajax.php"],
+          disallowed: ["/wp/wp-admin/", "/wp-content/uploads/wpo/wpo-plugins-tables-list.json"],
+          crawl_delay: 0
+        }
+      ]
     }
   ].freeze
 
@@ -132,6 +145,7 @@ class TestProbot < Minitest::Test
   def test_empty_allow_disallow
     assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
     assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
+    assert Probot.new(%(User-agent: *\nDisallow:\n\n)).rules.dig("*", "disallow").empty?
   end
 
   def test_consecutive_user_agents