diff --git a/lib/probot.rb b/lib/probot.rb index fecf236..e1002e3 100644 --- a/lib/probot.rb +++ b/lib/probot.rb @@ -19,8 +19,8 @@ require "net/http" # Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. class Probot - attr_reader :rules, :sitemap, :doc - attr_accessor :agent + attr_reader :rules, :doc + attr_accessor :agent, :sitemaps, :site def initialize(data, agent: "*") raise ArgumentError, "The first argument must be a string" unless data.is_a?(String) @@ -30,8 +30,8 @@ class Probot @current_agents = ["*"] @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } @sitemaps = [] - - @doc = data.start_with?("http") ? fetch_robots_txt(data) : data + @site = URI(data) if data.start_with?("http") + @doc = @site.nil? ? data : fetch_robots_txt(@site) parse(@doc) end @@ -90,11 +90,11 @@ class Probot end # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt - if data.allow? || data.disallow? @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } - subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. + # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. + subsequent_agent = false next end @@ -103,8 +103,11 @@ class Probot next end + # Ensure we have an absolute URL if data.sitemap? - @sitemap = URI(data.value).path + sitemap_uri = URI(data.value) + sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri + @sitemaps << sitemap_uri.to_s next end diff --git a/probot.gemspec b/probot.gemspec index a716caf..b148774 100644 --- a/probot.gemspec +++ b/probot.gemspec @@ -29,10 +29,4 @@ Gem::Specification.new do |spec| spec.bindir = "exe" spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } spec.require_paths = ["lib"] - - # Uncomment to register a new dependency of your gem - # spec.add_dependency "example-gem", "~> 1.0" - - # For more information and examples about making a new gem, check out our - # guide at: https://bundler.io/guides/creating_gem.html end diff --git a/test/test_probot.rb b/test/test_probot.rb index 4930fcc..7333342 100644 --- a/test/test_probot.rb +++ b/test/test_probot.rb @@ -25,7 +25,7 @@ class TestProbot < Minitest::Test Disallow: /noblah/ Allow: /cart/ ), - sitemap: "/sitemap.xml", + sitemaps: ["http://www.allenandunwin.com/sitemap.xml"], found_agents: ["*", "FooBot", "BlahBot", "YadaBot"], tests: [ { @@ -51,7 +51,7 @@ class TestProbot < Minitest::Test Allow: / # comment Sitemap: http://example.com/sitemap.xml ), - sitemap: "/sitemap.xml", + sitemaps: ["http://example.com/sitemap.xml"], found_agents: ["*"], tests: [ { @@ -85,7 +85,7 @@ class TestProbot < Minitest::Test sitemap: /sitemapxml.xml ), - sitemap: "/sitemapxml.xml", + sitemaps: ["/sitemapxml.xml"], found_agents: ["*", "rubytest"], tests: [ { @@ -103,7 +103,7 @@ class TestProbot < Minitest::Test r = Probot.new(test_case[:txt]) assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" - assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" + assert_equal test_case[:sitemaps], r.sitemaps, "sitemap for test #{ind}" test_case[:tests].each do |tst| r = Probot.new(test_case[:txt], agent: tst[:agent]) @@ -167,4 +167,26 @@ class TestProbot < Minitest::Test assert r.allowed?("/dir/page") == true assert r.allowed?("/dir/page?var") == false end + + def test_multiple_sitemaps + txt = %(User-agent: *\nSitemap: https://example.com/sitemapxml.xml\nSitemap: https://example.com/sitemapxml2.xml\n\n) + r = Probot.new(txt) + assert_equal 2, r.sitemaps.length + assert r.sitemaps.include?("https://example.com/sitemapxml.xml") + assert r.sitemaps.include?("https://example.com/sitemapxml2.xml") + end + + def test_absolute_sitemaps + txt = %(User-agent: *\nSitemap: /sitemapxml.xml\nSitemap: /sitemapxml2.xml\n\n) + + r = Probot.new(txt) + # We have to manually set the site, as we're not parsing a URL - then we need to reset the sitemaps array and reparse the doc. + r.site = URI("https://example.com") + r.sitemaps = [] + r.parse(r.doc) + + assert_equal 2, r.sitemaps.length + assert r.sitemaps.include?("https://example.com/sitemapxml.xml"), "expected https://example.com/sitemapxml.xml, got #{r.sitemaps}" + assert r.sitemaps.include?("https://example.com/sitemapxml2.xml"), "expected https://example.com/sitemapxml2.xml, got #{r.sitemaps}" + end end