Support for mulitple sitemaps. Sitemaps should be absolute, so we'll try and make them absolute - but if we're passed the Robots.txt as text we can't determine the host.

This commit is contained in:
Dan Milne
2023-09-10 13:17:53 +10:00
parent dd4a874f3f
commit 71bbd4d1ad
3 changed files with 36 additions and 17 deletions

View File

@@ -19,8 +19,8 @@ require "net/http"
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. # Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
class Probot class Probot
attr_reader :rules, :sitemap, :doc attr_reader :rules, :doc
attr_accessor :agent attr_accessor :agent, :sitemaps, :site
def initialize(data, agent: "*") def initialize(data, agent: "*")
raise ArgumentError, "The first argument must be a string" unless data.is_a?(String) raise ArgumentError, "The first argument must be a string" unless data.is_a?(String)
@@ -30,8 +30,8 @@ class Probot
@current_agents = ["*"] @current_agents = ["*"]
@current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
@sitemaps = [] @sitemaps = []
@site = URI(data) if data.start_with?("http")
@doc = data.start_with?("http") ? fetch_robots_txt(data) : data @doc = @site.nil? ? data : fetch_robots_txt(@site)
parse(@doc) parse(@doc)
end end
@@ -90,11 +90,11 @@ class Probot
end end
# All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
if data.allow? || data.disallow? if data.allow? || data.disallow?
@current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) }
subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
subsequent_agent = false
next next
end end
@@ -103,8 +103,11 @@ class Probot
next next
end end
# Ensure we have an absolute URL
if data.sitemap? if data.sitemap?
@sitemap = URI(data.value).path sitemap_uri = URI(data.value)
sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri
@sitemaps << sitemap_uri.to_s
next next
end end

View File

@@ -29,10 +29,4 @@ Gem::Specification.new do |spec|
spec.bindir = "exe" spec.bindir = "exe"
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
spec.require_paths = ["lib"] spec.require_paths = ["lib"]
# Uncomment to register a new dependency of your gem
# spec.add_dependency "example-gem", "~> 1.0"
# For more information and examples about making a new gem, check out our
# guide at: https://bundler.io/guides/creating_gem.html
end end

View File

@@ -25,7 +25,7 @@ class TestProbot < Minitest::Test
Disallow: /noblah/ Disallow: /noblah/
Allow: /cart/ Allow: /cart/
), ),
sitemap: "/sitemap.xml", sitemaps: ["http://www.allenandunwin.com/sitemap.xml"],
found_agents: ["*", "FooBot", "BlahBot", "YadaBot"], found_agents: ["*", "FooBot", "BlahBot", "YadaBot"],
tests: [ tests: [
{ {
@@ -51,7 +51,7 @@ class TestProbot < Minitest::Test
Allow: / # comment Allow: / # comment
Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap.xml
), ),
sitemap: "/sitemap.xml", sitemaps: ["http://example.com/sitemap.xml"],
found_agents: ["*"], found_agents: ["*"],
tests: [ tests: [
{ {
@@ -85,7 +85,7 @@ class TestProbot < Minitest::Test
sitemap: /sitemapxml.xml sitemap: /sitemapxml.xml
), ),
sitemap: "/sitemapxml.xml", sitemaps: ["/sitemapxml.xml"],
found_agents: ["*", "rubytest"], found_agents: ["*", "rubytest"],
tests: [ tests: [
{ {
@@ -103,7 +103,7 @@ class TestProbot < Minitest::Test
r = Probot.new(test_case[:txt]) r = Probot.new(test_case[:txt])
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" assert_equal test_case[:sitemaps], r.sitemaps, "sitemap for test #{ind}"
test_case[:tests].each do |tst| test_case[:tests].each do |tst|
r = Probot.new(test_case[:txt], agent: tst[:agent]) r = Probot.new(test_case[:txt], agent: tst[:agent])
@@ -167,4 +167,26 @@ class TestProbot < Minitest::Test
assert r.allowed?("/dir/page") == true assert r.allowed?("/dir/page") == true
assert r.allowed?("/dir/page?var") == false assert r.allowed?("/dir/page?var") == false
end end
def test_multiple_sitemaps
txt = %(User-agent: *\nSitemap: https://example.com/sitemapxml.xml\nSitemap: https://example.com/sitemapxml2.xml\n\n)
r = Probot.new(txt)
assert_equal 2, r.sitemaps.length
assert r.sitemaps.include?("https://example.com/sitemapxml.xml")
assert r.sitemaps.include?("https://example.com/sitemapxml2.xml")
end
def test_absolute_sitemaps
txt = %(User-agent: *\nSitemap: /sitemapxml.xml\nSitemap: /sitemapxml2.xml\n\n)
r = Probot.new(txt)
# We have to manually set the site, as we're not parsing a URL - then we need to reset the sitemaps array and reparse the doc.
r.site = URI("https://example.com")
r.sitemaps = []
r.parse(r.doc)
assert_equal 2, r.sitemaps.length
assert r.sitemaps.include?("https://example.com/sitemapxml.xml"), "expected https://example.com/sitemapxml.xml, got #{r.sitemaps}"
assert r.sitemaps.include?("https://example.com/sitemapxml2.xml"), "expected https://example.com/sitemapxml2.xml, got #{r.sitemaps}"
end
end end