mirror of
https://github.com/dkam/probot.git
synced 2025-12-28 09:14:53 +00:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45c1b001cd | ||
|
|
f1a0b74a97 | ||
|
|
2e91518fd6 | ||
|
|
c4e1b876ce | ||
|
|
a7291bdfc3 | ||
|
|
88c7dc67f2 | ||
|
|
36b6a29039 | ||
|
|
89432b2dac | ||
|
|
ad48a4e335 | ||
|
|
fea1e2009a | ||
|
|
c700c09021 | ||
|
|
71bbd4d1ad |
15
CHANGELOG.md
15
CHANGELOG.md
@@ -1,5 +1,20 @@
|
|||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.5.0] - 2024-12-24
|
||||||
|
|
||||||
|
- Fix bug with Disallow rule containing empty line
|
||||||
|
|
||||||
|
## [0.4.0] - 2024-10-31
|
||||||
|
|
||||||
|
- Ensure VERISON is available
|
||||||
|
|
||||||
|
## [0.3.0] - 2023-09-18
|
||||||
|
|
||||||
|
- Only return unique sitemaps.
|
||||||
|
|
||||||
|
## [0.2.0] - 2023-09-10
|
||||||
|
|
||||||
|
- Correctly handle multiple sitemaps + tests.
|
||||||
## [0.1.0] - 2023-09-09
|
## [0.1.0] - 2023-09-09
|
||||||
|
|
||||||
- Initial release
|
- Initial release
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
require "uri"
|
require "uri"
|
||||||
require "net/http"
|
require "net/http"
|
||||||
|
require_relative "probot/version"
|
||||||
|
|
||||||
# https://moz.com/learn/seo/robotstxt
|
# https://moz.com/learn/seo/robotstxt
|
||||||
# https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea
|
# https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea
|
||||||
@@ -19,8 +20,8 @@ require "net/http"
|
|||||||
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
|
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
|
||||||
|
|
||||||
class Probot
|
class Probot
|
||||||
attr_reader :rules, :sitemap, :doc
|
attr_reader :rules, :doc
|
||||||
attr_accessor :agent
|
attr_accessor :agent, :sitemaps, :site
|
||||||
|
|
||||||
def initialize(data, agent: "*")
|
def initialize(data, agent: "*")
|
||||||
raise ArgumentError, "The first argument must be a string" unless data.is_a?(String)
|
raise ArgumentError, "The first argument must be a string" unless data.is_a?(String)
|
||||||
@@ -31,7 +32,8 @@ class Probot
|
|||||||
@current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
|
@current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
|
||||||
@sitemaps = []
|
@sitemaps = []
|
||||||
|
|
||||||
@doc = data.start_with?("http") ? fetch_robots_txt(data) : data
|
@site = URI(data) if data.start_with?("http")
|
||||||
|
@doc = @site.nil? ? data : fetch_robots_txt(@site)
|
||||||
parse(@doc)
|
parse(@doc)
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -90,11 +92,13 @@ class Probot
|
|||||||
end
|
end
|
||||||
|
|
||||||
# All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
|
# All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
|
||||||
|
|
||||||
if data.allow? || data.disallow?
|
if data.allow? || data.disallow?
|
||||||
@current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) }
|
@current_agents.each do |agent|
|
||||||
|
rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil?
|
||||||
|
end
|
||||||
|
|
||||||
subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
|
# When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
|
||||||
|
subsequent_agent = false
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -103,8 +107,12 @@ class Probot
|
|||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Ensure we have an absolute URL
|
||||||
if data.sitemap?
|
if data.sitemap?
|
||||||
@sitemap = URI(data.value).path
|
sitemap_uri = URI(data.value)
|
||||||
|
sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri
|
||||||
|
@sitemaps << sitemap_uri.to_s
|
||||||
|
@sitemaps.uniq!
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -123,6 +131,8 @@ class Probot
|
|||||||
|
|
||||||
def clean_value = raw_value.split("#").first&.strip
|
def clean_value = raw_value.split("#").first&.strip
|
||||||
|
|
||||||
|
def clean_url = clean_value&.then { URI(_1).to_s }
|
||||||
|
|
||||||
def agent? = key == "user-agent"
|
def agent? = key == "user-agent"
|
||||||
|
|
||||||
def disallow? = key == "disallow"
|
def disallow? = key == "disallow"
|
||||||
@@ -135,11 +145,13 @@ class Probot
|
|||||||
|
|
||||||
def value
|
def value
|
||||||
return clean_value.to_f if crawl_delay?
|
return clean_value.to_f if crawl_delay?
|
||||||
return URI(clean_value).to_s if disallow? || allow?
|
return clean_url if disallow? || allow?
|
||||||
|
|
||||||
raw_value
|
raw_value
|
||||||
rescue URI::InvalidURIError
|
rescue URI::InvalidURIError
|
||||||
raw_value
|
raw_value
|
||||||
|
rescue ArgumentError
|
||||||
|
raw_value
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
class Probot
|
class Probot
|
||||||
VERSION = "0.1.0"
|
VERSION = "0.5.0"
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ Gem::Specification.new do |spec|
|
|||||||
spec.homepage = "http://github.com/dkam/probot"
|
spec.homepage = "http://github.com/dkam/probot"
|
||||||
spec.license = "MIT"
|
spec.license = "MIT"
|
||||||
spec.required_ruby_version = ">= 3.0"
|
spec.required_ruby_version = ">= 3.0"
|
||||||
|
spec.platform = Gem::Platform::RUBY
|
||||||
|
|
||||||
spec.metadata["homepage_uri"] = spec.homepage
|
spec.metadata["homepage_uri"] = spec.homepage
|
||||||
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
|
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
|
||||||
@@ -29,10 +30,5 @@ Gem::Specification.new do |spec|
|
|||||||
spec.bindir = "exe"
|
spec.bindir = "exe"
|
||||||
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
||||||
spec.require_paths = ["lib"]
|
spec.require_paths = ["lib"]
|
||||||
|
spec.add_development_dependency "debug"
|
||||||
# Uncomment to register a new dependency of your gem
|
|
||||||
# spec.add_dependency "example-gem", "~> 1.0"
|
|
||||||
|
|
||||||
# For more information and examples about making a new gem, check out our
|
|
||||||
# guide at: https://bundler.io/guides/creating_gem.html
|
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -2,6 +2,5 @@
|
|||||||
|
|
||||||
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
||||||
require "probot"
|
require "probot"
|
||||||
require "probot/version" # for testing the version number - otherwise the gemspec does it.
|
|
||||||
|
|
||||||
require "minitest/autorun"
|
require "minitest/autorun"
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ class TestProbot < Minitest::Test
|
|||||||
Disallow: /noblah/
|
Disallow: /noblah/
|
||||||
Allow: /cart/
|
Allow: /cart/
|
||||||
),
|
),
|
||||||
sitemap: "/sitemap.xml",
|
sitemaps: ["http://www.allenandunwin.com/sitemap.xml"],
|
||||||
found_agents: ["*", "FooBot", "BlahBot", "YadaBot"],
|
found_agents: ["*", "FooBot", "BlahBot", "YadaBot"],
|
||||||
tests: [
|
tests: [
|
||||||
{
|
{
|
||||||
@@ -51,7 +51,7 @@ class TestProbot < Minitest::Test
|
|||||||
Allow: / # comment
|
Allow: / # comment
|
||||||
Sitemap: http://example.com/sitemap.xml
|
Sitemap: http://example.com/sitemap.xml
|
||||||
),
|
),
|
||||||
sitemap: "/sitemap.xml",
|
sitemaps: ["http://example.com/sitemap.xml"],
|
||||||
found_agents: ["*"],
|
found_agents: ["*"],
|
||||||
tests: [
|
tests: [
|
||||||
{
|
{
|
||||||
@@ -85,7 +85,7 @@ class TestProbot < Minitest::Test
|
|||||||
sitemap: /sitemapxml.xml
|
sitemap: /sitemapxml.xml
|
||||||
|
|
||||||
),
|
),
|
||||||
sitemap: "/sitemapxml.xml",
|
sitemaps: ["/sitemapxml.xml"],
|
||||||
found_agents: ["*", "rubytest"],
|
found_agents: ["*", "rubytest"],
|
||||||
tests: [
|
tests: [
|
||||||
{
|
{
|
||||||
@@ -95,6 +95,19 @@ class TestProbot < Minitest::Test
|
|||||||
crawl_delay: 0
|
crawl_delay: 0
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
txt: %("User-agent: *\nDisallow: /wp/wp-admin/\nAllow: /wp/wp-admin/admin-ajax.php\n\nUser-agent: *\nDisallow: /wp-content/uploads/wpo/wpo-plugins-tables-list.json\n\n# START YOAST BLOCK\n# ---------------------------\nUser-agent: *\nDisallow:\n\nSitemap: https://prhinternationalsales.com/sitemap_index.xml\n# ---------------------------\n# END YOAST BLOCK"),
|
||||||
|
sitemaps: ["https://prhinternationalsales.com/sitemap_index.xml"],
|
||||||
|
found_agents: ["*"],
|
||||||
|
tests: [
|
||||||
|
{
|
||||||
|
agent: "*",
|
||||||
|
allowed: ["/wp/wp-admin/admin-ajax.php"],
|
||||||
|
disallowed: ["/wp/wp-admin/", "/wp-content/uploads/wpo/wpo-plugins-tables-list.json"],
|
||||||
|
crawl_delay: 0
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
].freeze
|
].freeze
|
||||||
|
|
||||||
@@ -103,7 +116,7 @@ class TestProbot < Minitest::Test
|
|||||||
r = Probot.new(test_case[:txt])
|
r = Probot.new(test_case[:txt])
|
||||||
|
|
||||||
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
|
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
|
||||||
assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}"
|
assert_equal test_case[:sitemaps], r.sitemaps, "sitemap for test #{ind}"
|
||||||
|
|
||||||
test_case[:tests].each do |tst|
|
test_case[:tests].each do |tst|
|
||||||
r = Probot.new(test_case[:txt], agent: tst[:agent])
|
r = Probot.new(test_case[:txt], agent: tst[:agent])
|
||||||
@@ -131,7 +144,9 @@ class TestProbot < Minitest::Test
|
|||||||
|
|
||||||
def test_empty_allow_disallow
|
def test_empty_allow_disallow
|
||||||
assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
|
assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
|
||||||
|
assert Probot.new(%(User-agent: *\nAllow:\n\n)).rules.dig("*", "allow").empty?
|
||||||
assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
|
assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
|
||||||
|
assert Probot.new(%(User-agent: *\nDisallow:\n\n)).rules.dig("*", "disallow").empty?
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_consecutive_user_agents
|
def test_consecutive_user_agents
|
||||||
@@ -167,4 +182,34 @@ class TestProbot < Minitest::Test
|
|||||||
assert r.allowed?("/dir/page") == true
|
assert r.allowed?("/dir/page") == true
|
||||||
assert r.allowed?("/dir/page?var") == false
|
assert r.allowed?("/dir/page?var") == false
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_multiple_sitemaps
|
||||||
|
txt = %(User-agent: *\nSitemap: https://example.com/sitemapxml.xml\nSitemap: https://example.com/sitemapxml2.xml\n\n)
|
||||||
|
r = Probot.new(txt)
|
||||||
|
assert_equal 2, r.sitemaps.length
|
||||||
|
assert r.sitemaps.include?("https://example.com/sitemapxml.xml")
|
||||||
|
assert r.sitemaps.include?("https://example.com/sitemapxml2.xml")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sitemaps should be absolute URLs, but we'll accept relative URLs and make them absolute.
|
||||||
|
# However, we need to test both scenarios - when we know the site, and when we don't because we're parsing a robots.txt file.
|
||||||
|
# This test is a little gross, reaching into the guts of the class, but it's the easiest way to test this.
|
||||||
|
def test_absolute_sitemaps
|
||||||
|
txt = %(User-agent: *\nSitemap: /sitemapxml.xml\nSitemap: /sitemapxml2.xml\n\n)
|
||||||
|
|
||||||
|
r = Probot.new(txt)
|
||||||
|
assert_equal 2, r.sitemaps.length
|
||||||
|
assert r.sitemaps.include?("/sitemapxml.xml"), "expected /sitemapxml.xml, got #{r.sitemaps}"
|
||||||
|
assert r.sitemaps.include?("/sitemapxml2.xml"), "expected /sitemapxml2.xml, got #{r.sitemaps}"
|
||||||
|
|
||||||
|
# We have to manually set the site, as we're not parsing a URL - then we need to reset the sitemaps array and reparse the doc. Gross.
|
||||||
|
r = Probot.new(txt)
|
||||||
|
r.site = URI("https://example.com")
|
||||||
|
r.sitemaps = []
|
||||||
|
r.parse(r.doc)
|
||||||
|
|
||||||
|
assert_equal 2, r.sitemaps.length
|
||||||
|
assert r.sitemaps.include?("https://example.com/sitemapxml.xml"), "expected https://example.com/sitemapxml.xml, got #{r.sitemaps}"
|
||||||
|
assert r.sitemaps.include?("https://example.com/sitemapxml2.xml"), "expected https://example.com/sitemapxml2.xml, got #{r.sitemaps}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user