diff --git a/README.md b/README.md index b12ab49..7dfd1b5 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,19 @@ OMG another Ruby Robot.txt parser? It was an accident, I didn't mean to make it Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh? -On the plus side, it has some nice features I don't think the others have. +On the plus side of this yak shaving, there are some nice features I don't think the others have. -1. Supports consecutive user agents making up a single record: +1. Support for consecutive user agents making up a single record: ```txt -# Block both first-agent and second-agent from the site. User-agent: first-agent User-agent: second-agent Disallow: / ``` -2. It can select the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores. +This record blocks both first-agent and second-agent from the site. + +2. It selects the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores. ```ruby txt = %Q{ @@ -30,6 +31,8 @@ Probot.new(txt).matches("/dir1/dir2/dir3") In this case, we can see the Disallow rule with length 15 would be followed. +3. It sets the User-Agent string when fetching robots.txt + ## Installation Install the gem and add to the application's Gemfile by executing: @@ -45,16 +48,16 @@ If bundler is not being used to manage dependencies, install the gem by executin It's straightforward to use. Instantiate it if you'll make a few requests: ```ruby -> r = Probot.new('https://booko.info', agent: 'MyAgent') +> r = Probot.new('https://booko.info', agent: 'BookScraper') > r.rules => {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1}, "YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}} -> r.allowed?("/abc/add_to_cart") +> r.allowed?("/abc/refresh_prices") => false > r.allowed?("https://booko.info/9780765397522/All-Systems-Red") => true -> r.allowed?("https://booko.info/9780765397522/add_to_cart") +> r.allowed?("https://booko.info/9780765397522/refresh_prices") => false ``` diff --git a/lib/probot.rb b/lib/probot.rb index 08db6ce..fecf236 100644 --- a/lib/probot.rb +++ b/lib/probot.rb @@ -1,7 +1,5 @@ # frozen_string_literal: true -require_relative "Probot/version" - require "uri" require "net/http" @@ -20,7 +18,7 @@ require "net/http" # Parse a robots.txt file # Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. -class Robots +class Probot attr_reader :rules, :sitemap, :doc attr_accessor :agent @@ -145,15 +143,15 @@ class Robots end end - def self.allowed?(url, agent: "*") = Robots.new(url, agent: agent).allowed?(url) + def self.allowed?(url, agent: "*") = Probot.new(url, agent: agent).allowed?(url) end -# Robots.allowed?("https://booko.info/9780765397522/All-Systems-Red") +# Probot.allowed?("https://booko.info/9780765397522/All-Systems-Red") # => true -# r = Robots.new('https://booko.info', agent: 'YandexBot') -# r = Robots.new('https://www.allenandunwin.com') -# $ Robots.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts") +# r = Probot.new('https://booko.info', agent: 'YandexBot') +# r = Probot.new('https://www.allenandunwin.com') +# $ Probot.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts") # => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}} # # Test with -# assert Robots.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//} +# assert Probot.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//} diff --git a/probot.gemspec b/probot.gemspec index fc72399..f9a4cd8 100644 --- a/probot.gemspec +++ b/probot.gemspec @@ -1,6 +1,6 @@ # frozen_string_literal: true -require_relative "lib/Probot/version" +require_relative "lib/probot/version" Gem::Specification.new do |spec| spec.name = "Probot" @@ -8,14 +8,12 @@ Gem::Specification.new do |spec| spec.authors = ["Dan Milne"] spec.email = ["d@nmilne.com"] - spec.summary = "A Robots.txt parser." - spec.description = "A more fully featured Robotos.txt parser." + spec.summary = "A robots.txt parser." + spec.description = "A fully featured robots.txt parser." spec.homepage = "http://github.com/dkam/probot" spec.license = "MIT" spec.required_ruby_version = ">= 3.0" - spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'" - spec.metadata["homepage_uri"] = spec.homepage spec.metadata["source_code_uri"] = "http://github.com/dkam/probot" spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md" diff --git a/test/test_helper.rb b/test/test_helper.rb index 2abc2ba..c5c180d 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -2,5 +2,6 @@ $LOAD_PATH.unshift File.expand_path("../lib", __dir__) require "probot" +require "probot/version" # for testing the version number - otherwise the gemspec does it. require "minitest/autorun" diff --git a/test/test_probot.rb b/test/test_probot.rb index 5c8900d..4930fcc 100644 --- a/test/test_probot.rb +++ b/test/test_probot.rb @@ -100,13 +100,13 @@ class TestProbot < Minitest::Test def test_some_tests TEST_CASES.each_with_index do |test_case, ind| - r = Robots.new(test_case[:txt]) + r = Probot.new(test_case[:txt]) assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" test_case[:tests].each do |tst| - r = Robots.new(test_case[:txt], agent: tst[:agent]) + r = Probot.new(test_case[:txt], agent: tst[:agent]) tst[:allowed].each do |url| assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}" @@ -121,24 +121,24 @@ class TestProbot < Minitest::Test # https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values def test_googles_tests - assert Robots.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} - assert Robots.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} - assert Robots.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} - assert Robots.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL - assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} - assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} + assert Probot.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} + assert Probot.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} + assert Probot.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} + assert Probot.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL + assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} + assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} end def test_empty_allow_disallow - assert Robots.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? - assert Robots.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? + assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? + assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? end def test_consecutive_user_agents txt = %(User-agent: Curl User-agent: Wget Disallow: /url) - r = Robots.new(txt) + r = Probot.new(txt) assert r.allowed?("/url") == true r.agent = "Curl" @@ -152,7 +152,7 @@ class TestProbot < Minitest::Test end def test_unfound_robots - r = Robots.new("") + r = Probot.new("") assert r.allowed?("/url") == true r.agent = "Curl" assert r.allowed?("/url") == true @@ -161,7 +161,7 @@ class TestProbot < Minitest::Test def test_more_other_tests txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n ) - r = Robots.new(txt, agent: "rubytest") + r = Probot.new(txt, agent: "rubytest") assert r.allowed?("/dir/page.php") == false assert r.allowed?("/dir/home.php") == false assert r.allowed?("/dir/page") == true