Gemspec adds the version, but tests also need it.

This commit is contained in:
Dan Milne
2023-09-10 10:31:46 +10:00
parent 5774ca2836
commit 27a47700e1
5 changed files with 34 additions and 34 deletions

View File

@@ -4,18 +4,19 @@ OMG another Ruby Robot.txt parser? It was an accident, I didn't mean to make it
Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh? Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh?
On the plus side, it has some nice features I don't think the others have. On the plus side of this yak shaving, there are some nice features I don't think the others have.
1. Supports consecutive user agents making up a single record: 1. Support for consecutive user agents making up a single record:
```txt ```txt
# Block both first-agent and second-agent from the site.
User-agent: first-agent User-agent: first-agent
User-agent: second-agent User-agent: second-agent
Disallow: / Disallow: /
``` ```
2. It can select the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores. This record blocks both first-agent and second-agent from the site.
2. It selects the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores.
```ruby ```ruby
txt = %Q{ txt = %Q{
@@ -30,6 +31,8 @@ Probot.new(txt).matches("/dir1/dir2/dir3")
In this case, we can see the Disallow rule with length 15 would be followed. In this case, we can see the Disallow rule with length 15 would be followed.
3. It sets the User-Agent string when fetching robots.txt
## Installation ## Installation
Install the gem and add to the application's Gemfile by executing: Install the gem and add to the application's Gemfile by executing:
@@ -45,16 +48,16 @@ If bundler is not being used to manage dependencies, install the gem by executin
It's straightforward to use. Instantiate it if you'll make a few requests: It's straightforward to use. Instantiate it if you'll make a few requests:
```ruby ```ruby
> r = Probot.new('https://booko.info', agent: 'MyAgent') > r = Probot.new('https://booko.info', agent: 'BookScraper')
> r.rules > r.rules
=> {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1}, => {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1},
"YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}} "YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}}
> r.allowed?("/abc/add_to_cart") > r.allowed?("/abc/refresh_prices")
=> false => false
> r.allowed?("https://booko.info/9780765397522/All-Systems-Red") > r.allowed?("https://booko.info/9780765397522/All-Systems-Red")
=> true => true
> r.allowed?("https://booko.info/9780765397522/add_to_cart") > r.allowed?("https://booko.info/9780765397522/refresh_prices")
=> false => false
``` ```

View File

@@ -1,7 +1,5 @@
# frozen_string_literal: true # frozen_string_literal: true
require_relative "Probot/version"
require "uri" require "uri"
require "net/http" require "net/http"
@@ -20,7 +18,7 @@ require "net/http"
# Parse a robots.txt file # Parse a robots.txt file
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. # Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
class Robots class Probot
attr_reader :rules, :sitemap, :doc attr_reader :rules, :sitemap, :doc
attr_accessor :agent attr_accessor :agent
@@ -145,15 +143,15 @@ class Robots
end end
end end
def self.allowed?(url, agent: "*") = Robots.new(url, agent: agent).allowed?(url) def self.allowed?(url, agent: "*") = Probot.new(url, agent: agent).allowed?(url)
end end
# Robots.allowed?("https://booko.info/9780765397522/All-Systems-Red") # Probot.allowed?("https://booko.info/9780765397522/All-Systems-Red")
# => true # => true
# r = Robots.new('https://booko.info', agent: 'YandexBot') # r = Probot.new('https://booko.info', agent: 'YandexBot')
# r = Robots.new('https://www.allenandunwin.com') # r = Probot.new('https://www.allenandunwin.com')
# $ Robots.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts") # $ Probot.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts")
# => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}} # => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}}
# #
# Test with # Test with
# assert Robots.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//} # assert Probot.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//}

View File

@@ -1,6 +1,6 @@
# frozen_string_literal: true # frozen_string_literal: true
require_relative "lib/Probot/version" require_relative "lib/probot/version"
Gem::Specification.new do |spec| Gem::Specification.new do |spec|
spec.name = "Probot" spec.name = "Probot"
@@ -8,14 +8,12 @@ Gem::Specification.new do |spec|
spec.authors = ["Dan Milne"] spec.authors = ["Dan Milne"]
spec.email = ["d@nmilne.com"] spec.email = ["d@nmilne.com"]
spec.summary = "A Robots.txt parser." spec.summary = "A robots.txt parser."
spec.description = "A more fully featured Robotos.txt parser." spec.description = "A fully featured robots.txt parser."
spec.homepage = "http://github.com/dkam/probot" spec.homepage = "http://github.com/dkam/probot"
spec.license = "MIT" spec.license = "MIT"
spec.required_ruby_version = ">= 3.0" spec.required_ruby_version = ">= 3.0"
spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
spec.metadata["homepage_uri"] = spec.homepage spec.metadata["homepage_uri"] = spec.homepage
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot" spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md" spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md"

View File

@@ -2,5 +2,6 @@
$LOAD_PATH.unshift File.expand_path("../lib", __dir__) $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
require "probot" require "probot"
require "probot/version" # for testing the version number - otherwise the gemspec does it.
require "minitest/autorun" require "minitest/autorun"

View File

@@ -100,13 +100,13 @@ class TestProbot < Minitest::Test
def test_some_tests def test_some_tests
TEST_CASES.each_with_index do |test_case, ind| TEST_CASES.each_with_index do |test_case, ind|
r = Robots.new(test_case[:txt]) r = Probot.new(test_case[:txt])
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}"
test_case[:tests].each do |tst| test_case[:tests].each do |tst|
r = Robots.new(test_case[:txt], agent: tst[:agent]) r = Probot.new(test_case[:txt], agent: tst[:agent])
tst[:allowed].each do |url| tst[:allowed].each do |url|
assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}" assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}"
@@ -121,24 +121,24 @@ class TestProbot < Minitest::Test
# https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values # https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values
def test_googles_tests def test_googles_tests
assert Robots.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} assert Probot.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/}
assert Robots.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} assert Probot.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/}
assert Robots.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} assert Probot.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/}
assert Robots.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL assert Probot.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL
assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/}
assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//}
end end
def test_empty_allow_disallow def test_empty_allow_disallow
assert Robots.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
assert Robots.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
end end
def test_consecutive_user_agents def test_consecutive_user_agents
txt = %(User-agent: Curl txt = %(User-agent: Curl
User-agent: Wget User-agent: Wget
Disallow: /url) Disallow: /url)
r = Robots.new(txt) r = Probot.new(txt)
assert r.allowed?("/url") == true assert r.allowed?("/url") == true
r.agent = "Curl" r.agent = "Curl"
@@ -152,7 +152,7 @@ class TestProbot < Minitest::Test
end end
def test_unfound_robots def test_unfound_robots
r = Robots.new("") r = Probot.new("")
assert r.allowed?("/url") == true assert r.allowed?("/url") == true
r.agent = "Curl" r.agent = "Curl"
assert r.allowed?("/url") == true assert r.allowed?("/url") == true
@@ -161,7 +161,7 @@ class TestProbot < Minitest::Test
def test_more_other_tests def test_more_other_tests
txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n ) txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n )
r = Robots.new(txt, agent: "rubytest") r = Probot.new(txt, agent: "rubytest")
assert r.allowed?("/dir/page.php") == false assert r.allowed?("/dir/page.php") == false
assert r.allowed?("/dir/home.php") == false assert r.allowed?("/dir/home.php") == false
assert r.allowed?("/dir/page") == true assert r.allowed?("/dir/page") == true