mirror of
https://github.com/dkam/probot.git
synced 2025-12-28 09:14:53 +00:00
Gemspec adds the version, but tests also need it.
This commit is contained in:
17
README.md
17
README.md
@@ -4,18 +4,19 @@ OMG another Ruby Robot.txt parser? It was an accident, I didn't mean to make it
|
|||||||
|
|
||||||
Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh?
|
Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh?
|
||||||
|
|
||||||
On the plus side, it has some nice features I don't think the others have.
|
On the plus side of this yak shaving, there are some nice features I don't think the others have.
|
||||||
|
|
||||||
1. Supports consecutive user agents making up a single record:
|
1. Support for consecutive user agents making up a single record:
|
||||||
|
|
||||||
```txt
|
```txt
|
||||||
# Block both first-agent and second-agent from the site.
|
|
||||||
User-agent: first-agent
|
User-agent: first-agent
|
||||||
User-agent: second-agent
|
User-agent: second-agent
|
||||||
Disallow: /
|
Disallow: /
|
||||||
```
|
```
|
||||||
|
|
||||||
2. It can select the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores.
|
This record blocks both first-agent and second-agent from the site.
|
||||||
|
|
||||||
|
2. It selects the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores.
|
||||||
|
|
||||||
```ruby
|
```ruby
|
||||||
txt = %Q{
|
txt = %Q{
|
||||||
@@ -30,6 +31,8 @@ Probot.new(txt).matches("/dir1/dir2/dir3")
|
|||||||
|
|
||||||
In this case, we can see the Disallow rule with length 15 would be followed.
|
In this case, we can see the Disallow rule with length 15 would be followed.
|
||||||
|
|
||||||
|
3. It sets the User-Agent string when fetching robots.txt
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Install the gem and add to the application's Gemfile by executing:
|
Install the gem and add to the application's Gemfile by executing:
|
||||||
@@ -45,16 +48,16 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|||||||
It's straightforward to use. Instantiate it if you'll make a few requests:
|
It's straightforward to use. Instantiate it if you'll make a few requests:
|
||||||
|
|
||||||
```ruby
|
```ruby
|
||||||
> r = Probot.new('https://booko.info', agent: 'MyAgent')
|
> r = Probot.new('https://booko.info', agent: 'BookScraper')
|
||||||
> r.rules
|
> r.rules
|
||||||
=> {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1},
|
=> {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1},
|
||||||
"YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}}
|
"YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}}
|
||||||
|
|
||||||
> r.allowed?("/abc/add_to_cart")
|
> r.allowed?("/abc/refresh_prices")
|
||||||
=> false
|
=> false
|
||||||
> r.allowed?("https://booko.info/9780765397522/All-Systems-Red")
|
> r.allowed?("https://booko.info/9780765397522/All-Systems-Red")
|
||||||
=> true
|
=> true
|
||||||
> r.allowed?("https://booko.info/9780765397522/add_to_cart")
|
> r.allowed?("https://booko.info/9780765397522/refresh_prices")
|
||||||
=> false
|
=> false
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
require_relative "Probot/version"
|
|
||||||
|
|
||||||
require "uri"
|
require "uri"
|
||||||
require "net/http"
|
require "net/http"
|
||||||
|
|
||||||
@@ -20,7 +18,7 @@ require "net/http"
|
|||||||
# Parse a robots.txt file
|
# Parse a robots.txt file
|
||||||
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
|
# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
|
||||||
|
|
||||||
class Robots
|
class Probot
|
||||||
attr_reader :rules, :sitemap, :doc
|
attr_reader :rules, :sitemap, :doc
|
||||||
attr_accessor :agent
|
attr_accessor :agent
|
||||||
|
|
||||||
@@ -145,15 +143,15 @@ class Robots
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.allowed?(url, agent: "*") = Robots.new(url, agent: agent).allowed?(url)
|
def self.allowed?(url, agent: "*") = Probot.new(url, agent: agent).allowed?(url)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Robots.allowed?("https://booko.info/9780765397522/All-Systems-Red")
|
# Probot.allowed?("https://booko.info/9780765397522/All-Systems-Red")
|
||||||
# => true
|
# => true
|
||||||
# r = Robots.new('https://booko.info', agent: 'YandexBot')
|
# r = Probot.new('https://booko.info', agent: 'YandexBot')
|
||||||
# r = Robots.new('https://www.allenandunwin.com')
|
# r = Probot.new('https://www.allenandunwin.com')
|
||||||
# $ Robots.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts")
|
# $ Probot.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts")
|
||||||
# => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}}
|
# => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}}
|
||||||
#
|
#
|
||||||
# Test with
|
# Test with
|
||||||
# assert Robots.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//}
|
# assert Probot.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
require_relative "lib/Probot/version"
|
require_relative "lib/probot/version"
|
||||||
|
|
||||||
Gem::Specification.new do |spec|
|
Gem::Specification.new do |spec|
|
||||||
spec.name = "Probot"
|
spec.name = "Probot"
|
||||||
@@ -8,14 +8,12 @@ Gem::Specification.new do |spec|
|
|||||||
spec.authors = ["Dan Milne"]
|
spec.authors = ["Dan Milne"]
|
||||||
spec.email = ["d@nmilne.com"]
|
spec.email = ["d@nmilne.com"]
|
||||||
|
|
||||||
spec.summary = "A Robots.txt parser."
|
spec.summary = "A robots.txt parser."
|
||||||
spec.description = "A more fully featured Robotos.txt parser."
|
spec.description = "A fully featured robots.txt parser."
|
||||||
spec.homepage = "http://github.com/dkam/probot"
|
spec.homepage = "http://github.com/dkam/probot"
|
||||||
spec.license = "MIT"
|
spec.license = "MIT"
|
||||||
spec.required_ruby_version = ">= 3.0"
|
spec.required_ruby_version = ">= 3.0"
|
||||||
|
|
||||||
spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
|
|
||||||
|
|
||||||
spec.metadata["homepage_uri"] = spec.homepage
|
spec.metadata["homepage_uri"] = spec.homepage
|
||||||
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
|
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
|
||||||
spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md"
|
spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md"
|
||||||
|
|||||||
@@ -2,5 +2,6 @@
|
|||||||
|
|
||||||
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
||||||
require "probot"
|
require "probot"
|
||||||
|
require "probot/version" # for testing the version number - otherwise the gemspec does it.
|
||||||
|
|
||||||
require "minitest/autorun"
|
require "minitest/autorun"
|
||||||
|
|||||||
@@ -100,13 +100,13 @@ class TestProbot < Minitest::Test
|
|||||||
|
|
||||||
def test_some_tests
|
def test_some_tests
|
||||||
TEST_CASES.each_with_index do |test_case, ind|
|
TEST_CASES.each_with_index do |test_case, ind|
|
||||||
r = Robots.new(test_case[:txt])
|
r = Probot.new(test_case[:txt])
|
||||||
|
|
||||||
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
|
assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
|
||||||
assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}"
|
assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}"
|
||||||
|
|
||||||
test_case[:tests].each do |tst|
|
test_case[:tests].each do |tst|
|
||||||
r = Robots.new(test_case[:txt], agent: tst[:agent])
|
r = Probot.new(test_case[:txt], agent: tst[:agent])
|
||||||
|
|
||||||
tst[:allowed].each do |url|
|
tst[:allowed].each do |url|
|
||||||
assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}"
|
assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}"
|
||||||
@@ -121,24 +121,24 @@ class TestProbot < Minitest::Test
|
|||||||
|
|
||||||
# https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values
|
# https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values
|
||||||
def test_googles_tests
|
def test_googles_tests
|
||||||
assert Robots.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/}
|
assert Probot.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/}
|
||||||
assert Robots.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/}
|
assert Probot.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/}
|
||||||
assert Robots.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/}
|
assert Probot.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/}
|
||||||
assert Robots.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL
|
assert Probot.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL
|
||||||
assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/}
|
assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/}
|
||||||
assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//}
|
assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//}
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_empty_allow_disallow
|
def test_empty_allow_disallow
|
||||||
assert Robots.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
|
assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
|
||||||
assert Robots.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
|
assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_consecutive_user_agents
|
def test_consecutive_user_agents
|
||||||
txt = %(User-agent: Curl
|
txt = %(User-agent: Curl
|
||||||
User-agent: Wget
|
User-agent: Wget
|
||||||
Disallow: /url)
|
Disallow: /url)
|
||||||
r = Robots.new(txt)
|
r = Probot.new(txt)
|
||||||
assert r.allowed?("/url") == true
|
assert r.allowed?("/url") == true
|
||||||
|
|
||||||
r.agent = "Curl"
|
r.agent = "Curl"
|
||||||
@@ -152,7 +152,7 @@ class TestProbot < Minitest::Test
|
|||||||
end
|
end
|
||||||
|
|
||||||
def test_unfound_robots
|
def test_unfound_robots
|
||||||
r = Robots.new("")
|
r = Probot.new("")
|
||||||
assert r.allowed?("/url") == true
|
assert r.allowed?("/url") == true
|
||||||
r.agent = "Curl"
|
r.agent = "Curl"
|
||||||
assert r.allowed?("/url") == true
|
assert r.allowed?("/url") == true
|
||||||
@@ -161,7 +161,7 @@ class TestProbot < Minitest::Test
|
|||||||
def test_more_other_tests
|
def test_more_other_tests
|
||||||
txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n )
|
txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n )
|
||||||
|
|
||||||
r = Robots.new(txt, agent: "rubytest")
|
r = Probot.new(txt, agent: "rubytest")
|
||||||
assert r.allowed?("/dir/page.php") == false
|
assert r.allowed?("/dir/page.php") == false
|
||||||
assert r.allowed?("/dir/home.php") == false
|
assert r.allowed?("/dir/home.php") == false
|
||||||
assert r.allowed?("/dir/page") == true
|
assert r.allowed?("/dir/page") == true
|
||||||
|
|||||||
Reference in New Issue
Block a user