From 5774ca2836c5a92bd60ef5933cf69ab3a538384f Mon Sep 17 00:00:00 2001 From: Dan Milne Date: Sun, 10 Sep 2023 10:02:52 +1000 Subject: [PATCH] Switch to Net::HTTP and set the request header to match the query header --- .gitignore | 8 ++ .standard.yml | 3 + CHANGELOG.md | 5 ++ Gemfile | 11 +++ LICENSE.txt | 21 ++++++ README.md | 90 ++++++++++++++++++++++ Rakefile | 14 ++++ bin/console | 11 +++ bin/setup | 8 ++ lib/probot.rb | 159 +++++++++++++++++++++++++++++++++++++++ lib/probot/version.rb | 3 + probot.gemspec | 40 ++++++++++ sig/probot.rbs | 4 + test/test_helper.rb | 6 ++ test/test_probot.rb | 170 ++++++++++++++++++++++++++++++++++++++++++ 15 files changed, 553 insertions(+) create mode 100644 .gitignore create mode 100644 .standard.yml create mode 100644 CHANGELOG.md create mode 100644 Gemfile create mode 100644 LICENSE.txt create mode 100644 README.md create mode 100644 Rakefile create mode 100755 bin/console create mode 100755 bin/setup create mode 100644 lib/probot.rb create mode 100644 lib/probot/version.rb create mode 100644 probot.gemspec create mode 100644 sig/probot.rbs create mode 100644 test/test_helper.rb create mode 100644 test/test_probot.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9106b2a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/.bundle/ +/.yardoc +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ diff --git a/.standard.yml b/.standard.yml new file mode 100644 index 0000000..08d0e90 --- /dev/null +++ b/.standard.yml @@ -0,0 +1,3 @@ +# For available configuration options, see: +# https://github.com/testdouble/standard +ruby_version: 3.0.0 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5ba7c3c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +## [Unreleased] + +## [0.1.0] - 2023-09-09 + +- Initial release diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..52d485e --- /dev/null +++ b/Gemfile @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# Specify your gem's dependencies in Probot.gemspec +gemspec + +gem "rake", "~> 13.0" +gem "minitest", "~> 5.0" +gem "rubocop", "~> 1.21" +gem "standard", "~> 1.31" # Adjust the version as needed diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d0ccb6b --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2023 Dan Milne + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b12ab49 --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +# Probot + +OMG another Ruby Robot.txt parser? It was an accident, I didn't mean to make it and I shouldn't have but here we are. It started out tiny and grew. Yes I should have used one of the other gems. + +Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh? + +On the plus side, it has some nice features I don't think the others have. + +1. Supports consecutive user agents making up a single record: + +```txt +# Block both first-agent and second-agent from the site. +User-agent: first-agent +User-agent: second-agent +Disallow: / +``` + +2. It can select the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores. + +```ruby +txt = %Q{ +User-agent: * +Disallow: /dir1 +Allow: /dir1/dir2 +Disallow: /dir1/dir2/dir3 +} +Probot.new(txt).matches("/dir1/dir2/dir3") +=> {:disallowed=>{/\/dir1/=>5, /\/dir1\/dir2\/dir3/=>15}, :allowed=>{/\/dir1\/dir2/=>10}} +``` + +In this case, we can see the Disallow rule with length 15 would be followed. + +## Installation + +Install the gem and add to the application's Gemfile by executing: + + $ bundle add probot + +If bundler is not being used to manage dependencies, install the gem by executing: + + $ gem install probot + +## Usage + +It's straightforward to use. Instantiate it if you'll make a few requests: + +```ruby +> r = Probot.new('https://booko.info', agent: 'MyAgent') +> r.rules +=> {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1}, + "YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}} + +> r.allowed?("/abc/add_to_cart") +=> false +> r.allowed?("https://booko.info/9780765397522/All-Systems-Red") +=> true +> r.allowed?("https://booko.info/9780765397522/add_to_cart") +=> false +``` + +Or just one-shot it for one-offs: + +```ruby +Probot.allowed?("https://booko.info/9780765397522/All-Systems-Red", agent: "BookScraper") +``` + + +## Development + +After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. + +To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). + +## Contributing + +Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/Probot. + +## Further Reading + +* https://moz.com/learn/seo/robotstxt +* https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea +* https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt +* https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt + +* https://github.com/google/robotstxt - Google's official parser + + +## License + +The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..5bb6087 --- /dev/null +++ b/Rakefile @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "rake/testtask" + +Rake::TestTask.new(:test) do |t| + t.libs << "test" + t.libs << "lib" + t.test_files = FileList["test/**/test_*.rb"] +end + +require "standard/rake" + +task default: %i[test standard] diff --git a/bin/console b/bin/console new file mode 100755 index 0000000..e76ba34 --- /dev/null +++ b/bin/console @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "probot" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +require "irb" +IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup new file mode 100755 index 0000000..dce67d8 --- /dev/null +++ b/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/lib/probot.rb b/lib/probot.rb new file mode 100644 index 0000000..08db6ce --- /dev/null +++ b/lib/probot.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +require_relative "Probot/version" + +require "uri" +require "net/http" + +# https://moz.com/learn/seo/robotstxt +# https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea +# https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt +# https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt +# +# https://github.com/google/robotstxt - Google's official parser + +# Note: User-agent found on consecutive lines are considered to be part of the same record. +# Note: Google ignores crawl_delay +# Note: Google does not consider crawl_delay or sitemap to be part of the per-agent records. + +# Two main parts of this class: +# Parse a robots.txt file +# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. + +class Robots + attr_reader :rules, :sitemap, :doc + attr_accessor :agent + + def initialize(data, agent: "*") + raise ArgumentError, "The first argument must be a string" unless data.is_a?(String) + @agent = agent + + @rules = {} + @current_agents = ["*"] + @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } + @sitemaps = [] + + @doc = data.start_with?("http") ? fetch_robots_txt(data) : data + parse(@doc) + end + + def request_headers = (agent == "*") ? {} : {"User-Agent" => @agent} + + def fetch_robots_txt(url) + Net::HTTP.get(URI(url).tap { |u| u.path = "/robots.txt" }, request_headers) + rescue + "" + end + + def crawl_delay = rules.dig(@agent, "crawl_delay") + + def found_agents = rules.keys + + def disallowed = rules.dig(@agent, "disallow") || rules.dig("*", "disallow") + + def allowed = rules.dig(@agent, "allow") || rules.dig("*", "allow") + + def disallowed_matches(url) = disallowed.select { |disallowed_url| url.match?(disallowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } + + def allowed_matches(url) = allowed.select { |allowed_url| url.match?(allowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } + + def matches(url) = {disallowed: disallowed_matches(url), allowed: allowed_matches(url)} + + def disallowed_best(url) = disallowed_matches(url).max_by { |k, v| v } + + def allowed_best(url) = allowed_matches(url).max_by { |k, v| v } + + def matching_rule(url) = (disallowed_best(url)&.last.to_i > allowed_best(url)&.last.to_i) ? {disallow: disallowed_best(url)&.first} : {allow: allowed_best(url)&.first} + + # If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it's allowed. + def allowed?(url) = !disallowed?(url) + + def disallowed?(url) = matching_rule(url)&.keys&.first == :disallow + + def parse(doc) + # We need to handle consective user-agent lines, which are considered to be part of the same record. + subsequent_agent = false + + doc.lines.each do |line| + next if line.start_with?("#") || !line.include?(":") || line.split(":").length < 2 + + data = ParsedLine.new(line) + + if data.agent? + if subsequent_agent + @current_agents << data.value + else + @current_agents = [data.value] + subsequent_agent = true + end + + @current_agents.each { |agent| rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } + next + end + + # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt + + if data.allow? || data.disallow? + @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } + + subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. + next + end + + if data.crawl_delay? + @current_agents.each { |agent| rules[agent][data.key] = data.value } + next + end + + if data.sitemap? + @sitemap = URI(data.value).path + next + end + + @current_agents.each { |agent| rules[agent][data.key] = data.value } + end + end + + def pattern_length(regexp) = regexp.source.gsub(/(\\[\*\$\.])/, "*").length + + # ParedLine Note: In the case of 'Sitemap: https://example.com/sitemap.xml', raw_value needs to rejoin after splitting the URL. + + ParsedLine = Struct.new(:input_string) do + def key = input_string.split(":").first&.strip&.downcase + + def raw_value = input_string.split(":").slice(1..)&.join(":")&.strip + + def clean_value = raw_value.split("#").first&.strip + + def agent? = key == "user-agent" + + def disallow? = key == "disallow" + + def allow? = key == "allow" + + def crawl_delay? = key == "crawl-delay" + + def sitemap? = key == "sitemap" + + def value + return clean_value.to_f if crawl_delay? + return URI(clean_value).to_s if disallow? || allow? + + raw_value + rescue URI::InvalidURIError + raw_value + end + end + + def self.allowed?(url, agent: "*") = Robots.new(url, agent: agent).allowed?(url) +end + +# Robots.allowed?("https://booko.info/9780765397522/All-Systems-Red") +# => true +# r = Robots.new('https://booko.info', agent: 'YandexBot') +# r = Robots.new('https://www.allenandunwin.com') +# $ Robots.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts") +# => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}} +# +# Test with +# assert Robots.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//} diff --git a/lib/probot/version.rb b/lib/probot/version.rb new file mode 100644 index 0000000..19d16d3 --- /dev/null +++ b/lib/probot/version.rb @@ -0,0 +1,3 @@ +class Probot + VERSION = "0.1.0" +end diff --git a/probot.gemspec b/probot.gemspec new file mode 100644 index 0000000..fc72399 --- /dev/null +++ b/probot.gemspec @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require_relative "lib/Probot/version" + +Gem::Specification.new do |spec| + spec.name = "Probot" + spec.version = Probot::VERSION + spec.authors = ["Dan Milne"] + spec.email = ["d@nmilne.com"] + + spec.summary = "A Robots.txt parser." + spec.description = "A more fully featured Robotos.txt parser." + spec.homepage = "http://github.com/dkam/probot" + spec.license = "MIT" + spec.required_ruby_version = ">= 3.0" + + spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'" + + spec.metadata["homepage_uri"] = spec.homepage + spec.metadata["source_code_uri"] = "http://github.com/dkam/probot" + spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md" + + # Specify which files should be added to the gem when it is released. + # The `git ls-files -z` loads the files in the RubyGem that have been added into git. + spec.files = Dir.chdir(__dir__) do + `git ls-files -z`.split("\x0").reject do |f| + (File.expand_path(f) == __FILE__) || + f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor Gemfile]) + end + end + spec.bindir = "exe" + spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } + spec.require_paths = ["lib"] + + # Uncomment to register a new dependency of your gem + # spec.add_dependency "example-gem", "~> 1.0" + + # For more information and examples about making a new gem, check out our + # guide at: https://bundler.io/guides/creating_gem.html +end diff --git a/sig/probot.rbs b/sig/probot.rbs new file mode 100644 index 0000000..fdb8173 --- /dev/null +++ b/sig/probot.rbs @@ -0,0 +1,4 @@ +module Probot + VERSION: String + # See the writing guide of rbs: https://github.com/ruby/rbs#guides +end diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..2abc2ba --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../lib", __dir__) +require "probot" + +require "minitest/autorun" diff --git a/test/test_probot.rb b/test/test_probot.rb new file mode 100644 index 0000000..5c8900d --- /dev/null +++ b/test/test_probot.rb @@ -0,0 +1,170 @@ +# frozen_string_literal: true + +require "test_helper" + +class TestProbot < Minitest::Test + def test_that_it_has_a_version_number + refute_nil ::Probot::VERSION + end + + TEST_CASES = [ + { + txt: %( + User-Agent: * + Disallow : /admin/ + Disallow : /cart/ + Disallow : /client/ + Sitemap: http://www.allenandunwin.com/sitemap.xml + + User-Agent: FooBot + Disallow: /private/ + Allow: /cart/ + + User-Agent: BlahBot + User-Agent: YadaBot + Disallow: /noblah/ + Allow: /cart/ + ), + sitemap: "/sitemap.xml", + found_agents: ["*", "FooBot", "BlahBot", "YadaBot"], + tests: [ + { + agent: "*", + allowed: ["/books/9781760878854", "/books/9781760878861", "/books/9781760878878"], + disallowed: ["/admin/", "/cart/", "/client/"], + crawl_delay: 0 + } + ] + }, { + txt: %( + User-agent: * + Disallow: /?*\t\t\t#comment + Disallow: /home/ + Disallow: /dashboard + Disallow: /terms-conditions + Disallow: /privacy-policy + Disallow: /index.php + Disallow: /chargify_system + Disallow: /test* + Disallow: /team* # comment + Disallow: /index + Allow: / # comment + Sitemap: http://example.com/sitemap.xml + ), + sitemap: "/sitemap.xml", + found_agents: ["*"], + tests: [ + { + agent: "*", + allowed: ["/home", "/books/9781760878878", "/client/"], + disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], + crawl_delay: 0 + }, + { + agent: "UnfoundAgent", + allowed: ["/home", "/books/9781760878878", "/client/"], + disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], + crawl_delay: 0 + } + ] + }, + # These tests from https://github.com/rinzi/robotstxt + { + txt: %(User-agent: rubytest + Disallow: /no-dir/ + Disallow: /no-page.php + Disallow: /*-no-dir/ + Disallow: /dir/*.php + Disallow: *?var + Disallow: /dir/*?var + + # this is a test + useragent: * + disalow: /test/ + + sitemap: /sitemapxml.xml + + ), + sitemap: "/sitemapxml.xml", + found_agents: ["*", "rubytest"], + tests: [ + { + agent: "rubytest", + allowed: ["/", "/blog/", "/blog/page.php"], + disallowed: ["/no-dir/", "/foo-no-dir/", "/foo-no-dir/page.html", "/dir/page.php", "/page.php?var=0", "/dir/page.php?var=0", "/blog/page.php?var=0"], + crawl_delay: 0 + } + ] + } + ].freeze + + def test_some_tests + TEST_CASES.each_with_index do |test_case, ind| + r = Robots.new(test_case[:txt]) + + assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" + assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" + + test_case[:tests].each do |tst| + r = Robots.new(test_case[:txt], agent: tst[:agent]) + + tst[:allowed].each do |url| + assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}" + end + + tst[:disallowed].each do |url| + assert r.disallowed?(url), "expected #{url} to be disallowed, for agent #{tst[:agent]} | test #{ind}" + end + end + end + end + + # https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values + def test_googles_tests + assert Robots.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} + assert Robots.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} + assert Robots.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} + assert Robots.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL + assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} + assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} + end + + def test_empty_allow_disallow + assert Robots.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? + assert Robots.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? + end + + def test_consecutive_user_agents + txt = %(User-agent: Curl + User-agent: Wget + Disallow: /url) + r = Robots.new(txt) + assert r.allowed?("/url") == true + + r.agent = "Curl" + assert r.allowed?("/url") == false + + r.agent = "Wget" + assert r.allowed?("/url") == false + + r.agent = "Other" + assert r.allowed?("/url") == true + end + + def test_unfound_robots + r = Robots.new("") + assert r.allowed?("/url") == true + r.agent = "Curl" + assert r.allowed?("/url") == true + end + + def test_more_other_tests + txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n ) + + r = Robots.new(txt, agent: "rubytest") + assert r.allowed?("/dir/page.php") == false + assert r.allowed?("/dir/home.php") == false + assert r.allowed?("/dir/page") == true + assert r.allowed?("/dir/page?var") == false + end +end