# frozen_string_literal: true require "test_helper" class TestProbot < Minitest::Test def test_that_it_has_a_version_number refute_nil ::Probot::VERSION end TEST_CASES = [ { txt: %( User-Agent: * Disallow : /admin/ Disallow : /cart/ Disallow : /client/ Sitemap: http://www.allenandunwin.com/sitemap.xml User-Agent: FooBot Disallow: /private/ Allow: /cart/ User-Agent: BlahBot User-Agent: YadaBot Disallow: /noblah/ Allow: /cart/ ), sitemaps: ["http://www.allenandunwin.com/sitemap.xml"], found_agents: ["*", "FooBot", "BlahBot", "YadaBot"], tests: [ { agent: "*", allowed: ["/books/9781760878854", "/books/9781760878861", "/books/9781760878878"], disallowed: ["/admin/", "/cart/", "/client/"], crawl_delay: 0 } ] }, { txt: %( User-agent: * Disallow: /?*\t\t\t#comment Disallow: /home/ Disallow: /dashboard Disallow: /terms-conditions Disallow: /privacy-policy Disallow: /index.php Disallow: /chargify_system Disallow: /test* Disallow: /team* # comment Disallow: /index Allow: / # comment Sitemap: http://example.com/sitemap.xml ), sitemaps: ["http://example.com/sitemap.xml"], found_agents: ["*"], tests: [ { agent: "*", allowed: ["/home", "/books/9781760878878", "/client/"], disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], crawl_delay: 0 }, { agent: "UnfoundAgent", allowed: ["/home", "/books/9781760878878", "/client/"], disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], crawl_delay: 0 } ] }, # These tests from https://github.com/rinzi/robotstxt { txt: %(User-agent: rubytest Disallow: /no-dir/ Disallow: /no-page.php Disallow: /*-no-dir/ Disallow: /dir/*.php Disallow: *?var Disallow: /dir/*?var # this is a test useragent: * disalow: /test/ sitemap: /sitemapxml.xml ), sitemaps: ["/sitemapxml.xml"], found_agents: ["*", "rubytest"], tests: [ { agent: "rubytest", allowed: ["/", "/blog/", "/blog/page.php"], disallowed: ["/no-dir/", "/foo-no-dir/", "/foo-no-dir/page.html", "/dir/page.php", "/page.php?var=0", "/dir/page.php?var=0", "/blog/page.php?var=0"], crawl_delay: 0 } ] } ].freeze def test_some_tests TEST_CASES.each_with_index do |test_case, ind| r = Probot.new(test_case[:txt]) assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" assert_equal test_case[:sitemaps], r.sitemaps, "sitemap for test #{ind}" test_case[:tests].each do |tst| r = Probot.new(test_case[:txt], agent: tst[:agent]) tst[:allowed].each do |url| assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}" end tst[:disallowed].each do |url| assert r.disallowed?(url), "expected #{url} to be disallowed, for agent #{tst[:agent]} | test #{ind}" end end end end # https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values def test_googles_tests assert Probot.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} assert Probot.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} assert Probot.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} assert Probot.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} assert Probot.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} end def test_empty_allow_disallow assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? end def test_consecutive_user_agents txt = %(User-agent: Curl User-agent: Wget Disallow: /url) r = Probot.new(txt) assert r.allowed?("/url") == true r.agent = "Curl" assert r.allowed?("/url") == false r.agent = "Wget" assert r.allowed?("/url") == false r.agent = "Other" assert r.allowed?("/url") == true end def test_unfound_robots r = Probot.new("") assert r.allowed?("/url") == true r.agent = "Curl" assert r.allowed?("/url") == true end def test_more_other_tests txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n ) r = Probot.new(txt, agent: "rubytest") assert r.allowed?("/dir/page.php") == false assert r.allowed?("/dir/home.php") == false assert r.allowed?("/dir/page") == true assert r.allowed?("/dir/page?var") == false end def test_multiple_sitemaps txt = %(User-agent: *\nSitemap: https://example.com/sitemapxml.xml\nSitemap: https://example.com/sitemapxml2.xml\n\n) r = Probot.new(txt) assert_equal 2, r.sitemaps.length assert r.sitemaps.include?("https://example.com/sitemapxml.xml") assert r.sitemaps.include?("https://example.com/sitemapxml2.xml") end def test_absolute_sitemaps txt = %(User-agent: *\nSitemap: /sitemapxml.xml\nSitemap: /sitemapxml2.xml\n\n) r = Probot.new(txt) # We have to manually set the site, as we're not parsing a URL - then we need to reset the sitemaps array and reparse the doc. r.site = URI("https://example.com") r.sitemaps = [] r.parse(r.doc) assert_equal 2, r.sitemaps.length assert r.sitemaps.include?("https://example.com/sitemapxml.xml"), "expected https://example.com/sitemapxml.xml, got #{r.sitemaps}" assert r.sitemaps.include?("https://example.com/sitemapxml2.xml"), "expected https://example.com/sitemapxml2.xml, got #{r.sitemaps}" end end