├── .gitignore ├── VERSION ├── test ├── fixtures │ ├── emptyish.txt │ ├── reddit.txt │ ├── yelp.txt │ ├── google.txt │ └── eventbrite.txt └── test_robots.rb ├── CHANGELOG ├── README ├── Rakefile ├── robots.gemspec └── lib └── robots.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.10.1 2 | -------------------------------------------------------------------------------- /test/fixtures/emptyish.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: -------------------------------------------------------------------------------- /test/fixtures/reddit.txt: -------------------------------------------------------------------------------- 1 | # 80legs 2 | User-agent: 008 3 | Disallow: / 4 | 5 | User-Agent: * 6 | Disallow: /goto 7 | Disallow: /*after= 8 | Disallow: /*before= 9 | Disallow: /domain/*t= 10 | Disallow: /login 11 | Disallow: /reddits/search 12 | Disallow: /search 13 | Disallow: /r/*/search 14 | Allow: / 15 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 0.10.0 2 | - Make sure the fetch robots.txt operation happens with a user agent (via rb2k) 3 | 0.9.0 4 | - Fix http://github.com/fizx/robots/issues#issue/1 5 | - Tests don't rely on network. 6 | 0.8.0 7 | - Add multiple values from robots.txt (via joost) 8 | 0.7.3 9 | - Move to jeweler, gemcutter 10 | 0.7.2 11 | - Add Ruby 1.9 compatibility 12 | 0.5-0.7.1 13 | - Lost the changelog information :/ 14 | 0.4.0 15 | - Fixed other_values bug 16 | - added crawl-delay support 17 | 0.3.2 18 | - fixed breaking on reddit.com 19 | 0.3.1 20 | - fixed bug in disallows handling 21 | - partially mocked out open-uri 22 | 0.3.0 23 | - added loggable dependency 24 | 0.2.0 25 | - IF robot.txt 404s, assume allowed. 26 | - Added CHANGELOG 27 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | A simple Ruby library to parse robots.txt. 2 | 3 | Usage: 4 | 5 | robots = Robots.new "Some User Agent" 6 | assert robots.allowed?("http://www.yelp.com/foo") 7 | assert !robots.allowed?("http://www.yelp.com/mail?foo=bar") 8 | robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps) 9 | 10 | If you want caching, you're on your own. I suggest marshalling an instance of the parser. 11 | 12 | Copyright (c) 2008 Kyle Maxwell, contributors 13 | 14 | Permission is hereby granted, free of charge, to any person 15 | obtaining a copy of this software and associated documentation 16 | files (the "Software"), to deal in the Software without 17 | restriction, including without limitation the rights to use, 18 | copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the 20 | Software is furnished to do so, subject to the following 21 | conditions: 22 | 23 | The above copyright notice and this permission notice shall be 24 | included in all copies or substantial portions of the Software. 25 | 26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 28 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 29 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 30 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 31 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 32 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 33 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /test/fixtures/yelp.txt: -------------------------------------------------------------------------------- 1 | # 2 | # 1. A robot may not injure a human being or, through inaction, allow a 3 | # human being to come to harm. 4 | # 5 | # 2. A robot must obey orders given it by human beings except where such 6 | # orders would conflict with the First Law. 7 | # 8 | # 3. A robot must protect its own existence as long as such protection 9 | # does not conflict with the First or Second Law. 10 | 11 | User-agent: * 12 | Disallow: /advertise? 13 | Disallow: /biz_share? 14 | Disallow: /biz_attribute 15 | Disallow: /biz_link 16 | Disallow: /biz_update 17 | Disallow: /bookmark? 18 | Disallow: /flag_content? 19 | Disallow: /invite_friends_service? 20 | Disallow: /login? 21 | Disallow: /mail? 22 | Disallow: /map? 23 | Disallow: /redir? 24 | Disallow: /writeareview 25 | Disallow: /signup? 26 | Disallow: /talk/new_topic 27 | Disallow: /thanx? 28 | Disallow: /user_favorites? 29 | Disallow: /weekly/signup 30 | Disallow: /elite? 31 | Disallow: /member_search_results 32 | Disallow: /advertise? 33 | Disallow: /syndicate/ 34 | Disallow: /filtered_reviews 35 | Disallow: /language/update? 36 | 37 | User-agent: Fasterfox 38 | Disallow: / 39 | 40 | User-agent: Nutch 41 | Disallow: / 42 | 43 | User-agent: spock 44 | Disallow: / 45 | 46 | User-agent: OmniExplorer_Bot 47 | Disallow: / 48 | 49 | User-agent: MJ12bot 50 | Disallow: / 51 | 52 | User-agent: TurnitinBot 53 | Disallow: / 54 | 55 | User-agent: BecomeBot 56 | Disallow: / 57 | 58 | User-agent: genieBot 59 | Disallow: / 60 | 61 | User-agent: dotbot 62 | Disallow: / 63 | 64 | User-agent: MLBot 65 | Disallow: / 66 | 67 | User-agent: 80bot 68 | Disallow: / 69 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | begin 5 | require 'jeweler' 6 | Jeweler::Tasks.new do |gem| 7 | gem.name = "robots" 8 | gem.summary = "Simple robots.txt parser" 9 | gem.description = "It parses robots.txt files" 10 | gem.email = "kyle@kylemaxwell.com" 11 | gem.homepage = "http://github.com/fizx/robots" 12 | gem.authors = ["Kyle Maxwell"] 13 | gem.add_development_dependency "thoughtbot-shoulda" 14 | # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings 15 | end 16 | Jeweler::GemcutterTasks.new 17 | rescue LoadError 18 | puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler" 19 | end 20 | 21 | require 'rake/testtask' 22 | Rake::TestTask.new(:test) do |test| 23 | test.libs << 'lib' << 'test' 24 | test.pattern = 'test/**/test_*.rb' 25 | test.verbose = true 26 | end 27 | 28 | begin 29 | require 'rcov/rcovtask' 30 | Rcov::RcovTask.new do |test| 31 | test.libs << 'test' 32 | test.pattern = 'test/**/*_test.rb' 33 | test.verbose = true 34 | end 35 | rescue LoadError 36 | task :rcov do 37 | abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov" 38 | end 39 | end 40 | 41 | task :default => :test 42 | 43 | require 'rake/rdoctask' 44 | Rake::RDocTask.new do |rdoc| 45 | if File.exist?('VERSION') 46 | version = File.read('VERSION') 47 | else 48 | version = "" 49 | end 50 | 51 | rdoc.rdoc_dir = 'rdoc' 52 | rdoc.title = "robots #{version}" 53 | rdoc.rdoc_files.include('README*') 54 | rdoc.rdoc_files.include('lib/**/*.rb') 55 | end 56 | -------------------------------------------------------------------------------- /robots.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{robots} 8 | s.version = "0.10.1" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["Kyle Maxwell"] 12 | s.date = %q{2011-04-12} 13 | s.description = %q{It parses robots.txt files} 14 | s.email = %q{kyle@kylemaxwell.com} 15 | s.extra_rdoc_files = [ 16 | "README" 17 | ] 18 | s.files = [ 19 | ".gitignore", 20 | "CHANGELOG", 21 | "README", 22 | "Rakefile", 23 | "VERSION", 24 | "lib/robots.rb", 25 | "robots.gemspec", 26 | "test/fixtures/emptyish.txt", 27 | "test/fixtures/eventbrite.txt", 28 | "test/fixtures/google.txt", 29 | "test/fixtures/reddit.txt", 30 | "test/fixtures/yelp.txt", 31 | "test/test_robots.rb" 32 | ] 33 | s.homepage = %q{http://github.com/fizx/robots} 34 | s.rdoc_options = ["--charset=UTF-8"] 35 | s.require_paths = ["lib"] 36 | s.rubygems_version = %q{1.3.6} 37 | s.summary = %q{Simple robots.txt parser} 38 | s.test_files = [ 39 | "test/test_robots.rb" 40 | ] 41 | 42 | if s.respond_to? :specification_version then 43 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 44 | s.specification_version = 3 45 | 46 | if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then 47 | s.add_development_dependency(%q, [">= 0"]) 48 | else 49 | s.add_dependency(%q, [">= 0"]) 50 | end 51 | else 52 | s.add_dependency(%q, [">= 0"]) 53 | end 54 | end 55 | 56 | -------------------------------------------------------------------------------- /test/test_robots.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require "test/unit" 3 | require File.dirname(__FILE__) + "/../lib/robots" 4 | 5 | module FakeHttp 6 | def content_type 7 | "text/plain" 8 | end 9 | 10 | def status 11 | ["200", "OK"] 12 | end 13 | end 14 | 15 | class TestRobots < Test::Unit::TestCase 16 | def setup 17 | def Robots.get_robots_txt(uri, user_agent) 18 | fixture_file = File.dirname(__FILE__) + "/fixtures/" + uri.host.split(".")[-2] + ".txt" 19 | File.open(fixture_file).extend(FakeHttp) 20 | end 21 | 22 | @robots = Robots.new "Ruby-Robot.txt Parser Test Script" 23 | end 24 | 25 | def test_allowed_if_no_robots 26 | def Robots.get_robots_txt(uri, user_agent) 27 | return nil 28 | end 29 | 30 | assert_allowed("somesite", "/") 31 | end 32 | 33 | def test_disallow_nothing 34 | assert_allowed("emptyish", "/") 35 | assert_allowed("emptyish", "/foo") 36 | end 37 | 38 | def test_reddit 39 | assert_allowed("reddit", "/") 40 | end 41 | 42 | def test_other 43 | assert_allowed("yelp", "/foo") 44 | assert_disallowed("yelp", "/mail?foo=bar") 45 | end 46 | 47 | def test_site_with_disallowed 48 | assert_allowed("google", "/") 49 | end 50 | 51 | def test_other_values 52 | sitemap = {"Sitemap" => ["http://www.eventbrite.com/sitemap_index.xml", "http://www.eventbrite.com/sitemap_index.xml"]} 53 | assert_other_equals("eventbrite", sitemap) 54 | end 55 | 56 | def assert_other_equals(name, value) 57 | assert_equal(value, @robots.other_values(uri_for_name(name, "/"))) 58 | end 59 | 60 | def assert_allowed(name, path) 61 | assert_allowed_equals(name, path, true) 62 | end 63 | 64 | def assert_disallowed(name, path) 65 | assert_allowed_equals(name, path, false) 66 | end 67 | 68 | def assert_allowed_equals(name, path, value) 69 | assert_equal(value, @robots.allowed?(uri_for_name(name, path)), @robots.inspect) 70 | end 71 | 72 | def uri_for_name(name, path) 73 | uri = name.nil? ? nil : "http://www.#{name}.com#{path}" 74 | end 75 | 76 | end -------------------------------------------------------------------------------- /lib/robots.rb: -------------------------------------------------------------------------------- 1 | require "open-uri" 2 | require "uri" 3 | require "rubygems" 4 | require "timeout" 5 | 6 | class Robots 7 | 8 | DEFAULT_TIMEOUT = 3 9 | 10 | class ParsedRobots 11 | 12 | def initialize(uri, user_agent) 13 | @last_accessed = Time.at(1) 14 | 15 | io = Robots.get_robots_txt(uri, user_agent) 16 | 17 | if !io || io.content_type != "text/plain" || io.status != ["200", "OK"] 18 | io = StringIO.new("User-agent: *\nAllow: /\n") 19 | end 20 | 21 | @other = {} 22 | @disallows = {} 23 | @allows = {} 24 | @delays = {} # added delays to make it work 25 | agent = /.*/ 26 | io.each do |line| 27 | next if line =~ /^\s*(#.*|$)/ 28 | arr = line.split(":") 29 | key = arr.shift 30 | value = arr.join(":").strip 31 | value.strip! 32 | case key 33 | when "User-agent" 34 | agent = to_regex(value) 35 | when "Allow" 36 | @allows[agent] ||= [] 37 | @allows[agent] << to_regex(value) 38 | when "Disallow" 39 | @disallows[agent] ||= [] 40 | @disallows[agent] << to_regex(value) 41 | when "Crawl-delay" 42 | @delays[agent] = value.to_i 43 | else 44 | @other[key] ||= [] 45 | @other[key] << value 46 | end 47 | end 48 | 49 | @parsed = true 50 | end 51 | 52 | def allowed?(uri, user_agent) 53 | return true unless @parsed 54 | allowed = true 55 | path = uri.request_uri 56 | 57 | @disallows.each do |key, value| 58 | if user_agent =~ key 59 | value.each do |rule| 60 | if path =~ rule 61 | allowed = false 62 | end 63 | end 64 | end 65 | end 66 | 67 | @allows.each do |key, value| 68 | unless allowed 69 | if user_agent =~ key 70 | value.each do |rule| 71 | if path =~ rule 72 | allowed = true 73 | end 74 | end 75 | end 76 | end 77 | end 78 | 79 | if allowed && @delays[user_agent] 80 | sleep @delays[user_agent] - (Time.now - @last_accessed) 81 | @last_accessed = Time.now 82 | end 83 | 84 | return allowed 85 | end 86 | 87 | def other_values 88 | @other 89 | end 90 | 91 | protected 92 | 93 | def to_regex(pattern) 94 | return /should-not-match-anything-123456789/ if pattern.strip.empty? 95 | pattern = Regexp.escape(pattern) 96 | pattern.gsub!(Regexp.escape("*"), ".*") 97 | Regexp.compile("^#{pattern}") 98 | end 99 | end 100 | 101 | def self.get_robots_txt(uri, user_agent) 102 | begin 103 | Timeout::timeout(Robots.timeout) do 104 | io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil 105 | end 106 | rescue Timeout::Error 107 | STDERR.puts "robots.txt request timed out" 108 | end 109 | end 110 | 111 | def self.timeout=(t) 112 | @timeout = t 113 | end 114 | 115 | def self.timeout 116 | @timeout || DEFAULT_TIMEOUT 117 | end 118 | 119 | def initialize(user_agent) 120 | @user_agent = user_agent 121 | @parsed = {} 122 | end 123 | 124 | def allowed?(uri) 125 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI) 126 | host = uri.host 127 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent) 128 | @parsed[host].allowed?(uri, @user_agent) 129 | end 130 | 131 | def other_values(uri) 132 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI) 133 | host = uri.host 134 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent) 135 | @parsed[host].other_values 136 | end 137 | end 138 | -------------------------------------------------------------------------------- /test/fixtures/google.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /search 3 | Disallow: /groups 4 | Disallow: /images 5 | Disallow: /catalogs 6 | Disallow: /catalogues 7 | Disallow: /news 8 | Allow: /news/directory 9 | Disallow: /nwshp 10 | Disallow: /setnewsprefs? 11 | Disallow: /index.html? 12 | Disallow: /? 13 | Disallow: /addurl/image? 14 | Disallow: /pagead/ 15 | Disallow: /relpage/ 16 | Disallow: /relcontent 17 | Disallow: /imgres 18 | Disallow: /imglanding 19 | Disallow: /keyword/ 20 | Disallow: /u/ 21 | Disallow: /univ/ 22 | Disallow: /cobrand 23 | Disallow: /custom 24 | Disallow: /advanced_group_search 25 | Disallow: /googlesite 26 | Disallow: /preferences 27 | Disallow: /setprefs 28 | Disallow: /swr 29 | Disallow: /url 30 | Disallow: /default 31 | Disallow: /m? 32 | Disallow: /m/? 33 | Disallow: /m/blogs? 34 | Disallow: /m/ig 35 | Disallow: /m/images? 36 | Disallow: /m/local? 37 | Disallow: /m/movies? 38 | Disallow: /m/news? 39 | Disallow: /m/news/i? 40 | Disallow: /m/place? 41 | Disallow: /m/setnewsprefs? 42 | Disallow: /m/search? 43 | Disallow: /m/swmloptin? 44 | Disallow: /m/trends 45 | Disallow: /wml? 46 | Disallow: /wml/? 47 | Disallow: /wml/search? 48 | Disallow: /xhtml? 49 | Disallow: /xhtml/? 50 | Disallow: /xhtml/search? 51 | Disallow: /xml? 52 | Disallow: /imode? 53 | Disallow: /imode/? 54 | Disallow: /imode/search? 55 | Disallow: /jsky? 56 | Disallow: /jsky/? 57 | Disallow: /jsky/search? 58 | Disallow: /pda? 59 | Disallow: /pda/? 60 | Disallow: /pda/search? 61 | Disallow: /sprint_xhtml 62 | Disallow: /sprint_wml 63 | Disallow: /pqa 64 | Disallow: /palm 65 | Disallow: /gwt/ 66 | Disallow: /purchases 67 | Disallow: /hws 68 | Disallow: /bsd? 69 | Disallow: /linux? 70 | Disallow: /mac? 71 | Disallow: /microsoft? 72 | Disallow: /unclesam? 73 | Disallow: /answers/search?q= 74 | Disallow: /local? 75 | Disallow: /local_url 76 | Disallow: /froogle? 77 | Disallow: /products? 78 | Disallow: /products/ 79 | Disallow: /froogle_ 80 | Disallow: /product_ 81 | Disallow: /products_ 82 | Disallow: /print 83 | Disallow: /books 84 | Disallow: /bkshp?q= 85 | Allow: /booksrightsholders 86 | Disallow: /patents? 87 | Disallow: /patents/ 88 | Allow: /patents/about 89 | Disallow: /scholar 90 | Disallow: /complete 91 | Disallow: /sponsoredlinks 92 | Disallow: /videosearch? 93 | Disallow: /videopreview? 94 | Disallow: /videoprograminfo? 95 | Disallow: /maps? 96 | Disallow: /mapstt? 97 | Disallow: /mapslt? 98 | Disallow: /maps/stk/ 99 | Disallow: /maps/br? 100 | Disallow: /mapabcpoi? 101 | Disallow: /maphp? 102 | Disallow: /places/ 103 | Disallow: /maps/place 104 | Disallow: /help/maps/streetview/partners/welcome/ 105 | Disallow: /lochp? 106 | Disallow: /center 107 | Disallow: /ie? 108 | Disallow: /sms/demo? 109 | Disallow: /katrina? 110 | Disallow: /blogsearch? 111 | Disallow: /blogsearch/ 112 | Disallow: /blogsearch_feeds 113 | Disallow: /advanced_blog_search 114 | Disallow: /reader/ 115 | Allow: /reader/play 116 | Disallow: /uds/ 117 | Disallow: /chart? 118 | Disallow: /transit? 119 | Disallow: /mbd? 120 | Disallow: /extern_js/ 121 | Disallow: /calendar/feeds/ 122 | Disallow: /calendar/ical/ 123 | Disallow: /cl2/feeds/ 124 | Disallow: /cl2/ical/ 125 | Disallow: /coop/directory 126 | Disallow: /coop/manage 127 | Disallow: /trends? 128 | Disallow: /trends/music? 129 | Disallow: /trends/hottrends? 130 | Disallow: /trends/viz? 131 | Disallow: /notebook/search? 132 | Disallow: /musica 133 | Disallow: /musicad 134 | Disallow: /musicas 135 | Disallow: /musicl 136 | Disallow: /musics 137 | Disallow: /musicsearch 138 | Disallow: /musicsp 139 | Disallow: /musiclp 140 | Disallow: /browsersync 141 | Disallow: /call 142 | Disallow: /archivesearch? 143 | Disallow: /archivesearch/url 144 | Disallow: /archivesearch/advanced_search 145 | Disallow: /base/reportbadoffer 146 | Disallow: /urchin_test/ 147 | Disallow: /movies? 148 | Disallow: /codesearch? 149 | Disallow: /codesearch/feeds/search? 150 | Disallow: /wapsearch? 151 | Disallow: /safebrowsing 152 | Allow: /safebrowsing/diagnostic 153 | Allow: /safebrowsing/report_error/ 154 | Allow: /safebrowsing/report_phish/ 155 | Disallow: /reviews/search? 156 | Disallow: /orkut/albums 157 | Disallow: /jsapi 158 | Disallow: /views? 159 | Disallow: /c/ 160 | Disallow: /cbk 161 | Disallow: /recharge/dashboard/car 162 | Disallow: /recharge/dashboard/static/ 163 | Disallow: /translate_a/ 164 | Disallow: /translate_c 165 | Disallow: /translate_f 166 | Disallow: /translate_static/ 167 | Disallow: /translate_suggestion 168 | Disallow: /profiles/me 169 | Allow: /profiles 170 | Disallow: /s2/profiles/me 171 | Allow: /s2/profiles 172 | Allow: /s2/photos 173 | Allow: /s2/static 174 | Disallow: /s2 175 | Disallow: /transconsole/portal/ 176 | Disallow: /gcc/ 177 | Disallow: /aclk 178 | Disallow: /cse? 179 | Disallow: /cse/home 180 | Disallow: /cse/panel 181 | Disallow: /cse/manage 182 | Disallow: /tbproxy/ 183 | Disallow: /comparisonads/ 184 | Disallow: /imesync/ 185 | Disallow: /shenghuo/search? 186 | Disallow: /support/forum/search? 187 | Disallow: /reviews/polls/ 188 | Disallow: /hosted/images/ 189 | Disallow: /ppob/? 190 | Disallow: /ppob? 191 | Disallow: /ig/add? 192 | Disallow: /adwordsresellers 193 | Disallow: /accounts/o8 194 | Allow: /accounts/o8/id 195 | Disallow: /topicsearch?q= 196 | Disallow: /xfx7/ 197 | Disallow: /squared/api 198 | Disallow: /squared/search 199 | Disallow: /squared/table 200 | Disallow: /toolkit/ 201 | Allow: /toolkit/*.html 202 | Disallow: /qnasearch? 203 | Disallow: /errors/ 204 | Disallow: /app/updates 205 | Disallow: /sidewiki/entry/ 206 | Disallow: /quality_form? 207 | Disallow: /labs/popgadget/search 208 | Disallow: /buzz/post 209 | Disallow: /compressiontest/ 210 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml 211 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml 212 | Sitemap: http://www.google.com/ventures/sitemap_ventures.xml 213 | Sitemap: http://www.google.com/sitemaps_webmasters.xml 214 | Sitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml 215 | Sitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml 216 | -------------------------------------------------------------------------------- /test/fixtures/eventbrite.txt: -------------------------------------------------------------------------------- 1 | # These entries assist in minimizing bandwidth usage caused 2 | # by questionable robots spidering your site. Some of these 3 | # robots or agents are used by web-stripping sofware. 4 | # Please do not remove these entries, but feel free to add 5 | # your own at the end of the list. 6 | # If you have any questions regarding this file, please 7 | # contact support@thinkhost.com 8 | 9 | User-agent: * 10 | Disallow: /rest/ 11 | Disallow: /xml/ 12 | Disallow: /json/ 13 | Disallow: /atom/ 14 | Disallow: /opml/ 15 | Disallow: /widget/ 16 | Disallow: /register 17 | Disallow: /review 18 | Disallow: /orderconfirmation 19 | Disallow: /venues/ 20 | Disallow: /*? 21 | 22 | Sitemap: http://www.eventbrite.com/sitemap_index.xml 23 | 24 | User-agent: msnbot 25 | Crawl-delay: 4 26 | 27 | User-agent: Slurp 28 | Crawl-delay: 4 29 | 30 | User-agent: Balihoo 31 | Disallow: / 32 | 33 | User-agent: BotRightHere 34 | Disallow: / 35 | 36 | User-agent: WebZip 37 | Disallow: / 38 | 39 | User-agent: larbin 40 | Disallow: / 41 | 42 | User-agent: b2w/0.1 43 | Disallow: / 44 | 45 | User-agent: Copernic 46 | Disallow: / 47 | 48 | User-agent: psbot 49 | Disallow: / 50 | 51 | User-agent: Python-urllib 52 | Disallow: / 53 | 54 | User-agent: NetMechanic 55 | Disallow: / 56 | 57 | User-agent: URL_Spider_Pro 58 | Disallow: / 59 | 60 | User-agent: CherryPicker 61 | Disallow: / 62 | 63 | User-agent: EmailCollector 64 | Disallow: / 65 | 66 | User-agent: EmailSiphon 67 | Disallow: / 68 | 69 | User-agent: WebBandit 70 | Disallow: / 71 | 72 | User-agent: EmailWolf 73 | Disallow: / 74 | 75 | User-agent: ExtractorPro 76 | Disallow: / 77 | 78 | User-agent: CopyRightCheck 79 | Disallow: / 80 | 81 | User-agent: Crescent 82 | Disallow: / 83 | 84 | User-agent: SiteSnagger 85 | Disallow: / 86 | 87 | User-agent: ProWebWalker 88 | Disallow: / 89 | 90 | User-agent: CheeseBot 91 | Disallow: / 92 | 93 | User-agent: LNSpiderguy 94 | Disallow: / 95 | 96 | User-agent: Alexibot 97 | Disallow: / 98 | 99 | User-agent: Teleport 100 | Disallow: / 101 | 102 | User-agent: TeleportPro 103 | Disallow: / 104 | 105 | User-agent: MIIxpc 106 | Disallow: / 107 | 108 | User-agent: Telesoft 109 | Disallow: / 110 | 111 | User-agent: Website Quester 112 | Disallow: / 113 | 114 | User-agent: WebZip 115 | Disallow: / 116 | 117 | User-agent: moget/2.1 118 | Disallow: / 119 | 120 | User-agent: WebZip/4.0 121 | Disallow: / 122 | 123 | User-agent: WebStripper 124 | Disallow: / 125 | 126 | User-agent: WebSauger 127 | Disallow: / 128 | 129 | User-agent: WebCopier 130 | Disallow: / 131 | 132 | User-agent: NetAnts 133 | Disallow: / 134 | 135 | User-agent: Mister PiX 136 | Disallow: / 137 | 138 | User-agent: WebAuto 139 | Disallow: / 140 | 141 | User-agent: TheNomad 142 | Disallow: / 143 | 144 | User-agent: WWW-Collector-E 145 | Disallow: / 146 | 147 | User-agent: RMA 148 | Disallow: / 149 | 150 | User-agent: libWeb/clsHTTP 151 | Disallow: / 152 | 153 | User-agent: asterias 154 | Disallow: / 155 | 156 | User-agent: httplib 157 | Disallow: / 158 | 159 | User-agent: turingos 160 | Disallow: / 161 | 162 | User-agent: spanner 163 | Disallow: / 164 | 165 | User-agent: InfoNaviRobot 166 | Disallow: / 167 | 168 | User-agent: Harvest/1.5 169 | Disallow: / 170 | 171 | User-agent: Bullseye/1.0 172 | Disallow: / 173 | 174 | User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95) 175 | Disallow: / 176 | 177 | User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0 178 | Disallow: / 179 | 180 | User-agent: CherryPickerSE/1.0 181 | Disallow: / 182 | 183 | User-agent: CherryPickerElite/1.0 184 | Disallow: / 185 | 186 | User-agent: WebBandit/3.50 187 | Disallow: / 188 | 189 | User-agent: NICErsPRO 190 | Disallow: / 191 | 192 | User-agent: Microsoft URL Control - 5.01.4511 193 | Disallow: / 194 | 195 | User-agent: DittoSpyder 196 | Disallow: / 197 | 198 | User-agent: Foobot 199 | Disallow: / 200 | 201 | User-agent: SpankBot 202 | Disallow: / 203 | 204 | User-agent: BotALot 205 | Disallow: / 206 | 207 | User-agent: lwp-trivial/1.34 208 | Disallow: / 209 | 210 | User-agent: lwp-trivial 211 | Disallow: / 212 | 213 | User-agent: BunnySlippers 214 | Disallow: / 215 | 216 | User-agent: Microsoft URL Control - 6.00.8169 217 | Disallow: / 218 | 219 | User-agent: URLy Warning 220 | Disallow: / 221 | 222 | User-agent: Wget/1.6 223 | Disallow: / 224 | 225 | User-agent: Wget/1.5.3 226 | Disallow: / 227 | 228 | User-agent: Wget 229 | Disallow: / 230 | 231 | User-agent: LinkWalker 232 | Disallow: / 233 | 234 | User-agent: cosmos 235 | Disallow: / 236 | 237 | User-agent: moget 238 | Disallow: / 239 | 240 | User-agent: hloader 241 | Disallow: / 242 | 243 | User-agent: humanlinks 244 | Disallow: / 245 | 246 | User-agent: LinkextractorPro 247 | Disallow: / 248 | 249 | User-agent: Offline Explorer 250 | Disallow: / 251 | 252 | User-agent: Mata Hari 253 | Disallow: / 254 | 255 | User-agent: LexiBot 256 | Disallow: / 257 | 258 | User-agent: Web Image Collector 259 | Disallow: / 260 | 261 | User-agent: The Intraformant 262 | Disallow: / 263 | 264 | User-agent: True_Robot/1.0 265 | Disallow: / 266 | 267 | User-agent: True_Robot 268 | Disallow: / 269 | 270 | User-agent: BlowFish/1.0 271 | Disallow: / 272 | 273 | User-agent: JennyBot 274 | Disallow: / 275 | 276 | User-agent: MIIxpc/4.2 277 | Disallow: / 278 | 279 | User-agent: BuiltBotTough 280 | Disallow: / 281 | 282 | User-agent: ProPowerBot/2.14 283 | Disallow: / 284 | 285 | User-agent: BackDoorBot/1.0 286 | Disallow: / 287 | 288 | User-agent: toCrawl/UrlDispatcher 289 | Disallow: / 290 | 291 | User-agent: WebEnhancer 292 | Disallow: / 293 | 294 | User-agent: suzuran 295 | Disallow: / 296 | 297 | User-agent: TightTwatBot 298 | Disallow: / 299 | 300 | User-agent: VCI WebViewer VCI WebViewer Win32 301 | Disallow: / 302 | 303 | User-agent: VCI 304 | Disallow: / 305 | 306 | User-agent: Szukacz/1.4 307 | Disallow: / 308 | 309 | User-agent: QueryN Metasearch 310 | Disallow: / 311 | 312 | User-agent: Openfind data gatherer 313 | Disallow: / 314 | 315 | User-agent: Openfind 316 | Disallow: / 317 | 318 | User-agent: Xenu's Link Sleuth 1.1c 319 | Disallow: / 320 | 321 | User-agent: Xenu's 322 | Disallow: / 323 | 324 | User-agent: Zeus 325 | Disallow: / 326 | 327 | User-agent: RepoMonkey Bait & Tackle/v1.01 328 | Disallow: / 329 | 330 | User-agent: RepoMonkey 331 | Disallow: / 332 | 333 | User-agent: Microsoft URL Control 334 | Disallow: / 335 | 336 | User-agent: Openbot 337 | Disallow: / 338 | 339 | User-agent: URL Control 340 | Disallow: / 341 | 342 | User-agent: Zeus Link Scout 343 | Disallow: / 344 | 345 | User-agent: Zeus 32297 Webster Pro V2.9 Win32 346 | Disallow: / 347 | 348 | User-agent: Webster Pro 349 | Disallow: / 350 | 351 | User-agent: EroCrawler 352 | Disallow: / 353 | 354 | User-agent: LinkScan/8.1a Unix 355 | Disallow: / 356 | 357 | User-agent: Keyword Density/0.9 358 | Disallow: / 359 | 360 | User-agent: Kenjin Spider 361 | Disallow: / 362 | 363 | User-agent: Iron33/1.0.2 364 | Disallow: / 365 | 366 | User-agent: Bookmark search tool 367 | Disallow: / 368 | 369 | User-agent: GetRight/4.2 370 | Disallow: / 371 | 372 | User-agent: FairAd Client 373 | Disallow: / 374 | 375 | User-agent: Gaisbot 376 | Disallow: / 377 | 378 | User-agent: Aqua_Products 379 | Disallow: / 380 | 381 | User-agent: Radiation Retriever 1.1 382 | Disallow: / 383 | 384 | User-agent: Flaming AttackBot 385 | Disallow: / 386 | 387 | User-agent: Oracle Ultra Search 388 | Disallow: / 389 | 390 | User-agent: MSIECrawler 391 | Disallow: / 392 | 393 | User-agent: PerMan 394 | Disallow: / 395 | 396 | User-agent: searchpreview 397 | Disallow: / 398 | 399 | User-agent: TurnitinBot 400 | Disallow: / 401 | 402 | User-agent: wget 403 | Disallow: / 404 | 405 | User-agent: ExtractorPro 406 | Disallow: / 407 | 408 | User-agent: WebZIP/4.21 409 | Disallow: / 410 | 411 | User-agent: WebZIP/5.0 412 | Disallow: / 413 | 414 | User-agent: HTTrack 3.0 415 | Disallow: / 416 | 417 | User-agent: TurnitinBot/1.5 418 | Disallow: / 419 | 420 | User-agent: WebCopier v3.2a 421 | Disallow: / 422 | 423 | User-agent: WebCapture 2.0 424 | Disallow: / 425 | 426 | User-agent: WebCopier v.2.2 427 | Disallow: / 428 | 429 | User-agent: Spinn3r 430 | Disallow: / 431 | 432 | User-agent: Tailrank 433 | Disallow: / 434 | 435 | Sitemap: http://www.eventbrite.com/sitemap_index.xml --------------------------------------------------------------------------------