├── .coveragerc ├── .gitignore ├── .gitmodules ├── .python-version ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── Vagrantfile ├── bench.py ├── dev-requirements.txt ├── reppy ├── __init__.py ├── cache │ ├── __init__.py │ └── policy.py ├── exceptions.py ├── robots.cpp ├── robots.pxd ├── robots.pyx ├── ttl.py └── util.py ├── requirements.txt ├── scripts └── vagrant │ └── provision.sh ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── asis ├── test_after_parse_hook │ └── robots.txt ├── test_after_response_hook │ └── robots.txt ├── test_agent_allowed │ └── robots.txt ├── test_caches_agent │ └── robots.txt ├── test_caches_robots │ └── robots.txt ├── test_content_too_big │ └── robots.txt ├── test_excessive_redirects │ └── robots.txt ├── test_fetch_status_200 │ └── robots.txt ├── test_fetch_status_401 │ └── robots.txt ├── test_fetch_status_403 │ └── robots.txt ├── test_fetch_status_4XX │ └── robots.txt ├── test_fetch_status_5XX │ └── robots.txt ├── test_returns_a_robots_object │ └── robots.txt ├── test_returns_an_agent_object │ └── robots.txt ├── test_robots_allowed │ └── robots.txt └── test_ssl_exception │ └── robots.txt ├── test_agent.py ├── test_cache ├── __init__.py ├── test_cache.py └── test_policy.py ├── test_robots.py ├── test_ttl.py ├── test_util.py └── util.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | plugins = Cython.Coverage 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries and build artifacts 2 | *.pyc 3 | *.o 4 | *.so 5 | build/ 6 | dist/ 7 | reppy.egg-info/ 8 | MANIFEST 9 | 10 | # Dev artifacts 11 | venv* 12 | 13 | # Coverage 14 | .coverage 15 | 16 | # Vagrant 17 | .vagrant/ 18 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "reppy/rep-cpp"] 2 | path = reppy/rep-cpp 3 | url = https://github.com/seomoz/rep-cpp 4 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 2.7.15 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: "2.7" 6 | dist: xenial 7 | sudo: false 8 | - python: "3.3" 9 | dist: trusty 10 | sudo: false 11 | - python: "3.4" 12 | dist: trusty 13 | sudo: false 14 | - python: "3.5" 15 | dist: xenial 16 | sudo: false 17 | - python: "3.6" 18 | dist: xenial 19 | sudo: false 20 | - python: "3.7" 21 | dist: xenial 22 | sudo: true 23 | 24 | install: pip install -r dev-requirements.txt 25 | 26 | script: make test 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 SEOmoz 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include reppy/rep-cpp/deps/url-cpp/src/* 2 | include reppy/rep-cpp/deps/url-cpp/include/* 3 | include reppy/rep-cpp/src/* 4 | include reppy/rep-cpp/include/* 5 | include reppy/robots.pyx 6 | include reppy/robots.pxd 7 | include reppy/robots.cpp 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | test: reppy/robots.so 3 | nosetests --with-coverage tests 4 | 5 | reppy/%.so: reppy/%.py* reppy/rep-cpp/src/* reppy/rep-cpp/include/* reppy/rep-cpp/deps/url-cpp/include/* reppy/rep-cpp/deps/url-cpp/src/* 6 | python setup.py build_ext --inplace 7 | 8 | install: 9 | python setup.py install 10 | 11 | dev-requirements: 12 | pip freeze | grep -v -e reppy > dev-requirements.txt 13 | 14 | clean: 15 | rm -rf build dist *.egg-info reppy/*.so 16 | find . -name '*.pyc' | xargs --no-run-if-empty rm 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Robots Exclusion Protocol Parser for Python 2 | =========================================== 3 | 4 | [![Build Status](https://travis-ci.org/seomoz/reppy.svg?branch=master)](https://travis-ci.org/seomoz/reppy) 5 | 6 | `Robots.txt` parsing in Python. 7 | 8 | Goals 9 | ===== 10 | 11 | - __Fetching__ -- helper utilities for fetching and parsing `robots.txt`s, including 12 | checking `cache-control` and `expires` headers 13 | - __Support for newer features__ -- like `Crawl-Delay` and `Sitemaps` 14 | - __Wildcard matching__ -- without using regexes, no less 15 | - __Performance__ -- with >100k parses per second, >1M URL checks per second once parsed 16 | - __Caching__ -- utilities to help with the caching of `robots.txt` responses 17 | 18 | Installation 19 | ============ 20 | `reppy` is available on `pypi`: 21 | 22 | ```bash 23 | pip install reppy 24 | ``` 25 | 26 | When installing from source, there are submodule dependencies that must also be fetched: 27 | 28 | ```bash 29 | git submodule update --init --recursive 30 | make install 31 | ``` 32 | 33 | Usage 34 | ===== 35 | 36 | Checking when pages are allowed 37 | ------------------------------- 38 | Two classes answer questions about whether a URL is allowed: `Robots` and 39 | `Agent`: 40 | 41 | ```python 42 | from reppy.robots import Robots 43 | 44 | # This utility uses `requests` to fetch the content 45 | robots = Robots.fetch('http://example.com/robots.txt') 46 | robots.allowed('http://example.com/some/path/', 'my-user-agent') 47 | 48 | # Get the rules for a specific agent 49 | agent = robots.agent('my-user-agent') 50 | agent.allowed('http://example.com/some/path/') 51 | ``` 52 | 53 | The `Robots` class also exposes properties `expired` and `ttl` to describe how 54 | long the response should be considered valid. A `reppy.ttl` policy is used to 55 | determine what that should be: 56 | 57 | ```python 58 | from reppy.ttl import HeaderWithDefaultPolicy 59 | 60 | # Use the `cache-control` or `expires` headers, defaulting to a 30 minutes and 61 | # ensuring it's at least 10 minutes 62 | policy = HeaderWithDefaultPolicy(default=1800, minimum=600) 63 | 64 | robots = Robots.fetch('http://example.com/robots.txt', ttl_policy=policy) 65 | ``` 66 | 67 | Customizing fetch 68 | ----------------- 69 | The `fetch` method accepts `*args` and `**kwargs` that are passed on to `requests.get`, 70 | allowing you to customize the way the `fetch` is executed: 71 | 72 | ```python 73 | robots = Robots.fetch('http://example.com/robots.txt', headers={...}) 74 | ``` 75 | 76 | Matching Rules and Wildcards 77 | ---------------------------- 78 | Both `*` and `$` are supported for wildcard matching. 79 | 80 | This library follows the matching [1996 RFC](http://www.robotstxt.org/norobots-rfc.txt) 81 | describes. In the case where multiple rules match a query, the longest rules wins as 82 | it is presumed to be the most specific. 83 | 84 | Checking sitemaps 85 | ----------------- 86 | The `Robots` class also lists the sitemaps that are listed in a `robots.txt` 87 | 88 | ```python 89 | # This property holds a list of URL strings of all the sitemaps listed 90 | robots.sitemaps 91 | ``` 92 | 93 | Delay 94 | ----- 95 | The `Crawl-Delay` directive is per agent and can be accessed through that class. If 96 | none was specified, it's `None`: 97 | 98 | ```python 99 | # What's the delay my-user-agent should use 100 | robots.agent('my-user-agent').delay 101 | ``` 102 | 103 | Determining the `robots.txt` URL 104 | -------------------------------- 105 | Given a URL, there's a utility to determine the URL of the corresponding `robots.txt`. 106 | It preserves the scheme and hostname and the port (if it's not the default port for the 107 | scheme). 108 | 109 | ```python 110 | # Get robots.txt URL for http://userinfo@example.com:8080/path;params?query#fragment 111 | # It's http://example.com:8080/robots.txt 112 | Robots.robots_url('http://userinfo@example.com:8080/path;params?query#fragment') 113 | ``` 114 | 115 | Caching 116 | ======= 117 | There are two cache classes provided -- `RobotsCache`, which caches entire `reppy.Robots` 118 | objects, and `AgentCache`, which only caches the `reppy.Agent` relevant to a client. These 119 | caches duck-type the class that they cache for the purposes of checking if a URL is 120 | allowed: 121 | 122 | ```python 123 | from reppy.cache import RobotsCache 124 | cache = RobotsCache(capacity=100) 125 | cache.allowed('http://example.com/foo/bar', 'my-user-agent') 126 | 127 | from reppy.cache import AgentCache 128 | cache = AgentCache(agent='my-user-agent', capacity=100) 129 | cache.allowed('http://example.com/foo/bar') 130 | ``` 131 | 132 | Like `reppy.Robots.fetch`, the cache constructory accepts a `ttl_policy` to inform the 133 | expiration of the fetched `Robots` objects, as well as `*args` and `**kwargs` to be passed 134 | to `reppy.Robots.fetch`. 135 | 136 | Caching Failures 137 | ---------------- 138 | There's a piece of classic caching advice: "don't cache failures." However, this is not 139 | always appropriate in certain circumstances. For example, if the failure is a timeout, 140 | clients may want to cache this result so that every check doesn't take a very long time. 141 | 142 | To this end, the `cache` module provides a notion of a cache policy. It determines what 143 | to do in the case of an exception. The default is to cache a form of a disallowed response 144 | for 10 minutes, but you can configure it as you see fit: 145 | 146 | ```python 147 | # Do not cache failures (note the `ttl=0`): 148 | from reppy.cache.policy import ReraiseExceptionPolicy 149 | cache = AgentCache('my-user-agent', cache_policy=ReraiseExceptionPolicy(ttl=0)) 150 | 151 | # Cache and reraise failures for 10 minutes (note the `ttl=600`): 152 | cache = AgentCache('my-user-agent', cache_policy=ReraiseExceptionPolicy(ttl=600)) 153 | 154 | # Treat failures as being disallowed 155 | cache = AgentCache( 156 | 'my-user-agent', 157 | cache_policy=DefaultObjectPolicy(ttl=600, lambda _: Agent().disallow('/'))) 158 | ``` 159 | 160 | Development 161 | =========== 162 | A `Vagrantfile` is provided to bootstrap a development environment: 163 | 164 | ```bash 165 | vagrant up 166 | ``` 167 | 168 | Alternatively, development can be conducted using a `virtualenv`: 169 | 170 | ```bash 171 | virtualenv venv 172 | source venv/bin/activate 173 | pip install -r requirements.txt 174 | ``` 175 | 176 | Tests 177 | ===== 178 | Tests may be run in `vagrant`: 179 | 180 | ```bash 181 | make test 182 | ``` 183 | 184 | Development 185 | =========== 186 | 187 | Environment 188 | ----------- 189 | To launch the `vagrant` image, we only need to 190 | `vagrant up` (though you may have to provide a `--provider` flag): 191 | 192 | ```bash 193 | vagrant up 194 | ``` 195 | 196 | With a running `vagrant` instance, you can log in and run tests: 197 | 198 | ```bash 199 | vagrant ssh 200 | make test 201 | ``` 202 | 203 | Running Tests 204 | ------------- 205 | Tests are run with the top-level `Makefile`: 206 | 207 | ```bash 208 | make test 209 | ``` 210 | 211 | PRs 212 | === 213 | These are not all hard-and-fast rules, but in general PRs have the following expectations: 214 | 215 | - __pass Travis__ -- or more generally, whatever CI is used for the particular project 216 | - __be a complete unit__ -- whether a bug fix or feature, it should appear as a complete 217 | unit before consideration. 218 | - __maintain code coverage__ -- some projects may include code coverage requirements as 219 | part of the build as well 220 | - __maintain the established style__ -- this means the existing style of established 221 | projects, the established conventions of the team for a given language on new 222 | projects, and the guidelines of the community of the relevant languages and 223 | frameworks. 224 | - __include failing tests__ -- in the case of bugs, failing tests demonstrating the bug 225 | should be included as one commit, followed by a commit making the test succeed. This 226 | allows us to jump to a world with a bug included, and prove that our test in fact 227 | exercises the bug. 228 | - __be reviewed by one or more developers__ -- not all feedback has to be accepted, but 229 | it should all be considered. 230 | - __avoid 'addressed PR feedback' commits__ -- in general, PR feedback should be rebased 231 | back into the appropriate commits that introduced the change. In cases, where this 232 | is burdensome, PR feedback commits may be used but should still describe the changed 233 | contained therein. 234 | 235 | PR reviews consider the design, organization, and functionality of the submitted code. 236 | 237 | Commits 238 | ======= 239 | Certain types of changes should be made in their own commits to improve readability. When 240 | too many different types of changes happen simultaneous to a single commit, the purpose of 241 | each change is muddled. By giving each commit a single logical purpose, it is implicitly 242 | clear why changes in that commit took place. 243 | 244 | - __updating / upgrading dependencies__ -- this is especially true for invocations like 245 | `bundle update` or `berks update`. 246 | - __introducing a new dependency__ -- often preceeded by a commit updating existing 247 | dependencies, this should only include the changes for the new dependency. 248 | - __refactoring__ -- these commits should preserve all the existing functionality and 249 | merely update how it's done. 250 | - __utility components to be used by a new feature__ -- if introducing an auxiliary class 251 | in support of a subsequent commit, add this new class (and its tests) in its own 252 | commit. 253 | - __config changes__ -- when adjusting configuration in isolation 254 | - __formatting / whitespace commits__ -- when adjusting code only for stylistic purposes. 255 | 256 | New Features 257 | ------------ 258 | Small new features (where small refers to the size and complexity of the change, not the 259 | impact) are often introduced in a single commit. Larger features or components might be 260 | built up piecewise, with each commit containing a single part of it (and its corresponding 261 | tests). 262 | 263 | Bug Fixes 264 | --------- 265 | In general, bug fixes should come in two-commit pairs: a commit adding a failing test 266 | demonstrating the bug, and a commit making that failing test pass. 267 | 268 | Tagging and Versioning 269 | ====================== 270 | Whenever the version included in `setup.py` is changed (and it should be changed when 271 | appropriate using [http://semver.org/](http://semver.org/)), a corresponding tag should 272 | be created with the same version number (formatted `v`). 273 | 274 | ```bash 275 | git tag -a v0.1.0 -m 'Version 0.1.0 276 | 277 | This release contains an initial working version of the `crawl` and `parse` 278 | utilities.' 279 | 280 | git push --tags origin 281 | ``` 282 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # Encoding: utf-8 2 | # -*- mode: ruby -*- 3 | # vi: set ft=ruby : 4 | 5 | ENV['VAGRANT_DEFAULT_PROVIDER'] = 'virtualbox' 6 | 7 | # http://docs.vagrantup.com/v2/ 8 | Vagrant.configure('2') do |config| 9 | config.vm.box = 'ubuntu/trusty64' 10 | config.vm.hostname = 'reppy' 11 | config.ssh.forward_agent = true 12 | 13 | config.vm.provider :virtualbox do |vb| 14 | vb.customize ["modifyvm", :id, "--memory", "1024"] 15 | vb.customize ["modifyvm", :id, "--cpus", "2"] 16 | vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] 17 | end 18 | 19 | config.vm.provision :shell, path: 'scripts/vagrant/provision.sh', privileged: false 20 | end 21 | -------------------------------------------------------------------------------- /bench.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | from contextlib import contextmanager 6 | import sys 7 | import time 8 | 9 | from reppy.robots import Robots 10 | content = ''' 11 | # /robots.txt for http://www.fict.org/ 12 | # comments to webmaster@fict.org 13 | 14 | User-agent: unhipbot 15 | Disallow: / 16 | 17 | User-agent: webcrawler 18 | User-agent: excite 19 | Disallow: 20 | 21 | User-agent: * 22 | Disallow: /org/plans.html 23 | Allow: /org/ 24 | Allow: /serv 25 | Allow: /~mak 26 | Disallow: / 27 | ''' 28 | 29 | @contextmanager 30 | def timer(name, count): 31 | '''Time this block.''' 32 | start = time.time() 33 | try: 34 | yield count 35 | finally: 36 | duration = time.time() - start 37 | print(name) 38 | print('=' * 10) 39 | print('Total: %s' % duration) 40 | print(' Avg: %s' % (duration / count)) 41 | print(' Rate: %s' % (count / duration)) 42 | print('') 43 | 44 | 45 | with timer('Parse', 100000) as count: 46 | for _ in xrange(count): 47 | Robots.parse('http://example.com/robots.txt', content) 48 | 49 | 50 | parsed = Robots.parse('http://example.com/robots.txt', content) 51 | with timer('Evaluate', 100000) as count: 52 | for _ in xrange(count): 53 | parsed.allowed('/org/example.html', 'other-bot') 54 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==2.0.0 2 | colorama==0.3.7 3 | coverage==4.0.3 4 | Cython==0.27.3 5 | funcsigs==1.0.2 6 | mock==2.0.0 7 | nose==1.3.7 8 | pbr==1.10.0 9 | python-dateutil==2.5.3 10 | python-termstyle==0.1.10 11 | rednose==1.1.1 12 | requests==2.10.0 13 | requests-mock==1.1.0 14 | six==1.10.0 15 | -------------------------------------------------------------------------------- /reppy/__init__.py: -------------------------------------------------------------------------------- 1 | '''Robots.txt parsing.''' 2 | 3 | import logging 4 | 5 | logger = logging.getLogger('reppy') 6 | handler = logging.StreamHandler() 7 | formatter = logging.Formatter( 8 | ' | '.join([ 9 | '[%(asctime)s]', 10 | 'PID %(process)d', 11 | '%(levelname)s in %(module)s:%(funcName)s@%(lineno)s => %(message)s' 12 | ])) 13 | handler.setLevel(logging.DEBUG) 14 | logger.addHandler(handler) 15 | logger.setLevel(logging.ERROR) 16 | 17 | from .robots import Robots, Agent 18 | -------------------------------------------------------------------------------- /reppy/cache/__init__.py: -------------------------------------------------------------------------------- 1 | '''A robots.txt cache.''' 2 | 3 | from functools import partial 4 | import threading 5 | import time 6 | 7 | from cachetools import LRUCache 8 | 9 | from .policy import DefaultObjectPolicy, ReraiseExceptionPolicy 10 | from ..robots import Robots, AllowNone, Agent 11 | from .. import logger 12 | 13 | 14 | class ExpiringObject(object): 15 | '''Wrap an object that expires over time.''' 16 | 17 | def __init__(self, factory): 18 | self.factory = factory 19 | self.lock = threading.Lock() 20 | self.obj = None 21 | self.expires = 0 22 | self.exception = None 23 | 24 | def get(self): 25 | '''Get the wrapped object.''' 26 | if (self.obj is None) or (time.time() >= self.expires): 27 | with self.lock: 28 | self.expires, self.obj = self.factory() 29 | if isinstance(self.obj, BaseException): 30 | self.exception = self.obj 31 | else: 32 | self.exception = None 33 | 34 | if self.exception: 35 | raise self.exception 36 | else: 37 | return self.obj 38 | 39 | 40 | class BaseCache(object): 41 | '''A base cache class.''' 42 | 43 | DEFAULT_CACHE_POLICY = ReraiseExceptionPolicy(ttl=600) 44 | DEFAULT_TTL_POLICY = Robots.DEFAULT_TTL_POLICY 45 | 46 | def __init__(self, capacity, cache_policy=None, ttl_policy=None, *args, **kwargs): 47 | self.cache_policy = cache_policy or self.DEFAULT_CACHE_POLICY 48 | self.ttl_policy = ttl_policy or self.DEFAULT_TTL_POLICY 49 | self.cache = LRUCache(maxsize=capacity) 50 | self.args = args 51 | self.kwargs = kwargs 52 | 53 | def get(self, url): 54 | '''Get the entity that corresponds to URL.''' 55 | robots_url = Robots.robots_url(url) 56 | if robots_url not in self.cache: 57 | self.cache[robots_url] = ExpiringObject(partial(self.factory, robots_url)) 58 | return self.cache[robots_url].get() 59 | 60 | def factory(self, url): 61 | ''' 62 | Return (expiration, obj) corresponding to provided url, exercising the 63 | cache_policy as necessary. 64 | ''' 65 | try: 66 | return self.fetch(url) 67 | except BaseException as exc: 68 | logger.exception('Reppy cache fetch error on %s' % url) 69 | return self.cache_policy.exception(url, exc) 70 | 71 | def fetch(self, url): 72 | '''Return (expiration, obj) corresponding to provided url.''' 73 | raise NotImplementedError('BaseCache does not implement fetch.') 74 | 75 | 76 | class RobotsCache(BaseCache): 77 | '''A cache of Robots objects.''' 78 | 79 | DEFAULT_CACHE_POLICY = DefaultObjectPolicy(ttl=600, factory=AllowNone) 80 | 81 | def allowed(self, url, agent): 82 | '''Return true if the provided URL is allowed to agent.''' 83 | return self.get(url).allowed(url, agent) 84 | 85 | def fetch(self, url): 86 | '''Return (expiration, Robots) for the robots.txt at the provided URL.''' 87 | robots = Robots.fetch( 88 | url, ttl_policy=self.ttl_policy, *self.args, **self.kwargs) 89 | return (robots.expires, robots) 90 | 91 | 92 | class AgentCache(BaseCache): 93 | '''A cache of Agent objects.''' 94 | 95 | DEFAULT_CACHE_POLICY = DefaultObjectPolicy( 96 | ttl=600, factory=lambda url: Agent().disallow('/')) 97 | 98 | def __init__(self, agent, *args, **kwargs): 99 | BaseCache.__init__(self, *args, **kwargs) 100 | self.agent = agent 101 | 102 | def allowed(self, url): 103 | '''Return true if the provided URL is allowed to self.agent.''' 104 | return self.get(url).allowed(url) 105 | 106 | def fetch(self, url): 107 | '''Return (expiration, Agent) for the robots.txt at the provided URL.''' 108 | robots = Robots.fetch( 109 | url, ttl_policy=self.ttl_policy, *self.args, **self.kwargs) 110 | return (robots.expires, robots.agent(self.agent)) 111 | -------------------------------------------------------------------------------- /reppy/cache/policy.py: -------------------------------------------------------------------------------- 1 | '''Policies for caching.''' 2 | 3 | import time 4 | 5 | 6 | class CachePolicyBase(object): 7 | '''Policy for caching.''' 8 | 9 | def exception(self, url, exception): 10 | '''What to return when there's an exception.''' 11 | raise NotImplementedError('CachePolicyBase does not implement exception.') 12 | 13 | 14 | class DefaultObjectPolicy(object): 15 | '''Return a default object on exception.''' 16 | 17 | def __init__(self, ttl, factory): 18 | self.ttl = ttl 19 | self.factory = factory 20 | 21 | def exception(self, url, exception): 22 | '''What to return when there's an exception.''' 23 | return (time.time() + self.ttl, self.factory(url)) 24 | 25 | 26 | class ReraiseExceptionPolicy(object): 27 | '''Reraise the exception.''' 28 | 29 | def __init__(self, ttl): 30 | self.ttl = ttl 31 | 32 | def exception(self, url, exception): 33 | '''What to return when there's an exception.''' 34 | return (time.time() + self.ttl, exception) 35 | -------------------------------------------------------------------------------- /reppy/exceptions.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # 3 | # Copyright (c) 2011 SEOmoz 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | '''All of our exceptions''' 25 | 26 | 27 | class ReppyException(Exception): 28 | '''Any internal exception''' 29 | pass 30 | 31 | class ContentTooLong(ReppyException): 32 | '''Robots.txt content is too long.''' 33 | pass 34 | 35 | class ServerError(ReppyException): 36 | '''When the remote server returns an error''' 37 | pass 38 | 39 | class SSLException(ReppyException): 40 | '''An SSL error.''' 41 | pass 42 | 43 | class ConnectionException(ReppyException): 44 | '''A connection error exception.''' 45 | pass 46 | 47 | class MalformedUrl(ReppyException): 48 | '''An exception for a missing or invalid url or schema.''' 49 | pass 50 | 51 | class ExcessiveRedirects(ReppyException): 52 | '''A TooManyRedirects error.''' 53 | pass 54 | 55 | class ReadTimeout(ReppyException): 56 | '''A ReadTimeout error from the HTTP library.''' 57 | pass 58 | 59 | class BadStatusCode(ReppyException): 60 | '''An exception for 5xx status codes.''' 61 | pass 62 | -------------------------------------------------------------------------------- /reppy/robots.pxd: -------------------------------------------------------------------------------- 1 | # Cython declarations 2 | 3 | from libcpp.string cimport string 4 | from libcpp.vector cimport vector 5 | from libcpp cimport bool 6 | 7 | cdef extern from "rep-cpp/include/directive.h" namespace "Rep": 8 | cpdef cppclass CppDirective "Rep::Directive": 9 | ctypedef size_t priority_t 10 | 11 | CppDirective(const string& line, bool allowed) 12 | CppDirective(const CppDirective& rhs) 13 | priority_t priority() const 14 | bool match(const string& path) const 15 | bool allowed() const 16 | string str() const 17 | 18 | cdef extern from "rep-cpp/include/agent.h" namespace "Rep": 19 | cpdef cppclass CppAgent "Rep::Agent": 20 | ctypedef float delay_t 21 | 22 | CppAgent() 23 | CppAgent(const string& host) 24 | CppAgent& allow(const string& query) 25 | CppAgent& disallow(const string& query) 26 | CppAgent& delay(delay_t delay) 27 | delay_t delay() const 28 | const vector[CppDirective]& directives() const 29 | bool allowed(const string& path) const 30 | string str() const 31 | 32 | cdef extern from "rep-cpp/include/robots.h" namespace "Rep": 33 | cpdef cppclass CppRobots "Rep::Robots": 34 | CppRobots(const string& content) except +ValueError 35 | CppRobots(const string& content, const string& base_url) except +ValueError 36 | const vector[string]& sitemaps() const 37 | CppAgent& agent(const string& name) const 38 | bool allowed(const string& path, const string& name) const 39 | string str() const 40 | @staticmethod 41 | string robotsUrl(const string& url) except +ValueError 42 | -------------------------------------------------------------------------------- /reppy/robots.pyx: -------------------------------------------------------------------------------- 1 | # cython: linetrace=True 2 | # distutils: define_macros=CYTHON_TRACE=1 3 | 4 | from contextlib import closing 5 | import time 6 | 7 | import requests 8 | from requests.exceptions import ( 9 | SSLError, 10 | ConnectionError, 11 | URLRequired, 12 | MissingSchema, 13 | InvalidSchema, 14 | InvalidURL, 15 | TooManyRedirects, 16 | ReadTimeout) 17 | import six 18 | 19 | from .ttl import HeaderWithDefaultPolicy 20 | from . import util, logger, exceptions 21 | 22 | cdef as_bytes(value): 23 | if isinstance(value, bytes): 24 | return value 25 | return value.encode('utf-8') 26 | 27 | # For contexts which require a 'str' type, convert bytes to unicode if needed 28 | # (i.e., Python 3). Note: could raise UnicodeDecodeError in Python 3 if input 29 | # is invalid UTF-8 30 | cdef as_string(value): 31 | if six.PY3: 32 | if isinstance(value, bytes): 33 | return value.decode('utf-8') 34 | return value 35 | 36 | 37 | def FromRobotsMethod(cls, Robots robots, const string& name): 38 | '''Construct an Agent from a CppAgent.''' 39 | agent = Agent() 40 | # This is somewhat inefficient due to the copying, but it is 41 | # required to be copied because we often toss the containing 42 | # Robots object as a temporary thus we'd leave the underlying 43 | # Agent object dangling without a full copy. 44 | agent.agent = robots.robots.agent(name) 45 | return agent 46 | 47 | cdef class Agent: 48 | '''Wrapper around rep-cpp's Rep::Agent class.''' 49 | 50 | cdef CppAgent agent 51 | 52 | from_robots = classmethod(FromRobotsMethod) 53 | 54 | def __str__(self): 55 | return as_string(self.agent.str()) 56 | 57 | def __len__(self): 58 | return self.agent.directives().size() 59 | 60 | @property 61 | def delay(self): 62 | '''The delay associated with this agent.''' 63 | cdef float value = self.agent.delay() 64 | if value > 0: 65 | return value 66 | return None 67 | 68 | def allow(self, path): 69 | '''Allow the provided path.''' 70 | self.agent.allow(as_bytes(path)) 71 | return self 72 | 73 | def disallow(self, path): 74 | '''Disallow the provided path.''' 75 | self.agent.disallow(as_bytes(path)) 76 | return self 77 | 78 | def allowed(self, path): 79 | '''Is the provided URL allowed?''' 80 | return self.agent.allowed(as_bytes(path)) 81 | 82 | 83 | def ParseMethod(cls, url, content, expires=None): 84 | '''Parse a robots.txt file.''' 85 | return cls(url, as_bytes(content), expires) 86 | 87 | def FetchMethod(cls, url, ttl_policy=None, max_size=1048576, *args, **kwargs): 88 | '''Get the robots.txt at the provided URL.''' 89 | after_response_hook = kwargs.pop('after_response_hook', None) 90 | after_parse_hook = kwargs.pop('after_parse_hook', None) 91 | def wrap_exception(etype, cause): 92 | wrapped = etype(cause) 93 | wrapped.url = url 94 | if after_response_hook is not None: 95 | after_response_hook(wrapped) 96 | raise wrapped 97 | try: 98 | # Limit the size of the request 99 | kwargs['stream'] = True 100 | with closing(requests.get(url, *args, **kwargs)) as res: 101 | content = res.raw.read(amt=max_size, decode_content=True) 102 | # Try to read an additional byte, to see if the response is too big 103 | if res.raw.read(amt=1, decode_content=True): 104 | raise exceptions.ContentTooLong( 105 | 'Content larger than %s bytes' % max_size) 106 | 107 | if after_response_hook is not None: 108 | after_response_hook(res) 109 | 110 | # Get the TTL policy's ruling on the ttl 111 | expires = (ttl_policy or cls.DEFAULT_TTL_POLICY).expires(res) 112 | 113 | if res.status_code == 200: 114 | robots = cls.parse(url, content, expires) 115 | if after_parse_hook is not None: 116 | after_parse_hook(robots) 117 | return robots 118 | elif res.status_code in (401, 403): 119 | return AllowNone(url, expires) 120 | elif res.status_code >= 400 and res.status_code < 500: 121 | return AllowAll(url, expires) 122 | else: 123 | raise exceptions.BadStatusCode( 124 | 'Got %i for %s' % (res.status_code, url), res.status_code) 125 | except SSLError as exc: 126 | wrap_exception(exceptions.SSLException, exc) 127 | except ConnectionError as exc: 128 | wrap_exception(exceptions.ConnectionException, exc) 129 | except (URLRequired, MissingSchema, InvalidSchema, InvalidURL) as exc: 130 | wrap_exception(exceptions.MalformedUrl, exc) 131 | except TooManyRedirects as exc: 132 | wrap_exception(exceptions.ExcessiveRedirects, exc) 133 | except ReadTimeout as exc: 134 | wrap_exception(exceptions.ReadTimeout, exc) 135 | 136 | def RobotsUrlMethod(cls, url): 137 | '''Get the robots.txt URL that corresponds to the provided one.''' 138 | return as_string(CppRobots.robotsUrl(as_bytes(url))) 139 | 140 | cdef class Robots: 141 | '''Wrapper around rep-cpp's Rep::Robots class.''' 142 | 143 | # The default TTL policy is to cache for 3600 seconds or what's provided in the 144 | # headers, and a minimum of 600 seconds 145 | DEFAULT_TTL_POLICY = HeaderWithDefaultPolicy(default=3600, minimum=600) 146 | 147 | # Class methods 148 | parse = classmethod(ParseMethod) 149 | fetch = classmethod(FetchMethod) 150 | robots_url = classmethod(RobotsUrlMethod) 151 | 152 | # Data members 153 | cdef CppRobots* robots 154 | cdef object expires 155 | 156 | def __init__(self, url, const string& content, expires=None): 157 | self.robots = new CppRobots(content, as_bytes(url)) 158 | self.expires = expires 159 | 160 | def __str__(self): 161 | # Note: this could raise a UnicodeDecodeError in Python 3 if the 162 | # robots.txt had invalid UTF-8 163 | return as_string(self.robots.str()) 164 | 165 | def __dealloc__(self): 166 | del self.robots 167 | 168 | @property 169 | def sitemaps(self): 170 | '''Get all the sitemaps in this robots.txt.''' 171 | return list(map(as_string, self.robots.sitemaps())) 172 | 173 | def allowed(self, path, name): 174 | '''Is the provided path allowed for the provided agent?''' 175 | return self.robots.allowed(as_bytes(path), as_bytes(name)) 176 | 177 | def agent(self, name): 178 | '''Return the Agent that corresponds to name. 179 | 180 | Note modifications to the returned Agent will not be reflected 181 | in this Robots object because it is a *copy*, not the original 182 | Agent object. 183 | ''' 184 | return Agent.from_robots(self, as_bytes(name)) 185 | 186 | @property 187 | def expired(self): 188 | '''True if the current time is past its expiration.''' 189 | return time.time() > self.expires 190 | 191 | @property 192 | def expires(self): 193 | '''The expiration of this robots.txt.''' 194 | return self.expires 195 | 196 | @property 197 | def ttl(self): 198 | '''Remaining time for this response to be considered valid.''' 199 | return max(self.expires - time.time(), 0) 200 | 201 | 202 | cdef class AllowNone(Robots): 203 | '''No requests are allowed.''' 204 | 205 | def __init__(self, url, expires=None): 206 | Robots.__init__(self, url, b'User-agent: *\nDisallow: /', expires) 207 | 208 | 209 | cdef class AllowAll(Robots): 210 | '''All requests are allowed.''' 211 | 212 | def __init__(self, url, expires=None): 213 | Robots.__init__(self, url, b'', expires) 214 | -------------------------------------------------------------------------------- /reppy/ttl.py: -------------------------------------------------------------------------------- 1 | '''Policies for setting the TTL on Robots objects.''' 2 | 3 | import time 4 | 5 | from . import logger 6 | from .util import parse_date 7 | 8 | 9 | class TTLPolicyBase(object): 10 | '''Policy for setting the TTL on Robots objects.''' 11 | 12 | def ttl(self, response): 13 | '''Get the caching TTL for a response.''' 14 | raise NotImplementedError('TTLPolicyBase does not implement ttl.') 15 | 16 | def expires(self, response): 17 | '''Determine when a response should expire.''' 18 | return time.time() + self.ttl(response) 19 | 20 | 21 | class HeaderWithDefaultPolicy(TTLPolicyBase): 22 | '''TTL is based on headers, but falls back to a default, clamps to a minimum.''' 23 | 24 | def __init__(self, default, minimum): 25 | self.default = default 26 | self.minimum = minimum 27 | 28 | def ttl(self, response): 29 | '''Get the ttl from headers.''' 30 | # If max-age is specified in Cache-Control, use it and ignore any 31 | # Expires header, as per RFC2616 Sec. 13.2.4. 32 | cache_control = response.headers.get('cache-control') 33 | if cache_control is not None: 34 | for directive in cache_control.split(','): 35 | name, _, value = directive.lower().partition('=') 36 | name = name.strip() 37 | if name in ('no-store', 'must-revalidate', 'no-cache'): 38 | return max(self.minimum, 0) 39 | elif name in ('s-maxage', 'max-age'): 40 | try: 41 | return max(self.minimum, int(value.strip())) 42 | except ValueError: 43 | logger.warn( 44 | 'Could not parse %s=%s', name, value, exc_info=1) 45 | 46 | # Check the Expires header 47 | expires = response.headers.get('expires') 48 | if expires is not None: 49 | # Evaluate the expiration relative to the server-provided date 50 | date = response.headers.get('date') 51 | if date is not None: 52 | try: 53 | date = parse_date(date) 54 | except ValueError: 55 | logger.warn( 56 | 'Could not parse date string %s', date, exc_info=1) 57 | date = time.time() 58 | else: 59 | date = time.time() 60 | 61 | try: 62 | return max(self.minimum, parse_date(expires) - date) 63 | except ValueError: 64 | logger.warn( 65 | 'Could not parse date string %s', expires, exc_info=1) 66 | 67 | return self.default 68 | -------------------------------------------------------------------------------- /reppy/util.py: -------------------------------------------------------------------------------- 1 | '''Utility functions.''' 2 | 3 | import email 4 | 5 | 6 | def parse_date(string): 7 | '''Return a timestamp for the provided datestring, described by RFC 7231.''' 8 | parsed = email.utils.parsedate_tz(string) 9 | if parsed is None: 10 | raise ValueError("Invalid time.") 11 | parsed = list(parsed) 12 | # Default time zone is GMT/UTC 13 | parsed[9] = 0 if parsed[9] is None else parsed[9] 14 | return email.utils.mktime_tz(parsed) 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==3.0.0 2 | requests==2.10.0 3 | six==1.10.0 4 | python-dateutil==2.5.3 5 | Cython==0.29.14 6 | mock==4.0.1 7 | requests_mock==1.7.0 8 | nose==1.3.7 9 | colorama==0.4.3 10 | python-termstyle==0.1.10 11 | rednose==1.2.1 12 | -------------------------------------------------------------------------------- /scripts/vagrant/provision.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -e 4 | 5 | sudo apt-get update 6 | sudo apt-get install -y tar curl git 7 | 8 | # Libraries required to build a complete python with pyenv: 9 | # https://github.com/yyuu/pyenv/wiki 10 | sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ 11 | libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev 12 | 13 | # Install pyenv 14 | if [ ! -d ~/.pyenv ]; then 15 | git clone https://github.com/yyuu/pyenv.git ~/.pyenv 16 | echo ' 17 | # Pyenv 18 | export PYENV_ROOT="$HOME/.pyenv" 19 | export PATH="$PYENV_ROOT/bin:$PATH" 20 | eval "$(pyenv init -)" 21 | ' >> ~/.bash_profile 22 | source ~/.bash_profile 23 | hash 24 | fi 25 | 26 | pushd /vagrant 27 | 28 | # Submodules 29 | git submodule update --init --recursive 30 | 31 | # Install our python version 32 | pyenv install --skip-existing 33 | pyenv rehash 34 | 35 | # Install a virtualenv 36 | pip install virtualenv 37 | if [ ! -d venv ]; then 38 | virtualenv venv 39 | fi 40 | source venv/bin/activate 41 | 42 | # Lastly, our dependencies 43 | pip install -r requirements.txt 44 | pip install -r dev-requirements.txt 45 | 46 | echo ' 47 | cd /vagrant 48 | # Activate virtualenv 49 | . /vagrant/venv/bin/activate 50 | ' >> ~/.bash_profile 51 | popd 52 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [nosetests] 5 | verbosity=2 6 | rednose=1 7 | exe=1 8 | cover-package=reppy 9 | cover-branches=1 10 | cover-min-percentage=100 11 | cover-inclusive=1 12 | cover-erase=1 13 | logging-clear-handlers=1 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011-2017 SEOmoz, Inc. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | from setuptools import setup 25 | from setuptools.extension import Extension 26 | 27 | ext_files = [ 28 | 'reppy/rep-cpp/src/agent.cpp', 29 | 'reppy/rep-cpp/src/directive.cpp', 30 | 'reppy/rep-cpp/src/robots.cpp', 31 | 'reppy/rep-cpp/deps/url-cpp/src/url.cpp', 32 | 'reppy/rep-cpp/deps/url-cpp/src/utf8.cpp', 33 | 'reppy/rep-cpp/deps/url-cpp/src/punycode.cpp', 34 | 'reppy/rep-cpp/deps/url-cpp/src/psl.cpp' 35 | ] 36 | 37 | kwargs = {} 38 | 39 | try: 40 | from Cython.Distutils import build_ext 41 | print('Building from Cython') 42 | ext_files.append('reppy/robots.pyx') 43 | kwargs['cmdclass'] = {'build_ext': build_ext} 44 | except ImportError: 45 | print('Building from C++') 46 | ext_files.append('reppy/robots.cpp') 47 | 48 | ext_modules = [ 49 | Extension( 50 | 'reppy.robots', ext_files, 51 | language='c++', 52 | extra_compile_args=['-std=c++11'], 53 | include_dirs=[ 54 | 'reppy/rep-cpp/include', 55 | 'reppy/rep-cpp/deps/url-cpp/include']) 56 | ] 57 | 58 | setup( 59 | name='reppy', 60 | version='0.4.16', 61 | description='Replacement robots.txt Parser', 62 | long_description='''Replaces the built-in robotsparser with a 63 | RFC-conformant implementation that supports modern robots.txt constructs like 64 | Sitemaps, Allow, and Crawl-delay. Main features: 65 | 66 | - Memoization of fetched robots.txt 67 | - Expiration taken from the `Expires` header 68 | - Batch queries 69 | - Configurable user agent for fetching robots.txt 70 | - Automatic refetching based on expiration 71 | ''', 72 | maintainer='Moz, Inc.', 73 | maintainer_email='turbo@moz.com', 74 | url='http://github.com/seomoz/reppy', 75 | license='MIT', 76 | platforms='Posix; MacOS X', 77 | ext_modules=ext_modules, 78 | packages=[ 79 | 'reppy', 80 | 'reppy.cache' 81 | ], 82 | package_dir={ 83 | 'reppy': 'reppy', 84 | 'reppy.cache': 'reppy/cache' 85 | }, 86 | install_requires=[ 87 | 'cachetools', 88 | 'python-dateutil>=1.5, !=2.0', 89 | 'requests', 90 | 'six' 91 | ], 92 | classifiers=[ 93 | 'License :: OSI Approved :: MIT License', 94 | 'Development Status :: 5 - Production/Stable', 95 | 'Environment :: Web Environment', 96 | 'Intended Audience :: Developers', 97 | 'Topic :: Internet :: WWW/HTTP', 98 | 'Programming Language :: Python :: 2.7', 99 | 'Programming Language :: Python :: 3', 100 | 'Programming Language :: Python :: 3.3', 101 | 'Programming Language :: Python :: 3.4', 102 | 'Programming Language :: Python :: 3.5', 103 | 'Programming Language :: Python :: 3.6', 104 | 'Programming Language :: Python :: 3.7' 105 | ], 106 | **kwargs 107 | ) 108 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seomoz/reppy/e92f22e0b6153772bb66cd41cde06a5a7ec04d66/tests/__init__.py -------------------------------------------------------------------------------- /tests/asis/test_after_parse_hook/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_after_response_hook/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_agent_allowed/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_caches_agent/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_caches_robots/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_content_too_big/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: / 6 | -------------------------------------------------------------------------------- /tests/asis/test_excessive_redirects/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.1 301 Moved Permanently 2 | Location: http://localhost:8080/robots.txt 3 | Content-Type: text/plain 4 | -------------------------------------------------------------------------------- /tests/asis/test_fetch_status_200/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: / 6 | -------------------------------------------------------------------------------- /tests/asis/test_fetch_status_401/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 401 Unauthorized 2 | Content-Type: text/plain 3 | -------------------------------------------------------------------------------- /tests/asis/test_fetch_status_403/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 403 Forbidden 2 | Content-Type: text/plain 3 | -------------------------------------------------------------------------------- /tests/asis/test_fetch_status_4XX/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 404 Not Found 2 | Content-Type: text/plain 3 | -------------------------------------------------------------------------------- /tests/asis/test_fetch_status_5XX/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 500 Internal Server Error 2 | Content-Type: text/plain 3 | -------------------------------------------------------------------------------- /tests/asis/test_returns_a_robots_object/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: / 6 | -------------------------------------------------------------------------------- /tests/asis/test_returns_an_agent_object/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: / 6 | -------------------------------------------------------------------------------- /tests/asis/test_robots_allowed/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Type: text/plain 3 | 4 | User-Agent: * 5 | Disallow: /disallowed 6 | Allow: /allowed 7 | -------------------------------------------------------------------------------- /tests/asis/test_ssl_exception/robots.txt: -------------------------------------------------------------------------------- 1 | HTTP/1.0 200 OK 2 | Content-Length: 23 3 | Content-Type: text/plain 4 | 5 | User-Agent: * 6 | Allow: / 7 | -------------------------------------------------------------------------------- /tests/test_agent.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from reppy.robots import Agent, Robots 4 | 5 | 6 | class AgentTest(unittest.TestCase): 7 | '''Tests about the Agent.''' 8 | 9 | def parse(self, content, name): 10 | '''Parse the robots.txt in content and return the agent of the provided name.''' 11 | return Robots.parse('http://example.com', content).agent(name) 12 | 13 | def test_length(self): 14 | '''An agent knows how many directives it has.''' 15 | agent = Agent().disallow('/path').allow('/path/') 16 | self.assertEqual(len(agent), 2) 17 | 18 | def test_make_allowed(self): 19 | '''Make an agent that allows a path.''' 20 | agent = Agent().disallow('/path').allow('/path/') 21 | self.assertTrue(agent.allowed('/path/')) 22 | self.assertFalse(agent.allowed('/path')) 23 | 24 | def test_make_disallowed(self): 25 | '''Make an agent that disallows a path.''' 26 | agent = Agent().disallow('/path') 27 | self.assertFalse(agent.allowed('/path')) 28 | 29 | def test_checks_allowed(self): 30 | '''Answers the allowed question.''' 31 | agent = self.parse(''' 32 | User-agent: agent 33 | Allow: /path 34 | ''', 'agent') 35 | self.assertTrue(agent.allowed('/path')) 36 | self.assertTrue(agent.allowed('/elsewhere')) 37 | 38 | def test_honors_longest_first_priority(self): 39 | '''The longest matching rule takes priority.''' 40 | agent = self.parse(''' 41 | User-agent: agent 42 | Disallow: /path 43 | Allow: /path/exception 44 | ''', 'agent') 45 | self.assertTrue(agent.allowed('/path/exception')) 46 | self.assertFalse(agent.allowed('/path')) 47 | 48 | def test_robots_txt_allowed(self): 49 | '''Robots.txt is always allowed.''' 50 | agent = self.parse(''' 51 | User-agent: agent 52 | Disallow: /robots.txt 53 | ''', 'agent') 54 | self.assertTrue(agent.allowed('/robots.txt')) 55 | 56 | def test_disallow_none(self): 57 | '''Recognizes the "Disallow:" form of "Allow: /"''' 58 | agent = self.parse(''' 59 | User-agent: agent 60 | Disallow: 61 | ''', 'agent') 62 | self.assertTrue(agent.allowed('/anything')) 63 | 64 | def test_escaped_rule(self): 65 | '''Handles an escaped rule.''' 66 | agent = self.parse(''' 67 | User-agent: agent 68 | Disallow: /a%3cd.html 69 | ''', 'agent') 70 | self.assertFalse(agent.allowed('/a