├── .coveragerc
├── .gitignore
├── .gitmodules
├── .python-version
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── Vagrantfile
├── bench.py
├── dev-requirements.txt
├── reppy
    ├── __init__.py
    ├── cache
    │   ├── __init__.py
    │   └── policy.py
    ├── exceptions.py
    ├── robots.cpp
    ├── robots.pxd
    ├── robots.pyx
    ├── ttl.py
    └── util.py
├── requirements.txt
├── scripts
    └── vagrant
    │   └── provision.sh
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── asis
        ├── test_after_parse_hook
        │   └── robots.txt
        ├── test_after_response_hook
        │   └── robots.txt
        ├── test_agent_allowed
        │   └── robots.txt
        ├── test_caches_agent
        │   └── robots.txt
        ├── test_caches_robots
        │   └── robots.txt
        ├── test_content_too_big
        │   └── robots.txt
        ├── test_excessive_redirects
        │   └── robots.txt
        ├── test_fetch_status_200
        │   └── robots.txt
        ├── test_fetch_status_401
        │   └── robots.txt
        ├── test_fetch_status_403
        │   └── robots.txt
        ├── test_fetch_status_4XX
        │   └── robots.txt
        ├── test_fetch_status_5XX
        │   └── robots.txt
        ├── test_returns_a_robots_object
        │   └── robots.txt
        ├── test_returns_an_agent_object
        │   └── robots.txt
        ├── test_robots_allowed
        │   └── robots.txt
        └── test_ssl_exception
        │   └── robots.txt
    ├── test_agent.py
    ├── test_cache
        ├── __init__.py
        ├── test_cache.py
        └── test_policy.py
    ├── test_robots.py
    ├── test_ttl.py
    ├── test_util.py
    └── util.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | plugins = Cython.Coverage
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries and build artifacts
 2 | *.pyc
 3 | *.o
 4 | *.so
 5 | build/
 6 | dist/
 7 | reppy.egg-info/
 8 | MANIFEST
 9 | 
10 | # Dev artifacts
11 | venv*
12 | 
13 | # Coverage
14 | .coverage
15 | 
16 | # Vagrant
17 | .vagrant/
18 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "reppy/rep-cpp"]
2 | 	path = reppy/rep-cpp
3 | 	url = https://github.com/seomoz/rep-cpp
4 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 2.7.15
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | matrix:
 4 |   include:
 5 |     - python: "2.7"
 6 |       dist: xenial
 7 |       sudo: false
 8 |     - python: "3.3"
 9 |       dist: trusty
10 |       sudo: false
11 |     - python: "3.4"
12 |       dist: trusty
13 |       sudo: false
14 |     - python: "3.5"
15 |       dist: xenial
16 |       sudo: false
17 |     - python: "3.6"
18 |       dist: xenial
19 |       sudo: false
20 |     - python: "3.7"
21 |       dist: xenial
22 |       sudo: true
23 | 
24 | install: pip install -r dev-requirements.txt
25 | 
26 | script: make test
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 SEOmoz
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include reppy/rep-cpp/deps/url-cpp/src/*
2 | include reppy/rep-cpp/deps/url-cpp/include/*
3 | include reppy/rep-cpp/src/*
4 | include reppy/rep-cpp/include/*
5 | include reppy/robots.pyx
6 | include reppy/robots.pxd
7 | include reppy/robots.cpp
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test
 2 | test: reppy/robots.so
 3 | 	nosetests --with-coverage tests
 4 | 
 5 | reppy/%.so: reppy/%.py* reppy/rep-cpp/src/* reppy/rep-cpp/include/* reppy/rep-cpp/deps/url-cpp/include/* reppy/rep-cpp/deps/url-cpp/src/*
 6 | 	python setup.py build_ext --inplace
 7 | 
 8 | install:
 9 | 	python setup.py install
10 | 
11 | dev-requirements:
12 | 	pip freeze | grep -v -e reppy > dev-requirements.txt
13 | 
14 | clean:
15 | 	rm -rf build dist *.egg-info reppy/*.so
16 | 	find . -name '*.pyc' | xargs --no-run-if-empty rm
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Robots Exclusion Protocol Parser for Python
  2 | ===========================================
  3 | 
  4 | [![Build Status](https://travis-ci.org/seomoz/reppy.svg?branch=master)](https://travis-ci.org/seomoz/reppy)
  5 | 
  6 | `Robots.txt` parsing in Python.
  7 | 
  8 | Goals
  9 | =====
 10 | 
 11 | - __Fetching__ -- helper utilities for fetching and parsing `robots.txt`s, including
 12 |     checking `cache-control` and `expires` headers
 13 | - __Support for newer features__ -- like `Crawl-Delay` and `Sitemaps`
 14 | - __Wildcard matching__ -- without using regexes, no less
 15 | - __Performance__ -- with >100k parses per second, >1M URL checks per second once parsed
 16 | - __Caching__ -- utilities to help with the caching of `robots.txt` responses
 17 | 
 18 | Installation
 19 | ============
 20 | `reppy` is available on `pypi`:
 21 | 
 22 | ```bash
 23 | pip install reppy
 24 | ```
 25 | 
 26 | When installing from source, there are submodule dependencies that must also be fetched:
 27 | 
 28 | ```bash
 29 | git submodule update --init --recursive
 30 | make install
 31 | ```
 32 | 
 33 | Usage
 34 | =====
 35 | 
 36 | Checking when pages are allowed
 37 | -------------------------------
 38 | Two classes answer questions about whether a URL is allowed: `Robots` and
 39 | `Agent`:
 40 | 
 41 | ```python
 42 | from reppy.robots import Robots
 43 | 
 44 | # This utility uses `requests` to fetch the content
 45 | robots = Robots.fetch('http://example.com/robots.txt')
 46 | robots.allowed('http://example.com/some/path/', 'my-user-agent')
 47 | 
 48 | # Get the rules for a specific agent
 49 | agent = robots.agent('my-user-agent')
 50 | agent.allowed('http://example.com/some/path/')
 51 | ```
 52 | 
 53 | The `Robots` class also exposes properties `expired` and `ttl` to describe how
 54 | long the response should be considered valid. A `reppy.ttl` policy is used to
 55 | determine what that should be:
 56 | 
 57 | ```python
 58 | from reppy.ttl import HeaderWithDefaultPolicy
 59 | 
 60 | # Use the `cache-control` or `expires` headers, defaulting to a 30 minutes and
 61 | # ensuring it's at least 10 minutes
 62 | policy = HeaderWithDefaultPolicy(default=1800, minimum=600)
 63 | 
 64 | robots = Robots.fetch('http://example.com/robots.txt', ttl_policy=policy)
 65 | ```
 66 | 
 67 | Customizing fetch
 68 | -----------------
 69 | The `fetch` method accepts `*args` and `**kwargs` that are passed on to `requests.get`,
 70 | allowing you to customize the way the `fetch` is executed:
 71 | 
 72 | ```python
 73 | robots = Robots.fetch('http://example.com/robots.txt', headers={...})
 74 | ```
 75 | 
 76 | Matching Rules and Wildcards
 77 | ----------------------------
 78 | Both `*` and `$` are supported for wildcard matching.
 79 | 
 80 | This library follows the matching [1996 RFC](http://www.robotstxt.org/norobots-rfc.txt)
 81 | describes. In the case where multiple rules match a query, the longest rules wins as
 82 | it is presumed to be the most specific.
 83 | 
 84 | Checking sitemaps
 85 | -----------------
 86 | The `Robots` class also lists the sitemaps that are listed in a `robots.txt`
 87 | 
 88 | ```python
 89 | # This property holds a list of URL strings of all the sitemaps listed
 90 | robots.sitemaps
 91 | ```
 92 | 
 93 | Delay
 94 | -----
 95 | The `Crawl-Delay` directive is per agent and can be accessed through that class. If
 96 | none was specified, it's `None`:
 97 | 
 98 | ```python
 99 | # What's the delay my-user-agent should use
100 | robots.agent('my-user-agent').delay
101 | ```
102 | 
103 | Determining the `robots.txt` URL
104 | --------------------------------
105 | Given a URL, there's a utility to determine the URL of the corresponding `robots.txt`.
106 | It preserves the scheme and hostname and the port (if it's not the default port for the
107 | scheme).
108 | 
109 | ```python
110 | # Get robots.txt URL for http://userinfo@example.com:8080/path;params?query#fragment
111 | # It's http://example.com:8080/robots.txt
112 | Robots.robots_url('http://userinfo@example.com:8080/path;params?query#fragment')
113 | ```
114 | 
115 | Caching
116 | =======
117 | There are two cache classes provided -- `RobotsCache`, which caches entire `reppy.Robots`
118 | objects, and `AgentCache`, which only caches the `reppy.Agent` relevant to a client. These
119 | caches duck-type the class that they cache for the purposes of checking if a URL is
120 | allowed:
121 | 
122 | ```python
123 | from reppy.cache import RobotsCache
124 | cache = RobotsCache(capacity=100)
125 | cache.allowed('http://example.com/foo/bar', 'my-user-agent')
126 | 
127 | from reppy.cache import AgentCache
128 | cache = AgentCache(agent='my-user-agent', capacity=100)
129 | cache.allowed('http://example.com/foo/bar')
130 | ```
131 | 
132 | Like `reppy.Robots.fetch`, the cache constructory accepts a `ttl_policy` to inform the
133 | expiration of the fetched `Robots` objects, as well as `*args` and `**kwargs` to be passed
134 | to `reppy.Robots.fetch`.
135 | 
136 | Caching Failures
137 | ----------------
138 | There's a piece of classic caching advice: "don't cache failures." However, this is not
139 | always appropriate in certain circumstances. For example, if the failure is a timeout,
140 | clients may want to cache this result so that every check doesn't take a very long time.
141 | 
142 | To this end, the `cache` module provides a notion of a cache policy. It determines what
143 | to do in the case of an exception. The default is to cache a form of a disallowed response
144 | for 10 minutes, but you can configure it as you see fit:
145 | 
146 | ```python
147 | # Do not cache failures (note the `ttl=0`):
148 | from reppy.cache.policy import ReraiseExceptionPolicy
149 | cache = AgentCache('my-user-agent', cache_policy=ReraiseExceptionPolicy(ttl=0))
150 | 
151 | # Cache and reraise failures for 10 minutes (note the `ttl=600`):
152 | cache = AgentCache('my-user-agent', cache_policy=ReraiseExceptionPolicy(ttl=600))
153 | 
154 | # Treat failures as being disallowed
155 | cache = AgentCache(
156 |     'my-user-agent',
157 |     cache_policy=DefaultObjectPolicy(ttl=600, lambda _: Agent().disallow('/')))
158 | ```
159 | 
160 | Development
161 | ===========
162 | A `Vagrantfile` is provided to bootstrap a development environment:
163 | 
164 | ```bash
165 | vagrant up
166 | ```
167 | 
168 | Alternatively, development can be conducted using a `virtualenv`:
169 | 
170 | ```bash
171 | virtualenv venv
172 | source venv/bin/activate
173 | pip install -r requirements.txt
174 | ```
175 | 
176 | Tests
177 | =====
178 | Tests may be run in `vagrant`:
179 | 
180 | ```bash
181 | make test
182 | ```
183 | 
184 | Development
185 | ===========
186 | 
187 | Environment
188 | -----------
189 | To launch the `vagrant` image, we only need to
190 | `vagrant up` (though you may have to provide a `--provider` flag):
191 | 
192 | ```bash
193 | vagrant up
194 | ```
195 | 
196 | With a running `vagrant` instance, you can log in and run tests:
197 | 
198 | ```bash
199 | vagrant ssh
200 | make test
201 | ```
202 | 
203 | Running Tests
204 | -------------
205 | Tests are run with the top-level `Makefile`:
206 | 
207 | ```bash
208 | make test
209 | ```
210 | 
211 | PRs
212 | ===
213 | These are not all hard-and-fast rules, but in general PRs have the following expectations:
214 | 
215 | - __pass Travis__ -- or more generally, whatever CI is used for the particular project
216 | - __be a complete unit__ -- whether a bug fix or feature, it should appear as a complete
217 |     unit before consideration.
218 | - __maintain code coverage__ -- some projects may include code coverage requirements as
219 |     part of the build as well
220 | - __maintain the established style__ -- this means the existing style of established
221 |     projects, the established conventions of the team for a given language on new
222 |     projects, and the guidelines of the community of the relevant languages and
223 |     frameworks.
224 | - __include failing tests__ -- in the case of bugs, failing tests demonstrating the bug
225 |     should be included as one commit, followed by a commit making the test succeed. This
226 |     allows us to jump to a world with a bug included, and prove that our test in fact
227 |     exercises the bug.
228 | - __be reviewed by one or more developers__ -- not all feedback has to be accepted, but
229 |     it should all be considered.
230 | - __avoid 'addressed PR feedback' commits__ -- in general, PR feedback should be rebased
231 |     back into the appropriate commits that introduced the change. In cases, where this
232 |     is burdensome, PR feedback commits may be used but should still describe the changed
233 |     contained therein.
234 | 
235 | PR reviews consider the design, organization, and functionality of the submitted code.
236 | 
237 | Commits
238 | =======
239 | Certain types of changes should be made in their own commits to improve readability. When
240 | too many different types of changes happen simultaneous to a single commit, the purpose of
241 | each change is muddled. By giving each commit a single logical purpose, it is implicitly
242 | clear why changes in that commit took place.
243 | 
244 | - __updating / upgrading dependencies__ -- this is especially true for invocations like
245 |     `bundle update` or `berks update`.
246 | - __introducing a new dependency__ -- often preceeded by a commit updating existing
247 |     dependencies, this should only include the changes for the new dependency.
248 | - __refactoring__ -- these commits should preserve all the existing functionality and
249 |     merely update how it's done.
250 | - __utility components to be used by a new feature__ -- if introducing an auxiliary class
251 |     in support of a subsequent commit, add this new class (and its tests) in its own
252 |     commit.
253 | - __config changes__ -- when adjusting configuration in isolation
254 | - __formatting / whitespace commits__ -- when adjusting code only for stylistic purposes.
255 | 
256 | New Features
257 | ------------
258 | Small new features (where small refers to the size and complexity of the change, not the
259 | impact) are often introduced in a single commit. Larger features or components might be
260 | built up piecewise, with each commit containing a single part of it (and its corresponding
261 | tests).
262 | 
263 | Bug Fixes
264 | ---------
265 | In general, bug fixes should come in two-commit pairs: a commit adding a failing test
266 | demonstrating the bug, and a commit making that failing test pass.
267 | 
268 | Tagging and Versioning
269 | ======================
270 | Whenever the version included in `setup.py` is changed (and it should be changed when
271 | appropriate using [http://semver.org/](http://semver.org/)), a corresponding tag should
272 | be created with the same version number (formatted `v<version>`).
273 | 
274 | ```bash
275 | git tag -a v0.1.0 -m 'Version 0.1.0
276 | 
277 | This release contains an initial working version of the `crawl` and `parse`
278 | utilities.'
279 | 
280 | git push --tags origin
281 | ```
282 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # Encoding: utf-8
 2 | # -*- mode: ruby -*-
 3 | # vi: set ft=ruby :
 4 | 
 5 | ENV['VAGRANT_DEFAULT_PROVIDER'] = 'virtualbox'
 6 | 
 7 | # http://docs.vagrantup.com/v2/
 8 | Vagrant.configure('2') do |config|
 9 |   config.vm.box = 'ubuntu/trusty64'
10 |   config.vm.hostname = 'reppy'
11 |   config.ssh.forward_agent = true
12 | 
13 |   config.vm.provider :virtualbox do |vb|
14 |     vb.customize ["modifyvm", :id, "--memory", "1024"]
15 |     vb.customize ["modifyvm", :id, "--cpus", "2"]
16 |     vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
17 |   end
18 | 
19 |   config.vm.provision :shell, path: 'scripts/vagrant/provision.sh', privileged: false
20 | end
21 | 


--------------------------------------------------------------------------------
/bench.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | from contextlib import contextmanager
 6 | import sys
 7 | import time
 8 | 
 9 | from reppy.robots import Robots
10 | content = '''
11 | # /robots.txt for http://www.fict.org/
12 | # comments to webmaster@fict.org
13 | 
14 | User-agent: unhipbot
15 | Disallow: /
16 | 
17 | User-agent: webcrawler
18 | User-agent: excite
19 | Disallow:
20 | 
21 | User-agent: *
22 | Disallow: /org/plans.html
23 | Allow: /org/
24 | Allow: /serv
25 | Allow: /~mak
26 | Disallow: /
27 | '''
28 | 
29 | @contextmanager
30 | def timer(name, count):
31 |     '''Time this block.'''
32 |     start = time.time()
33 |     try:
34 |         yield count
35 |     finally:
36 |         duration = time.time() - start
37 |         print(name)
38 |         print('=' * 10)
39 |         print('Total: %s' % duration)
40 |         print('  Avg: %s' % (duration / count))
41 |         print(' Rate: %s' % (count / duration))
42 |         print('')
43 | 
44 | 
45 | with timer('Parse', 100000) as count:
46 |     for _ in xrange(count):
47 |         Robots.parse('http://example.com/robots.txt', content)
48 | 
49 | 
50 | parsed = Robots.parse('http://example.com/robots.txt', content)
51 | with timer('Evaluate', 100000) as count:
52 |     for _ in xrange(count):
53 |         parsed.allowed('/org/example.html', 'other-bot')
54 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
 1 | cachetools==2.0.0
 2 | colorama==0.3.7
 3 | coverage==4.0.3
 4 | Cython==0.27.3
 5 | funcsigs==1.0.2
 6 | mock==2.0.0
 7 | nose==1.3.7
 8 | pbr==1.10.0
 9 | python-dateutil==2.5.3
10 | python-termstyle==0.1.10
11 | rednose==1.1.1
12 | requests==2.10.0
13 | requests-mock==1.1.0
14 | six==1.10.0
15 | 


--------------------------------------------------------------------------------
/reppy/__init__.py:
--------------------------------------------------------------------------------
 1 | '''Robots.txt parsing.'''
 2 | 
 3 | import logging
 4 | 
 5 | logger = logging.getLogger('reppy')
 6 | handler = logging.StreamHandler()
 7 | formatter = logging.Formatter(
 8 |     ' | '.join([
 9 |         '[%(asctime)s]',
10 |         'PID %(process)d',
11 |         '%(levelname)s in %(module)s:%(funcName)s@%(lineno)s => %(message)s'
12 |     ]))
13 | handler.setLevel(logging.DEBUG)
14 | logger.addHandler(handler)
15 | logger.setLevel(logging.ERROR)
16 | 
17 | from .robots import Robots, Agent
18 | 


--------------------------------------------------------------------------------
/reppy/cache/__init__.py:
--------------------------------------------------------------------------------
  1 | '''A robots.txt cache.'''
  2 | 
  3 | from functools import partial
  4 | import threading
  5 | import time
  6 | 
  7 | from cachetools import LRUCache
  8 | 
  9 | from .policy import DefaultObjectPolicy, ReraiseExceptionPolicy
 10 | from ..robots import Robots, AllowNone, Agent
 11 | from .. import logger
 12 | 
 13 | 
 14 | class ExpiringObject(object):
 15 |     '''Wrap an object that expires over time.'''
 16 | 
 17 |     def __init__(self, factory):
 18 |         self.factory = factory
 19 |         self.lock = threading.Lock()
 20 |         self.obj = None
 21 |         self.expires = 0
 22 |         self.exception = None
 23 | 
 24 |     def get(self):
 25 |         '''Get the wrapped object.'''
 26 |         if (self.obj is None) or (time.time() >= self.expires):
 27 |             with self.lock:
 28 |                 self.expires, self.obj = self.factory()
 29 |                 if isinstance(self.obj, BaseException):
 30 |                     self.exception = self.obj
 31 |                 else:
 32 |                     self.exception = None
 33 | 
 34 |         if self.exception:
 35 |             raise self.exception
 36 |         else:
 37 |             return self.obj
 38 | 
 39 | 
 40 | class BaseCache(object):
 41 |     '''A base cache class.'''
 42 | 
 43 |     DEFAULT_CACHE_POLICY = ReraiseExceptionPolicy(ttl=600)
 44 |     DEFAULT_TTL_POLICY = Robots.DEFAULT_TTL_POLICY
 45 | 
 46 |     def __init__(self, capacity, cache_policy=None, ttl_policy=None, *args, **kwargs):
 47 |         self.cache_policy = cache_policy or self.DEFAULT_CACHE_POLICY
 48 |         self.ttl_policy = ttl_policy or self.DEFAULT_TTL_POLICY
 49 |         self.cache = LRUCache(maxsize=capacity)
 50 |         self.args = args
 51 |         self.kwargs = kwargs
 52 | 
 53 |     def get(self, url):
 54 |         '''Get the entity that corresponds to URL.'''
 55 |         robots_url = Robots.robots_url(url)
 56 |         if robots_url not in self.cache:
 57 |             self.cache[robots_url] = ExpiringObject(partial(self.factory, robots_url))
 58 |         return self.cache[robots_url].get()
 59 | 
 60 |     def factory(self, url):
 61 |         '''
 62 |         Return (expiration, obj) corresponding to provided url, exercising the
 63 |         cache_policy as necessary.
 64 |         '''
 65 |         try:
 66 |             return self.fetch(url)
 67 |         except BaseException as exc:
 68 |             logger.exception('Reppy cache fetch error on %s' % url)
 69 |             return self.cache_policy.exception(url, exc)
 70 | 
 71 |     def fetch(self, url):
 72 |         '''Return (expiration, obj) corresponding to provided url.'''
 73 |         raise NotImplementedError('BaseCache does not implement fetch.')
 74 | 
 75 | 
 76 | class RobotsCache(BaseCache):
 77 |     '''A cache of Robots objects.'''
 78 | 
 79 |     DEFAULT_CACHE_POLICY = DefaultObjectPolicy(ttl=600, factory=AllowNone)
 80 | 
 81 |     def allowed(self, url, agent):
 82 |         '''Return true if the provided URL is allowed to agent.'''
 83 |         return self.get(url).allowed(url, agent)
 84 | 
 85 |     def fetch(self, url):
 86 |         '''Return (expiration, Robots) for the robots.txt at the provided URL.'''
 87 |         robots = Robots.fetch(
 88 |             url, ttl_policy=self.ttl_policy, *self.args, **self.kwargs)
 89 |         return (robots.expires, robots)
 90 | 
 91 | 
 92 | class AgentCache(BaseCache):
 93 |     '''A cache of Agent objects.'''
 94 | 
 95 |     DEFAULT_CACHE_POLICY = DefaultObjectPolicy(
 96 |         ttl=600, factory=lambda url: Agent().disallow('/'))
 97 | 
 98 |     def __init__(self, agent, *args, **kwargs):
 99 |         BaseCache.__init__(self, *args, **kwargs)
100 |         self.agent = agent
101 | 
102 |     def allowed(self, url):
103 |         '''Return true if the provided URL is allowed to self.agent.'''
104 |         return self.get(url).allowed(url)
105 | 
106 |     def fetch(self, url):
107 |         '''Return (expiration, Agent) for the robots.txt at the provided URL.'''
108 |         robots = Robots.fetch(
109 |             url, ttl_policy=self.ttl_policy, *self.args, **self.kwargs)
110 |         return (robots.expires, robots.agent(self.agent))
111 | 


--------------------------------------------------------------------------------
/reppy/cache/policy.py:
--------------------------------------------------------------------------------
 1 | '''Policies for caching.'''
 2 | 
 3 | import time
 4 | 
 5 | 
 6 | class CachePolicyBase(object):
 7 |     '''Policy for caching.'''
 8 | 
 9 |     def exception(self, url, exception):
10 |         '''What to return when there's an exception.'''
11 |         raise NotImplementedError('CachePolicyBase does not implement exception.')
12 | 
13 | 
14 | class DefaultObjectPolicy(object):
15 |     '''Return a default object on exception.'''
16 | 
17 |     def __init__(self, ttl, factory):
18 |         self.ttl = ttl
19 |         self.factory = factory
20 | 
21 |     def exception(self, url, exception):
22 |         '''What to return when there's an exception.'''
23 |         return (time.time() + self.ttl, self.factory(url))
24 | 
25 | 
26 | class ReraiseExceptionPolicy(object):
27 |     '''Reraise the exception.'''
28 | 
29 |     def __init__(self, ttl):
30 |         self.ttl = ttl
31 | 
32 |     def exception(self, url, exception):
33 |         '''What to return when there's an exception.'''
34 |         return (time.time() + self.ttl, exception)
35 | 


--------------------------------------------------------------------------------
/reppy/exceptions.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | #
 3 | # Copyright (c) 2011 SEOmoz
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | '''All of our exceptions'''
25 | 
26 | 
27 | class ReppyException(Exception):
28 |     '''Any internal exception'''
29 |     pass
30 | 
31 | class ContentTooLong(ReppyException):
32 |     '''Robots.txt content is too long.'''
33 |     pass
34 | 
35 | class ServerError(ReppyException):
36 |     '''When the remote server returns an error'''
37 |     pass
38 | 
39 | class SSLException(ReppyException):
40 |     '''An SSL error.'''
41 |     pass
42 | 
43 | class ConnectionException(ReppyException):
44 |     '''A connection error exception.'''
45 |     pass
46 | 
47 | class MalformedUrl(ReppyException):
48 |     '''An exception for a missing or invalid url or schema.'''
49 |     pass
50 | 
51 | class ExcessiveRedirects(ReppyException):
52 |     '''A TooManyRedirects error.'''
53 |     pass
54 | 
55 | class ReadTimeout(ReppyException):
56 |     '''A ReadTimeout error from the HTTP library.'''
57 |     pass
58 | 
59 | class BadStatusCode(ReppyException):
60 |     '''An exception for 5xx status codes.'''
61 |     pass
62 | 


--------------------------------------------------------------------------------
/reppy/robots.pxd:
--------------------------------------------------------------------------------
 1 | # Cython declarations
 2 | 
 3 | from libcpp.string cimport string
 4 | from libcpp.vector cimport vector
 5 | from libcpp cimport bool
 6 | 
 7 | cdef extern from "rep-cpp/include/directive.h" namespace "Rep":
 8 |     cpdef cppclass CppDirective "Rep::Directive":
 9 |         ctypedef size_t priority_t
10 | 
11 |         CppDirective(const string& line, bool allowed)
12 |         CppDirective(const CppDirective& rhs)
13 |         priority_t priority() const
14 |         bool match(const string& path) const
15 |         bool allowed() const
16 |         string str() const
17 | 
18 | cdef extern from "rep-cpp/include/agent.h" namespace "Rep":
19 |     cpdef cppclass CppAgent "Rep::Agent":
20 |         ctypedef float delay_t
21 | 
22 |         CppAgent()
23 |         CppAgent(const string& host)
24 |         CppAgent& allow(const string& query)
25 |         CppAgent& disallow(const string& query)
26 |         CppAgent& delay(delay_t delay)
27 |         delay_t delay() const
28 |         const vector[CppDirective]& directives() const
29 |         bool allowed(const string& path) const
30 |         string str() const
31 | 
32 | cdef extern from "rep-cpp/include/robots.h" namespace "Rep":
33 |     cpdef cppclass CppRobots "Rep::Robots":
34 |         CppRobots(const string& content) except +ValueError
35 |         CppRobots(const string& content, const string& base_url) except +ValueError
36 |         const vector[string]& sitemaps() const
37 |         CppAgent& agent(const string& name) const
38 |         bool allowed(const string& path, const string& name) const
39 |         string str() const
40 |         @staticmethod
41 |         string robotsUrl(const string& url) except +ValueError
42 | 


--------------------------------------------------------------------------------
/reppy/robots.pyx:
--------------------------------------------------------------------------------
  1 | # cython: linetrace=True
  2 | # distutils: define_macros=CYTHON_TRACE=1
  3 | 
  4 | from contextlib import closing
  5 | import time
  6 | 
  7 | import requests
  8 | from requests.exceptions import (
  9 |     SSLError,
 10 |     ConnectionError,
 11 |     URLRequired,
 12 |     MissingSchema,
 13 |     InvalidSchema,
 14 |     InvalidURL,
 15 |     TooManyRedirects,
 16 |     ReadTimeout)
 17 | import six
 18 | 
 19 | from .ttl import HeaderWithDefaultPolicy
 20 | from . import util, logger, exceptions
 21 | 
 22 | cdef as_bytes(value):
 23 |     if isinstance(value, bytes):
 24 |         return value
 25 |     return value.encode('utf-8')
 26 | 
 27 | # For contexts which require a 'str' type, convert bytes to unicode if needed
 28 | # (i.e., Python 3). Note: could raise UnicodeDecodeError in Python 3 if input
 29 | # is invalid UTF-8
 30 | cdef as_string(value):
 31 |     if six.PY3:
 32 |         if isinstance(value, bytes):
 33 |             return value.decode('utf-8')
 34 |     return value
 35 | 
 36 | 
 37 | def FromRobotsMethod(cls, Robots robots, const string& name):
 38 |     '''Construct an Agent from a CppAgent.'''
 39 |     agent = Agent()
 40 |     # This is somewhat inefficient due to the copying, but it is
 41 |     # required to be copied because we often toss the containing
 42 |     # Robots object as a temporary thus we'd leave the underlying
 43 |     # Agent object dangling without a full copy.
 44 |     agent.agent = robots.robots.agent(name)
 45 |     return agent
 46 | 
 47 | cdef class Agent:
 48 |     '''Wrapper around rep-cpp's Rep::Agent class.'''
 49 | 
 50 |     cdef CppAgent agent
 51 | 
 52 |     from_robots = classmethod(FromRobotsMethod)
 53 | 
 54 |     def __str__(self):
 55 |         return as_string(self.agent.str())
 56 | 
 57 |     def __len__(self):
 58 |         return self.agent.directives().size()
 59 | 
 60 |     @property
 61 |     def delay(self):
 62 |         '''The delay associated with this agent.'''
 63 |         cdef float value = self.agent.delay()
 64 |         if value > 0:
 65 |             return value
 66 |         return None
 67 | 
 68 |     def allow(self, path):
 69 |         '''Allow the provided path.'''
 70 |         self.agent.allow(as_bytes(path))
 71 |         return self
 72 | 
 73 |     def disallow(self, path):
 74 |         '''Disallow the provided path.'''
 75 |         self.agent.disallow(as_bytes(path))
 76 |         return self
 77 | 
 78 |     def allowed(self, path):
 79 |         '''Is the provided URL allowed?'''
 80 |         return self.agent.allowed(as_bytes(path))
 81 | 
 82 | 
 83 | def ParseMethod(cls, url, content, expires=None):
 84 |     '''Parse a robots.txt file.'''
 85 |     return cls(url, as_bytes(content), expires)
 86 | 
 87 | def FetchMethod(cls, url, ttl_policy=None, max_size=1048576, *args, **kwargs):
 88 |     '''Get the robots.txt at the provided URL.'''
 89 |     after_response_hook = kwargs.pop('after_response_hook', None)
 90 |     after_parse_hook = kwargs.pop('after_parse_hook', None)
 91 |     def wrap_exception(etype, cause):
 92 |         wrapped = etype(cause)
 93 |         wrapped.url = url
 94 |         if after_response_hook is not None:
 95 |             after_response_hook(wrapped)
 96 |         raise wrapped
 97 |     try:
 98 |         # Limit the size of the request
 99 |         kwargs['stream'] = True
100 |         with closing(requests.get(url, *args, **kwargs)) as res:
101 |             content = res.raw.read(amt=max_size, decode_content=True)
102 |             # Try to read an additional byte, to see if the response is too big
103 |             if res.raw.read(amt=1, decode_content=True):
104 |                 raise exceptions.ContentTooLong(
105 |                     'Content larger than %s bytes' % max_size)
106 | 
107 |             if after_response_hook is not None:
108 |                 after_response_hook(res)
109 | 
110 |             # Get the TTL policy's ruling on the ttl
111 |             expires = (ttl_policy or cls.DEFAULT_TTL_POLICY).expires(res)
112 | 
113 |             if res.status_code == 200:
114 |                 robots = cls.parse(url, content, expires)
115 |                 if after_parse_hook is not None:
116 |                     after_parse_hook(robots)
117 |                 return robots
118 |             elif res.status_code in (401, 403):
119 |                 return AllowNone(url, expires)
120 |             elif res.status_code >= 400 and res.status_code < 500:
121 |                 return AllowAll(url, expires)
122 |             else:
123 |                 raise exceptions.BadStatusCode(
124 |                     'Got %i for %s' % (res.status_code, url), res.status_code)
125 |     except SSLError as exc:
126 |         wrap_exception(exceptions.SSLException, exc)
127 |     except ConnectionError as exc:
128 |         wrap_exception(exceptions.ConnectionException, exc)
129 |     except (URLRequired, MissingSchema, InvalidSchema, InvalidURL) as exc:
130 |         wrap_exception(exceptions.MalformedUrl, exc)
131 |     except TooManyRedirects as exc:
132 |         wrap_exception(exceptions.ExcessiveRedirects, exc)
133 |     except ReadTimeout as exc:
134 |         wrap_exception(exceptions.ReadTimeout, exc)
135 | 
136 | def RobotsUrlMethod(cls, url):
137 |     '''Get the robots.txt URL that corresponds to the provided one.'''
138 |     return as_string(CppRobots.robotsUrl(as_bytes(url)))
139 | 
140 | cdef class Robots:
141 |     '''Wrapper around rep-cpp's Rep::Robots class.'''
142 | 
143 |     # The default TTL policy is to cache for 3600 seconds or what's provided in the
144 |     # headers, and a minimum of 600 seconds
145 |     DEFAULT_TTL_POLICY = HeaderWithDefaultPolicy(default=3600, minimum=600)
146 | 
147 |     # Class methods
148 |     parse = classmethod(ParseMethod)
149 |     fetch = classmethod(FetchMethod)
150 |     robots_url = classmethod(RobotsUrlMethod)
151 | 
152 |     # Data members
153 |     cdef CppRobots* robots
154 |     cdef object expires
155 | 
156 |     def __init__(self, url, const string& content, expires=None):
157 |         self.robots = new CppRobots(content, as_bytes(url))
158 |         self.expires = expires
159 | 
160 |     def __str__(self):
161 |         # Note: this could raise a UnicodeDecodeError in Python 3 if the
162 |         # robots.txt had invalid UTF-8
163 |         return as_string(self.robots.str())
164 | 
165 |     def __dealloc__(self):
166 |         del self.robots
167 | 
168 |     @property
169 |     def sitemaps(self):
170 |         '''Get all the sitemaps in this robots.txt.'''
171 |         return list(map(as_string, self.robots.sitemaps()))
172 | 
173 |     def allowed(self, path, name):
174 |         '''Is the provided path allowed for the provided agent?'''
175 |         return self.robots.allowed(as_bytes(path), as_bytes(name))
176 | 
177 |     def agent(self, name):
178 |         '''Return the Agent that corresponds to name.
179 | 
180 |         Note modifications to the returned Agent will not be reflected
181 |         in this Robots object because it is a *copy*, not the original
182 |         Agent object.
183 |         '''
184 |         return Agent.from_robots(self, as_bytes(name))
185 | 
186 |     @property
187 |     def expired(self):
188 |         '''True if the current time is past its expiration.'''
189 |         return time.time() > self.expires
190 | 
191 |     @property
192 |     def expires(self):
193 |         '''The expiration of this robots.txt.'''
194 |         return self.expires
195 | 
196 |     @property
197 |     def ttl(self):
198 |         '''Remaining time for this response to be considered valid.'''
199 |         return max(self.expires - time.time(), 0)
200 | 
201 | 
202 | cdef class AllowNone(Robots):
203 |     '''No requests are allowed.'''
204 | 
205 |     def __init__(self, url, expires=None):
206 |         Robots.__init__(self, url, b'User-agent: *\nDisallow: /', expires)
207 | 
208 | 
209 | cdef class AllowAll(Robots):
210 |     '''All requests are allowed.'''
211 | 
212 |     def __init__(self, url, expires=None):
213 |         Robots.__init__(self, url, b'', expires)
214 | 


--------------------------------------------------------------------------------
/reppy/ttl.py:
--------------------------------------------------------------------------------
 1 | '''Policies for setting the TTL on Robots objects.'''
 2 | 
 3 | import time
 4 | 
 5 | from . import logger
 6 | from .util import parse_date
 7 | 
 8 | 
 9 | class TTLPolicyBase(object):
10 |     '''Policy for setting the TTL on Robots objects.'''
11 | 
12 |     def ttl(self, response):
13 |         '''Get the caching TTL for a response.'''
14 |         raise NotImplementedError('TTLPolicyBase does not implement ttl.')
15 | 
16 |     def expires(self, response):
17 |         '''Determine when a response should expire.'''
18 |         return time.time() + self.ttl(response)
19 | 
20 | 
21 | class HeaderWithDefaultPolicy(TTLPolicyBase):
22 |     '''TTL is based on headers, but falls back to a default, clamps to a minimum.'''
23 | 
24 |     def __init__(self, default, minimum):
25 |         self.default = default
26 |         self.minimum = minimum
27 | 
28 |     def ttl(self, response):
29 |         '''Get the ttl from headers.'''
30 |         # If max-age is specified in Cache-Control, use it and ignore any
31 |         # Expires header, as per RFC2616 Sec. 13.2.4.
32 |         cache_control = response.headers.get('cache-control')
33 |         if cache_control is not None:
34 |             for directive in cache_control.split(','):
35 |                 name, _, value = directive.lower().partition('=')
36 |                 name = name.strip()
37 |                 if name in ('no-store', 'must-revalidate', 'no-cache'):
38 |                     return max(self.minimum, 0)
39 |                 elif name in ('s-maxage', 'max-age'):
40 |                     try:
41 |                         return max(self.minimum, int(value.strip()))
42 |                     except ValueError:
43 |                         logger.warn(
44 |                             'Could not parse %s=%s', name, value, exc_info=1)
45 | 
46 |         # Check the Expires header
47 |         expires = response.headers.get('expires')
48 |         if expires is not None:
49 |             # Evaluate the expiration relative to the server-provided date
50 |             date = response.headers.get('date')
51 |             if date is not None:
52 |                 try:
53 |                     date = parse_date(date)
54 |                 except ValueError:
55 |                     logger.warn(
56 |                         'Could not parse date string %s', date, exc_info=1)
57 |                     date = time.time()
58 |             else:
59 |                 date = time.time()
60 | 
61 |             try:
62 |                 return max(self.minimum, parse_date(expires) - date)
63 |             except ValueError:
64 |                 logger.warn(
65 |                     'Could not parse date string %s', expires, exc_info=1)
66 | 
67 |         return self.default
68 | 


--------------------------------------------------------------------------------
/reppy/util.py:
--------------------------------------------------------------------------------
 1 | '''Utility functions.'''
 2 | 
 3 | import email
 4 | 
 5 | 
 6 | def parse_date(string):
 7 |     '''Return a timestamp for the provided datestring, described by RFC 7231.'''
 8 |     parsed = email.utils.parsedate_tz(string)
 9 |     if parsed is None:
10 |         raise ValueError("Invalid time.")
11 |     parsed = list(parsed)
12 |     # Default time zone is GMT/UTC
13 |     parsed[9] = 0 if parsed[9] is None else parsed[9]
14 |     return email.utils.mktime_tz(parsed)
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cachetools==3.0.0
 2 | requests==2.10.0
 3 | six==1.10.0
 4 | python-dateutil==2.5.3
 5 | Cython==0.29.14
 6 | mock==4.0.1
 7 | requests_mock==1.7.0
 8 | nose==1.3.7
 9 | colorama==0.4.3
10 | python-termstyle==0.1.10
11 | rednose==1.2.1
12 | 


--------------------------------------------------------------------------------
/scripts/vagrant/provision.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | sudo apt-get update
 6 | sudo apt-get install -y tar curl git
 7 | 
 8 | # Libraries required to build a complete python with pyenv:
 9 | # https://github.com/yyuu/pyenv/wiki
10 | sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
11 |     libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev
12 | 
13 | # Install pyenv
14 | if [ ! -d ~/.pyenv ]; then
15 |     git clone https://github.com/yyuu/pyenv.git ~/.pyenv
16 |     echo '
17 |     # Pyenv
18 |     export PYENV_ROOT="$HOME/.pyenv"
19 |     export PATH="$PYENV_ROOT/bin:$PATH"
20 |     eval "$(pyenv init -)"
21 |     ' >> ~/.bash_profile
22 |     source ~/.bash_profile
23 |     hash
24 | fi
25 | 
26 | pushd /vagrant
27 | 
28 |     # Submodules
29 |     git submodule update --init --recursive
30 | 
31 |     # Install our python version
32 |     pyenv install --skip-existing
33 |     pyenv rehash
34 | 
35 |     # Install a virtualenv
36 |     pip install virtualenv
37 |     if [ ! -d venv ]; then
38 |         virtualenv venv
39 |     fi
40 |     source venv/bin/activate
41 | 
42 |     # Lastly, our dependencies
43 |     pip install -r requirements.txt
44 |     pip install -r dev-requirements.txt
45 | 
46 |     echo '
47 |     cd /vagrant
48 |     # Activate virtualenv
49 |     . /vagrant/venv/bin/activate
50 |     ' >> ~/.bash_profile
51 | popd
52 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description-file = README.md
 3 | 
 4 | [nosetests]
 5 | verbosity=2
 6 | rednose=1
 7 | exe=1
 8 | cover-package=reppy
 9 | cover-branches=1
10 | cover-min-percentage=100
11 | cover-inclusive=1
12 | cover-erase=1
13 | logging-clear-handlers=1
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011-2017 SEOmoz, Inc.
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining
  6 | # a copy of this software and associated documentation files (the
  7 | # "Software"), to deal in the Software without restriction, including
  8 | # without limitation the rights to use, copy, modify, merge, publish,
  9 | # distribute, sublicense, and/or sell copies of the Software, and to
 10 | # permit persons to whom the Software is furnished to do so, subject to
 11 | # the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be
 14 | # included in all copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23 | 
 24 | from setuptools import setup
 25 | from setuptools.extension import Extension
 26 | 
 27 | ext_files = [
 28 |     'reppy/rep-cpp/src/agent.cpp',
 29 |     'reppy/rep-cpp/src/directive.cpp',
 30 |     'reppy/rep-cpp/src/robots.cpp',
 31 |     'reppy/rep-cpp/deps/url-cpp/src/url.cpp',
 32 |     'reppy/rep-cpp/deps/url-cpp/src/utf8.cpp',
 33 |     'reppy/rep-cpp/deps/url-cpp/src/punycode.cpp',
 34 |     'reppy/rep-cpp/deps/url-cpp/src/psl.cpp'
 35 | ]
 36 | 
 37 | kwargs = {}
 38 | 
 39 | try:
 40 |     from Cython.Distutils import build_ext
 41 |     print('Building from Cython')
 42 |     ext_files.append('reppy/robots.pyx')
 43 |     kwargs['cmdclass'] = {'build_ext': build_ext}
 44 | except ImportError:
 45 |     print('Building from C++')
 46 |     ext_files.append('reppy/robots.cpp')
 47 | 
 48 | ext_modules = [
 49 |     Extension(
 50 |         'reppy.robots', ext_files,
 51 |         language='c++',
 52 |         extra_compile_args=['-std=c++11'],
 53 |         include_dirs=[
 54 |             'reppy/rep-cpp/include',
 55 |             'reppy/rep-cpp/deps/url-cpp/include'])
 56 | ]
 57 | 
 58 | setup(
 59 |     name='reppy',
 60 |     version='0.4.16',
 61 |     description='Replacement robots.txt Parser',
 62 |     long_description='''Replaces the built-in robotsparser with a
 63 | RFC-conformant implementation that supports modern robots.txt constructs like
 64 | Sitemaps, Allow, and Crawl-delay. Main features:
 65 | 
 66 | - Memoization of fetched robots.txt
 67 | - Expiration taken from the `Expires` header
 68 | - Batch queries
 69 | - Configurable user agent for fetching robots.txt
 70 | - Automatic refetching based on expiration
 71 | ''',
 72 |     maintainer='Moz, Inc.',
 73 |     maintainer_email='turbo@moz.com',
 74 |     url='http://github.com/seomoz/reppy',
 75 |     license='MIT',
 76 |     platforms='Posix; MacOS X',
 77 |     ext_modules=ext_modules,
 78 |     packages=[
 79 |         'reppy',
 80 |         'reppy.cache'
 81 |     ],
 82 |     package_dir={
 83 |         'reppy': 'reppy',
 84 |         'reppy.cache': 'reppy/cache'
 85 |     },
 86 |     install_requires=[
 87 |         'cachetools',
 88 |         'python-dateutil>=1.5, !=2.0',
 89 |         'requests',
 90 |         'six'
 91 |     ],
 92 |     classifiers=[
 93 |         'License :: OSI Approved :: MIT License',
 94 |         'Development Status :: 5 - Production/Stable',
 95 |         'Environment :: Web Environment',
 96 |         'Intended Audience :: Developers',
 97 |         'Topic :: Internet :: WWW/HTTP',
 98 |         'Programming Language :: Python :: 2.7',
 99 |         'Programming Language :: Python :: 3',
100 |         'Programming Language :: Python :: 3.3',
101 |         'Programming Language :: Python :: 3.4',
102 |         'Programming Language :: Python :: 3.5',
103 |         'Programming Language :: Python :: 3.6',
104 |         'Programming Language :: Python :: 3.7'
105 |     ],
106 |     **kwargs
107 | )
108 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seomoz/reppy/e92f22e0b6153772bb66cd41cde06a5a7ec04d66/tests/__init__.py


--------------------------------------------------------------------------------
/tests/asis/test_after_parse_hook/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_after_response_hook/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_agent_allowed/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_caches_agent/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_caches_robots/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_content_too_big/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /
6 | 


--------------------------------------------------------------------------------
/tests/asis/test_excessive_redirects/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.1 301 Moved Permanently
2 | Location: http://localhost:8080/robots.txt
3 | Content-Type: text/plain
4 | 


--------------------------------------------------------------------------------
/tests/asis/test_fetch_status_200/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /
6 | 


--------------------------------------------------------------------------------
/tests/asis/test_fetch_status_401/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 401 Unauthorized
2 | Content-Type: text/plain
3 | 


--------------------------------------------------------------------------------
/tests/asis/test_fetch_status_403/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 403 Forbidden
2 | Content-Type: text/plain
3 | 


--------------------------------------------------------------------------------
/tests/asis/test_fetch_status_4XX/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 404 Not Found
2 | Content-Type: text/plain
3 | 


--------------------------------------------------------------------------------
/tests/asis/test_fetch_status_5XX/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 500 Internal Server Error
2 | Content-Type: text/plain
3 | 


--------------------------------------------------------------------------------
/tests/asis/test_returns_a_robots_object/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /
6 | 


--------------------------------------------------------------------------------
/tests/asis/test_returns_an_agent_object/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /
6 | 


--------------------------------------------------------------------------------
/tests/asis/test_robots_allowed/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Type: text/plain
3 | 
4 | User-Agent: *
5 | Disallow: /disallowed
6 | Allow: /allowed
7 | 


--------------------------------------------------------------------------------
/tests/asis/test_ssl_exception/robots.txt:
--------------------------------------------------------------------------------
1 | HTTP/1.0 200 OK
2 | Content-Length: 23
3 | Content-Type: text/plain
4 | 
5 | User-Agent: *
6 | Allow: /
7 | 


--------------------------------------------------------------------------------
/tests/test_agent.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from reppy.robots import Agent, Robots
  4 | 
  5 | 
  6 | class AgentTest(unittest.TestCase):
  7 |     '''Tests about the Agent.'''
  8 | 
  9 |     def parse(self, content, name):
 10 |         '''Parse the robots.txt in content and return the agent of the provided name.'''
 11 |         return Robots.parse('http://example.com', content).agent(name)
 12 | 
 13 |     def test_length(self):
 14 |         '''An agent knows how many directives it has.'''
 15 |         agent = Agent().disallow('/path').allow('/path/')
 16 |         self.assertEqual(len(agent), 2)
 17 | 
 18 |     def test_make_allowed(self):
 19 |         '''Make an agent that allows a path.'''
 20 |         agent = Agent().disallow('/path').allow('/path/')
 21 |         self.assertTrue(agent.allowed('/path/'))
 22 |         self.assertFalse(agent.allowed('/path'))
 23 | 
 24 |     def test_make_disallowed(self):
 25 |         '''Make an agent that disallows a path.'''
 26 |         agent = Agent().disallow('/path')
 27 |         self.assertFalse(agent.allowed('/path'))
 28 | 
 29 |     def test_checks_allowed(self):
 30 |         '''Answers the allowed question.'''
 31 |         agent = self.parse('''
 32 |             User-agent: agent
 33 |             Allow: /path
 34 |         ''', 'agent')
 35 |         self.assertTrue(agent.allowed('/path'))
 36 |         self.assertTrue(agent.allowed('/elsewhere'))
 37 | 
 38 |     def test_honors_longest_first_priority(self):
 39 |         '''The longest matching rule takes priority.'''
 40 |         agent = self.parse('''
 41 |             User-agent: agent
 42 |             Disallow: /path
 43 |             Allow: /path/exception
 44 |         ''', 'agent')
 45 |         self.assertTrue(agent.allowed('/path/exception'))
 46 |         self.assertFalse(agent.allowed('/path'))
 47 | 
 48 |     def test_robots_txt_allowed(self):
 49 |         '''Robots.txt is always allowed.'''
 50 |         agent = self.parse('''
 51 |             User-agent: agent
 52 |             Disallow: /robots.txt
 53 |         ''', 'agent')
 54 |         self.assertTrue(agent.allowed('/robots.txt'))
 55 | 
 56 |     def test_disallow_none(self):
 57 |         '''Recognizes the "Disallow:" form of "Allow: /"'''
 58 |         agent = self.parse('''
 59 |             User-agent: agent
 60 |             Disallow:
 61 |         ''', 'agent')
 62 |         self.assertTrue(agent.allowed('/anything'))
 63 | 
 64 |     def test_escaped_rule(self):
 65 |         '''Handles an escaped rule.'''
 66 |         agent = self.parse('''
 67 |             User-agent: agent
 68 |             Disallow: /a%3cd.html
 69 |         ''', 'agent')
 70 |         self.assertFalse(agent.allowed('/a<d.html'))
 71 |         self.assertFalse(agent.allowed('/a%3cd.html'))
 72 | 
 73 |     def test_unescaped_rule(self):
 74 |         '''Handles an unescaped rule.'''
 75 |         agent = self.parse('''
 76 |             User-agent: agent
 77 |             Disallow: /a<d.html
 78 |         ''', 'agent')
 79 |         self.assertFalse(agent.allowed('/a<d.html'))
 80 |         self.assertFalse(agent.allowed('/a%3cd.html'))
 81 | 
 82 |     def test_escaped_rule_wildcard(self):
 83 |         '''Handles the case where the wildcard rule is escaped.'''
 84 |         agent = self.parse('''
 85 |             User-agent: agent
 86 |             Disallow: /a%3c*
 87 |         ''', 'agent')
 88 |         self.assertFalse(agent.allowed('/a<d.html'))
 89 |         self.assertFalse(agent.allowed('/a%3cd.html'))
 90 | 
 91 |     def test_unescaped_rule_wildcard(self):
 92 |         '''Handles the case where the wildcard rule is unescaped.'''
 93 |         agent = self.parse('''
 94 |             User-agent: agent
 95 |             Disallow: /a<*
 96 |         ''', 'agent')
 97 |         self.assertFalse(agent.allowed('/a<d.html'))
 98 |         self.assertFalse(agent.allowed('/a%3cd.html'))
 99 | 
100 |     def test_accepts_full_url(self):
101 |         '''Accepts a full URL.'''
102 |         agent = self.parse('''
103 |             User-agent: agent
104 |             Disallow: /path;params?query
105 |         ''', 'agent')
106 |         self.assertFalse(agent.allowed('http://exmaple.com/path;params?query'))
107 | 
108 |     def test_query_only(self):
109 |         '''Recognized query-only rules.'''
110 |         agent = self.parse('''
111 |             User-agent: agent
112 |             Disallow: /?
113 |         ''', 'agent')
114 |         self.assertFalse(agent.allowed('/?'))
115 |         self.assertTrue(agent.allowed('/'))
116 | 
117 |     def test_params_only(self):
118 |         '''Recognized params-only rules.'''
119 |         agent = self.parse('''
120 |             User-agent: agent
121 |             Disallow: /;
122 |         ''', 'agent')
123 |         self.assertFalse(agent.allowed('/;'))
124 |         self.assertTrue(agent.allowed('/'))
125 | 
126 |     def test_str(self):
127 |         '''str() shows directives.'''
128 |         agent = self.parse('''
129 |             User-agent: agent
130 |             Disallow: /
131 |         ''', 'agent')
132 |         self.assertEquals(str(agent), '[Directive(Disallow: /)]')
133 | 
134 |     def test_str_crawl_delay(self):
135 |         '''str() shows crawl-delay.'''
136 |         agent = self.parse('''
137 |             User-agent: agent
138 |             Crawl-Delay: 1
139 |             Disallow: /
140 |         ''', 'agent')
141 |         self.assertEquals(
142 |             str(agent), 'Crawl-Delay: 1 [Directive(Disallow: /)]')
143 | 


--------------------------------------------------------------------------------
/tests/test_cache/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seomoz/reppy/e92f22e0b6153772bb66cd41cde06a5a7ec04d66/tests/test_cache/__init__.py


--------------------------------------------------------------------------------
/tests/test_cache/test_cache.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | '''Tests about our caching utilities.'''
  4 | 
  5 | import unittest
  6 | import mock
  7 | 
  8 | import sys
  9 | 
 10 | from reppy import cache
 11 | from reppy import logger
 12 | import reppy.exceptions
 13 | 
 14 | from ..util import requests_fixtures
 15 | 
 16 | 
 17 | class TestExpiringObject(unittest.TestCase):
 18 |     '''Tests about ExpiringObject.'''
 19 | 
 20 |     def test_uses_factory(self):
 21 |         '''Uses the result from the factory.'''
 22 |         factory = mock.Mock(return_value=(10, 'result'))
 23 |         obj = cache.ExpiringObject(factory)
 24 |         self.assertEqual(obj.get(), 'result')
 25 |         self.assertEqual(obj.expires, 10)
 26 | 
 27 |     def test_memoizes_cached_result(self):
 28 |         '''Memoizes what's returned by the factory.'''
 29 |         factory = mock.Mock(return_value=(10, 'result'))
 30 |         obj = cache.ExpiringObject(factory)
 31 |         with mock.patch.object(cache.time, 'time', return_value=0):
 32 |             for _ in range(10):
 33 |                 obj.get()
 34 |         self.assertEqual(factory.call_count, 1)
 35 | 
 36 |     def test_reraise_exception(self):
 37 |         '''If the provided factory returns an exception, reraise it.'''
 38 |         factory = mock.Mock(return_value=(10, ValueError('Kaboom!')))
 39 |         obj = cache.ExpiringObject(factory)
 40 |         with self.assertRaises(ValueError):
 41 |             obj.get()
 42 |         self.assertEqual(obj.expires, 10)
 43 | 
 44 | 
 45 | class TestBaseCache(unittest.TestCase):
 46 |     '''Tests about BaseCache.'''
 47 | 
 48 |     def test_does_not_implement_missing(self):
 49 |         '''Does not implement the missing method.'''
 50 |         with self.assertRaises(NotImplementedError):
 51 |             cache.BaseCache(10).fetch('http://example.com/robots.txt')
 52 | 
 53 | 
 54 | class TestRobotsCache(unittest.TestCase):
 55 |     '''Tests about RobotsCache.'''
 56 | 
 57 |     def setUp(self):
 58 |         self.cache = cache.RobotsCache(10)
 59 | 
 60 |     def test_returns_a_robots_object(self):
 61 |         '''Returns a Robots object.'''
 62 |         with requests_fixtures('test_returns_a_robots_object'):
 63 |             self.assertIsInstance(
 64 |                 self.cache.get('http://example.com/blog'), cache.Robots)
 65 | 
 66 |     def test_returns_allow_none_on_failure(self):
 67 |         '''Returns a AllowNone object on exception.'''
 68 |         self.assertIsInstance(
 69 |             self.cache.get('http://does-not-resolve/'), cache.AllowNone)
 70 | 
 71 |     def test_uses_default_expiration_on_failure(self):
 72 |         '''When we get AllowNone, it uses the default expiration.'''
 73 |         with mock.patch.object(self.cache.cache_policy, 'ttl', 17):
 74 |             with mock.patch.object(cache.time, 'time', return_value=0):
 75 |                 self.cache.get('http://does-not-resolve/')
 76 |                 self.assertEqual(
 77 |                     self.cache.cache['http://does-not-resolve/robots.txt'].expires, 17)
 78 | 
 79 |     def test_robots_allowed(self):
 80 |         '''Can check for allowed.'''
 81 |         with requests_fixtures('test_robots_allowed'):
 82 |             self.assertFalse(
 83 |                 self.cache.allowed('http://example.com/disallowed', 'agent'))
 84 |             self.assertTrue(
 85 |                 self.cache.allowed('http://example.com/allowed', 'agent'))
 86 | 
 87 |     def test_caches_robots(self):
 88 |         '''Caches robots responses.'''
 89 |         with requests_fixtures('test_caches_robots'):
 90 |             self.cache.get('http://example.com/')
 91 | 
 92 |         # The fact that these pass without the `requests_fixtures` block demonstrates
 93 |         # that these don't result in a second fetch.
 94 |         self.assertFalse(
 95 |             self.cache.allowed('http://example.com/disallowed', 'agent'))
 96 |         self.assertTrue(
 97 |             self.cache.allowed('http://example.com/allowed', 'agent'))
 98 | 
 99 | 
100 | class TestAgentCache(unittest.TestCase):
101 |     '''Tests about AgentCache.'''
102 | 
103 |     def setUp(self):
104 |         self.cache = cache.AgentCache('agent', 10)
105 | 
106 |     def test_returns_an_agent_object(self):
107 |         '''Returns a Robots object.'''
108 |         with requests_fixtures('test_returns_an_agent_object'):
109 |             self.assertIsInstance(
110 |                 self.cache.get('http://example.com/blog'), cache.Agent)
111 | 
112 |     def test_allows_none_on_failure(self):
113 |         '''Nothing is allowed on failure.'''
114 |         self.assertFalse(
115 |             self.cache.get('http://does-not-resolve/').allowed('/path'))
116 | 
117 |     def test_uses_default_expiration_on_failure(self):
118 |         '''On fetch failure, it uses the default expiration.'''
119 |         with mock.patch.object(self.cache.cache_policy, 'ttl', 17):
120 |             with mock.patch.object(cache.time, 'time', return_value=0):
121 |                 self.cache.get('http://does-not-resolve/')
122 |                 self.assertEqual(
123 |                     self.cache.cache['http://does-not-resolve/robots.txt'].expires, 17)
124 | 
125 |     def test_logs_on_failure(self):
126 |         '''On fetch failure, it logs the exception.'''
127 |         logs = []
128 |         def mock_logger(msg):
129 |             exc_type = sys.exc_info()[0]
130 |             logs.append( (exc_type, msg) )
131 |         with mock.patch.object(logger, 'exception', mock_logger):
132 |             self.cache.get('http://does-not-resolve/')
133 | 
134 |             expected_err = reppy.exceptions.ConnectionException
135 |             expected_msg = 'Reppy cache fetch error on http://does-not-resolve/robots.txt'
136 |             self.assertIn((expected_err, expected_msg), logs)
137 | 
138 |     def test_agent_allowed(self):
139 |         '''Can check for allowed.'''
140 |         with requests_fixtures('test_agent_allowed'):
141 |             self.assertFalse(
142 |                 self.cache.allowed('http://example.com/disallowed'))
143 |             self.assertTrue(
144 |                 self.cache.allowed('http://example.com/allowed'))
145 | 
146 |     def test_caches_agent(self):
147 |         '''Caches agent responses.'''
148 |         with requests_fixtures('test_caches_agent'):
149 |             self.cache.get('http://example.com/')
150 | 
151 |         # The fact that these pass without the `requests_fixtures` block demonstrates
152 |         # that these don't result in a second fetch.
153 |         self.assertFalse(
154 |             self.cache.allowed('http://example.com/disallowed'))
155 |         self.assertTrue(
156 |             self.cache.allowed('http://example.com/allowed'))
157 | 


--------------------------------------------------------------------------------
/tests/test_cache/test_policy.py:
--------------------------------------------------------------------------------
 1 | '''Tests about our caching policies.'''
 2 | 
 3 | import unittest
 4 | 
 5 | import mock
 6 | 
 7 | from reppy.cache import policy
 8 | 
 9 | 
10 | class TestCachePolicyBase(unittest.TestCase):
11 |     '''Tests about CachePolicyBase.'''
12 | 
13 |     def test_exception_not_implemented(self):
14 |         '''Does not implement the exception method.'''
15 |         with self.assertRaises(NotImplementedError):
16 |             policy.CachePolicyBase().exception(
17 |                 'http://example.com/', ValueError('Kaboom'))
18 | 
19 | 
20 | class TestDefaultObjectPolicy(unittest.TestCase):
21 |     '''Tests about DefaultObjectPolicy.'''
22 | 
23 |     def setUp(self):
24 |         self.factory = mock.Mock()
25 |         self.ttl = 17
26 |         self.policy = policy.DefaultObjectPolicy(self.ttl, self.factory)
27 | 
28 |     def test_uses_ttl(self):
29 |         '''Uses the provided TTL.'''
30 |         with mock.patch.object(policy.time, 'time', return_value=0):
31 |             expiration, _ = self.policy.exception(
32 |                 'http://example.com/', ValueError('Kaboom'))
33 |             self.assertEqual(expiration, self.ttl)
34 | 
35 |     def test_uses_factory(self):
36 |         '''Uses the provided factory.'''
37 |         _, value = self.policy.exception('http://example.com/', ValueError('Kaboom'))
38 |         self.assertEqual(value, self.factory.return_value)
39 | 
40 | 
41 | class TestReraiseExceptionPolicy(unittest.TestCase):
42 |     '''Tests about ReraiseExceptionPolicy.'''
43 | 
44 |     def setUp(self):
45 |         self.ttl = 17
46 |         self.policy = policy.ReraiseExceptionPolicy(self.ttl)
47 | 
48 |     def test_uses_ttl(self):
49 |         '''Uses the provided TTL.'''
50 |         with mock.patch.object(policy.time, 'time', return_value=0):
51 |             expiration, _ = self.policy.exception(
52 |                 'http://example.com/', ValueError('Kaboom'))
53 |             self.assertEqual(expiration, self.ttl)
54 | 
55 |     def test_returns_exception(self):
56 |         '''Returns whatever exception was passed in.'''
57 |         exception = ValueError('Kaboom')
58 |         _, value = self.policy.exception('http://example.com/', exception)
59 |         self.assertEqual(value, exception)
60 | 


--------------------------------------------------------------------------------
/tests/test_robots.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | '''These are unit tests that are derived from the rfc at
  7 | http://www.robotstxt.org/norobots-rfc.txt'''
  8 | 
  9 | import codecs
 10 | import unittest
 11 | 
 12 | import mock
 13 | from requests.exceptions import SSLError
 14 | 
 15 | from reppy import robots
 16 | 
 17 | from .util import requests_fixtures
 18 | 
 19 | 
 20 | class RobotsTest(unittest.TestCase):
 21 |     '''Tests about our Robots class.'''
 22 | 
 23 |     def test_expired(self):
 24 |         '''Returns true if expired.'''
 25 |         with mock.patch.object(robots.time, 'time', return_value=10):
 26 |             robot = robots.Robots.parse('http://example.com/robots.txt', '', expires=5)
 27 |             self.assertTrue(robot.expired)
 28 | 
 29 |     def test_not_expired(self):
 30 |         '''Returns false if not expired.'''
 31 |         with mock.patch.object(robots.time, 'time', return_value=10):
 32 |             robot = robots.Robots.parse('http://example.com/robots.txt', '', expires=15)
 33 |             self.assertFalse(robot.expired)
 34 | 
 35 |     def test_ttl(self):
 36 |         '''Returns the time remaining until expiration.'''
 37 |         with mock.patch.object(robots.time, 'time', return_value=10):
 38 |             robot = robots.Robots.parse('http://example.com/robots.txt', '', expires=15)
 39 |             self.assertEqual(robot.ttl, 5)
 40 | 
 41 |     def test_no_leading_user_agent(self):
 42 |         '''Treats missing User-Agent as default user agent'''
 43 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
 44 |             Disallow: /path
 45 |             Allow: /path/exception
 46 |             Crawl-delay: 7
 47 |         ''')
 48 |         self.assertNotEqual(robot.agent('agent'), None)
 49 |         self.assertTrue(robot.allowed('/path/exception', 'agent'))
 50 |         self.assertFalse(robot.allowed('/path', 'agent'))
 51 |         self.assertTrue(robot.allowed('/', 'agent'))
 52 |         self.assertEquals(robot.agent('agent').delay, 7)
 53 | 
 54 |     def test_malformed_crawl_delay(self):
 55 |         '''Handles a malformed delay.'''
 56 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
 57 |             User-agent: *
 58 |             Crawl-delay: word
 59 |         ''')
 60 |         self.assertEqual(robot.agent('agent').delay, None)
 61 | 
 62 |     def test_honors_default_agents(self):
 63 |         '''Honors the default user agent when that's all that's available.'''
 64 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
 65 |             User-agent: *
 66 |             Disallow: /tmp
 67 | 
 68 |             User-agent: other-agent
 69 |             Allow: /tmp
 70 |         ''')
 71 |         self.assertFalse(robot.allowed('/tmp', 'agent'))
 72 |         self.assertTrue(robot.allowed('/path', 'agent'))
 73 | 
 74 |     def test_honors_specific_agent(self):
 75 |         '''Honors the specific user agent if a match is found.'''
 76 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
 77 |             User-agent: *
 78 |             Disallow: /tmp
 79 | 
 80 |             User-agent: agent
 81 |             Allow: /tmp
 82 |         ''')
 83 |         self.assertTrue(robot.allowed('/tmp', 'agent'))
 84 |         self.assertTrue(robot.allowed('/path', 'agent'))
 85 | 
 86 |     def test_grouping(self):
 87 |         '''Multiple consecutive User-Agent lines are allowed.'''
 88 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
 89 |             User-agent: one
 90 |             User-agent: two
 91 |             Disallow: /tmp
 92 |         ''')
 93 |         self.assertFalse(robot.allowed('/tmp', 'one'))
 94 |         self.assertFalse(robot.allowed('/tmp', 'two'))
 95 | 
 96 |     def test_grouping_unknown_keys(self):
 97 |         '''
 98 |         When we encounter unknown keys, we should disregard any grouping that may have
 99 |         happened between user agent rules.
100 | 
101 |         This is an example from the wild. Despite `Noindex` not being a valid directive,
102 |         we'll not consider the '*' and 'ia_archiver' rules together.
103 |         '''
104 |         rules = robots.Robots.parse('http://example.com/robots.txt', '''
105 |             User-agent: *
106 |             Disallow: /content/2/
107 |             User-agent: *
108 |             Noindex: /gb.html
109 |             Noindex: /content/2/
110 |             User-agent: ia_archiver
111 |             Disallow: /
112 |         ''')
113 |         self.assertTrue(rules.allowed('/foo', 'agent'))
114 |         self.assertTrue(not rules.allowed('/bar', 'ia_archiver'))
115 | 
116 |     def test_separates_agents(self):
117 |         '''Hands back an appropriate agent.'''
118 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
119 |             User-agent: one
120 |             Crawl-delay: 1
121 | 
122 |             User-agent: two
123 |             Crawl-delay: 2
124 |         ''')
125 |         self.assertNotEqual(
126 |             robot.agent('one').delay,
127 |             robot.agent('two').delay)
128 | 
129 |     def test_exposes_sitemaps(self):
130 |         '''Finds and exposes sitemaps.'''
131 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
132 |             Sitemap: http://a.com/sitemap.xml
133 |             Sitemap: http://b.com/sitemap.xml
134 |         ''')
135 |         self.assertEqual(robot.sitemaps, [
136 |             'http://a.com/sitemap.xml', 'http://b.com/sitemap.xml'
137 |         ])
138 | 
139 |     def test_case_insensitivity(self):
140 |         '''Make sure user agent matches are case insensitive'''
141 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
142 |             User-agent: Agent
143 |             Disallow: /path
144 |         ''')
145 |         self.assertFalse(robot.allowed('/path', 'agent'))
146 |         self.assertFalse(robot.allowed('/path', 'aGeNt'))
147 | 
148 |     def test_empty(self):
149 |         '''Makes sure we can parse an empty robots.txt'''
150 |         robot = robots.Robots.parse('http://example.com/robots.txt', '')
151 |         self.assertEqual(robot.sitemaps, [])
152 |         self.assertTrue(robot.allowed('/', 'agent'))
153 | 
154 |     def test_comments(self):
155 |         '''Robust against comments.'''
156 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
157 |             User-Agent: *  # comment saying it's the default agent
158 |             Allow: /
159 |         ''')
160 |         self.assertNotEqual(robot.agent('agent'), None)
161 | 
162 |     def test_accepts_full_url(self):
163 |         '''Can accept a url string.'''
164 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
165 |             User-Agent: agent
166 |             Disallow: /
167 |         ''')
168 |         self.assertFalse(robot.allowed('http://example.com/path', 'agent'))
169 | 
170 |     def test_skip_malformed_line(self):
171 |         '''If there is no colon in a line, then we must skip it'''
172 |         robot = robots.Robots.parse('http://example.com/robots.txt', '''
173 |             User-Agent: agent
174 |             Disallow /no/colon/in/this/line
175 |         ''')
176 |         self.assertTrue(robot.allowed('/no/colon/in/this/line', 'agent'))
177 | 
178 |     def test_fetch_status_200(self):
179 |         '''A 200 parses things normally.'''
180 |         with requests_fixtures('test_fetch_status_200'):
181 |             robot = robots.Robots.fetch('http://localhost:8080/robots.txt')
182 |             self.assertFalse(robot.allowed('/path', 'agent'))
183 | 
184 |     def test_fetch_status_401(self):
185 |         '''A 401 gives us an AllowNone Robots.'''
186 |         with requests_fixtures('test_fetch_status_401'):
187 |             robot = robots.Robots.fetch('http://localhost:8080/robots.txt')
188 |             self.assertIsInstance(robot, robots.AllowNone)
189 | 
190 |     def test_fetch_status_403(self):
191 |         '''A 403 gives us an AllowNone Robots.'''
192 |         with requests_fixtures('test_fetch_status_403'):
193 |             robot = robots.Robots.fetch('http://localhost:8080/robots.txt')
194 |             self.assertIsInstance(robot, robots.AllowNone)
195 | 
196 |     def test_fetch_status_4XX(self):
197 |         '''A 4XX gives us an AllowAll Robots.'''
198 |         with requests_fixtures('test_fetch_status_4XX'):
199 |             robot = robots.Robots.fetch('http://localhost:8080/robots.txt')
200 |             self.assertIsInstance(robot, robots.AllowAll)
201 | 
202 |     def test_fetch_status_5XX(self):
203 |         '''A server error raises an exception.'''
204 |         with requests_fixtures('test_fetch_status_5XX'):
205 |             with self.assertRaises(robots.exceptions.BadStatusCode):
206 |                 robots.Robots.fetch('http://localhost:8080/robots.txt')
207 | 
208 |     def test_content_too_big(self):
209 |         '''Raises an exception if the content is too big.'''
210 |         with requests_fixtures('test_content_too_big'):
211 |             with self.assertRaises(robots.exceptions.ReppyException):
212 |                 robots.Robots.fetch('http://localhost:8080/robots.txt', max_size=5)
213 | 
214 |     def test_ssl_exception(self):
215 |         '''Raises a ReppyException on SSL errors.'''
216 |         with mock.patch.object(robots.requests, 'get', side_effect=SSLError('Kaboom')):
217 |             with self.assertRaises(robots.exceptions.SSLException):
218 |                 robots.Robots.fetch('https://localhost:8080/robots.txt')
219 | 
220 |     def test_connection_exception(self):
221 |         '''Raises a ReppyException on connection errors.'''
222 |         with self.assertRaises(robots.exceptions.ConnectionException):
223 |             robots.Robots.fetch('http://localhost:8080/robots.txt')
224 | 
225 |     def test_malformed_url(self):
226 |         '''Raises a ReppyException on malformed URLs.'''
227 |         with self.assertRaises(robots.exceptions.MalformedUrl):
228 |             robots.Robots.fetch('gobbledygook')
229 | 
230 |     def test_excessive_redirects(self):
231 |         '''Raises a ReppyException on too many redirects.'''
232 |         with requests_fixtures('test_excessive_redirects'):
233 |             with self.assertRaises(robots.exceptions.ExcessiveRedirects):
234 |                 robots.Robots.fetch('http://localhost:8080/robots.txt')
235 | 
236 |     def test_robots_url_http(self):
237 |         '''Works with a http URL.'''
238 |         url = 'http://user@example.com:80/path;params?query#fragment'
239 |         expected = 'http://example.com/robots.txt'
240 |         self.assertEqual(robots.Robots.robots_url(url), expected)
241 | 
242 |     def test_robots_url_https(self):
243 |         '''Works with a https URL.'''
244 |         url = 'https://user@example.com:443/path;params?query#fragment'
245 |         expected = 'https://example.com/robots.txt'
246 |         self.assertEqual(robots.Robots.robots_url(url), expected)
247 | 
248 |     def test_robots_url_non_default_port(self):
249 |         '''Works with a URL with a non-default port.'''
250 |         url = 'http://user@example.com:8080/path;params?query#fragment'
251 |         expected = 'http://example.com:8080/robots.txt'
252 |         self.assertEqual(robots.Robots.robots_url(url), expected)
253 | 
254 |     def test_robots_url_invalid_port(self):
255 |         '''Raises exception when given an invalid port.'''
256 |         url = 'http://:::cnn.com/'
257 |         with self.assertRaises(ValueError):
258 |             robots.Robots.robots_url(url)
259 | 
260 |     def test_utf8_bom(self):
261 |         '''If there's a utf-8 BOM, we should parse it as such'''
262 |         robot = robots.Robots.parse('http://example.com/robots.txt',
263 |             codecs.BOM_UTF8 + b'''
264 |             User-Agent: agent
265 |             Allow: /path
266 | 
267 |             User-Agent: other
268 |             Disallow: /path
269 |         ''')
270 |         self.assertTrue(robot.allowed('http://example.com/path', 'agent'))
271 |         self.assertFalse(robot.allowed('http://example.com/path', 'other'))
272 | 
273 |     def test_str_function(self):
274 |         '''
275 |         If there is valid UTF-8, str() should return a representation of the
276 |         directives.
277 | 
278 |         This came out of a UnicodeDecodeError happening in Python 2, when we
279 |         were unduly decoding the bytes (via UTF-8) to unicode, then implictly
280 |         converting back to bytes via UTF-8.
281 |         '''
282 |         robot = robots.Robots.parse('http://example.com/robots.txt',
283 |             codecs.BOM_UTF8 + b'''
284 |             User-Agent: \xc3\xa4gent
285 |             Allow: /swedish-chef
286 |         ''')
287 |         s = str(robot)
288 |         self.assertTrue('ägent' in s)
289 | 
290 |     def test_utf16_bom(self):
291 |         '''If there's a utf-16 BOM, we should parse it as such'''
292 |         robot = robots.Robots.parse('http://example.com/robots.txt',
293 |             codecs.BOM_UTF16 + b'''
294 |             User-Agent: agent
295 |             Allow: /path
296 | 
297 |             User-Agent: other
298 |             Disallow: /path
299 |         ''')
300 |         self.assertTrue(robot.allowed('http://example.com/path', 'agent'))
301 |         self.assertFalse(robot.allowed('http://example.com/path', 'other'))
302 | 
303 |     def test_rfc_example(self):
304 |         '''Tests the example provided by the RFC.'''
305 |         robot = robots.Robots.parse('http://www.fict.org', '''
306 |             # /robots.txt for http://www.fict.org/
307 |             # comments to webmaster@fict.org
308 | 
309 |             User-agent: unhipbot
310 |             Disallow: /
311 | 
312 |             User-agent: webcrawler
313 |             User-agent: excite
314 |             Disallow:
315 | 
316 |             User-agent: *
317 |             Disallow: /org/plans.html
318 |             Allow: /org/
319 |             Allow: /serv
320 |             Allow: /~mak
321 |             Disallow: /
322 |         ''')
323 | 
324 |         # The unhip bot
325 |         self.assertFalse(robot.allowed('/', 'unhipbot'))
326 |         self.assertFalse(robot.allowed('/index.html', 'unhipbot'))
327 |         self.assertTrue(robot.allowed('/robots.txt', 'unhipbot'))
328 |         self.assertFalse(robot.allowed('/server.html', 'unhipbot'))
329 |         self.assertFalse(robot.allowed('/services/fast.html', 'unhipbot'))
330 |         self.assertFalse(robot.allowed('/services/slow.html', 'unhipbot'))
331 |         self.assertFalse(robot.allowed('/orgo.gif', 'unhipbot'))
332 |         self.assertFalse(robot.allowed('/org/about.html', 'unhipbot'))
333 |         self.assertFalse(robot.allowed('/org/plans.html', 'unhipbot'))
334 |         self.assertFalse(robot.allowed('/%7Ejim/jim.html', 'unhipbot'))
335 |         self.assertFalse(robot.allowed('/%7Emak/mak.html', 'unhipbot'))
336 | 
337 |         # The webcrawler agent
338 |         self.assertTrue(robot.allowed('/', 'webcrawler'))
339 |         self.assertTrue(robot.allowed('/index.html', 'webcrawler'))
340 |         self.assertTrue(robot.allowed('/robots.txt', 'webcrawler'))
341 |         self.assertTrue(robot.allowed('/server.html', 'webcrawler'))
342 |         self.assertTrue(robot.allowed('/services/fast.html', 'webcrawler'))
343 |         self.assertTrue(robot.allowed('/services/slow.html', 'webcrawler'))
344 |         self.assertTrue(robot.allowed('/orgo.gif', 'webcrawler'))
345 |         self.assertTrue(robot.allowed('/org/about.html', 'webcrawler'))
346 |         self.assertTrue(robot.allowed('/org/plans.html', 'webcrawler'))
347 |         self.assertTrue(robot.allowed('/%7Ejim/jim.html', 'webcrawler'))
348 |         self.assertTrue(robot.allowed('/%7Emak/mak.html', 'webcrawler'))
349 | 
350 |         # The excite agent
351 |         self.assertTrue(robot.allowed('/', 'excite'))
352 |         self.assertTrue(robot.allowed('/index.html', 'excite'))
353 |         self.assertTrue(robot.allowed('/robots.txt', 'excite'))
354 |         self.assertTrue(robot.allowed('/server.html', 'excite'))
355 |         self.assertTrue(robot.allowed('/services/fast.html', 'excite'))
356 |         self.assertTrue(robot.allowed('/services/slow.html', 'excite'))
357 |         self.assertTrue(robot.allowed('/orgo.gif', 'excite'))
358 |         self.assertTrue(robot.allowed('/org/about.html', 'excite'))
359 |         self.assertTrue(robot.allowed('/org/plans.html', 'excite'))
360 |         self.assertTrue(robot.allowed('/%7Ejim/jim.html', 'excite'))
361 |         self.assertTrue(robot.allowed('/%7Emak/mak.html', 'excite'))
362 | 
363 |         # All others
364 |         self.assertFalse(robot.allowed('/', 'anything'))
365 |         self.assertFalse(robot.allowed('/index.html', 'anything'))
366 |         self.assertTrue(robot.allowed('/robots.txt', 'anything'))
367 |         self.assertTrue(robot.allowed('/server.html', 'anything'))
368 |         self.assertTrue(robot.allowed('/services/fast.html', 'anything'))
369 |         self.assertTrue(robot.allowed('/services/slow.html', 'anything'))
370 |         self.assertFalse(robot.allowed('/orgo.gif', 'anything'))
371 |         self.assertTrue(robot.allowed('/org/about.html', 'anything'))
372 |         self.assertFalse(robot.allowed('/org/plans.html', 'anything'))
373 |         self.assertFalse(robot.allowed('/%7Ejim/jim.html', 'anything'))
374 |         self.assertTrue(robot.allowed('/%7Emak/mak.html', 'anything'))
375 | 
376 |     def test_after_response_hook(self):
377 |         '''Calls after_response_hook when response is received'''
378 |         state = {"called": False}
379 | 
380 |         def hook(response):
381 |             state["called"] = True
382 |             self.assertEquals(response.status_code, 200)
383 |         with requests_fixtures('test_after_response_hook'):
384 |             robots.Robots.fetch(
385 |                 'http://example.com/robots.txt', after_response_hook=hook)
386 |             self.assertTrue(state["called"])
387 | 
388 |     def test_after_response_hook_on_error(self):
389 |         '''Calls after_response_hook when error occurs during fetch'''
390 |         state = {"called": False}
391 |         expected_url = 'http://localhost:8080/robots.txt'
392 | 
393 |         def hook(response):
394 |             state["called"] = True
395 |             self.assertIsInstance(
396 |                 response, robots.exceptions.ConnectionException)
397 |             self.assertEquals(response.url, expected_url)
398 |         with self.assertRaises(robots.exceptions.ConnectionException):
399 |             robots.Robots.fetch(expected_url, after_response_hook=hook)
400 |         self.assertTrue(state["called"])
401 | 
402 |     def test_after_parse_hook(self):
403 |         '''Calls after_parse_hook after parsing robots.txt'''
404 |         state = {"called": False}
405 | 
406 |         def hook(robots):
407 |             state["called"] = True
408 |             self.assertFalse(robots.allowed('/disallowed', 'me'))
409 |         with requests_fixtures('test_after_parse_hook'):
410 |             robots.Robots.fetch(
411 |                 'http://example.com/robots.txt', after_parse_hook=hook)
412 |             self.assertTrue(state["called"])
413 | 
414 | 
415 | class AllowNoneTest(unittest.TestCase):
416 |     '''Tests about the AllowNone Robots class.'''
417 | 
418 |     def test_allow(self):
419 |         '''Allows nothing.'''
420 |         robot = robots.AllowNone('http://example.com/robots.txt')
421 |         self.assertFalse(robot.allowed('/', 'agent'))
422 | 
423 |     def test_allow_robots_txt(self):
424 |         '''Allows robots.txt.'''
425 |         robot = robots.AllowNone('http://example.com/robots.txt')
426 |         self.assertTrue(robot.allowed('/robots.txt', 'agent'))
427 | 
428 | 
429 | class AllowAllTest(unittest.TestCase):
430 |     '''Tests about the AllowAll Robots class.'''
431 | 
432 |     def test_allow(self):
433 |         '''Allows nothing.'''
434 |         robot = robots.AllowAll('http://example.com/robots.txt')
435 |         self.assertTrue(robot.allowed('/', 'agent'))
436 | 


--------------------------------------------------------------------------------
/tests/test_ttl.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import mock
  4 | 
  5 | from reppy import ttl
  6 | 
  7 | 
  8 | class TTLPolicyBaseTest(unittest.TestCase):
  9 |     '''Tests about TTLPolicyBase.'''
 10 | 
 11 |     def test_does_not_implement_ttl(self):
 12 |         '''Does not implement the ttl method.'''
 13 |         with self.assertRaises(NotImplementedError):
 14 |             ttl.TTLPolicyBase().ttl(object())
 15 | 
 16 |     def test_implements_expires(self):
 17 |         '''Expires is based off of ttl.'''
 18 |         policy = ttl.TTLPolicyBase()
 19 |         with mock.patch.object(policy, 'ttl', return_value=10):
 20 |             with mock.patch.object(ttl.time, 'time', return_value=100):
 21 |                 self.assertEqual(policy.expires(object()), 110)
 22 | 
 23 | 
 24 | class HeaderWithDefaultPolicyTest(unittest.TestCase):
 25 |     '''Tests about HeaderWithDefaultPolicy.'''
 26 | 
 27 |     def test_no_store(self):
 28 |         '''Returns the minimum when no-store present.'''
 29 |         response = mock.Mock(headers={
 30 |             'cache-control': 'no-store'
 31 |         })
 32 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 33 |         self.assertEqual(policy.ttl(response), 10)
 34 | 
 35 |     def test_must_revalidate(self):
 36 |         '''Returns the minimum when must-revalidate present.'''
 37 |         response = mock.Mock(headers={
 38 |             'cache-control': 'must-revalidate'
 39 |         })
 40 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 41 |         self.assertEqual(policy.ttl(response), 10)
 42 | 
 43 |     def test_no_cache(self):
 44 |         '''Returns the minimum when no-cache present.'''
 45 |         response = mock.Mock(headers={
 46 |             'cache-control': 'no-cache'
 47 |         })
 48 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 49 |         self.assertEqual(policy.ttl(response), 10)
 50 | 
 51 |     def test_s_maxage(self):
 52 |         '''Returns the parsed s-maxage.'''
 53 |         response = mock.Mock(headers={
 54 |             'cache-control': 's-maxage=15'
 55 |         })
 56 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 57 |         self.assertEqual(policy.ttl(response), 15)
 58 | 
 59 |     def test_max_age(self):
 60 |         '''Returns the parsed max-age.'''
 61 |         response = mock.Mock(headers={
 62 |             'cache-control': 'max-age=15'
 63 |         })
 64 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 65 |         self.assertEqual(policy.ttl(response), 15)
 66 | 
 67 |     def test_default_for_malformed_maxage(self):
 68 |         '''Returns the default when maxage cannot be parsed.'''
 69 |         response = mock.Mock(headers={
 70 |             'cache-control': 'max-age=not-a-number'
 71 |         })
 72 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 73 |         self.assertEqual(policy.ttl(response), 20)
 74 | 
 75 |     def test_multiple_cache_control(self):
 76 |         '''Can walk through multiple cache control configs.'''
 77 |         response = mock.Mock(headers={
 78 |             'cache-control': 'foo, max-age=15'
 79 |         })
 80 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 81 |         self.assertEqual(policy.ttl(response), 15)
 82 | 
 83 |     def test_expires_with_no_date(self):
 84 |         '''Uses the host computer's date when the Date header is absent.'''
 85 |         expires = 'Thu, 13 Oct 2016 15:50:54 GMT'
 86 |         response = mock.Mock(headers={
 87 |             'expires': expires
 88 |         })
 89 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
 90 |         timestamp = ttl.parse_date(expires)
 91 |         expected = 60
 92 |         with mock.patch.object(ttl.time, 'time', return_value=timestamp - expected):
 93 |             self.assertEqual(policy.ttl(response), expected)
 94 | 
 95 |     def test_expires_with_malformed_date(self):
 96 |         '''Uses the host computer's date when the Date header is unparseable.'''
 97 |         expires = 'Thu, 13 Oct 2016 15:50:54 GMT'
 98 |         response = mock.Mock(headers={
 99 |             'expires': expires,
100 |             'date': 'not parseable as a date'
101 |         })
102 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
103 |         timestamp = ttl.parse_date(expires)
104 |         expected = 60
105 |         with mock.patch.object(ttl.time, 'time', return_value=timestamp - expected):
106 |             self.assertEqual(policy.ttl(response), expected)
107 | 
108 |     def test_expires_with_date(self):
109 |         '''Uses the Date header when present.'''
110 |         response = mock.Mock(headers={
111 |             'expires': 'Thu, 13 Oct 2016 15:50:54 GMT',
112 |             'date': 'Thu, 13 Oct 2016 15:49:54 GMT'
113 |         })
114 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
115 |         self.assertEqual(policy.ttl(response), 60)
116 | 
117 |     def test_malformed_expires(self):
118 |         '''Returns the default when the Expires header is malformed.'''
119 |         response = mock.Mock(headers={
120 |             'expires': 'not parseable as a date'
121 |         })
122 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
123 |         self.assertEqual(policy.ttl(response), 20)
124 | 
125 |     def test_cache_control_precedence(self):
126 |         '''Cache control is used before expires.'''
127 |         response = mock.Mock(headers={
128 |             'cache-control': 'max-age=30',
129 |             'expires': 'Thu, 13 Oct 2016 15:50:54 GMT',
130 |             'date': 'Thu, 13 Oct 2016 15:49:54 GMT'
131 |         })
132 |         policy = ttl.HeaderWithDefaultPolicy(20, 10)
133 |         self.assertEqual(policy.ttl(response), 30)
134 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from reppy import util
 4 | 
 5 | 
 6 | class ParseDateTest(unittest.TestCase):
 7 |     '''Tests about parsing dates provided in headers.'''
 8 | 
 9 |     def test_invalid(self):
10 |         '''Raises an error on an unparseable date.'''
11 |         with self.assertRaises(ValueError):
12 |             util.parse_date('Not a real date')
13 | 
14 |     def test_imf_fixdate(self):
15 |         '''Successfully parses a IMF fixdate.'''
16 |         self.assertEqual(
17 |             util.parse_date('Sun, 06 Nov 1994 08:49:37 GMT'), 784111777)
18 | 
19 |     def test_rfc_850(self):
20 |         '''An obsolete date format that is also supported.'''
21 |         self.assertEqual(
22 |             util.parse_date('Sunday, 06-Nov-94 08:49:37 GMT'), 784111777)
23 | 
24 |     def test_asctime(self):
25 |         '''An obsolete date format that is also supported.'''
26 |         self.assertEqual(
27 |             util.parse_date('Sun Nov  6 08:49:37 1994'), 784111777)
28 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | '''Testing utilities.'''
 2 | 
 3 | import contextlib
 4 | import os
 5 | 
 6 | import six
 7 | 
 8 | import requests_mock
 9 | 
10 | 
11 | @contextlib.contextmanager
12 | def requests_fixtures(*segments):
13 |     '''Mock the paths provided in the fixture.'''
14 |     # This reads in each of the asis files
15 |     path = os.path.join('tests', 'asis', *segments)
16 |     with requests_mock.mock() as mock:
17 |         for name in os.listdir(path):
18 |             with open(os.path.join(path, name), 'rb') as fin:
19 |                 content = iter(fin)
20 | 
21 |                 # Read in the status line
22 |                 line = next(content)
23 |                 _, status, reason = line.split(b' ', 2)
24 | 
25 |                 # Read in the headers
26 |                 headers = {}
27 |                 for line in content:
28 |                     if not line.strip():
29 |                         break
30 | 
31 |                     key, _, value = line.partition(b': ')
32 |                     headers[key.strip()] = value.strip()
33 | 
34 |                 # In python 3 mode, headers need to be text
35 |                 if six.PY3:
36 |                     headers = {
37 |                         k.decode('utf-8'): v.decode('utf-8') for k, v in headers.items()}
38 | 
39 |                 mock.get(
40 |                     '/%s' % name,
41 |                     status_code=int(status),
42 |                     reason=reason,
43 |                     headers=headers,
44 |                     content=b'\n'.join(content))
45 |         yield
46 | 


--------------------------------------------------------------------------------