├── .dockerignore
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .pylintrc
├── Dockerfile
├── LICENSE.md
├── Makefile
├── README.md
├── examples
    ├── issue_209.py
    ├── merge_group.py
    ├── merge_group.txt
    ├── mistake2.txt
    ├── robots.txt
    ├── robots_308278.txt
    ├── robots_541230.txt
    ├── robots_file.py
    ├── robots_file_large.py
    ├── robots_file_large.txt
    ├── robots_multiple_agents.py
    ├── robots_multiple_agents.txt
    ├── robots_string.py
    └── robots_url.py
├── make.bat
├── requirements.txt
├── robots
    ├── __init__.py
    ├── __main__.py
    ├── parser.py
    └── robotparser.py
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── core.py
    ├── test_google.py
    ├── test_google_correctness.py
    ├── test_google_stress.py
    ├── test_network.py
    ├── test_parser.py
    ├── test_robotparser.py
    └── test_robots.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !README.md
3 | !LICENSE.md


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |   open-pull-requests-limit: 10
 8 |   ignore:
 9 |   - dependency-name: twine
10 |     versions:
11 |     - 3.4.0
12 |     - 3.4.1
13 |   - dependency-name: tqdm
14 |     versions:
15 |     - 4.56.1
16 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Test RobotsPy with Pytest
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - name: Set up Python ${{ matrix.python-version }}
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: ${{ matrix.python-version }}
19 |       - name: Display Python version
20 |         run: python -c "import sys; print(sys.version)"
21 |       - name: Install Dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install -r requirements.txt
25 |       - name: Execute pytest 
26 |         run: pytest tests -vv
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv*
 2 | .idea/
 3 | .vscode/
 4 | tmp/
 5 | dist/
 6 | build/
 7 | __pycache__/
 8 | .mypy_cache/
 9 | .pytest_cache/
10 | *.py[cod]
11 | robotspy.egg-info
12 | *.bak


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [DESIGN]
2 | max-attributes=12
3 | good-names=f,m,T


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM python:3.12.5-alpine3.20
 4 | 
 5 | ENV     ROBOTSPY_VERSION=0.9.0 \
 6 |         maintainer="andre.burgaud@gmail.com"
 7 | 
 8 | LABEL   robotspy.version=$ROBOTSPY_VERSION
 9 | LABEL   python.version=$PYTHON_VERSION
10 | 
11 | RUN     pip install --no-cache-dir --upgrade pip && \
12 |         pip install --no-cache-dir robotspy==$ROBOTSPY_VERSION
13 | 
14 | ENTRYPOINT ["robots"]
15 | 
16 | CMD ["--help"]


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Andre Burgaud
 2 | 
 3 | Permission is hereby granted, free  of charge, to any person obtaining a copy of
 4 | this software and associated documentation  files (the "Software"),  to  deal in
 5 | the Software without restriction,  including without  limitation the  rights  to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit  persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice  and this permission notice shall be included in  all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE  IS PROVIDED "AS  IS",  WITHOUT  WARRANTY  OF ANY  KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE  AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS  OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR  ANY CLAIM, DAMAGES OR OTHER LIABILITY,  WHETHER
17 | IN AN  ACTION  OF  CONTRACT, TORT  OR  OTHERWISE, ARISING  FROM,  OUT  OF OR  IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := help
 2 | PROJECT := robotspy
 3 | VERSION := $(shell echo `grep __version__ robots/__init__.py | cut -d '"' -f 2`)
 4 | 
 5 | # twine installed globally
 6 | check:
 7 | 	twine check dist/*
 8 | 
 9 | clean:
10 | 	find . -name '*.pyc' -delete || true
11 | 	find . -name '__pycache__' -type d | xargs rm -rf || true
12 | 	find . -name '.pytest_cache' -type d | xargs rm -rf || true
13 | 	rm *.bak || true
14 | 	rm -rf .cache build dist robotspy.egg-info || true
15 | 
16 | # twine installed globally
17 | deploy:
18 | 	twine upload dist/*
19 | 
20 | difflib: SHELL:=/bin/bash
21 | difflib:
22 | 	diff -w <(pip freeze) <(cat requirements.txt)
23 | 
24 | 
25 | dist: version test clean wheel check
26 | 
27 | docker:
28 | 	docker build -t 'andreburgaud/${PROJECT}:${VERSION}' .
29 | 
30 | docker-scout: docker
31 | 	docker scout cves 'andreburgaud/${PROJECT}:${VERSION}'
32 | 
33 | docker-deploy: docker-scout
34 | 	docker push 'docker.io/andreburgaud/${PROJECT}:${VERSION}'
35 | 	docker tag 'andreburgaud/${PROJECT}:${VERSION}' 'docker.io/andreburgaud/${PROJECT}:latest'
36 | 	docker push 'docker.io/andreburgaud/${PROJECT}:latest'
37 | 
38 | # black installed globally
39 | fmt:
40 | 	black robots
41 | 
42 | help:
43 | 	@echo 'Makefile for RobotsPy (Python robots.txt parser)'
44 | 	@echo
45 | 	@echo 'Usage:'
46 | 	@echo '    make check         Check the wheel'
47 | 	@echo '    make clean         Delete temp files (*.pyc), caches (__pycache__)'
48 | 	@echo '    make deploy        Deploy package to the Cheese Shop (PyPI)'
49 | 	@echo '    make difflib       Identify differences between libraries installed and requirement.txt file'
50 | 	@echo '    make dist          Clean, generate the distribution and check'
51 | 	@echo '    make docker        Build a docker image using the Dockerfile at the root of hte repo'
52 | 	@echo '    make docker-scout  Validate the image against CVEs (requires docker scout to be installed on the build system)'
53 | 	@echo '    make docker-deploy Push the docker image to Docker Hub (requires a docker hub account)'
54 | 	@echo '    make fmt           Format Python files using Black (installed globally)'
55 | 	@echo '    make freeze        Update the requirements.txt excluding local package (robotspy)'
56 | 	@echo '    make help          Display this help message'
57 | 	@echo '    make lint          Lint Python file using Pylint (installed globally)'
58 | 	@echo '    make test          Execute tests'
59 | 	@echo '    make type          Type checking using Mypy (installed globally)'
60 | 	@echo '    make version       Display current package version'
61 | 	@echo '    make wheel         Build the wheel'
62 | 
63 | # pylint installed globally
64 | lint:
65 | 	pylint robots
66 | 
67 | tag:
68 | 	git push
69 | 	git tag -a ${VERSION} -m 'Version ${VERSION}'
70 | 	git push --tags
71 | 
72 | test:
73 | 	pytest tests -vv
74 | 
75 | # mypy installed globally
76 | type:
77 | 	mypy --check-untyped-defs robots
78 | 
79 | version:
80 | 	@echo 'robots version: ${VERSION}'
81 | 	@perl -pi.bak -e 's/version="(\d+\.\d+\.\d+.*)"/version="${VERSION}"/' setup.py
82 | 
83 | wheel:
84 | 	python setup.py sdist bdist_wheel
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Robots Exclusion Standard Parser for Python
  2 | 
  3 | The `robotspy` Python module implements a parser for `robots.txt` files. The recommended class to use is
  4 | `robots.RobotsParser`. 
  5 | 
  6 | A thin facade `robots.RobotFileParser` can also be used as  a substitute for [`urllib.robotparser.RobotFileParser`](https://docs.python.org/3/library/urllib.robotparser.html),
  7 | available in the Python standard library. The class `robots.RobotFileParser` exposes an API that is mostly compatible
  8 | with `urllib.robotparser.RobotFileParser`.
  9 | 
 10 | The main reasons for this rewrite are the following:
 11 | 
 12 | 1. It was initially intended to experiment with parsing `robots.txt` files for a link checker project (not implemented yet).
 13 | 1. It (mostly) follows the specs from the [RFC 9309 - Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309).
 14 | 1. It does not try to be compliant with commonly accepted directives that are not in the current specs such as `request-rate` 
 15 | and `crawl-delay`, but it currently supports `sitemaps`.
 16 | 1. It satisfies the same tests as the [Google Robots.txt Parser](https://github.com/google/robotstxt), except for some custom behaviors specific to Google Robots.
 17 | 
 18 | To use the `robots` command line tool (CLI) in a Docker container, read the following section **Docker Image**.
 19 | 
 20 | To install `robotspy` globally as a tool on your system with `pipx` skip to the **Global Installation** section.
 21 | 
 22 | If you are interested in using `robotspy` in a local Python environment or as a library, skip to section **Module Installation**.
 23 | 
 24 | ## Docker Image
 25 | 
 26 | The Robotspy CLI, `robots`, is available as a [Docker](https://www.docker.com/) automated built image at https://hub.docker.com/r/andreburgaud/robotspy.
 27 | 
 28 | If you already have [Docker](https://docs.docker.com/get-docker/) installed on your machine, first pull the image from Docker Hub:
 29 | 
 30 | ```
 31 | $ docker pull andreburgaud/robotspy
 32 | ```
 33 | 
 34 | Then, you can exercise the tool against the following remote Python `robots.txt` test file located at http://www.pythontest.net/elsewhere/robots.txt:
 35 | 
 36 | ```
 37 | # Used by NetworkTestCase in Lib/test/test_robotparser.py
 38 | 
 39 | User-agent: Nutch
 40 | Disallow: /
 41 | Allow: /brian/
 42 | 
 43 | User-agent: *
 44 | Disallow: /webstats/
 45 | ```
 46 | 
 47 | The following examples demonstrate how to use the `robots` command line with the Docker container:
 48 | 
 49 | ```
 50 | $ # Example 1: User agent "Johnny" is allowed to access path "/"
 51 | $ docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Johnny /
 52 | user-agent 'Johnny' with path '/': ALLOWED
 53 | ```
 54 | 
 55 | ```
 56 | $ # Example 2:  User agent "Nutch" is not allowed to access path "/brian"
 57 | $ docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Nutch /brian
 58 | user-agent 'Nutch' with path '/brian': DISALLOWED
 59 | ```
 60 | 
 61 | ```
 62 | $ # Example 3: User agent "Johnny" is not allowed to access path "/webstats/"
 63 | docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Johnny /webstats/
 64 | user-agent 'Johnny' with path '/webstats/': DISALLOWED
 65 | ```
 66 | 
 67 | The arguments are the following:
 68 | 
 69 | 1. Location of the robots.txt file (`http://www.pythontest.net/elsewhere/robots.txt`)
 70 | 1. User agent name (`Johnny`)
 71 | 1. Path or URL (`/`)
 72 | 
 73 | Without any argument, `robots` displays the help:
 74 | 
 75 | ```
 76 | docker run --rm andreburgaud/robotspy
 77 | usage: robots <robotstxt> <useragent> <path>
 78 | 
 79 | Shows whether the given user agent and path combination are allowed or disallowed by the given robots.txt file.
 80 | 
 81 | positional arguments:
 82 |   robotstxt      robots.txt file path or URL
 83 |   useragent      User agent name
 84 |   path           Path or URI
 85 | 
 86 | optional arguments:
 87 |   -h, --help     show this help message and exit
 88 |   -v, --version  show program's version number and exit
 89 | ```
 90 | 
 91 | To use the CLI `robots` as a global tools, continue to the following section. If you want to use `robotspy` as a Python module, skip to **Module Installation**.
 92 | 
 93 | ## Global Installation with pipx
 94 | 
 95 | If you only want to use the command line tool `robots`, you may want to use [pipx](https://pipxproject.github.io/pipx/installation/) to install it as a global tool on your system.
 96 | 
 97 | To install `robotspy` using `pipx` execute the following command:
 98 | 
 99 | ```bash
100 | $  pipx install robotspy
101 | ```
102 | 
103 | When `robotspy` is installed globally on your system, you can invoke it from any folder locations. For example, you can execute:
104 | 
105 | ```bash
106 | $  robots --version
107 | robots 0.8.0
108 | ```
109 | 
110 | You can see more detailed usages in section **Usage**.
111 | 
112 | ## Module Installation
113 | 
114 | **Note**: Python 3.8.x or 3.9.x required
115 | 
116 | You preferably want to install the `robotspy` package after creating a Python virtual environment,
117 | in a newly created directory, as follows:
118 | 
119 | ```
120 | $ mkdir project && cd project
121 | $ python -m venv .venv
122 | $ . .venv/bin/activate
123 | (.venv) $ python -m pip install --upgrade pip
124 | (.venv) $ python -m pip install --upgrade setuptools
125 | (.venv) $ python -m pip install robotspy
126 | (.venv) $ python -m robots --help
127 | ...
128 | ```
129 | 
130 | On Windows:
131 | 
132 | ```
133 | C:/> mkdir project && cd project
134 | C:/> python -m venv .venv
135 | C:/> .venv\scripts\activate
136 | (.venv) c:\> python -m pip install --upgrade pip
137 | (.venv) c:\> python -m pip install --upgrade setuptools
138 | (.venv) c:\> python -m pip install robotspy
139 | (.venv) c:\> python -m robots --help
140 | ...
141 | ```
142 | 
143 | ## Usage
144 | 
145 | The `robotspy` package can be imported as a module and also exposes an executable, `robots`, invocable with
146 | `python -m`. If installed globally with `pipx`, the command `robots` can be invoked from any folders. The usage examples in the following section use the command `robots`, but you can also substitute it with `python -m robots` in a virtual environment.
147 | 
148 | ### Execute the Tool
149 | 
150 | After installing `robotspy`, you can validate the installation by running the following command:
151 | 
152 | ```
153 | $ robots --help
154 | usage: robots <robotstxt> <useragent> <path>
155 | 
156 | Shows whether the given user agent and path combination are allowed or disallowed by the given robots.txt file.
157 | 
158 | positional arguments:
159 |   robotstxt      robots.txt file path or URL
160 |   useragent      User agent name
161 |   path           Path or URI
162 | 
163 | optional arguments:
164 |   -h, --help     show this help message and exit
165 |   -v, --version  show program's version number and exit
166 | ```
167 | 
168 | ### Examples
169 | 
170 | The content of http://www.pythontest.net/elsewhere/robots.txt is the following:
171 | 
172 | ```
173 | # Used by NetworkTestCase in Lib/test/test_robotparser.py
174 | 
175 | User-agent: Nutch
176 | Disallow: /
177 | Allow: /brian/
178 | 
179 | User-agent: *
180 | Disallow: /webstats/
181 | ```
182 | 
183 | To check if the user agent `Nutch` can fetch the path `/brian/` you can execute:
184 | 
185 | ```
186 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian/
187 | user-agent 'Nutch' with path '/brian/': ALLOWED
188 | ```
189 | 
190 | Or, you can also pass the full URL, http://www.pythontest.net/brian/:
191 | 
192 | ```
193 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian/
194 | user-agent 'Nutch' with url 'http://www.pythontest.net/brian/': ALLOWED
195 | ```
196 | 
197 | Can user agent `Nutch` fetch the path `/brian`?
198 | 
199 | ```
200 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian
201 | user-agent 'Nutch' with path '/brian': DISALLOWED
202 | ```
203 | 
204 | Or, `/`?
205 | 
206 | ```
207 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /
208 | user-agent 'Nutch' with path '/': DISALLOWED
209 | ```
210 | 
211 | How about user agent `Johnny`?
212 | 
213 | ```
214 | $ robots http://www.pythontest.net/elsewhere/robots.txt Johnny /
215 | user-agent 'Johnny' with path '/': ALLOWED
216 | ```
217 | 
218 | ### Use the Module in a Project
219 | 
220 | If you have a virtual environment with the `robotspy` package installed, you can use the `robots` module from the Python shell:
221 | 
222 | ```
223 | (.venv) $ python
224 | >>> import robots
225 | >>> parser = robots.RobotsParser.from_uri('http://www.pythontest.net/elsewhere/robots.txt')
226 | >>> useragent = 'Nutch'
227 | >>> path = '/brian/'
228 | >>> result = parser.can_fetch(useragent, path)
229 | >>> print(f'Can {useragent} fetch {path}? {result}')
230 | Can Nutch fetch /brian/? True
231 | >>>
232 | ```
233 | 
234 | ### Bug in the Python standard library
235 | 
236 | There is a bug in [`urllib.robotparser`](https://docs.python.org/3/library/urllib.robotparser.html)
237 | from the Python standard library that causes the following test to differ from the example above with `robotspy`.
238 | 
239 | The example with `urllib.robotparser` is the following:
240 | 
241 | ```
242 | $ python
243 | >>> import urllib.robotparser
244 | >>> rp = urllib.robotparser.RobotFileParser()
245 | >>> rp.set_url('http://www.pythontest.net/elsewhere/robots.txt')
246 | >>> rp.read()
247 | >>> rp.can_fetch('Nutch', '/brian/')
248 | False
249 | ```
250 | 
251 | Notice that the result is `False` whereas `robotspy` returns `True`.
252 | 
253 | Bug [bpo-39187](https://bugs.python.org/issue39187) was open to raise awareness on this issue and PR
254 | https://github.com/python/cpython/pull/17794 was submitted as a possible fix. `robotspy` does not
255 | exhibit this problem.
256 | 
257 | ## Development
258 | 
259 | The main development dependency is `pytest` for executing the tests. It is automatically
260 | installed if you perform the following steps:
261 | 
262 | ```
263 | $ git clone https://github.com/andreburgaud/robotspy
264 | $ cd robotspy
265 | $ python -m venv .venv --prompt robots
266 | $ . .venv/bin/activate
267 | (robots) $ python -m pip install -r requirements.txt
268 | (robots) $ python -m pip install -e .
269 | (robots) $ make test
270 | (robots) $ deactivate
271 | $
272 | ```
273 | 
274 | On Windows:
275 | 
276 | ```
277 | C:/> git clone https://github.com/andreburgaud/robotspy
278 | C:/> cd robotspy
279 | C:/> python -m venv .venv --prompt robotspy
280 | C:/> .venv\scripts\activate
281 | (robots) c:\> python -m pip install -r requirements.txt
282 | (robots) c:\> python -m pip install -e .
283 | (robots) c:\> make test
284 | (robots) c:\> deactivate
285 | ```
286 | 
287 | ## Global Tools
288 | 
289 | The following tools were used during the development of `robotspy`:
290 | 
291 | * [Black](https://github.com/psf/black)
292 | * [Mypy](http://mypy-lang.org/)
293 | * [Pylint](https://www.pylint.org/)
294 | * [twine](https://pypi.org/project/twine/)
295 | 
296 | See the build file, `Makefile` or `make.bat` on Windows, for the commands and parameters.
297 | 
298 | ## Release History
299 | 
300 | * 0.10.0:
301 |   * Fixed bugs in the URL path pattern matching ('?' is now handled correctly as the character '?' instead of matching any one character)
302 |   * Added tests 541230 and 541230 from Google project https://github.com/google/robotstxt-spec-test
303 |   * Contribution from https://github.com/kox-solid
304 | * 0.9.0:
305 |   * Updated the parser to behave like the Google robots parser. It now handles the product token in the user-agent line up to the last correct character instead of discarding it. See [issue #209](https://github.com/andreburgaud/robotspy/issues/209) for more details.
306 |   * Contribution from https://github.com/kox-solid
307 | * 0.8.0:
308 |   * Addressed an issue raised when a robots.txt file is not UTF-8 encoded
309 |   * Added a user agent to fetch the robots.txt, as some websites, such as pages hosted on Cloudflare, may return a 403 error
310 |   * Updated the documentation to link to RFC 9309, Robots Exclusion Protocol (REP)
311 |   * Added a GitHub action job to execute the tests against Python versions 3.8 to 3.12
312 |   * Contribution from https://github.com/tumma72
313 | * 0.7.0:
314 |   * Fixed bug with the argument path when using the CLI
315 |   * Print 'url' when the argument is a URL, 'path' otherwise
316 | * 0.6.0:
317 |   * Simplified dependencies by keeping only `pytest` in `requirements.txt`
318 | * 0.5.0:
319 |   * Updated all libraries. Tested with Python 3.9.
320 | * 0.4.0:
321 |   * Fixed issue with robots text pointed by relative paths
322 |   * Integration of [Mypy](http://mypy-lang.org/), [Black](https://github.com/psf/black) and [Pylint](https://www.pylint.org/) as depencencies to ease cross-platform development
323 |   * Limited `make.bat` build file for Windows
324 |   * Git ignore vscode files, `tmp` directory, multiple virtual env (`.venv*`)
325 |   * Fixed case insensitive issues on Windows
326 |   * Tests successful on Windows
327 |   * Added an ATRIBUTIONS files and build task to generate it
328 |   * Upgraded `pyparsing` and `certifi`
329 | * 0.3.3:
330 |   * Upgraded `tqdm`, and `cryptography` packages
331 |   * 0.3.2:
332 |   * Upgraded `bleach`, `tqdm`, and `setuptools` packages
333 | * 0.3.1:
334 |   * Updated `idna` and `wcwidth` packages
335 |   * Added `pipdeptree` package to provide visibility on dependencies
336 |   * Fixed `mypy` errors
337 |   * Explicitly ignored `pylint` errors related to commonly used names like `f`, `m`, or `T`
338 | * 0.3.0: Updated `bleach` package to address CVE-2020-6802
339 | * 0.2.0: Updated the documentation
340 | * 0.1.0: Initial release
341 | 
342 | ## License
343 | 
344 | [MIT License](LICENSE.md)


--------------------------------------------------------------------------------
/examples/issue_209.py:
--------------------------------------------------------------------------------
 1 | import robots
 2 | 
 3 | content = """
 4 | User-agent: mozilla/5
 5 | Disallow: /
 6 | """
 7 | 
 8 | check_url = "https://example.com"
 9 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
10 | 
11 | parser = robots.RobotsParser.from_string(content)
12 | 
13 | print(parser.can_fetch(user_agent, check_url))
14 | print(parser.is_agent_valid(user_agent))
15 | 
16 | 
17 | content = """
18 | User-agent: mozilla
19 | Disallow: /
20 | """
21 | 
22 | check_url = "https://example.com"
23 | user_agent = "Mozilla"
24 | 
25 | parser = robots.RobotsParser.from_string(content)
26 | 
27 | print(parser.can_fetch(user_agent, check_url))
28 | print(parser.is_agent_valid(user_agent))


--------------------------------------------------------------------------------
/examples/merge_group.py:
--------------------------------------------------------------------------------
1 | import robots
2 | 
3 | parser = robots.RobotsParser.from_file("merge_group.txt")
4 | 
5 | assert parser.can_fetch("ExampleBot", "/")
6 | assert not parser.can_fetch("ExampleBot", "/foo")
7 | assert not parser.can_fetch("ExampleBot", "/bar")
8 | assert not parser.can_fetch("ExampleBot", "/baz")
9 | 


--------------------------------------------------------------------------------
/examples/merge_group.txt:
--------------------------------------------------------------------------------
1 | user-agent: ExampleBot
2 | disallow: /foo
3 | disallow: /bar
4 | 
5 | user-agent: ExampleBot
6 | disallow: /baz


--------------------------------------------------------------------------------
/examples/mistake2.txt:
--------------------------------------------------------------------------------
1 | user-agent FooBot
2 | disallow /
3 | 


--------------------------------------------------------------------------------
/examples/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /tmp/
3 | Disallow: /a%3Cd.html
4 | Disallow: /a/b.html
5 | Disallow: /%7ejoe/index.html
6 | 


--------------------------------------------------------------------------------
/examples/robots_308278.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Disallow: /asdf-login
 3 | Disallow: /asdf-admin
 4 | Disallow: /databack/
 5 | Disallow: /data/*
 6 | Disallow: /?*/
 7 | Disallow: /author/
 8 | Disallow: /id/*/page/
 9 | Disallow: /id/*/data/
10 | Sitemap: http://example.com/page-sitemap.xml
11 | 


--------------------------------------------------------------------------------
/examples/robots_541230.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Allow: /*.js
 3 | Allow: /*.css
 4 | Allow: /*.jpg
 5 | Allow: /*.png
 6 | Allow: /*.gif
 7 | Allow: /*?page
 8 | Allow: /*?ref=
 9 | Disallow: /*?
10 | Disallow: /stat/
11 | Disallow: /id/1
12 | Disallow: /id/3
13 | Disallow: /register
14 | Disallow: /id/5
15 | Disallow: /id/7
16 | Disallow: /id/8
17 | Disallow: /id/9
18 | Disallow: /id/sub
19 | Disallow: /panel/
20 | Disallow: /admin/
21 | Disallow: /informer/
22 | Disallow: /secure/
23 | Disallow: /poll/
24 | Disallow: /search/
25 | Disallow: /abnl/
26 | Disallow: /*_escaped_pattern_=
27 | Disallow: /*-*-*-*-321$
28 | Disallow: /baz/order/
29 | Disallow: /baz/printorder/
30 | Disallow: /baz/checkout/
31 | Disallow: /baz/user/
32 | Disallow: /baz/search
33 | Disallow: /*0-*-0-03$
34 | Disallow: /*-0-0-
35 | 
36 | Sitemap: http://example.com/sitemap.xml
37 | Sitemap: http://example.com/sitemap-forum.xml


--------------------------------------------------------------------------------
/examples/robots_file.py:
--------------------------------------------------------------------------------
 1 | import robots
 2 | 
 3 | AGENT = "test_robotparser"
 4 | 
 5 | parser = robots.RobotsParser.from_file("robots.txt")
 6 | 
 7 | if parser.errors:
 8 |     print("ERRORS:")
 9 |     print(parser.errors)
10 | 
11 | if parser.errors:
12 |     print("WARNINGS:")
13 |     print(parser.errors)
14 | 
15 | assert parser.can_fetch(AGENT, "/tmp")
16 | assert not parser.can_fetch(AGENT, "/tmp/")
17 | assert not parser.can_fetch(AGENT, "/tmp/a.html")
18 | assert not parser.can_fetch(AGENT, "/a%3cd.html")
19 | assert not parser.can_fetch(AGENT, "/a%3Cd.html")
20 | assert not parser.can_fetch(AGENT, "/a/b.html")
21 | assert not parser.can_fetch(AGENT, "/%7Ejoe/index.html")
22 | 


--------------------------------------------------------------------------------
/examples/robots_file_large.py:
--------------------------------------------------------------------------------
 1 | import robots
 2 | 
 3 | parser = robots.RobotsParser.from_file("robots_file_large.txt")
 4 | 
 5 | if parser.errors:
 6 |     print("ERRORS:")
 7 |     print(parser.errors)
 8 | 
 9 | if parser.errors:
10 |     print("WARNINGS:")
11 |     print(parser.errors)
12 | 
13 | assert parser.can_fetch("Googlebot", "/")
14 | assert not  parser.can_fetch("Exabot", "/")


--------------------------------------------------------------------------------
/examples/robots_file_large.txt:
--------------------------------------------------------------------------------
  1 | User-agent: Mediapartners-Google
  2 | Disallow:
  3 | 
  4 | User-agent: Mediapartners-Google*
  5 | Disallow:
  6 | 
  7 | User-agent: *
  8 | Disallow: /abuse
  9 | Disallow: /admgt/
 10 | Disallow: /donate
 11 | Disallow: /go/
 12 | Disallow: /modcp
 13 | Disallow: /post
 14 | Disallow: /privmsg
 15 | Disallow: /spa/
 16 | Disallow: /sta/
 17 | Disallow: /bw
 18 | Disallow: /dx
 19 | Disallow: /topicit/index.php/connect
 20 | Disallow: /calendar_scheduler.forum
 21 | Noindex: /login
 22 | 
 23 | User-agent: 008
 24 | User-agent: Accoona
 25 | User-agent: aipbot
 26 | User-agent: aipbot*
 27 | User-agent: aipbot/1.0
 28 | User-agent: Alexa
 29 | User-agent: Alexa Bitlybot
 30 | User-agent: Alexibot
 31 | User-agent: AltaVista Intranet V2.0 AVS EVAL search@freeit.com
 32 | User-agent: AltaVista Intranet V2.0 Compaq Altavista Eval sveand@altavista.net
 33 | User-agent: AltaVista Intranet V2.0 evreka.com crawler@evreka.com
 34 | User-agent: AltaVista V2.0B crawler@evreka.com
 35 | User-agent: Anonymous
 36 | User-agent: ApocalXExplorerBot
 37 | User-agent: appie
 38 | User-agent: Aqua_Products
 39 | User-agent: Argus/1.1
 40 | User-agent: Artabus
 41 | User-agent: Ask Jeeves
 42 | User-agent: asterias
 43 | User-agent: atSpider
 44 | User-agent: attentio
 45 | User-agent: AV Fetch 1.0
 46 | User-agent: AVSearch-3.0(AltaVista/AVC)
 47 | User-agent: AWS Cloud Based
 48 | User-agent: b2w
 49 | User-agent: b2w/0.1
 50 | User-agent: BackDoorBot
 51 | User-agent: BackDoorBot/1.0
 52 | User-agent: BacklinkCrawler
 53 | User-agent: becomebot
 54 | User-agent: BecomeBot
 55 | User-agent: BigBrother
 56 | User-agent: BIGLOTRON (BETA 2;GNU/Linux)
 57 | User-agent: BizInformation
 58 | User-agent: Black Hole
 59 | User-agent: Black.Hole
 60 | User-agent: BlackWidow
 61 | User-agent: BlowFish
 62 | User-agent: BlowFish/1.0
 63 | User-agent: BoardPulse
 64 | User-agent: boitho.com-dc
 65 | User-agent: Bookmark search tool
 66 | User-agent: bot/1.0
 67 | User-agent: BotALot
 68 | User-agent: Bot mailto:craftbot@yahoo.com
 69 | User-agent: BotRightHere
 70 | User-agent: BrandProtect
 71 | User-agent: BuiltBotTough
 72 | User-agent: Bullseye
 73 | User-agent: Bullseye/1.0
 74 | User-agent: BunnySlippers
 75 | User-agent: CazoodleBot
 76 | User-agent: Cegbfeieh
 77 | User-agent: cfetch
 78 | User-agent: cfetch/1.0
 79 | User-agent: CheeseBot
 80 | User-agent: CherryPicker
 81 | User-agent: CherryPicker /1.0
 82 | User-agent: CherryPickerElite/1.0
 83 | User-agent: CherryPickerSE/1.0
 84 | User-agent: ChinaClaw
 85 | User-agent: Collage
 86 | User-agent: cometrics-bot
 87 | User-agent: complex_network_group
 88 | User-agent: convera
 89 | User-agent: ConveraCrawler
 90 | User-agent: ConveraCrawler/0.2
 91 | User-agent: ConveraCrawler/0.9d
 92 | User-agent: Convera Internet Spider V6.x
 93 | User-agent: ConveraMultiMediaCrawler/0.1
 94 | User-agent: Copernic
 95 | User-agent: CopyRightCheck
 96 | User-agent: cosmos
 97 | User-agent: Crescent
 98 | User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
 99 | User-agent: Crescent Internet ToolPak HTTPOLE Control v.1.0
100 | User-agent: Curl
101 | User-agent: Custo
102 | User-agent: CydralSpider
103 | User-agent: Deepnet Explorer
104 | User-agent: default.ida
105 | User-agent: DigExt
106 | User-agent: DISCo
107 | User-agent: discobot
108 | User-agent: DISCoFinder
109 | User-agent: DISCo Pump
110 | User-agent: DISCo Pump 3.0
111 | User-agent: DISCo Pump 3.1
112 | User-agent: DISCo Pump 3.2
113 | User-agent: DittoSpyder
114 | User-agent: DOC
115 | User-agent: dotbot
116 | User-agent: DotBot
117 | User-agent: DotBot/1.1
118 | User-agent: Download Demon
119 | User-agent: Download Demon/3.2.0.8
120 | User-agent: Download Demon/3.5.0.11
121 | User-agent: Download Ninja
122 | User-agent: Download Wonder
123 | User-agent: DSurf
124 | User-agent: Dulance bot
125 | User-agent: dumbot
126 | User-agent: eCatch
127 | User-agent: eCatch/3.0
128 | User-agent: echo!
129 | User-agent: EchO!/2.0
130 | User-agent: EirGrabber
131 | User-agent: EliteSys Entry
132 | User-agent: EmailCollector
133 | User-agent: Email Extractor
134 | User-agent: EmailSiphon
135 | User-agent: EmailSmartz
136 | User-agent: EmailWolf
137 | User-agent: Enterprise_Search
138 | User-agent: Enterprise_Search/1.0
139 | User-agent: EroCrawler
140 | User-agent: es
141 | User-agent: ESIRover
142 | User-agent: e-SocietyRobot
143 | User-agent: Exabot
144 | User-agent: Exabot/2.0
145 | User-agent: Exabot-Images
146 | User-agent: Express WebPictures
147 | User-agent: Express WebPictures (www.express-soft.com)
148 | User-agent: ExtractorPro
149 | User-agent: EyeNetIE
150 | User-agent: FairAd Client
151 | User-agent: Fairshare
152 | User-agent: Fasterfox
153 | User-agent: Fetch
154 | User-agent: findlinks
155 | User-agent: Flaming AttackBot
156 | User-agent: Flamingo_SearchEngine
157 | User-agent: FlashGet
158 | User-agent: FlashGet WebWasher 3.2
159 | User-agent: Foobot
160 | User-agent: FreeFind
161 | User-agent: FreeWebMonitoring SiteChecker/0.1
162 | User-agent: FrontPage
163 | User-agent: FrontPage [NC,OR]
164 | User-agent: FurlBot
165 | User-agent: Gaisbot
166 | User-agent: Gaisbot/3.0
167 | User-agent: GetBot
168 | User-agent: GetRight
169 | User-agent: GetRight/2.11
170 | User-agent: GetRight/3.1
171 | User-agent: GetRight/3.2
172 | User-agent: GetRight/3.3
173 | User-agent: GetRight/3.3.3
174 | User-agent: GetRight/3.3.4
175 | User-agent: GetRight/4.0.0
176 | User-agent: GetRight/4.1.0
177 | User-agent: GetRight/4.1.1
178 | User-agent: GetRight/4.1.2
179 | User-agent: GetRight/4.2
180 | User-agent: GetRight/4.2b (Portuguxeas)
181 | User-agent: GetRight/4.2c
182 | User-agent: GetRight/4.3
183 | User-agent: GetRight/4.5
184 | User-agent: GetRight/4.5a
185 | User-agent: GetRight/4.5b
186 | User-agent: GetRight/4.5b1
187 | User-agent: GetRight/4.5b2
188 | User-agent: GetRight/4.5b3
189 | User-agent: GetRight/4.5b6
190 | User-agent: GetRight/4.5b7
191 | User-agent: GetRight/4.5c
192 | User-agent: GetRight/4.5d
193 | User-agent: GetRight/4.5e
194 | User-agent: GetRight/5.0beta1
195 | User-agent: GetRight/5.0beta2
196 | User-agent: GetUrl
197 | User-agent: GetWeb!
198 | User-agent: Gigabot
199 | User-agent: Gigabot/3.0
200 | User-agent: Go-Ahead-Got-It
201 | User-agent: Go!Zilla
202 | User-agent: Go!Zilla 3.3 (www.gozilla.com)
203 | User-agent: Go!Zilla 3.5 (www.gozilla.com)
204 | User-agent: Go!Zilla (www.gozilla.com)
205 | User-agent: GrabNet
206 | User-agent: Grafula
207 | User-agent: grub
208 | User-agent: grub-client
209 | User-agent: Hackertarget.com
210 | User-agent: Harvest
211 | User-agent: Harvest/1.5
212 | User-agent: Hatena Antenna
213 | User-agent: HavIndex
214 | User-agent: heritrix
215 | User-agent: hloader
216 | User-agent: HMView
217 | User-agent: httplib
218 | User-agent: httrack
219 | User-agent: HTTrack
220 | User-agent: HTTrack 3.0
221 | User-agent: HTTrack 3.0x
222 | User-agent: HTTrack [NC,OR]
223 | User-agent: humanlinks
224 | User-agent: ichiro
225 | User-agent: IconSurf
226 | User-agent: Igentia
227 | User-agent: Image Collector
228 | User-agent: Image Stripper
229 | User-agent: Image Sucker
230 | User-agent: Indy Library
231 | User-agent: Indy Library [NC,OR]
232 | User-agent: InfoNaviRobot
233 | User-agent: InfoSpiders
234 | User-agent: InterGET
235 | User-agent: Internet Explore
236 | User-agent: Internet Ninja
237 | User-agent: Internet Ninja 4.0
238 | User-agent: Internet Ninja 5.0
239 | User-agent: Internet Ninja 6.0
240 | User-agent: InternetSupervision
241 | User-agent: IRLbot
242 | User-agent: Iron
243 | User-agent: Iron33/1.0.2
244 | User-agent: Jeeves
245 | User-agent: JennyBot
246 | User-agent: Jetbot
247 | User-agent: Jetbot/1.0
248 | User-agent: JetCar
249 | User-agent: Jobo
250 | User-agent: JOC Web Spider
251 | User-agent: kalooga
252 | User-agent: KDD Exploror
253 | User-agent: Kenjin Spider
254 | User-agent: Kenjin.Spider
255 | User-agent: Keyword Density
256 | User-agent: Keyword.Density
257 | User-agent: Keyword Density/0.9
258 | User-agent: larbin
259 | User-agent: Larbin
260 | User-agent: larbin_2.6.2 (kabura@sushi.com)
261 | User-agent: larbin_2.6.2 kabura@sushi.com
262 | User-agent: larbin_2.6.2 (larbin2.6.2@unspecified.mail)
263 | User-agent: larbin_2.6.2 larbin2.6.2@unspecified.mail
264 | User-agent: larbin_2.6.2 larbin@correa.org
265 | User-agent: larbin_2.6.2 listonATccDOTgatechDOTedu
266 | User-agent: larbin_2.6.2 (listonATccDOTgatechDOTedu)
267 | User-agent: larbin_2.6.2 (vitalbox1@hotmail.com)
268 | User-agent: larbin_2.6.2 vitalbox1@hotmail.com
269 | User-agent: larbin (samualt9@bigfoot.com)
270 | User-agent: larbin samualt9@bigfoot.com
271 | User-agent: LBot
272 | User-agent: LeechFTP
273 | User-agent: LexiBot
274 | User-agent: libWeb/clsHTTP
275 | User-agent: libWeb/clsHTTPDisallow: /
276 | User-agent: libwww
277 | User-agent: LightningDownload
278 | User-agent: Linguee
279 | User-agent: LinkedIn
280 | User-agent: LinkextractorPro
281 | User-agent: Linknzbot
282 | User-agent: Linknzbot*
283 | User-agent: Linknzbot 2004
284 | User-agent: LinkScan
285 | User-agent: LinkScan/8.1a Unix
286 | User-agent: LinkScan/8.1a.Unix
287 | User-agent: LinkScan/8.1a Unix Disallow: /
288 | User-agent: linksmanager
289 | User-agent: LinksManager
290 | User-agent: LinksManager.com_bot
291 | User-agent: LinkWalker
292 | User-agent: LjSEEK
293 | User-agent: LNSpiderguy
294 | User-agent: looksmart
295 | User-agent: LWP
296 | User-agent: LWP*
297 | User-agent: lwp-trivial
298 | User-agent: lwp-trivial/1.34
299 | User-agent: magpie-crawler
300 | User-agent: Mail Sweeper
301 | User-agent: Marketwirebot
302 | User-agent: Mass Downloader
303 | User-agent: Mass Downloader/2.2
304 | User-agent: Mata Hari
305 | User-agent: Mata.Hari
306 | User-agent: MegaIndex.ru
307 | User-agent: MegaIndex.ru/2.0
308 | User-agent: MetagerBot
309 | User-agent: MetaURI
310 | User-agent: Microsoft.URL
311 | User-agent: Microsoft URL Control
312 | User-agent: Microsoft URL Control*
313 | User-agent: Microsoft.URL.Control
314 | User-agent: Microsoft URL Control - 5.01.4511
315 | User-agent: Microsoft URL Control - 6.00.8169
316 | User-agent: Microsoft URL Control - 6.01.9782
317 | User-agent: MIDown tool
318 | User-agent: MIIxpc
319 | User-agent: MIIxpc/4.2
320 | User-agent: Missigua Locator
321 | User-agent: Mister PiX
322 | User-agent: Mister.PiX
323 | User-agent: Mister Pix II 2.01
324 | User-agent: Mister Pix II 2.02a
325 | User-agent: Mister PiX version.dll
326 | User-agent: MLBot
327 | User-agent: moget
328 | User-agent: moget/2.1
329 | User-agent: mozilla
330 | User-agent: Mozilla
331 | User-agent: Mozilla/2.0 (compatible; Ask Jeeves)
332 | User-agent: mozilla/3
333 | User-agent: mozilla/4
334 | User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)
335 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 2000)
336 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 95)
337 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 98)
338 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows ME)
339 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows NT)
340 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows XP)
341 | User-agent: mozilla/5
342 | User-agent: MRSPUTNIK
343 | User-agent: MSIECrawler
344 | User-agent: MSRBOT
345 | User-agent: MS Search 4.0 Robot
346 | User-agent: MS Search 5.0 Robot
347 | User-agent: munky
348 | User-agent: naver
349 | User-agent: Naverbot
350 | User-agent: NaverBot
351 | User-agent: NaverBot-1.0
352 | User-agent: Navroad
353 | User-agent: NearSite
354 | User-agent: NetAnts
355 | User-agent: NetAnts/1.10
356 | User-agent: NetAnts/1.23
357 | User-agent: NetAnts/1.24
358 | User-agent: NetAnts/1.25
359 | User-agent: NetAttache
360 | User-agent: NetAttache Light 1.1
361 | User-agent: Netcraft Web Server Survey
362 | User-agent: NetMechanic
363 | User-agent: NetSpider
364 | User-agent: Net Vampire
365 | User-agent: Net Vampire/3.0
366 | User-agent: NetZIP
367 | User-agent: NetZip-Downloader
368 | User-agent: NetZip-Downloader/1.0.62 (Win32; Dec 7 1998)
369 | User-agent: NetZip Downloader 1.0 Win32(Nov 12 1998)
370 | User-agent: NetZippy+(http://www.innerprise.net/usp-spider.asp)
371 | User-agent: NetZippy+(http:/www.innerprise.net/usp-spider.asp)
372 | User-agent: NICErsPRO
373 | User-agent: NimbleCrawler
374 | User-agent: NPbot
375 | User-agent: NPBot
376 | User-agent: NPBot/3
377 | User-agent: Nutch
378 | User-agent: Nutch*
379 | User-agent: NutchCVS/0.06-dev
380 | User-agent: NutchCVS/0.7.1
381 | User-agent: NutchOrg
382 | User-agent: oBot
383 | User-agent: Ocelli
384 | User-agent: Octopus
385 | User-agent: Offline Explorer
386 | User-agent: Offline.Explorer
387 | User-agent: Offline Explorer/1.2
388 | User-agent: Offline Explorer/1.4
389 | User-agent: Offline Explorer/1.6
390 | User-agent: Offline Explorer/1.7
391 | User-agent: Offline Explorer/1.9
392 | User-agent: Offline Explorer/2.0
393 | User-agent: Offline Explorer/2.1
394 | User-agent: Offline Explorer/2.3
395 | User-agent: Offline Explorer/2.4
396 | User-agent: Offline Explorer/2.5
397 | User-agent: Offline Navigator
398 | User-agent: OmniExplorer_Bot
399 | User-agent: oneriot
400 | User-agent: Openbot
401 | User-agent: Openfind
402 | User-agent: Openfind data gathere
403 | User-agent: Openfind data gatherer
404 | User-agent: Oracle Ultra Search
405 | User-agent: OutfoxBot/0.5
406 | User-agent: PageGrabber
407 | User-agent: Papa Foto
408 | User-agent: pavuk
409 | User-agent: PBWF
410 | User-agent: pcBrowser
411 | User-agent: penthesilea
412 | User-agent: PerMan
413 | User-agent: PGBot
414 | User-agent: PhpDig
415 | User-agent: Pingdom GIGRIB (http://www.pingdom.com)
416 | User-agent: postrank
417 | User-agent: ProPowerBot
418 | User-agent: ProPowerBot/2.14
419 | User-agent: ProWebWalker
420 | User-agent: psbot
421 | User-agent: psycheclone
422 | User-agent: Psycheclone
423 | User-agent: Python-urllib
424 | User-agent: QuepasaCreep
425 | User-agent: QueryN Metasearch
426 | User-agent: QueryN.Metasearch
427 | User-agent: radian6 comment reader
428 | User-agent: radian6 Feedfetcher
429 | User-agent: Radiation Retriever
430 | User-agent: Radiation Retriever 1.1
431 | User-agent: RB2B-bot
432 | User-agent: RealDownload
433 | User-agent: RealDownload/4.0.0.40
434 | User-agent: RealDownload/4.0.0.41
435 | User-agent: RealDownload/4.0.0.42
436 | User-agent: ReGet
437 | User-agent: RepoMonkey
438 | User-agent: RepoMonkey Bait &amp; Tackle/v1.01
439 | User-agent: RepoMonkey Bait & Tackle
440 | User-agent: RepoMonkey Bait & Tackle/v1.01
441 | User-agent: research-spider
442 | User-agent: RMA
443 | User-agent: Robozilla
444 | User-agent: Roverbot
445 | User-agent: RufusBot
446 | User-agent: sbider
447 | User-agent: Scooter/1.0
448 | User-agent: Scooter/1.0 scooter@pa.dec.com
449 | User-agent: Scooter/1.1 (custom)
450 | User-agent: Scooter/2.0 G.R.A.B. V1.1.0
451 | User-agent: Scooter/2.0 G.R.A.B. X2.0
452 | User-agent: Scooter2_Mercator_x-x.0
453 | User-agent: Scooter-3.0.EU
454 | User-agent: Scooter-3.0.FS
455 | User-agent: Scooter-3.0.HD
456 | User-agent: Scooter-3.0QI
457 | User-agent: Scooter-3.0.VNS
458 | User-agent: Scooter-3.2
459 | User-agent: Scooter-3.2.BT
460 | User-agent: Scooter-3.2.DIL
461 | User-agent: Scooter-3.2.EX
462 | User-agent: Scooter-3.2.JT
463 | User-agent: Scooter-3.2.NIV
464 | User-agent: Scooter-3.2.SF0
465 | User-agent: Scooter-3.2.snippet
466 | User-agent: Scooter/3.3
467 | User-agent: Scooter-3.3dev
468 | User-agent: Scooter/3.3.QA.pczukor
469 | User-agent: Scooter/3.3_SF
470 | User-agent: Scooter/3.3.vscooter
471 | User-agent: Scooter-ARS-1.1
472 | User-agent: Scooter-ARS-1.1-ih
473 | User-agent: Scooter_bh0-3.0.3
474 | User-agent: Scooter_trk3-3.0.3
475 | User-agent: scooter-venus-3.0.vns
476 | User-agent: Scooter-W3-1.0
477 | User-agent: Scooter-W3.1.2
478 | User-agent: Scrubby
479 | User-agent: SearchDaimon.com-dc
480 | User-agent: searchpreview
481 | User-agent: semalt.com
482 | User-agent: seekbot
483 | User-agent: Seekbot
484 | User-agent: Seekbot/1.0
485 | User-agent: SEOprofiler
486 | User-agent: Shai'Hulud
487 | User-agent: Shim-Crawler
488 | User-agent: ShopWiki
489 | User-agent: ShopWiki/1.0
490 | User-agent: SightupBot
491 | User-agent: SiteBot
492 | User-agent: SiteSnagger
493 | User-agent: Slurp China
494 | User-agent: SlySearch
495 | User-agent: SmartDownload
496 | User-agent: SmartDownload/1.2.76 (Win32; Apr 1 1999)
497 | User-agent: SmartDownload/1.2.77 (Win32; Aug 17 1999)
498 | User-agent: SmartDownload/1.2.77 (Win32; Feb 1 2000)
499 | User-agent: SmartDownload/1.2.77 (Win32; Jun 19 2001)
500 | User-agent: Snapbot
501 | User-agent: Snappy
502 | User-agent: Softlayer Server
503 | User-agent: Sogou web spider
504 | User-agent: sootle
505 | User-agent: sosospider
506 | User-agent: SpankBot
507 | User-agent: spanner
508 | User-agent: spbot
509 | User-agent: Speedy
510 | User-agent: SpiderBot
511 | User-agent: Sqworm
512 | User-agent: Sqworm/2.9.85-BETA (beta_release; 20011115-775; i686-pc-linux
513 | User-agent: ssearcher100
514 | User-agent: Stanford
515 | User-agent: Stanford Comp Sci
516 | User-agent: suggybot
517 | User-agent: SuperBot
518 | User-agent: SuperBot/2.6
519 | User-agent: SuperBot/3.0 (Win32)
520 | User-agent: SuperBot/3.1 (Win32)
521 | User-agent: SuperHTTP
522 | User-agent: SuperHTTP/1.0
523 | User-agent: Surfbot
524 | User-agent: SurveyBot
525 | User-agent: suzuran
526 | User-agent: Szukacz
527 | User-agent: Szukacz/1.4
528 | User-agent: tAkeOut
529 | User-agent: Teleport
530 | User-agent: TeleportPro
531 | User-agent: Teleport Pro
532 | User-agent: Teleport Pro/1.29
533 | User-agent: Teleport Pro/1.29.1590
534 | User-agent: Teleport Pro/1.29.1634
535 | User-agent: Teleport Pro/1.29.1718
536 | User-agent: Teleport Pro/1.29.1820
537 | User-agent: Teleport Pro/1.29.1847
538 | User-agent: Telesoft
539 | User-agent: Templeton
540 | User-agent: Teoma
541 | User-agent: The Intraformant
542 | User-agent: The.Intraformant
543 | User-agent: TheNomad
544 | User-agent: TightTwatBot
545 | User-agent: Titan
546 | User-agent: toCrawl
547 | User-agent: toCrawl/UrlDispatcher
548 | User-agent: True_Robot
549 | User-agent: True_Robot/1.0
550 | User-agent: turingos
551 | User-agent: TurnitinBot
552 | User-agent: TurnitinBot/1.5
553 | User-agent: Tweetmeme
554 | User-agent: TwengaBot
555 | User-agent: Twiceler
556 | User-agent: URL Control
557 | User-agent: UrlDispatcher
558 | User-agent: ://URLFAN
559 | User-agent: URL_Spider_Pro
560 | User-agent: URLy Warning
561 | User-agent: URLy.Warning
562 | User-agent: VCI
563 | User-agent: VCI WebViewer VCI WebViewer Win32
564 | User-agent: vobsub
565 | User-agent: VoidEYE
566 | User-agent: vscooter
567 | User-agent: w3mir
568 | User-agent: WatchDog/3.0
569 | User-agent: WebAuto
570 | User-agent: WebAuto/3.40 (Win98; I)
571 | User-agent: WebBandit
572 | User-agent: WebBandit/3.50
573 | User-agent: WebCapture
574 | User-agent: WebCapture 2.0
575 | User-agent: WebCatcher
576 | User-agent: webcopier
577 | User-agent: WebCopier
578 | User-agent: WebCopier v.2.2
579 | User-agent: WebCopier v2.5
580 | User-agent: WebCopier v2.6
581 | User-agent: WebCopier v2.7a
582 | User-agent: WebCopier v2.8
583 | User-agent: WebCopier v3.0
584 | User-agent: WebCopier v3.0.1
585 | User-agent: WebCopier v3.2
586 | User-agent: WebCopier v3.2a
587 | User-agent: webcopy
588 | User-agent: WebCopy
589 | User-agent: webcrawl.net
590 | User-agent: WebEmailExtrac
591 | User-agent: WebEMailExtrac.*
592 | User-agent: WebEnhancer
593 | User-agent: WebFetch
594 | User-agent: webfetch/2.1.0
595 | User-agent: WebFetcher
596 | User-agent: WebGo IS
597 | User-agent: Web Image Collector
598 | User-agent: Web.Image.Collector
599 | User-agent: WebLeacher
600 | User-agent: WebmasterWorld Extractor
601 | User-agent: WebmasterWorldForumBot
602 | User-agent: webmirror
603 | User-agent: WebMirror
604 | User-agent: WebReaper
605 | User-agent: Web Reaper
606 | User-agent: WebReaper [info@webreaper.net]
607 | User-agent: WebReaper v9.1 - www.otway.com/webreaper
608 | User-agent: WebReaper v9.7 - www.webreaper.net
609 | User-agent: WebReaper v9.8 - www.webreaper.net
610 | User-agent: WebReaper vWebReaper v7.3 - www,otway.com/webreaper
611 | User-agent: WebReaper [webreaper@otway.com]
612 | User-agent: WebSauger
613 | User-agent: WebSauger 1.20b
614 | User-agent: WebSauger 1.20j
615 | User-agent: WebSauger 1.20k
616 | User-agent: website extractor
617 | User-agent: Website eXtractor
618 | User-agent: Website eXtractor (http:/www.asona.org)
619 | User-agent: Website Quester
620 | User-agent: Website.Quester
621 | User-agent: Website Quester - www.asona.org
622 | User-agent: Website Quester - www.esalesbiz.com/extra/
623 | User-agent: Webster Pro
624 | User-agent: Webster.Pro
625 | User-agent: WebStripper
626 | User-agent: WebStripper/2.02
627 | User-agent: WebStripper/2.03
628 | User-agent: WebStripper/2.10
629 | User-agent: WebStripper/2.12
630 | User-agent: WebStripper/2.13
631 | User-agent: WebStripper/2.15
632 | User-agent: WebStripper/2.16
633 | User-agent: WebStripper/2.19
634 | User-agent: Web Sucker
635 | User-agent: webvac
636 | User-agent: WebVac
637 | User-agent: WebVulnCrawl
638 | User-agent: WebVulnScan
639 | User-agent: WebWalk
640 | User-agent: WebWasher
641 | User-agent: WebWhacker
642 | User-agent: WebZip
643 | User-agent: WebZIP
644 | User-agent: WebZIP/2.75 (http://www.spidersoft.com)
645 | User-agent: WebZIP/2.75 (http:/www.spidersoft.com)
646 | User-agent: WebZIP/3.65 (http://www.spidersoft.com)
647 | User-agent: WebZIP/3.65 (http:/www.spidersoft.com)
648 | User-agent: WebZIP/3.80 (http://www.spidersoft.com)
649 | User-agent: WebZIP/3.80 (http:/www.spidersoft.com)
650 | User-agent: WebZip/4.0
651 | User-agent: WebZIP/4.0 (http://www.spidersoft.com)
652 | User-agent: WebZIP/4.0 (http:/www.spidersoft.com)
653 | User-agent: WebZIP/4.1 (http://www.spidersoft.com)
654 | User-agent: WebZIP/4.1 (http:/www.spidersoft.com)
655 | User-agent: WebZIP/4.21
656 | User-agent: WebZIP/4.21 (http://www.spidersoft.com)
657 | User-agent: WebZIP/4.21 (http:/www.spidersoft.com)
658 | User-agent: WebZIP/5.0
659 | User-agent: WebZIP/5.0 (http://www.spidersoft.com)
660 | User-agent: WebZIP/5.0 (http:/www.spidersoft.com)
661 | User-agent: WebZIP/5.0 PR1 (http://www.spidersoft.com)
662 | User-agent: WebZIP/5.0 PR1 (http:/www.spidersoft.com)
663 | User-agent: wget
664 | User-agent: wGet
665 | User-agent: Wget
666 | User-agent: Wget/1.10.2
667 | User-agent: Wget/1.5.2
668 | User-agent: Wget/1.5.3
669 | User-agent: Wget/1.6
670 | User-agent: Wget/1.7
671 | User-agent: Wget/1.8
672 | User-agent: Wget/1.8.1
673 | User-agent: Wget/1.8.1+cvs
674 | User-agent: Wget/1.8.2
675 | User-agent: Wget/1.9-beta
676 | User-agent: whitevector crawler
677 | User-agent: Whitevector+Crawler
678 | User-agent: Widow
679 | User-agent: WikioFeedBot
680 | User-agent: wikiwix-bot-3.0
681 | User-agent: Willow
682 | User-agent: WinHTTrack
683 | User-agent: Wise-Guys
684 | User-agent: woozweb-monitoring
685 | User-agent: woriobot
686 | User-agent: WWW-Collector
687 | User-agent: WWW-Collector-E
688 | User-agent: WWWOFFLE
689 | User-agent: Xaldon WebSpider
690 | User-agent: Xaldon WebSpider 2.5.b3
691 | User-agent: Xenu
692 | User-agent: Xenu Link Sleuth
693 | User-agent: Xenu's
694 | User-agent: Xenu's Link Sleuth 1.1c
695 | User-agent: xGet
696 | User-agent: Yahoo-MMCrawler
697 | User-agent: YahooSeeker/CafeKelsa
698 | User-agent: Yeti
699 | User-agent: YodaoBot
700 | User-agent: YRSPider
701 | User-agent: Zao
702 | User-agent: Zealbot
703 | User-agent: Zeus
704 | User-agent: Zeus 11389 Webster Pro V2.9 Win32
705 | User-agent: Zeus 11652 Webster Pro V2.9 Win32
706 | User-agent: Zeus 18018 Webster Pro V2.9 Win32
707 | User-agent: Zeus 26378 Webster Pro V2.9 Win32
708 | User-agent: Zeus 30747 Webster Pro V2.9 Win32
709 | User-agent: Zeus 32297 Webster Pro V2.9 Win32
710 | User-agent: Zeus 39206 Webster Pro V2.9 Win32
711 | User-agent: Zeus 41641 Webster Pro V2.9 Win32
712 | User-agent: Zeus 44238 Webster Pro V2.9 Win32
713 | User-agent: Zeus 51070 Webster Pro V2.9 Win32
714 | User-agent: Zeus 51674 Webster Pro V2.9 Win32
715 | User-agent: Zeus 51837 Webster Pro V2.9 Win32
716 | User-agent: Zeus 63567 Webster Pro V2.9 Win32
717 | User-agent: Zeus 6694 Webster Pro V2.9 Win32
718 | User-agent: Zeus 71129 Webster Pro V2.9 Win32
719 | User-agent: Zeus 82016 Webster Pro V2.9 Win32
720 | User-agent: Zeus 82900 Webster Pro V2.9 Win32
721 | User-agent: Zeus 84842 Webster Pro V2.9 Win32
722 | User-agent: Zeus 90872 Webster Pro V2.9 Win32
723 | User-agent: Zeus 94934 Webster Pro V2.9 Win32
724 | User-agent: Zeus 95245 Webster Pro V2.9 Win32
725 | User-agent: Zeus 95351 Webster Pro V2.9 Win32
726 | User-agent: Zeus 97371 Webster Pro V2.9 Win32
727 | User-agent: Zeus Link Scout
728 | User-agent: ZyBorg
729 | Disallow: /
730 | 
731 | User-agent: AhrefsBot
732 | User-agent: SemrushBot
733 | User-agent: Sogou web spider
734 | User-agent: sogou spider
735 | User-agent: MJ12bot
736 | User-agent: MJ12bot/v1.4.3
737 | Crawl-delay: 2
738 | 
739 | Sitemap: https://22-lr.forumactif.com/sitemap.xml


--------------------------------------------------------------------------------
/examples/robots_multiple_agents.py:
--------------------------------------------------------------------------------
 1 | import robots
 2 | 
 3 | parser = robots.RobotsParser.from_file("robots_multiple_agents.txt")
 4 | 
 5 | if parser.errors:
 6 |     print("ERRORS:")
 7 |     print(parser.errors)
 8 | 
 9 | if parser.errors:
10 |     print("WARNINGS:")
11 |     print(parser.errors)
12 | 
13 | assert parser.can_fetch("GoogleBot", "/")
14 | assert parser.can_fetch("GoogleBot", "/tmp")
15 | assert not parser.can_fetch("GoogleBot", "/tmp/")
16 | 
17 | assert parser.can_fetch("FacebookBot", "/")
18 | assert parser.can_fetch("FacebookBot", "/tmp")
19 | assert not parser.can_fetch("FacebookBot", "/tmp/")


--------------------------------------------------------------------------------
/examples/robots_multiple_agents.txt:
--------------------------------------------------------------------------------
 1 | User-agent: Mediapartners-Google
 2 | Disallow:
 3 | 
 4 | User-agent: Mediapartners-Google*
 5 | Disallow:
 6 | 
 7 | User-agent: *
 8 | Disallow: /abuse
 9 | Disallow: /admgt/
10 | Disallow: /donate
11 | Disallow: /go/
12 | Disallow: /modcp
13 | Disallow: /post
14 | Disallow: /privmsg
15 | Disallow: /spa/
16 | Disallow: /sta/
17 | Disallow: /bw
18 | Disallow: /dx
19 | Disallow: /topicit/index.php/connect
20 | Disallow: /calendar_scheduler.forum
21 | Noindex: /login
22 | 
23 | User-agent: 008
24 | User-agent: Accoona
25 | User-agent: aipbot
26 | User-agent: aipbot*
27 | User-agent: aipbot/1.0
28 | User-agent: Alexa
29 | User-agent: Alexa Bitlybot
30 | User-agent: Alexibot
31 | User-agent: AltaVista Intranet V2.0 AVS EVAL search@freeit.com
32 | User-agent: AltaVista Intranet V2.0 Compaq Altavista Eval sveand@altavista.net
33 | User-agent: AltaVista Intranet V2.0 evreka.com crawler@evreka.com
34 | User-agent: AltaVista V2.0B crawler@evreka.com
35 | Disallow: /bad
36 | 
37 | User-agent: GoogleBot
38 | User-agent: MicrosoftBot
39 | User-agent: FacebookBot
40 | Disallow: /tmp/
41 | Disallow: /secrets/
42 | 
43 | User-agent: AhrefsBot
44 | User-agent: SemrushBot
45 | User-agent: Sogou web spider
46 | User-agent: sogou spider
47 | User-agent: MJ12bot
48 | User-agent: MJ12bot/v1.4.3
49 | Crawl-delay: 2
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/examples/robots_string.py:
--------------------------------------------------------------------------------
 1 | import robots
 2 | 
 3 | r = """
 4 | # GoogleOnly_System
 5 | 
 6 | user-agent: FooBot
 7 | disallow: /
 8 | 
 9 | BAD LINE
10 | """
11 | 
12 | parser = robots.RobotsParser.from_string(r)
13 | 
14 | if parser.errors:
15 |     print("ERRORS:")
16 |     print(parser.errors)
17 | 
18 | if parser.errors:
19 |     print("WARNINGS:")
20 |     print(parser.errors)
21 | 
22 | assert not parser.can_fetch("FooBot", "/toto")
23 | 
24 | r = """
25 | # CrawlDelayAndCustomAgentTest
26 | 
27 | User-agent: *
28 | Crawl-delay: 1
29 | Request-rate: 3/15
30 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
31 | 
32 | # Cybermapper knows where to go.
33 | User-agent: cybermapper
34 | Disallow:
35 | """
36 | 
37 | parser = robots.RobotsParser.from_string(r)
38 | 
39 | if parser.errors:
40 |     print("ERRORS:")
41 |     print(parser.errors)
42 | 
43 | if parser.warnings:
44 |     print("WARNINGS:")
45 |     print(parser.warnings)
46 | 
47 | assert parser.can_fetch("cybermapper", "/cyberworld/map/index.html")
48 | 


--------------------------------------------------------------------------------
/examples/robots_url.py:
--------------------------------------------------------------------------------
 1 | # Content of http://www.musi-cal.com/robots.txt:
 2 | """
 3 | User-agent: *
 4 | Disallow: /wp-admin/
 5 | Allow: /wp-admin/admin-ajax.php
 6 | """
 7 | 
 8 | # The first implementation is using the Python standard library urllib.robotparser
 9 | 
10 | import urllib.robotparser
11 | import robots
12 | 
13 | rp = urllib.robotparser.RobotFileParser()
14 | rp.set_url("http://www.musi-cal.com/robots.txt")
15 | rp.read()
16 | 
17 | assert rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
18 | assert rp.can_fetch("*", "http://www.musi-cal.com/")
19 | assert not rp.can_fetch("*", "http://www.musi-cal.com/wp-admin/")
20 | assert not rp.can_fetch("*", "/wp-admin/")
21 | 
22 | # The second implementation is using the robotspy thin layer supporting the same api as
23 | # the python standard library urllib.robotparser
24 | 
25 | parser = robots.RobotFileParser()
26 | parser.set_url("http://www.musi-cal.com/robots.txt")
27 | parser.read()
28 | 
29 | assert parser.can_fetch(
30 |     "*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco"
31 | )
32 | assert parser.can_fetch("*", "http://www.musi-cal.com/")
33 | assert not parser.can_fetch("*", "http://www.musi-cal.com/wp-admin/")
34 | assert not parser.can_fetch("*", "/wp-admin/")
35 | 
36 | # The third implementation is directly using robots.RobotsParser
37 | 
38 | parser = robots.RobotsParser.from_uri("http://www.musi-cal.com/robots.txt")
39 | 
40 | if parser.errors:
41 |     print("ERRORS:")
42 |     print(parser.errors)
43 | 
44 | if parser.warnings:
45 |     print("WARNINGS:")
46 |     print(parser.errors)
47 | 
48 | assert parser.can_fetch(
49 |     "*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco"
50 | )
51 | assert parser.can_fetch("*", "http://www.musi-cal.com/")
52 | assert not parser.can_fetch("*", "http://www.musi-cal.com/wp-admin/")
53 | assert not parser.can_fetch("*", "/wp-admin/")
54 | 
55 | # Examples with custom timeout
56 | parser = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 2)
57 | 
58 | if parser.errors:
59 |     print("ERRORS:")
60 |     print(parser.errors)
61 | 
62 | if parser.warnings:
63 |     print("WARNINGS:")
64 |     print(parser.errors)
65 | 
66 | assert parser.can_fetch(
67 |     "Googlebot", "https://robotspy.org/"
68 | )
69 | assert parser.can_fetch("*", "https://robotspy.org/")
70 | 
71 | # Set a 0 timeout should result in an error
72 | 
73 | parser = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0)
74 | assert parser.errors
75 | if parser.errors:
76 |     print("ERRORS:")
77 |     print(parser.errors)
78 | 
79 | 
80 | # Timeout error
81 | parser = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 2)
82 | 
83 | # The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout
84 | assert parser.errors
85 | if parser.errors:
86 |     print("ERRORS:")
87 |     print(parser.errors)
88 | 


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | if {%1} == {} (
 4 |     goto USAGE
 5 | )
 6 | 
 7 | if {%1} == {help} (
 8 |     goto USAGE
 9 | )
10 | 
11 | if {%1} == {clean} (
12 |     goto CLEAN
13 | )
14 | 
15 | if {%1} == {fmt} (black robots) & (goto :EOF)
16 | if {%1} == {attributions} (pip-licenses -d -u -f markdown -o license > ATTRIBUTIONS.md) & (goto :EOF)
17 | if {%1} == {lint} (pylint robots) & (goto :EOF)
18 | if {%1} == {test} (pytest tests -vv) & (goto :EOF)
19 | if {%1} == {tree} (pipdeptree) & (goto :EOF)
20 | if {%1} == {type} (mypy robots) & (goto :EOF)
21 | 
22 | :CLEAN
23 | rmdir /q /s .cache build dist robotspy.egg-info .pytest_cache robots\__pycache__ tests\__pycache__
24 | del /q *.bak
25 | goto :EOF
26 | 
27 | :USAGE
28 | echo.
29 | echo Usage:
30 | echo.  make ^<task^>
31 | echo.
32 | echo The tasks are:
33 | echo.
34 | echo attributions       Generate attribution list for software used by robotspy
35 | echo make clean         Delete temp files (*.pyc), caches (__pycache__)
36 | echo make fmt           Format Python files using Black (Assuming Black installed globally)
37 | echo make help          Display this help message
38 | echo make lint          Lint Python file using Pylint (Assuming Pylint installed globally)
39 | echo make test          Execute tests
40 | echo make tree          Display the dependency tree (using pipdeptree)
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==8.3.4
2 | 


--------------------------------------------------------------------------------
/robots/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module robots.
 3 | """
 4 | 
 5 | from robots.parser import RobotsParser
 6 | from robots.parser import RequestRate
 7 | from robots.robotparser import RobotFileParser
 8 | 
 9 | __version__ = "0.12.0"
10 | 
11 | __all__ = ["RobotsParser", "RobotFileParser"]
12 | 


--------------------------------------------------------------------------------
/robots/__main__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Main module for package robots. The script is executed when invoking:
 3 | python -m robots <options> <arguments>
 4 | 
 5 | For help, use:
 6 | python -m robots -h | --help
 7 | 
 8 | It mimics the behavior of Google robotstxt available at:
 9 | https://github.com/google/robotstxt
10 | """
11 | 
12 | import argparse
13 | import pathlib
14 | import sys
15 | import urllib.parse
16 | 
17 | import robots
18 | 
19 | 
20 | def init_cli() -> argparse.ArgumentParser:
21 |     """Initialize the argument parser to handle the command line interface."""
22 | 
23 |     cli: argparse.ArgumentParser = argparse.ArgumentParser(
24 |         usage="%(prog)s <robotstxt> <useragent> <path>",
25 |         description=(
26 |             "Shows whether a given user agent and path/url combination "
27 |             "is allowed or disallowed by a given robots.txt file."
28 |         ),
29 |     )
30 |     cli.prog = __package__
31 |     cli.add_argument(
32 |         "-v", "--version", action="version", version=f"{cli.prog} {robots.__version__}"
33 |     )
34 |     cli.add_argument("robotstxt", help="robots.txt file path or URL")
35 |     cli.add_argument("useragent", help="User agent name")
36 |     cli.add_argument("path", help="Path or URL")
37 | 
38 |     return cli
39 | 
40 | 
41 | def is_url(path_uri: str) -> bool:
42 |     """Validate if a given string is a URL."""
43 | 
44 |     res = urllib.parse.urlsplit(path_uri)
45 |     return res.scheme in ("http", "https", "ftp", "file")
46 | 
47 | 
48 | def normalize_uri(path_uri: str) -> str:
49 |     """Convert any path to URI. If not a path, return the URI."""
50 | 
51 |     if not isinstance(path_uri, pathlib.Path) and is_url(path_uri):
52 |         return path_uri
53 | 
54 |     return pathlib.Path(path_uri).resolve().as_uri()
55 | 
56 | 
57 | def create_robots(robots_uri: str) -> robots.RobotsParser:
58 |     """Instantiate a RobotParser object with a URI."""
59 | 
60 |     parser: robots.RobotsParser = robots.RobotsParser.from_uri(robots_uri)
61 |     return parser
62 | 
63 | 
64 | def main() -> None:
65 |     """Entry point for the package as a Python module (python -m)"""
66 | 
67 |     cli = init_cli()
68 |     args = cli.parse_args()
69 | 
70 |     robots_uri = normalize_uri(args.robotstxt)
71 |     robots_parser = create_robots(robots_uri)
72 | 
73 |     allowed = robots_parser.can_fetch(args.useragent, args.path)
74 | 
75 |     allowed_str = "ALLOWED" if allowed else "DISALLOWED"
76 |     url_or_path = "url" if is_url(args.path) else "path"
77 |     print(f"user-agent '{args.useragent}' with {url_or_path} '{args.path}': {allowed_str}")
78 | 
79 |     if errors := robots_parser.errors:
80 |         for error in errors:
81 |             print(f"{error[0]} -> {error[1]}", file=sys.stderr)
82 | 
83 |     if warnings := robots_parser.warnings:
84 |         for warning in warnings:
85 |             print(f"{warning[0]} -> {warning[1]}", file=sys.stderr)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/robots/parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Alternate implementation of RobotParser (alternative to standard library urllib.robotparser)
  3 | 
  4 | Reference:
  5 | Robots Exclusion Protocol (REP) https://www.rfc-editor.org/rfc/rfc9309
  6 | """
  7 | 
  8 | from typing import Dict, Iterator, List, NamedTuple, Tuple, Type, TypeVar
  9 | 
 10 | import enum
 11 | import fnmatch
 12 | import re
 13 | import time
 14 | 
 15 | import urllib.parse
 16 | import urllib.error
 17 | import urllib.request
 18 | 
 19 | import robots
 20 | 
 21 | # Pattern used to validate a user agent token (not used by the parser)
 22 | RE_AGENT_TOKEN = re.compile(r"^[a-zA-Z_-]+$")
 23 | 
 24 | # Pattern to read and identify a product token, user agent.
 25 | # Match up to the first invalid character (for example stops at a number but does not error out)
 26 | # Note: the hash character ('#') needs to be escaped in VERBOSE mode, otherwise it would be
 27 | # interpreted as a comment.
 28 | RE_AGENT = re.compile(
 29 |     r"^\s*user-agent\s*:?\s*(?P<AGENT>\*|[a-zA-Z_-]+)[^]s]*\s*(?:\#.*)?$",
 30 |     re.IGNORECASE | re.VERBOSE,
 31 | )
 32 | 
 33 | RE_SITEMAP = re.compile(
 34 |     r"^\s*sitemap\s*:\s*(?P<SITEMAP>https?://[^\n\s]+)\s*$",
 35 |     re.IGNORECASE | re.VERBOSE,
 36 | )
 37 | 
 38 | # Product token in the user-agent line:
 39 | RE_PRODUCT = re.compile(r"^[a-zA-Z_-]+$|\*")
 40 | 
 41 | # Rule allow
 42 | RE_RULE_ALLOW = re.compile(
 43 |     r"^\s*(?P<RULE>allow)\s*:?\s*(?P<PATH>\*|[^\s#]+)?\s*(?:\#.*)?$",
 44 |     re.IGNORECASE | re.VERBOSE,
 45 | )
 46 | 
 47 | # Rule disallow
 48 | RE_RULE_DISALLOW = re.compile(
 49 |     r"^\s*(?P<RULE>disallow)\s*:?\s*(?P<PATH>\*|[^\s#]+)?\s*(?:\#.*)?$",
 50 |     re.IGNORECASE | re.VERBOSE,
 51 | )
 52 | 
 53 | # NamedTuple used to store rules. Each record includes:
 54 | # - A path or path pattern
 55 | # - A boolean indicating if the given path can be access or not
 56 | Rule = NamedTuple("Rule", [("path", str), ("allowed", bool)])
 57 | 
 58 | RequestRate = NamedTuple("RequestRate", [("requests", int), ("seconds", int)])
 59 | 
 60 | 
 61 | class State(enum.Enum):
 62 |     """Define states while parsing the robotstxt file"""
 63 | 
 64 |     BEGIN = enum.auto()  # Begin parsing
 65 |     AGENT = enum.auto()  # User-agent line
 66 |     RULE = enum.auto()  # Rule line
 67 | 
 68 | 
 69 | class TokenType(enum.Enum):
 70 |     """Token definitions for the parser"""
 71 | 
 72 |     AGENT = enum.auto()
 73 |     ALLOW = enum.auto()
 74 |     DISALLOW = enum.auto()
 75 |     SITEMAP = enum.auto()
 76 |     CRAWL_DELAY = enum.auto()
 77 |     REQ_RATE = enum.auto()
 78 |     UNEXPECTED = enum.auto()
 79 | 
 80 | 
 81 | class Errors(enum.Enum):
 82 |     """Errors definitions and messages"""
 83 | 
 84 |     WARNING_EMPTY_ALLOW_RULE = (
 85 |         "Warning: An empty allow rule has no effect and is confusing"
 86 |     )
 87 |     WARNING_RULE_WITHOUT_AGENT = "Warning: Rule without an agent is ignored"
 88 |     WARNING_NOTFOUND = "Warning: No remote robots.txt file found"
 89 |     WARNING_CRAWL_DELAY_IGNORED = "Warning: Directive 'crawl-delay' ignored"
 90 |     WARNING_REQUEST_RATE_IGNORED = "Warning: Directive 'request-rate' ignored"
 91 |     WARNING_UNEXPECTED_OR_IGNORED = "Warning: Unexpected or ignored token"
 92 |     ERROR_NO_FILE_FOUND = "Error: No file found"
 93 | 
 94 | 
 95 | Token = NamedTuple("Token", [("type", TokenType), ("value", str), ("linenum", int)])
 96 | 
 97 | 
 98 | def gen_tokens(gen_func, source):
 99 |     """Token generator.
100 | 
101 |     Emit tokens when parsing the content of a robots.txt
102 |     """
103 | 
104 |     linenum = 0
105 |     for line in gen_func(source):
106 |         linenum += 1
107 |         # pylint: disable=superfluous-parens
108 |         if not (line := line.strip()):
109 |             continue  # Skip empty lines
110 |         if line.startswith("#"):
111 |             continue  # Skip line comment
112 |         if m := RE_AGENT.match(line):
113 |             agent = m.group("AGENT")
114 |             yield Token(TokenType.AGENT, agent, linenum)
115 |         elif m := RE_RULE_ALLOW.match(line):
116 |             path = m.group("PATH")
117 |             yield Token(TokenType.ALLOW, path, linenum)
118 |         elif m := RE_RULE_DISALLOW.match(line):
119 |             path = m.group("PATH")
120 |             yield Token(TokenType.DISALLOW, path, linenum)
121 |         elif m := RE_SITEMAP.match(line):
122 |             sitemap = m.group("SITEMAP")
123 |             yield Token(TokenType.SITEMAP, sitemap, linenum)
124 |         else:
125 |             yield Token(TokenType.UNEXPECTED, line, linenum)
126 | 
127 | 
128 | T = TypeVar("T", bound="RobotsParser")
129 | 
130 | 
131 | class RobotsParser:
132 |     """Encapsulates functions and data to parse a robotstxt file and get
133 |     feedback on what a crawler is allowed to access to on a given website.
134 |     """
135 | 
136 |     AGENT_NOT_VALID = "User agent [%s] is not valid"
137 |     UNEXPECTED_LINE = "Unexpected line: [%s]"
138 | 
139 |     def __init__(self, url=""):
140 |         self.agents_rules: Dict[str, List[Rule]] = {}
141 |         self._sitemaps = []
142 |         self._errors = []
143 |         self._warnings = []
144 |         self.url = url
145 |         self.disallow_all = False
146 |         self.allow_all = False
147 |         self.timeout = 5
148 |         self._host = ""
149 |         self._path = "/robots.txt"
150 |         self._time = 0.0  # Time the robots.txt is fetched
151 | 
152 |     @property
153 |     def errors(self):
154 |         """Property pointing to the errors list"""
155 |         return self._errors
156 | 
157 |     @property
158 |     def warnings(self):
159 |         """Property pointing to the warning list"""
160 |         return self._warnings
161 | 
162 |     @property
163 |     def url(self):
164 |         """Url pointing to the robots.txt. For example: https://example.com/robots.com"""
165 |         return self._url
166 | 
167 |     @url.setter
168 |     def url(self, url):
169 |         """Set the url (example: 'https://www.example.com/robots.txt'), the path of the robots.txt
170 |         file, example: '/robots.txt', and the hostname, example 'www.example.com'.
171 | 
172 |         It discards the scheme ('http' or 'https') for compatibility with the Python standard
173 |         library module 'urllib.robotparser'."""
174 | 
175 |         self._url = url
176 |         self._host, self._path = urllib.parse.urlparse(url)[1:3]
177 | 
178 |     @property
179 |     def host(self):
180 |         """Host of the server serving the robots.txt file."""
181 |         return self._host
182 | 
183 |     @property
184 |     def path(self):
185 |         """Path of the robots.txt file.
186 | 
187 |         Example: '/robots.txt'.
188 |         """
189 |         return self._path
190 | 
191 |     @property
192 |     def timestamp(self):
193 |         """Property pointing to the time the robots.txt was parsed"""
194 |         return self._time
195 | 
196 |     @timestamp.setter
197 |     def timestamp(self, timestamp):
198 |         self._time = timestamp
199 | 
200 |     @property
201 |     def sitemaps(self):
202 |         """Property pointing to the private sitemaps list"""
203 |         return self._sitemaps or None
204 | 
205 |     @classmethod
206 |     def from_string(cls: Type[T], robotstxt: str) -> T:
207 |         """Build a robots parser from a string representing the content of a robots.txt file."""
208 |         parser = cls()
209 |         gen_string = lambda txt: (line for line in txt.split("\n"))
210 |         parser.parse_tokens(gen_tokens(gen_string, robotstxt))
211 |         return parser
212 | 
213 |     def gen_uri(self, uri: str):
214 |         """Instantiate a generator from a URI (either http:// or https:// or file:///"""
215 | 
216 |         try:
217 |             useragent = f"robotspy/{robots.__version__}"
218 |             req = urllib.request.Request(uri, headers={'User-Agent': useragent})
219 |             self.timestamp = time.time()
220 |             with urllib.request.urlopen(req, timeout=self.timeout) as f:
221 |                 for line in f:
222 |                     try:
223 |                         yield line.decode("utf-8-sig") # uft-8-sig to handle BOM characters
224 |                     except UnicodeDecodeError as err:
225 |                         self.allow_all = True
226 |                         self._errors.append(("robots.txt must be UTF-8 encoded", f"{str(err)} for {uri}"))
227 |                         return
228 |         except urllib.error.HTTPError as err:
229 |             if err.code in (401, 403):
230 |                 self.disallow_all = True
231 |                 self._errors.append((str(err.code), f"{str(err)} for {uri}"))
232 |             elif 400 <= err.code < 500: # Unavailable status
233 |                 self.allow_all = True
234 |                 self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
235 |             elif 500 <= err.code < 600: # Unreachable status
236 |                 self.disallow_all = True
237 |                 self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
238 |             self.timestamp = 0
239 |         except urllib.error.URLError as err: # Unreachable status?
240 |             self.disallow_all = True
241 |             now = time.time()
242 |             duration = round(now - self.timestamp)
243 |             self._errors.append(("", f"{str(err)} for {uri} (duration={duration}s)"))
244 | 
245 |     @classmethod
246 |     def from_uri(cls: Type[T], uri: str, timeout=5) -> T:
247 |         """Build a robots parser given a url or uri pointing to a robots.txt."""
248 |         parser = cls()
249 |         parser.timeout = timeout
250 |         parser.parse_tokens(gen_tokens(parser.gen_uri, uri))
251 |         return parser
252 | 
253 |     def gen_file(self, filename):
254 |         """Instantiate a generator from a file"""
255 | 
256 |         try:
257 |             with open(filename) as f:
258 |                 for line in f:
259 | 
260 |                     yield line
261 |         except FileNotFoundError:
262 |             self._errors.append((filename, Errors.ERROR_NO_FILE_FOUND.value))
263 | 
264 |     @classmethod
265 |     def from_file(cls: Type[T], filename: str) -> T:
266 |         """Build a robots parser given a local path pointing to a robots.txt file."""
267 |         parser = cls()
268 |         parser.parse_tokens(gen_tokens(parser.gen_file, filename))
269 |         return parser
270 | 
271 |     @staticmethod
272 |     def is_agent_valid(useragent: str) -> bool:
273 |         """Helper function not used internally by the parser. Useful to validate user agent token.
274 | 
275 |         https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
276 |         User agent in robots.txt should only include the following characters: "a-zA-Z_-"
277 |         """
278 |         if RE_AGENT_TOKEN.match(useragent):
279 |             return True
280 |         return False
281 | 
282 |     def update_rules(self, agents: List[str], rules: List[Rule]) -> None:
283 |         """Sort the rules for a given group.
284 | 
285 |         The rules are sorted with the longest path first and the allowed first in case of both
286 |         'diasallowed' and 'allowed' rule for same path
287 |         See: https://tools.ietf.org/html/draft-koster-rep-00#section-3.2
288 |         """
289 |         rules.sort(key=lambda x: (len(x.path), x.allowed), reverse=True)
290 | 
291 |         # Need to dedup agents `list(set(agents))` caused by intentional lax parsing stopping at the first invalid character.
292 |         # For example, GoogleBot and GoogleBot* will both result in googlebot
293 |         for agent in list(set(agents)):
294 |             if existing_rules := self.agents_rules.get(agent, None):
295 |                 rules = (
296 |                     existing_rules + rules
297 |                 )  # Combine rules if agent found several times
298 |                 rules.sort(key=lambda x: (len(x.path), x.allowed), reverse=True)
299 |             self.agents_rules[agent] = rules
300 | 
301 |     def parse_tokens(self, tokens: Iterator) -> None:
302 |         """Main function of the parser.
303 | 
304 |         Parse a robots.txt file and generate a data structure that can then be used by the
305 |         Robots object to answer question (can_fetch?) given a URL and a robots ID.
306 |         """
307 | 
308 |         state = State.BEGIN
309 |         current_agents: List[str] = []
310 |         current_rules: List[Rule] = []
311 | 
312 |         for token in tokens:
313 |             if token.type == TokenType.AGENT:
314 |                 if state == State.RULE:
315 |                     self.update_rules(current_agents, current_rules)
316 |                     current_agents = []
317 |                     current_rules = []
318 |                 state = State.AGENT
319 |                 current_agents.append(token.value.lower())
320 |             elif token.type in (TokenType.ALLOW, TokenType.DISALLOW):
321 |                 if state == State.BEGIN:
322 |                     self._warnings.append(
323 |                         (
324 |                             f"line {token.linenum}",
325 |                             Errors.WARNING_RULE_WITHOUT_AGENT.value,
326 |                         )
327 |                     )
328 |                     continue  # A rule without an agent is ignored
329 |                 state = State.RULE
330 |                 if path := token.value:
331 |                     current_rules.append(
332 |                         Rule(urllib.parse.unquote(path), token.type == TokenType.ALLOW)
333 |                     )
334 |                 else:
335 |                     if token.type == TokenType.ALLOW:
336 |                         self._warnings.append(
337 |                             (f"line {token.linenum}", Errors.WARNING_EMPTY_ALLOW_RULE.value)
338 |                         )
339 |             elif token.type == TokenType.SITEMAP:
340 |                 self._sitemaps.append(token.value)
341 |             else:
342 |                 # Unprocessed or unexpected token
343 |                 self._warnings.append(
344 |                     (
345 |                         f"line {token.linenum}",
346 |                         f"{Errors.WARNING_UNEXPECTED_OR_IGNORED.value}: {token.value}",
347 |                     )
348 |                 )
349 | 
350 |         self.update_rules(current_agents, current_rules)
351 | 
352 |     def find_rules(self, agent: str) -> List[Rule]:
353 |         """Return rules for a given agent. If agent is not stored, return
354 |         rules for wild card agent ('*'), if no rule for '*', return empty list.
355 |         """
356 | 
357 |         # Crawlers MUST use case-insensitive matching to find the group that matches the product token =>
358 |         # convert the product token (agent) to lower case.
359 |         rules = self.agents_rules.get(agent.lower(), self.agents_rules.get("*", []))
360 |         return rules
361 | 
362 |     @staticmethod
363 |     def dedup_slash(path: str) -> str:
364 |         """Replace multiple slashes in a path to one slash.
365 |         Keep the duplicate slash after https: or http: (a URL can appear in a query string)
366 |         Note: This would be a problem with other patterns like file:/// or ftp://
367 |         """
368 | 
369 |         # Regex lookbehind with http or https not possible as it has to be fixed length
370 |         path = re.sub(r"//+", "/", path)
371 |         # Inject back double slash after any scheme contained in the query string (http or https)
372 |         return re.sub(r"(https:/|http:/)", r"\1/", path)
373 | 
374 |     @staticmethod
375 |     def normalize_url(url: str) -> Tuple[str, str]:
376 |         """Normalize a URL to extract a quoted path to be used to compare with
377 |         a saved rule.
378 | 
379 |         Returns a tuple containing the host part of the URL if any and a normalized path
380 |         """
381 | 
382 |         url = urllib.parse.unquote(url)
383 |         result = urllib.parse.urlsplit(url)
384 | 
385 |         # extract the path portion of the URL as-is (e.g. preserve a standalone ?)
386 |         host_url = urllib.parse.urlunsplit((result.scheme, result.netloc, "", "", ""))
387 |         path = url[len(host_url):] or "/"
388 |         return result.netloc, RobotsParser.dedup_slash(path)
389 | 
390 |     @staticmethod
391 |     def startswith_pattern(path: str, pattern: str) -> bool:
392 |         """A match is intended to be a 'startswith' match. To accommodate, add a
393 |         star ('*') at the end of the pattern if it does not exist already.
394 |         """
395 | 
396 |         if pattern.endswith("$"):
397 |             # When ending with '$', needs to be an exact match
398 |             return fnmatch.fnmatchcase(path, pattern[:-1])
399 | 
400 |         if not pattern.endswith("*"):
401 |             pattern += "*"
402 | 
403 |         # In unix file name pattern a `?' is a single character. In a url path, it is a '?'. To take it as character
404 |         # replace with [?] (https://docs.python.org/3/library/fnmatch.html)
405 |         pattern = pattern.replace("?", "[?]")
406 | 
407 |         return fnmatch.fnmatchcase(path, pattern)
408 | 
409 |     # pylint: disable=too-many-return-statements
410 |     def can_fetch(self, useragent: str, url: str) -> bool:
411 |         """Answer the question if a user agent can fetch a particular URL.
412 | 
413 |         The parser checks the group of rules applying to the given robots ID (user-agent),
414 |         and then check the rule that may apply to the given URL.
415 |         """
416 | 
417 |         if self.allow_all:
418 |             return True
419 |         if self.disallow_all:
420 |             return False
421 | 
422 |         host, path = RobotsParser.normalize_url(url)
423 | 
424 |         if host and self.host and host != self.host:
425 |             return False
426 | 
427 |         rules = self.find_rules(useragent)
428 | 
429 |         for rule in rules:
430 |             # $ is a special character for robots and indicate the exact end of the pattern
431 |             if rule.path.endswith("$") and rule.path[:-1] == path:
432 |                 return rule.allowed
433 | 
434 |             if path.startswith(rule.path):
435 |                 return rule.allowed
436 | 
437 |             if rule.path != "*" and "*" in rule.path:
438 |                 if RobotsParser.startswith_pattern(path, rule.path):
439 |                     return rule.allowed
440 | 
441 |             if rule.path == "*":
442 |                 return rule.allowed
443 | 
444 |         return True
445 | 
446 |     def __str__(self):
447 |         """Produces a robots.txt from the structure of the rules memorized by the parser."""
448 | 
449 |         lines = []
450 |         for agent, rules in self.agents_rules.items():
451 |             lines.append(f"User-agent: {agent}")
452 |             for rule in rules:
453 |                 lines.append(
454 |                     ("Allow" if rule.allowed else "Disallow") + ": " + rule.path
455 |                 )
456 |             lines.append("")
457 | 
458 |         if self._sitemaps:
459 |             for sitemap in self._sitemaps:
460 |                 lines.append(f"Sitemap: {sitemap}")
461 | 
462 |             lines.append("")
463 | 
464 |         return "\n".join(lines)
465 | 


--------------------------------------------------------------------------------
/robots/robotparser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Thin facade in front of robots.parser to mimic the api from the Python standard library
 3 | urllib.robotparser https://docs.python.org/3/library/urllib.robotparser.html
 4 | """
 5 | 
 6 | import time
 7 | from typing import List
 8 | from . import parser
 9 | 
10 | 
11 | def gen_lines(lines: List[str]):
12 |     """Instantiate a generator from a list"""
13 |     return (line for line in lines)
14 | 
15 | 
16 | class RobotFileParser(parser.RobotsParser):
17 |     """Thin wrapper on RobotsParser to enable some level of compatibility with
18 |     urllib.robotparser.RobotFileParser. The implementation is incomplete, for
19 |     example, crawl_delay and request_rate are hard-coded to return None. The
20 |     unit tests take into account the implementation."""
21 | 
22 |     def set_url(self, url):
23 |         """Sets the URL referring to a robots.txt file."""
24 |         self.url = url
25 | 
26 |     def read(self):
27 |         """Populate the tokens if a URL is assigned to the url attribute"""
28 |         if self.url:
29 |             self.parse_tokens(parser.gen_tokens(self.gen_uri, self.url))
30 |         else:
31 |             self._errors.append(
32 |                 (
33 |                     self.url,
34 |                     "RobotFileParser.read requires RobotFileParser.url to be set",
35 |                 )
36 |             )
37 | 
38 |     def parse(self, lines):
39 |         """Method 'parse' compatible with urllib.robotparser.RobotFileParser. Parses the tokens
40 |         given an iterator."""
41 |         self.parse_tokens(parser.gen_tokens(gen_lines, lines))
42 | 
43 |     def mtime(self):
44 |         """Method 'mtime' compatible with urllib.robotparser.RobotFileParser. Return the timestamp
45 |         initialized when parsing a robots.txt url."""
46 |         return self.timestamp
47 | 
48 |     def modified(self):
49 |         """Method 'modified' compatible with urllib.robotparser.RobotFileParser. When invoked,
50 |         instantiate the internal timestamp to the current time."""
51 |         self.timestamp = time.time()
52 | 
53 |     def crawl_delay(self, _: str):
54 |         """The 'crawl-delay' directive is not recognize by the Google robots parser. Ignoring it in
55 |         robotspy. Keep this method for compatibility with urllib.robotparser."""
56 |         self._warnings.append(
57 |             ("crawl-delay", parser.Errors.WARNING_CRAWL_DELAY_IGNORED)
58 |         )
59 | 
60 |     def request_rate(self, _: str):
61 |         """The 'request-rate' directive is not recognize by the Google robots parser. Ignoring it in
62 |         robotspy. Keep this method for compatibility with urllib.robotparser."""
63 |         self._warnings.append(
64 |             ("request-rate", parser.Errors.WARNING_REQUEST_RATE_IGNORED)
65 |         )
66 | 
67 |     def site_maps(self):
68 |         """Method site_maps compatible with urllib.robotparser.RobotFileParser. Return the list of
69 |         sitemaps encountered while parsing a robots.txt content."""
70 |         return self.sitemaps
71 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup
 3 | 
 4 | CWD = pathlib.Path(__file__).parent
 5 | 
 6 | README = (CWD / "README.md").read_text()
 7 | 
 8 | setup(
 9 |     name="robotspy",
10 |     version="0.12.0",
11 |     description="Robots Exclusion Protocol File Parser",
12 |     long_description=README,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/andreburgaud/robotspy",
15 |     author="Andre Burgaud",
16 |     author_email="andre.burgaud@gmail.com",
17 |     license="MIT",
18 |     classifiers=[
19 |         "License :: OSI Approved :: MIT License",
20 |         "Programming Language :: Python :: 3",
21 |         "Programming Language :: Python :: 3.8",
22 |         "Programming Language :: Python :: 3.9",
23 |         "Programming Language :: Python :: 3.10",
24 |         "Programming Language :: Python :: 3.11",
25 |         "Programming Language :: Python :: 3.12",
26 |     ],
27 |     packages=["robots"],
28 |     entry_points={
29 |         "console_scripts": [
30 |             "robots=robots.__main__:main",
31 |         ]
32 |     },
33 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andreburgaud/robotspy/8cd173a5f1370ad9671aea3bb89456ea542ed4ee/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import robots
 3 | 
 4 | 
 5 | @pytest.fixture(scope='function')
 6 | def can_fetch():
 7 |     def _parser(robots_txt, agent, path):
 8 |         p = robots.RobotsParser.from_string(robots_txt)
 9 |         return p.can_fetch(agent, path)
10 |     return _parser
11 | 
12 | 
13 | def pytest_make_parametrize_id(config, val, argname):
14 |     if isinstance(val, str):
15 |         if not val:
16 |             return f'{argname}=<empty>'
17 |         output = val.strip()
18 |         output = output.split('\n')[0].strip()
19 |         return f"{argname}={output}"
20 |     return f'{argname}={val}'
21 | 


--------------------------------------------------------------------------------
/tests/core.py:
--------------------------------------------------------------------------------
1 | ALLOWED = True
2 | DISALLOWED = False
3 | DEFAULT_AGENT = 'test_robotparser'  # From urllib.robotparser
4 | FOOBOT_AGENT = 'FooBot'
5 | 


--------------------------------------------------------------------------------
/tests/test_google.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mostly tests from:
  3 | https://github.com/google/robotstxt/blob/master/robots_test.cc
  4 | implemented with PyTest and intended to validate the compatibility with the robots.txt parser
  5 | from Google (written in C++) and released under https://www.apache.org/licenses/LICENSE-2.0
  6 | 
  7 | Each ID_<Name> corresponds to a test or set of tests in robots_test.cc
  8 | 
  9 | For each test a data row contains the following fields:
 10 | robotstxt, useragent, url, allowed/disallowed
 11 | 
 12 | allow/disallowed is expressed as a boolean, True/False
 13 | 
 14 | Reference:
 15 | Robots Exclusion Protocol (REP) draft-koster-rep-01
 16 | https://tools.ietf.org/html/draft-koster-rep
 17 | """
 18 | 
 19 | import pytest
 20 | import robots
 21 | 
 22 | from .core import *
 23 | 
 24 | google_only_system = """
 25 | # GoogleOnly_System
 26 | 
 27 | user-agent: FooBot
 28 | disallow: /
 29 | """
 30 | 
 31 | google_only_system_data = (
 32 |     # Empty robots.txt: everything allowed
 33 |     ['', FOOBOT_AGENT, '', ALLOWED],
 34 |     # Empty user agent to be matched: everything allowed
 35 |     [google_only_system, '', '', ALLOWED],
 36 | 
 37 |     # Empty url: implicitly disallowed, see method comment for GetPathParamsQuery in robots.cc.
 38 |     [google_only_system, FOOBOT_AGENT, '', DISALLOWED],
 39 | 
 40 |     # All params empty: same as robots.txt empty, everything allowed.
 41 |     ['', '', '', ALLOWED],
 42 | )
 43 | 
 44 | 
 45 | # Rules are colon separated name-value pairs. The following names are provisioned:
 46 | #     user-agent: <value>
 47 | #     allow: <value>
 48 | #     disallow: <value>
 49 | # See REP I-D section "Protocol Definition".
 50 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1
 51 | # Google specific: webmasters sometimes miss the colon separator, but it's
 52 | # obvious what they mean by "disallow /", so we assume the colon if it's missing.
 53 | 
 54 | # Note: robotspy discards incorrect lines and does not implicitly assume that it is
 55 | # a webmaster mistake if a colon (:) is missing
 56 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_only_system_data)
 57 | def test_useragent_wild_card(robots_txt, agent, path, allowed, can_fetch):
 58 |     assert can_fetch(robots_txt, agent, path) is allowed
 59 | 
 60 | 
 61 | linesyntax_line_correct = """
 62 | # ID_LineSyntax_Line (correct)
 63 | 
 64 | user-agent: FooBot
 65 | disallow: /
 66 | """
 67 | 
 68 | linesyntax_line_incorrect = """
 69 | # ID_LineSyntax_Line (incorrect)
 70 | 
 71 | foo: FooBot
 72 | bar: /
 73 | """
 74 | 
 75 | linesyntax_line_incorrect_accepted = """
 76 | # ID_LineSyntax_Line (mistake - missing ":" - accepted by Google
 77 | 
 78 | user-agent FooBot
 79 | disallow /
 80 | """
 81 | 
 82 | url = 'http://foo.bar/x/y'
 83 | 
 84 | linesyntax_line_data = (
 85 |     [linesyntax_line_correct, FOOBOT_AGENT, url, DISALLOWED],
 86 |     [linesyntax_line_incorrect, FOOBOT_AGENT, url, ALLOWED],
 87 |     [linesyntax_line_incorrect_accepted, FOOBOT_AGENT, url, DISALLOWED],
 88 | )
 89 | 
 90 | 
 91 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', linesyntax_line_data)
 92 | def test_linesyntax_line(robots_txt, agent, path, allowed, can_fetch):
 93 |     assert can_fetch(robots_txt, agent, path) is allowed
 94 | 
 95 | 
 96 | # A group is one or more user-agent line followed by rules, and terminated
 97 | # by a another user-agent line. Rules for same user-agents are combined
 98 | # opaquely into one group. Rules outside groups are ignored.
 99 | # See REP I-D section "Protocol Definition".
100 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1
101 | 
102 | linesyntax_group = """
103 | # ID_LineSyntax_Groups
104 | 
105 | allow: /foo/bar/
106 | 
107 | user-agent: FooBot
108 | disallow: /
109 | allow: /x/
110 | user-agent: BarBot
111 | disallow: /
112 | allow: /y/
113 | 
114 | 
115 | allow: /w/
116 | user-agent: BazBot
117 | 
118 | user-agent: FooBot
119 | allow: /z/
120 | disallow: /
121 | """
122 | 
123 | linesyntax_group_data = (
124 |   [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/x/b', ALLOWED],
125 |   [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/z/d', ALLOWED],
126 |   [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/y/c', DISALLOWED],
127 |   [linesyntax_group, 'BarBot', 'http://foo.bar/y/c', ALLOWED],
128 |   [linesyntax_group, 'BarBot', 'http://foo.bar/w/a', ALLOWED],
129 |   [linesyntax_group, 'BarBot', 'http://foo.bar/z/d', DISALLOWED],
130 |   [linesyntax_group, 'BazBot', 'http://foo.bar/z/d', ALLOWED],
131 | 
132 |   # Lines with rules outside groups are ignored
133 |   [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/foo/bar/', DISALLOWED],
134 |   [linesyntax_group, 'BarBot', 'http://foo.bar/foo/bar/', DISALLOWED],
135 |   [linesyntax_group, 'BazBot', 'http://foo.bar/foo/bar/', DISALLOWED],
136 | )
137 | 
138 | 
139 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', linesyntax_group_data)
140 | def test_linesyntax_group(robots_txt, agent, path, allowed, can_fetch):
141 |     assert can_fetch(robots_txt, agent, path) is allowed
142 | 
143 | 
144 | # Robot Exclusion Protocol (REP) lines are case insensitive.
145 | # See REP I-D section "Protocol Definition".
146 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1
147 | 
148 | line_names_camel = """
149 | # ID_REPLineNamesCaseInsensitive (camel)
150 | 
151 | uSeR-aGeNt: FooBot
152 | AlLoW: /x/
153 | dIsAlLoW: /
154 | """
155 | 
156 | line_names_lower = """
157 | # ID_REPLineNamesCaseInsensitive (lower)
158 | 
159 | user-agent: FooBot
160 | allow: /x/
161 | disallow: /
162 | """
163 | 
164 | line_names_upper = """
165 | # ID_REPLineNamesCaseInsensitive (upper)
166 | 
167 | USER-AGENT: FooBot
168 | ALLOW: /x/
169 | DISALLOW: /
170 | """
171 | 
172 | url_allowed = 'http://foo.bar/x/y'
173 | url_disallowed = 'http://foo.bar/a/b'
174 | 
175 | line_names_case_insensitive_data = (
176 |     [line_names_upper, FOOBOT_AGENT, url_allowed, ALLOWED],
177 |     [line_names_lower, FOOBOT_AGENT, url_allowed, ALLOWED],
178 |     [line_names_camel, FOOBOT_AGENT, url_allowed, ALLOWED],
179 |     [line_names_upper, FOOBOT_AGENT, url_disallowed, DISALLOWED],
180 |     [line_names_lower, FOOBOT_AGENT, url_disallowed, DISALLOWED],
181 |     [line_names_upper, FOOBOT_AGENT, url_disallowed, DISALLOWED],
182 | )
183 | 
184 | 
185 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', line_names_case_insensitive_data)
186 | def test_line_names_case_insensitive(robots_txt, agent, path, allowed, can_fetch):
187 |     assert can_fetch(robots_txt, agent, path) is allowed
188 | 
189 | 
190 | # A user-agent line is expected to contain only [a-zA-Z_-] characters and must
191 | # not be empty. See REP I-D section "The user-agent line".
192 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
193 | # ID_VerifyValidUserAgentsToObey
194 | useragents_data = (
195 |     ['Foobot', True],
196 |     ['Foobot-Bar', True],
197 |     ['Foo_Bar', True],
198 |     #[None, False],
199 |     ['', False],
200 |     ['ツ', False],
201 |     ['Foobot*', False],
202 |     ['Foobot/2.1', False],
203 |     ['Foobot Bar', False],
204 | )
205 | 
206 | 
207 | @pytest.mark.parametrize('agent,valid', useragents_data)
208 | def test_valid_agent(agent, valid):
209 |     assert robots.RobotsParser.is_agent_valid(agent) is valid
210 | 
211 | 
212 | # The following test data is google specific as the Google robots parses the first string of the agent
213 | # and ignore the rest. robotspy is intentionally stricter.
214 | robots_upper = """
215 | # ID_UserAgentValueCaseInsensitive (upper)
216 | 
217 | User-Agent: FOO BAR
218 | Allow: /x/
219 | Disallow: /
220 | """
221 | 
222 | robots_lower = """
223 | # ID_UserAgentValueCaseInsensitive (lower)
224 | 
225 | User-Agent: foo bar
226 | Allow: /x/
227 | Disallow: /
228 | """
229 | 
230 | robots_camel = """
231 | # ID_UserAgentValueCaseInsensitive (camel)
232 | 
233 | User-Agent: FoO bAr
234 | Allow: /x/
235 | Disallow: /
236 | """
237 | 
238 | url_allowed = "http://foo.bar/x/y"
239 | url_disallowed = "http://foo.bar/a/b"
240 | 
241 | agent_case_insensitive_google_data = (
242 |     [robots_upper, 'Foo', url_allowed, ALLOWED],
243 |     [robots_lower, 'Foo', url_allowed, ALLOWED],
244 |     [robots_camel, 'Foo', url_allowed, ALLOWED],
245 |     [robots_upper, 'Foo', url_disallowed, DISALLOWED],
246 |     [robots_lower, 'Foo', url_disallowed, DISALLOWED],
247 |     [robots_camel, 'Foo', url_disallowed, DISALLOWED],
248 |     [robots_upper, 'foo', url_allowed, ALLOWED],
249 |     [robots_lower, 'foo', url_allowed, ALLOWED],
250 |     [robots_camel, 'foo', url_allowed, ALLOWED],
251 |     [robots_upper, 'foo', url_disallowed, DISALLOWED],
252 |     [robots_lower, 'foo', url_disallowed, DISALLOWED],
253 |     [robots_camel, 'foo', url_disallowed, DISALLOWED],
254 | )
255 | 
256 | 
257 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', agent_case_insensitive_google_data)
258 | def test_agent_case_insensitive_google(robots_txt, agent, path, allowed, can_fetch):
259 |     assert can_fetch(robots_txt, agent, path) is allowed
260 | 
261 | 
262 | # The following test data is modified from the google test data above and eliminates the space
263 | # It allows to validate the case insensitivity of the user agent.
264 | robots_upper = """
265 | # ID_UserAgentValueCaseInsensitive (upper)
266 | 
267 | User-Agent: FOO
268 | Allow: /x/
269 | Disallow: /
270 | """
271 | 
272 | robots_lower = """
273 | # ID_UserAgentValueCaseInsensitive (lower)
274 | 
275 | User-Agent: foo
276 | Allow: /x/
277 | Disallow: /
278 | """
279 | 
280 | robots_camel = """
281 | # ID_UserAgentValueCaseInsensitive (camel)
282 | 
283 | User-Agent: FoO
284 | Allow: /x/
285 | Disallow: /
286 | """
287 | 
288 | agent_case_insensitive_data = (
289 |     [robots_upper, 'Foo', url_allowed, ALLOWED],
290 |     [robots_lower, 'Foo', url_allowed, ALLOWED],
291 |     [robots_camel, 'Foo', url_allowed, ALLOWED],
292 |     [robots_upper, 'Foo', url_disallowed, DISALLOWED],
293 |     [robots_lower, 'Foo', url_disallowed, DISALLOWED],
294 |     [robots_camel, 'Foo', url_disallowed, DISALLOWED],
295 |     [robots_upper, 'foo', url_allowed, ALLOWED],
296 |     [robots_lower, 'foo', url_allowed, ALLOWED],
297 |     [robots_camel, 'foo', url_allowed, ALLOWED],
298 |     [robots_upper, 'foo', url_disallowed, DISALLOWED],
299 |     [robots_lower, 'foo', url_disallowed, DISALLOWED],
300 |     [robots_camel, 'foo', url_disallowed, DISALLOWED],
301 | )
302 | 
303 | 
304 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', agent_case_insensitive_data)
305 | def test_agent_case_insensitive(robots_txt, agent, path, allowed, can_fetch):
306 |     assert can_fetch(robots_txt, agent, path) is allowed
307 | 
308 | 
309 | robotstxt_global = """
310 | # ID_GlobalGroups_Secondary
311 | 
312 | user-agent: *
313 | allow: /
314 | user-agent: FooBot
315 | disallow: /
316 | """
317 | 
318 | robotstxt_only_specific = """
319 | # ID_GlobalGroups_Secondary
320 | 
321 | user-agent: FooBot
322 | allow: /
323 | user-agent: BarBot
324 | disallow: /
325 | user-agent: BazBot
326 | disallow: /
327 | """
328 | 
329 | url = 'http://foo.bar/x/y'
330 | 
331 | robotstxt_global_data = (
332 |     ['', FOOBOT_AGENT, url, ALLOWED],
333 |     [robotstxt_global, FOOBOT_AGENT, url, DISALLOWED],
334 |     [robotstxt_global, 'BarBot', url, ALLOWED],
335 |     [robotstxt_only_specific, 'QusBot', url, ALLOWED],
336 | )
337 | 
338 | 
339 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', robotstxt_global_data)
340 | def test_robotstxt_global(robots_txt, agent, path, allowed, can_fetch):
341 |     assert can_fetch(robots_txt, agent, path) is allowed
342 | 
343 | 
344 | # Matching rules against URIs is case sensitive.
345 | # See REP I-D section "The Allow and Disallow lines".
346 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
347 | 
348 | robots_url_lower = """
349 | # ID_AllowDisallow_Value_CaseSensitive (lower)
350 | 
351 | user-agent: FooBot
352 | disallow: /x/
353 | """
354 | 
355 | robots_url_upper = """
356 | # ID_AllowDisallow_Value_CaseSensitive (upper)
357 | user-agent: FooBot
358 | disallow: /X/
359 | """
360 | 
361 | url = 'http://foo.bar/x/y'
362 | 
363 | uri_case_sensitive_data = (
364 |     [robots_url_lower, FOOBOT_AGENT, url, DISALLOWED],
365 |     [robots_url_upper, FOOBOT_AGENT, url, ALLOWED],
366 | )
367 | 
368 | 
369 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', uri_case_sensitive_data)
370 | def test_uri_case_sensitive(robots_txt, agent, path, allowed, can_fetch):
371 |     assert can_fetch(robots_txt, agent, path) is allowed
372 | 
373 | 
374 | longest_match_01 = """
375 | # ID_LongestMatch 01
376 | 
377 | user-agent: FooBot
378 | disallow: /x/page.html
379 | allow: /x/
380 | """
381 | 
382 | longest_match_02 = """
383 | # ID_LongestMatch 02
384 | 
385 | user-agent: FooBot
386 | allow: /x/page.html
387 | disallow: /x/
388 | """
389 | 
390 | longest_match_03 = """
391 | # ID_LongestMatch 03
392 | 
393 | user-agent: FooBot
394 | disallow:
395 | allow:
396 | """
397 | 
398 | longest_match_04 = """
399 | # ID_LongestMatch 04
400 | 
401 | user-agent: FooBot
402 | disallow: /
403 | allow: /
404 | """
405 | 
406 | longest_match_05 = """
407 | # ID_LongestMatch 05
408 | 
409 | user-agent: FooBot
410 | disallow: /x
411 | allow: /x/
412 | """
413 | 
414 | longest_match_06 = """
415 | # ID_LongestMatch 06
416 | 
417 | user-agent: FooBot
418 | disallow: /x/page.html
419 | allow: /x/page.html
420 | """
421 | 
422 | longest_match_07 = """
423 | # ID_LongestMatch 07
424 | 
425 | user-agent: FooBot
426 | allow: /page
427 | disallow: /*.html
428 | """
429 | 
430 | longest_match_08 = """
431 | # ID_LongestMatch 08
432 | 
433 | user-agent: FooBot
434 | allow: /x/page.
435 | disallow: /*.html
436 | """
437 | 
438 | longest_match_09 = """
439 | # ID_LongestMatch 09
440 | 
441 | User-agent: *
442 | Disallow: /x/
443 | User-agent: FooBot
444 | Disallow: /y/
445 | """
446 | 
447 | url = 'http://foo.bar/x/page.html'
448 | 
449 | longest_match_data = (
450 |     [longest_match_01, FOOBOT_AGENT, url, DISALLOWED],
451 |     [longest_match_02, FOOBOT_AGENT, url, ALLOWED],
452 |     [longest_match_02, FOOBOT_AGENT, 'http://foo.bar/x/', DISALLOWED],
453 |     [longest_match_03, FOOBOT_AGENT, url, ALLOWED],
454 |     [longest_match_04, FOOBOT_AGENT, url, ALLOWED],
455 |     [longest_match_05, FOOBOT_AGENT, 'http://foo.bar/x', DISALLOWED],
456 |     [longest_match_05, FOOBOT_AGENT, 'http://foo.bar/x/', ALLOWED],
457 |     [longest_match_06, FOOBOT_AGENT, url, ALLOWED],
458 |     [longest_match_07, FOOBOT_AGENT, 'http://foo.bar/page.html', DISALLOWED],
459 |     [longest_match_07, FOOBOT_AGENT, 'http://foo.bar/page', ALLOWED],
460 |     [longest_match_08, FOOBOT_AGENT, url, ALLOWED],
461 |     [longest_match_08, FOOBOT_AGENT, 'http://foo.bar/x/y.html', DISALLOWED],
462 | 
463 |     [longest_match_09, FOOBOT_AGENT, 'http://foo.bar/x/page', ALLOWED],
464 |     [longest_match_09, FOOBOT_AGENT, 'http://foo.bar/y/page', DISALLOWED],
465 | 
466 | )
467 | 
468 | 
469 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', longest_match_data)
470 | def test_longest_match(robots_txt, agent, path, allowed, can_fetch):
471 |     assert can_fetch(robots_txt, agent, path) is allowed
472 | 
473 | 
474 | # Octets in the URI and robots.txt paths outside the range of the US-ASCII
475 | # coded character set, and those in the reserved range defined by RFC3986,
476 | # MUST be percent-encoded as defined by RFC3986 prior to comparison.
477 | # See REP I-D section "The Allow and Disallow lines".
478 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
479 | #
480 | # NOTE: It's up to the caller to percent encode a URL before passing it to the
481 | # parser. Percent encoding URIs in the rules is unnecessary.
482 | 
483 | 
484 | encoding_01 = """
485 | # ID_Encoding 01
486 | 
487 | User-agent: FooBot
488 | Disallow: /
489 | Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par
490 | """
491 | 
492 | encoding_02 = """
493 | # ID_Encoding 02
494 | 
495 | User-agent: FooBot
496 | Disallow: /
497 | Allow: /foo/bar/ツ
498 | """
499 | 
500 | encoding_03 = """
501 | # ID_Encoding 03
502 | 
503 | User-agent: FooBot
504 | Disallow: /
505 | Allow: /foo/bar/%E3%83%84
506 | """
507 | 
508 | encoding_04 = """
509 | # ID_Encoding 04
510 | 
511 | User-agent: FooBot
512 | Disallow: /
513 | Allow: /foo/bar/%62%61%7A
514 | """
515 | 
516 | # TODO: Revisit the encoding to match Google robots?
517 | encoding_data = (
518 |     [encoding_01, FOOBOT_AGENT, 'http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par', ALLOWED],
519 |     [encoding_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%E3%83%84"', ALLOWED],
520 |     [encoding_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/ツ', ALLOWED],     # Google -> DISALLOWED
521 |     [encoding_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%E3%83%84', ALLOWED],
522 |     [encoding_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar/ツ', ALLOWED],      # Google -> DISALLOWED
523 |     [encoding_04, FOOBOT_AGENT, 'http://foo.bar/foo/bar/baz', ALLOWED],     # Google -> DISALLOWED
524 |     [encoding_04, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%62%61%7A', ALLOWED],
525 | )
526 | 
527 | 
528 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', encoding_data)
529 | def test_encoding(robots_txt, agent, path, allowed, can_fetch):
530 |     assert can_fetch(robots_txt, agent, path) is allowed
531 | 
532 | 
533 | special_characters_01 = """
534 | # ID_SpecialCharacters 01
535 | 
536 | User-agent: FooBot
537 | Disallow: /foo/bar/quz
538 | Allow: /foo/*/qux
539 | """
540 | 
541 | special_characters_02 = """
542 | # ID_SpecialCharacters 02
543 | 
544 | User-agent: FooBot
545 | Disallow: /foo/bar$
546 | Allow: /foo/bar/qux
547 | """
548 | 
549 | special_characters_03 = """
550 | # ID_SpecialCharacters 03
551 | 
552 | User-agent: FooBot
553 | # Disallow: /
554 | Disallow: /foo/quz#qux
555 | Allow: /
556 | """
557 | 
558 | special_characters_data = (
559 |     [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/bar/quz', DISALLOWED],
560 |     [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/quz', ALLOWED],
561 |     [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo//quz', ALLOWED],
562 |     [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/bax/quz', ALLOWED],
563 |     [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar', DISALLOWED],
564 |     [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/qux', ALLOWED],
565 |     [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/', ALLOWED],
566 |     [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/baz', ALLOWED],
567 |     [special_characters_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar', ALLOWED],
568 |     [special_characters_03, FOOBOT_AGENT, 'http://foo.bar/foo/quz', DISALLOWED],
569 | )
570 | 
571 | # Skip:
572 | # - GoogleOnly_IndexHTMLisDirectory
573 | # - GoogleOnly_LineTooLong
574 | 
575 | google_doc_01 = """
576 | # GoogleOnly_DocumentationChecks 01
577 | 
578 | user-agent: FooBot
579 | disallow: /
580 | allow: /fish
581 | """
582 | 
583 | google_doc_02 = """
584 | # GoogleOnly_DocumentationChecks 02
585 | 
586 | user-agent: FooBot
587 | disallow: /
588 | allow: /fish*
589 | """
590 | 
591 | google_doc_03 = """
592 | # GoogleOnly_DocumentationChecks 03
593 | 
594 | user-agent: FooBot
595 | disallow: /
596 | allow: /fish/
597 | """
598 | 
599 | google_doc_data = (
600 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
601 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish', ALLOWED],
602 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish.html', ALLOWED],
603 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED],
604 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fishheads', ALLOWED],
605 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fishheads/yummy.html', ALLOWED],
606 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish.html?id=anything', ALLOWED],
607 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/Fish.asp', DISALLOWED],
608 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/catfish', DISALLOWED],
609 |     [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/?id=fish', DISALLOWED],
610 | 
611 |     # "/fish*" equals "/fish"
612 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
613 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish', ALLOWED],
614 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish.html', ALLOWED],
615 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED],
616 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fishheads', ALLOWED],
617 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fishheads/yummy.html', ALLOWED],
618 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish.html?id=anything', ALLOWED],
619 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/Fish.bar', DISALLOWED],
620 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/catfish', DISALLOWED],
621 |     [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/?id=fish', DISALLOWED],
622 | 
623 |     # "/fish/" does not equal "/fish"
624 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
625 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/', ALLOWED],
626 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/salmon', ALLOWED],
627 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/?salmon', ALLOWED],
628 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED],
629 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/?id=anything', ALLOWED],
630 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish', DISALLOWED],
631 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish.html', DISALLOWED],
632 |     [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/Fish/Salmon.html', DISALLOWED],
633 | )
634 | 
635 | 
636 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_doc_data)
637 | def test_google_doc(robots_txt, agent, path, allowed, can_fetch):
638 |     assert can_fetch(robots_txt, agent, path) is allowed
639 | 
640 | 
641 | google_php_01 = """
642 | # GoogleOnly_DocumentationChecks PHP 01
643 | 
644 | user-agent: FooBot
645 | disallow: /
646 | allow: /*.php
647 | """
648 | 
649 | google_php_02 = """
650 | # GoogleOnly_DocumentationChecks PHP 02
651 | 
652 | user-agent: FooBot
653 | disallow: /
654 | allow: /*.php$
655 | """
656 | 
657 | google_php_03 = """
658 | # GoogleOnly_DocumentationChecks PHP 03
659 | 
660 | user-agent: FooBot
661 | disallow: /
662 | allow: /fish*.php
663 | """
664 | 
665 | google_php_data = (
666 |     # "/*.php"
667 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
668 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/filename.php', ALLOWED],
669 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php', ALLOWED],
670 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php?parameters', ALLOWED],
671 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/filename.php/', ALLOWED],
672 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/index?f=filename.php/', ALLOWED],
673 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/php/', DISALLOWED],
674 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/index?php', DISALLOWED],
675 |     [google_php_01, FOOBOT_AGENT, 'http://foo.bar/windows.PHP', DISALLOWED],
676 | 
677 |     # "/*.php$"
678 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
679 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php', ALLOWED],
680 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php', ALLOWED],
681 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php?parameters', DISALLOWED],
682 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php/', DISALLOWED],
683 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php5', DISALLOWED],
684 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename?php', DISALLOWED],
685 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar/aaaphpaaa', DISALLOWED],
686 |     [google_php_02, FOOBOT_AGENT, 'http://foo.bar//windows.PHP', DISALLOWED],
687 | 
688 |     # "/fish*.php"
689 |     [google_php_03, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED],
690 |     [google_php_03, FOOBOT_AGENT, 'http://foo.bar/fish.php', ALLOWED],
691 |     [google_php_03, FOOBOT_AGENT, 'http://foo.bar/fishheads/catfish.php?parameters', ALLOWED],
692 |     [google_php_03, FOOBOT_AGENT, 'http://foo.bar/Fish.PHP', DISALLOWED],
693 | )
694 | 
695 | 
696 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_php_data)
697 | def test_google_php(robots_txt, agent, path, allowed, can_fetch):
698 |     assert can_fetch(robots_txt, agent, path) is allowed
699 | 
700 | 
701 | # Order of precedence for group-member records
702 | order_precedence_01 = """
703 | # GoogleOnly_DocumentationChecks 01 (Order Precedence)
704 | 
705 | user-agent: FooBot
706 | allow: /folder
707 | disallow: /folder
708 | """
709 | 
710 | order_precedence_02 = """
711 | # GoogleOnly_DocumentationChecks 02 (Order Precedence)
712 | 
713 | user-agent: FooBot
714 | allow: /folder
715 | disallow: /folder
716 | """
717 | 
718 | order_precedence_03 = """
719 | # GoogleOnly_DocumentationChecks 03 (Order Precedence)
720 | 
721 | user-agent: FooBot
722 | allow: /page
723 | disallow: /*.htm
724 | """
725 | 
726 | order_precedence_04 = """
727 | # GoogleOnly_DocumentationChecks 04 (Order Precedence)
728 | 
729 | user-agent: FooBot
730 | allow: /$
731 | disallow: /
732 | """
733 | 
734 | order_precedence_data = (
735 |     [order_precedence_01, FOOBOT_AGENT, 'http://example.com/page', ALLOWED],
736 |     [order_precedence_02, FOOBOT_AGENT, 'http://example.com/folder/page', ALLOWED],
737 |     [order_precedence_03, FOOBOT_AGENT, 'http://example.com/page.htm', DISALLOWED],
738 |     [order_precedence_04, FOOBOT_AGENT, 'http://example.com/', ALLOWED],
739 |     [order_precedence_04, FOOBOT_AGENT, 'http://example.com/page.html', DISALLOWED],
740 | )
741 | 
742 | 
743 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', order_precedence_data)
744 | def test_order_precedence(robots_txt, agent, path, allowed, can_fetch):
745 |     assert can_fetch(robots_txt, agent, path) is allowed
746 | 


--------------------------------------------------------------------------------
/tests/test_google_correctness.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Code generated from https://github.com/google/robotstxt-spec-test/tree/master/src/main/resources/CTC/
  3 | 
  4 | 
  5 | import pytest
  6 | from .core import *
  7 | 
  8 | 
  9 | robots_txt_matching_path_values_20 = """
 10 | user-agent: FooBot
 11 | disallow: /
 12 | allow: /*.php
 13 | """
 14 | 
 15 | data_matching_path_values_20 = (
 16 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/bar", DISALLOWED],
 17 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/filename.php", ALLOWED],
 18 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/folder/filename.php", ALLOWED],
 19 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/folder/filename.php?parameters", ALLOWED],
 20 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar//folder/any.php.file.html", ALLOWED],
 21 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/filename.php/", ALLOWED],
 22 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/index?f=filename.php/", ALLOWED],
 23 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/php/", DISALLOWED],
 24 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/index?php", DISALLOWED],
 25 |     [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/windows.PHP", DISALLOWED],
 26 | )
 27 | 
 28 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_20)
 29 | def test_google_correctness_matching_path_values_20(robots_txt, agent, path, allowed, can_fetch):
 30 |     assert can_fetch(robots_txt, agent, path) is allowed
 31 | 
 32 | 
 33 | robots_txt_matching_path_values_21 = """
 34 | user-agent: FooBot
 35 | disallow: /
 36 | allow: /*.php$
 37 | """
 38 | 
 39 | data_matching_path_values_21 = (
 40 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/bar", DISALLOWED],
 41 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php", ALLOWED],
 42 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/folder/filename.php", ALLOWED],
 43 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php?parameters", DISALLOWED],
 44 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php/", DISALLOWED],
 45 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php5", DISALLOWED],
 46 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/php/", DISALLOWED],
 47 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename?php", DISALLOWED],
 48 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/aaaphpaaa", DISALLOWED],
 49 |     [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar//windows.PHP", DISALLOWED],
 50 | )
 51 | 
 52 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_21)
 53 | def test_google_correctness_matching_path_values_21(robots_txt, agent, path, allowed, can_fetch):
 54 |     assert can_fetch(robots_txt, agent, path) is allowed
 55 | 
 56 | 
 57 | robots_txt_matching_path_values_22 = """
 58 | user-agent: FooBot
 59 | disallow: /
 60 | allow: /fish*.php
 61 | """
 62 | 
 63 | data_matching_path_values_22 = (
 64 |     [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/bar", DISALLOWED],
 65 |     [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/fish.php", ALLOWED],
 66 |     [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/fishheads/catfish.php?parameters", ALLOWED],
 67 |     [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/Fish.PHP", DISALLOWED],
 68 | )
 69 | 
 70 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_22)
 71 | def test_google_correctness_matching_path_values_22(robots_txt, agent, path, allowed, can_fetch):
 72 |     assert can_fetch(robots_txt, agent, path) is allowed
 73 | 
 74 | 
 75 | robots_txt_BOM_characters0 = """
 76 | User-Agent: foo
 77 | Disallow: /AnyValue
 78 | """
 79 | 
 80 | data_BOM_characters0 = (
 81 |     [robots_txt_BOM_characters0, "foo", "http://example.com/AnyValue", DISALLOWED],
 82 | )
 83 | 
 84 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters0)
 85 | def test_google_correctness_BOM_characters0(robots_txt, agent, path, allowed, can_fetch):
 86 |     assert can_fetch(robots_txt, agent, path) is allowed
 87 | 
 88 | 
 89 | robots_txt_BOM_characters1 = """
 90 | User-Agent: foo
 91 | Disallow: /AnyValue
 92 | """
 93 | 
 94 | data_BOM_characters1 = (
 95 |     [robots_txt_BOM_characters1, "foo", "http://example.com/AnyValue", DISALLOWED],
 96 | )
 97 | 
 98 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters1)
 99 | def test_google_correctness_BOM_characters1(robots_txt, agent, path, allowed, can_fetch):
100 |     assert can_fetch(robots_txt, agent, path) is allowed
101 | 
102 | 
103 | robots_txt_BOM_characters2 = """
104 | User-Agent: foo
105 | Disallow: /AnyValue
106 | """
107 | 
108 | data_BOM_characters2 = (
109 |     [robots_txt_BOM_characters2, "foo", "http://example.com/AnyValue", DISALLOWED],
110 | )
111 | 
112 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters2)
113 | def test_google_correctness_BOM_characters2(robots_txt, agent, path, allowed, can_fetch):
114 |     assert can_fetch(robots_txt, agent, path) is allowed
115 | 
116 | 
117 | robots_txt_BOM_characters3 = """
118 | User-Agent: foo
119 | Disallow: /AnyValue
120 | """
121 | 
122 | data_BOM_characters3 = (
123 | #    [robots_txt_BOM_characters3, "foo", "http://example.com/AnyValue", ALLOWED],  # Fails Google correctness
124 | )
125 | 
126 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters3)
127 | def test_google_correctness_BOM_characters3(robots_txt, agent, path, allowed, can_fetch):
128 |     assert can_fetch(robots_txt, agent, path) is allowed
129 | 
130 | 
131 | robots_txt_BOM_characters4 = """
132 | User-Agent: foo
133 | Disallow: /AnyValue
134 | """
135 | 
136 | data_BOM_characters4 = (
137 | #    [robots_txt_BOM_characters4, "foo", "http://example.com/AnyValue", ALLOWED],  # Fails Google correctness
138 | )
139 | 
140 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters4)
141 | def test_google_correctness_BOM_characters4(robots_txt, agent, path, allowed, can_fetch):
142 |     assert can_fetch(robots_txt, agent, path) is allowed
143 | 
144 | 
145 | robots_txt_empty_string0 = """
146 | 
147 | """
148 | 
149 | data_empty_string0 = (
150 |     [robots_txt_empty_string0, "FooBot", "", ALLOWED],
151 |     [robots_txt_empty_string0, "", "", ALLOWED],
152 | )
153 | 
154 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_empty_string0)
155 | def test_google_correctness_empty_string0(robots_txt, agent, path, allowed, can_fetch):
156 |     assert can_fetch(robots_txt, agent, path) is allowed
157 | 
158 | 
159 | robots_txt_empty_string1 = """
160 | user-agent: FooBot
161 | disallow: /
162 | """
163 | 
164 | data_empty_string1 = (
165 |     [robots_txt_empty_string1, "", "", ALLOWED],
166 |     [robots_txt_empty_string1, "FooBot", "", DISALLOWED],
167 | )
168 | 
169 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_empty_string1)
170 | def test_google_correctness_empty_string1(robots_txt, agent, path, allowed, can_fetch):
171 |     assert can_fetch(robots_txt, agent, path) is allowed
172 | 
173 | 
174 | robots_txt_accepted_mistakes0 = """
175 | user-agent: FooBot
176 | disallow: /
177 | """
178 | 
179 | data_accepted_mistakes0 = (
180 |     [robots_txt_accepted_mistakes0, "FooBot", "http://foo.bar/x/y", DISALLOWED],
181 | )
182 | 
183 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes0)
184 | def test_google_correctness_accepted_mistakes0(robots_txt, agent, path, allowed, can_fetch):
185 |     assert can_fetch(robots_txt, agent, path) is allowed
186 | 
187 | 
188 | robots_txt_accepted_mistakes1 = """
189 | foo: FooBot
190 | bar: /
191 | """
192 | 
193 | data_accepted_mistakes1 = (
194 |     [robots_txt_accepted_mistakes1, "FooBot", "http://foo.bar/x/y", ALLOWED],
195 | )
196 | 
197 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes1)
198 | def test_google_correctness_accepted_mistakes1(robots_txt, agent, path, allowed, can_fetch):
199 |     assert can_fetch(robots_txt, agent, path) is allowed
200 | 
201 | 
202 | robots_txt_accepted_mistakes2 = """
203 | user-agent FooBot
204 | disallow /
205 | """
206 | 
207 | data_accepted_mistakes2 = (
208 |     [robots_txt_accepted_mistakes2, "FooBot", "http://foo.bar/x/y", DISALLOWED],
209 | )
210 | 
211 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes2)
212 | def test_google_correctness_accepted_mistakes2(robots_txt, agent, path, allowed, can_fetch):
213 |     assert can_fetch(robots_txt, agent, path) is allowed
214 | 
215 | 
216 | robots_txt_uri_case_sensitivity0 = """
217 | user-agent: FooBot
218 | disallow: /X/
219 | """
220 | 
221 | data_uri_case_sensitivity0 = (
222 |     [robots_txt_uri_case_sensitivity0, "FooBot", "http://foo.bar/x/y", ALLOWED],
223 | )
224 | 
225 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_uri_case_sensitivity0)
226 | def test_google_correctness_uri_case_sensitivity0(robots_txt, agent, path, allowed, can_fetch):
227 |     assert can_fetch(robots_txt, agent, path) is allowed
228 | 
229 | 
230 | robots_txt_uri_case_sensitivity1 = """
231 | user-agent: FooBot
232 | disallow: /x/
233 | """
234 | 
235 | data_uri_case_sensitivity1 = (
236 |     [robots_txt_uri_case_sensitivity1, "FooBot", "http://foo.bar/x/y", DISALLOWED],
237 | )
238 | 
239 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_uri_case_sensitivity1)
240 | def test_google_correctness_uri_case_sensitivity1(robots_txt, agent, path, allowed, can_fetch):
241 |     assert can_fetch(robots_txt, agent, path) is allowed
242 | 
243 | 
244 | robots_txt_global_rules0 = """
245 | 
246 | """
247 | 
248 | data_global_rules0 = (
249 |     [robots_txt_global_rules0, "FooBot", "http://foo.bar/x/y", ALLOWED],
250 | )
251 | 
252 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules0)
253 | def test_google_correctness_global_rules0(robots_txt, agent, path, allowed, can_fetch):
254 |     assert can_fetch(robots_txt, agent, path) is allowed
255 | 
256 | 
257 | robots_txt_global_rules1 = """
258 | user-agent: *
259 | disallow: /x
260 | user-agent: FooBot
261 | allow: /x/y
262 | """
263 | 
264 | data_global_rules1 = (
265 |     [robots_txt_global_rules1, "FooBot", "http://foo.bar/x/y", ALLOWED],
266 |     [robots_txt_global_rules1, "BarBot", "http://foo.bar/x/y", DISALLOWED],
267 | )
268 | 
269 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules1)
270 | def test_google_correctness_global_rules1(robots_txt, agent, path, allowed, can_fetch):
271 |     assert can_fetch(robots_txt, agent, path) is allowed
272 | 
273 | 
274 | robots_txt_global_rules2 = """
275 | user-agent: FooBot
276 | allow: /
277 | user-agent: BarBot
278 | disallow: /
279 | user-agent: BazBot
280 | disallow: /
281 | """
282 | 
283 | data_global_rules2 = (
284 |     [robots_txt_global_rules2, "QuxBot", "http://foo.bar/x/y", ALLOWED],
285 | )
286 | 
287 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules2)
288 | def test_google_correctness_global_rules2(robots_txt, agent, path, allowed, can_fetch):
289 |     assert can_fetch(robots_txt, agent, path) is allowed
290 | 
291 | 
292 | robots_txt_non_ascii_paths0 = """
293 | User-agent: FooBot
294 | Disallow: /
295 | Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par
296 | """
297 | 
298 | data_non_ascii_paths0 = (
299 |     [robots_txt_non_ascii_paths0, "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par", ALLOWED],
300 | )
301 | 
302 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths0)
303 | def test_google_correctness_non_ascii_paths0(robots_txt, agent, path, allowed, can_fetch):
304 |     assert can_fetch(robots_txt, agent, path) is allowed
305 | 
306 | 
307 | robots_txt_non_ascii_paths1 = """
308 | User-agent: FooBot
309 | Disallow: /
310 | Allow: /foo/bar/ツ
311 | """
312 | 
313 | data_non_ascii_paths1 = (
314 |     [robots_txt_non_ascii_paths1, "FooBot", "http://foo.bar/foo/bar/%E3%83%84", ALLOWED],
315 | #    [robots_txt_non_ascii_paths1, "FooBot", "http://foo.bar/foo/bar/ツ", DISALLOWED], # Fails Google correctness
316 | )
317 | 
318 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths1)
319 | def test_google_correctness_non_ascii_paths1(robots_txt, agent, path, allowed, can_fetch):
320 |     assert can_fetch(robots_txt, agent, path) is allowed
321 | 
322 | 
323 | robots_txt_non_ascii_paths2 = """
324 | User-agent: FooBot
325 | Disallow: /
326 | Allow: /foo/bar/%E3%83%84
327 | """
328 | 
329 | data_non_ascii_paths2 = (
330 |     [robots_txt_non_ascii_paths2, "FooBot", "http://foo.bar/foo/bar/%E3%83%84", ALLOWED],
331 | #    [robots_txt_non_ascii_paths2, "FooBot", "http://foo.bar/foo/bar/ツ", DISALLOWED], # Fails Google correctness
332 | )
333 | 
334 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths2)
335 | def test_google_correctness_non_ascii_paths2(robots_txt, agent, path, allowed, can_fetch):
336 |     assert can_fetch(robots_txt, agent, path) is allowed
337 | 
338 | 
339 | robots_txt_non_ascii_paths3 = """
340 | User-agent: FooBot
341 | Disallow: /
342 | Allow: /foo/bar/%62%61%7A
343 | """
344 | 
345 | data_non_ascii_paths3 = (
346 | #    [robots_txt_non_ascii_paths3, "FooBot", "http://foo.bar/foo/bar/baz", DISALLOWED], # Fails google correctness
347 |     [robots_txt_non_ascii_paths3, "FooBot", "http://foo.bar/foo/bar/%62%61%7A", ALLOWED],
348 | )
349 | 
350 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths3)
351 | def test_google_correctness_non_ascii_paths3(robots_txt, agent, path, allowed, can_fetch):
352 |     assert can_fetch(robots_txt, agent, path) is allowed
353 | 
354 | 
355 | robots_txt_special_characters0 = """
356 | User-agent: FooBot
357 | Disallow: /foo/bar/quz
358 | Allow: /foo/*/qux
359 | """
360 | 
361 | data_special_characters0 = (
362 |     [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/bar/quz", DISALLOWED],
363 |     [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/quz", ALLOWED],
364 |     [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo//quz", ALLOWED],
365 |     [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/bax/quz", ALLOWED],
366 | )
367 | 
368 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters0)
369 | def test_google_correctness_special_characters0(robots_txt, agent, path, allowed, can_fetch):
370 |     assert can_fetch(robots_txt, agent, path) is allowed
371 | 
372 | 
373 | robots_txt_special_characters1 = """
374 | User-agent: FooBot
375 | Disallow: /foo/bar$
376 | Allow: /foo/bar/qux
377 | """
378 | 
379 | data_special_characters1 = (
380 |     [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar", DISALLOWED],
381 |     [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/qux", ALLOWED],
382 |     [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/", ALLOWED],
383 |     [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/baz", ALLOWED],
384 | )
385 | 
386 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters1)
387 | def test_google_correctness_special_characters1(robots_txt, agent, path, allowed, can_fetch):
388 |     assert can_fetch(robots_txt, agent, path) is allowed
389 | 
390 | 
391 | robots_txt_special_characters2 = """
392 | User-agent: FooBot
393 | # Disallow: /
394 | Disallow: /foo/quz#qux
395 | Allow: /
396 | """
397 | 
398 | data_special_characters2 = (
399 |     [robots_txt_special_characters2, "FooBot", "http://foo.bar/foo/bar", ALLOWED],
400 |     [robots_txt_special_characters2, "FooBot", "http://foo.bar/foo/quz", DISALLOWED],
401 | )
402 | 
403 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters2)
404 | def test_google_correctness_special_characters2(robots_txt, agent, path, allowed, can_fetch):
405 |     assert can_fetch(robots_txt, agent, path) is allowed
406 | 
407 | 
408 | robots_txt_index_page0 = """
409 | User-Agent: *
410 | Allow: /allowed-slash/index.html
411 | Disallow: /
412 | """
413 | 
414 | data_index_page0 = (
415 | #    [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/", ALLOWED], # google specific - fails google correcness
416 |     [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/index.htm", DISALLOWED],
417 |     [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/index.html", ALLOWED],
418 |     [robots_txt_index_page0, "foobot", "http://foo.com/anyother-url", DISALLOWED],
419 | )
420 | 
421 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_index_page0)
422 | def test_google_correctness_index_page0(robots_txt, agent, path, allowed, can_fetch):
423 |     assert can_fetch(robots_txt, agent, path) is allowed
424 | 
425 | 
426 | robots_txt_user_agent_name0 = """
427 | User-Agent: *
428 | Disallow: /
429 | User-Agent: Foo Bar
430 | Allow: /x/
431 | Disallow: /
432 | """
433 | 
434 | data_user_agent_name0 = (
435 |     [robots_txt_user_agent_name0, "Foo", "http://foo.bar/x/y", ALLOWED],
436 |     [robots_txt_user_agent_name0, "Foo Bar", "http://foo.bar/x/y", DISALLOWED],
437 | )
438 | 
439 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name0)
440 | def test_google_correctness_user_agent_name0(robots_txt, agent, path, allowed, can_fetch):
441 |     assert can_fetch(robots_txt, agent, path) is allowed
442 | 
443 | 
444 | robots_txt_user_agent_name1 = """
445 | user-agent: FOO BAR
446 | allow: /x/
447 | disallow: /
448 | """
449 | 
450 | data_user_agent_name1 = (
451 |     [robots_txt_user_agent_name1, "Foo", "http://foo.bar/x/y", ALLOWED],
452 |     [robots_txt_user_agent_name1, "foo", "http://foo.bar/x/y", ALLOWED],
453 |     [robots_txt_user_agent_name1, "Foo", "http://foo.bar/a/b", DISALLOWED],
454 |     [robots_txt_user_agent_name1, "foo", "http://foo.bar/a/b", DISALLOWED],
455 | )
456 | 
457 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name1)
458 | def test_google_correctness_user_agent_name1(robots_txt, agent, path, allowed, can_fetch):
459 |     assert can_fetch(robots_txt, agent, path) is allowed
460 | 
461 | 
462 | robots_txt_user_agent_name2 = """
463 | user-agent: foo bar
464 | allow: /x/
465 | disallow: /
466 | """
467 | 
468 | data_user_agent_name2 = (
469 |     [robots_txt_user_agent_name2, "Foo", "http://foo.bar/x/y", ALLOWED],
470 |     [robots_txt_user_agent_name2, "foo", "http://foo.bar/x/y", ALLOWED],
471 |     [robots_txt_user_agent_name2, "Foo", "http://foo.bar/a/b", DISALLOWED],
472 |     [robots_txt_user_agent_name2, "foo", "http://foo.bar/a/b", DISALLOWED],
473 | )
474 | 
475 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name2)
476 | def test_google_correctness_user_agent_name2(robots_txt, agent, path, allowed, can_fetch):
477 |     assert can_fetch(robots_txt, agent, path) is allowed
478 | 
479 | 
480 | robots_txt_user_agent_name3 = """
481 | user-agent: FoO bAr
482 | allow: /x/
483 | disallow: /
484 | """
485 | 
486 | data_user_agent_name3 = (
487 |     [robots_txt_user_agent_name3, "Foo", "http://foo.bar/x/y", ALLOWED],
488 |     [robots_txt_user_agent_name3, "foo", "http://foo.bar/x/y", ALLOWED],
489 |     [robots_txt_user_agent_name3, "Foo", "http://foo.bar/a/b", DISALLOWED],
490 |     [robots_txt_user_agent_name3, "foo", "http://foo.bar/a/b", DISALLOWED],
491 | )
492 | 
493 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name3)
494 | def test_google_correctness_user_agent_name3(robots_txt, agent, path, allowed, can_fetch):
495 |     assert can_fetch(robots_txt, agent, path) is allowed
496 | 
497 | 
498 | robots_txt_directives_case_insensitivity0 = """
499 | USER-AGENT: FooBot
500 | ALLOW: /x/
501 | DISALLOW: /
502 | """
503 | 
504 | data_directives_case_insensitivity0 = (
505 |     [robots_txt_directives_case_insensitivity0, "FooBot", "http://foo.bar/x/y", ALLOWED],
506 |     [robots_txt_directives_case_insensitivity0, "FooBot", "http://foo.bar/a/b", DISALLOWED],
507 | )
508 | 
509 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity0)
510 | def test_google_correctness_directives_case_insensitivity0(robots_txt, agent, path, allowed, can_fetch):
511 |     assert can_fetch(robots_txt, agent, path) is allowed
512 | 
513 | 
514 | robots_txt_directives_case_insensitivity1 = """
515 | user-agent: FooBot
516 | allow: /x/
517 | disallow: /
518 | """
519 | 
520 | data_directives_case_insensitivity1 = (
521 |     [robots_txt_directives_case_insensitivity1, "FooBot", "http://foo.bar/x/y", ALLOWED],
522 |     [robots_txt_directives_case_insensitivity1, "FooBot", "http://foo.bar/a/b", DISALLOWED],
523 | )
524 | 
525 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity1)
526 | def test_google_correctness_directives_case_insensitivity1(robots_txt, agent, path, allowed, can_fetch):
527 |     assert can_fetch(robots_txt, agent, path) is allowed
528 | 
529 | 
530 | robots_txt_directives_case_insensitivity2 = """
531 | uSeR-aGeNt: FooBot
532 | AlLoW: /x/
533 | dIsAlLoW: /
534 | """
535 | 
536 | data_directives_case_insensitivity2 = (
537 |     [robots_txt_directives_case_insensitivity2, "FooBot", "http://foo.bar/x/y", ALLOWED],
538 |     [robots_txt_directives_case_insensitivity2, "FooBot", "http://foo.bar/a/b", DISALLOWED],
539 | )
540 | 
541 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity2)
542 | def test_google_correctness_directives_case_insensitivity2(robots_txt, agent, path, allowed, can_fetch):
543 |     assert can_fetch(robots_txt, agent, path) is allowed
544 | 
545 | 
546 | robots_txt_groups0 = """
547 | allow: /foo/bar/
548 | 
549 | user-agent: FooBot
550 | disallow: /
551 | allow: /x/
552 | user-agent: BarBot
553 | disallow: /
554 | allow: /y/
555 | 
556 | 
557 | allow: /w/
558 | user-agent: BazBot
559 | 
560 | user-agent: FooBot
561 | allow: /z/
562 | disallow: /
563 | """
564 | 
565 | data_groups0 = (
566 |     [robots_txt_groups0, "FooBot", "http://foo.bar/x/b", ALLOWED],
567 |     [robots_txt_groups0, "FooBot", "http://foo.bar/z/d", ALLOWED],
568 |     [robots_txt_groups0, "FooBot", "http://foo.bar/y/c", DISALLOWED],
569 |     [robots_txt_groups0, "BarBot", "http://foo.bar/y/c", ALLOWED],
570 |     [robots_txt_groups0, "BarBot", "http://foo.bar/w/a", ALLOWED],
571 |     [robots_txt_groups0, "BarBot", "http://foo.bar/z/d", DISALLOWED],
572 |     [robots_txt_groups0, "BazBot", "http://foo.bar/z/d", ALLOWED],
573 |     [robots_txt_groups0, "FooBot", "http://foo.bar/foo/bar/", DISALLOWED],
574 |     [robots_txt_groups0, "BarBot", "http://foo.bar/foo/bar/", DISALLOWED],
575 |     [robots_txt_groups0, "BazBot", "http://foo.bar/foo/bar/", DISALLOWED],
576 | )
577 | 
578 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_groups0)
579 | def test_google_correctness_groups0(robots_txt, agent, path, allowed, can_fetch):
580 |     assert can_fetch(robots_txt, agent, path) is allowed
581 | 
582 | 
583 | robots_txt_most_specific_match0 = """
584 | user-agent: FooBot
585 | disallow: /x/page.html
586 | allow: /x/
587 | """
588 | 
589 | data_most_specific_match0 = (
590 |     [robots_txt_most_specific_match0, "FooBot", "http://foo.bar/x/page.html", DISALLOWED],
591 | )
592 | 
593 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match0)
594 | def test_google_correctness_most_specific_match0(robots_txt, agent, path, allowed, can_fetch):
595 |     assert can_fetch(robots_txt, agent, path) is allowed
596 | 
597 | 
598 | robots_txt_most_specific_match1 = """
599 | user-agent: FooBot
600 | allow: /x/page.html
601 | disallow: /x/
602 | """
603 | 
604 | data_most_specific_match1 = (
605 |     [robots_txt_most_specific_match1, "FooBot", "http://foo.bar/x/page.html", ALLOWED],
606 |     [robots_txt_most_specific_match1, "FooBot", "http://foo.bar/x/", DISALLOWED],
607 | )
608 | 
609 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match1)
610 | def test_google_correctness_most_specific_match1(robots_txt, agent, path, allowed, can_fetch):
611 |     assert can_fetch(robots_txt, agent, path) is allowed
612 | 
613 | 
614 | robots_txt_most_specific_match2 = """
615 | user-agent: FooBot
616 | disallow: 
617 | allow: 
618 | """
619 | 
620 | data_most_specific_match2 = (
621 |     [robots_txt_most_specific_match2, "FooBot", "http://foo.bar/x/page.html", ALLOWED],
622 | )
623 | 
624 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match2)
625 | def test_google_correctness_most_specific_match2(robots_txt, agent, path, allowed, can_fetch):
626 |     assert can_fetch(robots_txt, agent, path) is allowed
627 | 
628 | 
629 | robots_txt_most_specific_match3 = """
630 | user-agent: FooBot
631 | disallow: /
632 | allow: /
633 | """
634 | 
635 | data_most_specific_match3 = (
636 |     [robots_txt_most_specific_match3, "FooBot", "http://foo.bar/x/page.html", ALLOWED],
637 | )
638 | 
639 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match3)
640 | def test_google_correctness_most_specific_match3(robots_txt, agent, path, allowed, can_fetch):
641 |     assert can_fetch(robots_txt, agent, path) is allowed
642 | 
643 | 
644 | robots_txt_most_specific_match4 = """
645 | user-agent: FooBot
646 | disallow: /x
647 | allow: /x/
648 | """
649 | 
650 | data_most_specific_match4 = (
651 |     [robots_txt_most_specific_match4, "FooBot", "http://foo.bar/x", DISALLOWED],
652 |     [robots_txt_most_specific_match4, "FooBot", "http://foo.bar/x/", ALLOWED],
653 | )
654 | 
655 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match4)
656 | def test_google_correctness_most_specific_match4(robots_txt, agent, path, allowed, can_fetch):
657 |     assert can_fetch(robots_txt, agent, path) is allowed
658 | 
659 | 
660 | robots_txt_most_specific_match5 = """
661 | user-agent: FooBot
662 | disallow: /x/page.html
663 | allow: /x/page.html
664 | """
665 | 
666 | data_most_specific_match5 = (
667 |     [robots_txt_most_specific_match5, "FooBot", "http://foo.bar/x/page.html", ALLOWED],
668 | )
669 | 
670 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match5)
671 | def test_google_correctness_most_specific_match5(robots_txt, agent, path, allowed, can_fetch):
672 |     assert can_fetch(robots_txt, agent, path) is allowed
673 | 
674 | 
675 | robots_txt_most_specific_match6 = """
676 | user-agent: FooBot
677 | allow: /page
678 | disallow: /*.html
679 | """
680 | 
681 | data_most_specific_match6 = (
682 |     [robots_txt_most_specific_match6, "FooBot", "http://foo.bar/page.html", DISALLOWED],
683 |     [robots_txt_most_specific_match6, "FooBot", "http://foo.bar/page", ALLOWED],
684 | )
685 | 
686 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match6)
687 | def test_google_correctness_most_specific_match6(robots_txt, agent, path, allowed, can_fetch):
688 |     assert can_fetch(robots_txt, agent, path) is allowed
689 | 
690 | 
691 | robots_txt_most_specific_match7 = """
692 | user-agent: FooBot
693 | allow: /x/page.
694 | disallow: /*.html
695 | """
696 | 
697 | data_most_specific_match7 = (
698 |     [robots_txt_most_specific_match7, "FooBot", "http://foo.bar/x/page.html", ALLOWED],
699 |     [robots_txt_most_specific_match7, "FooBot", "http://foo.bar/x/y.html", DISALLOWED],
700 | )
701 | 
702 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match7)
703 | def test_google_correctness_most_specific_match7(robots_txt, agent, path, allowed, can_fetch):
704 |     assert can_fetch(robots_txt, agent, path) is allowed
705 | 
706 | 
707 | robots_txt_most_specific_match8 = """
708 | User-agent: *
709 | Disallow: /x/
710 | User-agent: FooBot
711 | Disallow: /y/
712 | """
713 | 
714 | data_most_specific_match8 = (
715 |     [robots_txt_most_specific_match8, "FooBot", "http://foo.bar/x/page", ALLOWED],
716 |     [robots_txt_most_specific_match8, "FooBot", "http://foo.bar/y/page", DISALLOWED],
717 | )
718 | 
719 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match8)
720 | def test_google_correctness_most_specific_match8(robots_txt, agent, path, allowed, can_fetch):
721 |     assert can_fetch(robots_txt, agent, path) is allowed
722 | 
723 | 
724 | robots_txt_different_line_endings0 = """
725 | User-Agent: foo
726 | Allow: /some/path
727 | User-Agent: bar
728 | 
729 | 
730 | Disallow: /
731 | """
732 | 
733 | data_different_line_endings0 = (
734 |     [robots_txt_different_line_endings0, "bar", "http://example.com/page", DISALLOWED],
735 | )
736 | 
737 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings0)
738 | def test_google_correctness_different_line_endings0(robots_txt, agent, path, allowed, can_fetch):
739 |     assert can_fetch(robots_txt, agent, path) is allowed
740 | 
741 | 
742 | robots_txt_different_line_endings1 = """
743 | User-Agent: foo
744 | Allow: /some/path
745 | User-Agent: bar
746 | 
747 | 
748 | Disallow: /
749 | """
750 | 
751 | data_different_line_endings1 = (
752 |     [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED],
753 |     [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED],
754 |     [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED],
755 | )
756 | 
757 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings1)
758 | def test_google_correctness_different_line_endings1(robots_txt, agent, path, allowed, can_fetch):
759 |     assert can_fetch(robots_txt, agent, path) is allowed
760 | 
761 | 
762 | robots_txt_different_line_endings2 = """
763 | User-Agent: foo
764 | User-Agent: bar
765 | 
766 | Disallow: /
767 | """
768 | 
769 | data_different_line_endings2 = (
770 |     [robots_txt_different_line_endings2, "bar", "http://example.com/page", DISALLOWED],
771 | )
772 | 
773 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings2)
774 | def test_google_correctness_different_line_endings2(robots_txt, agent, path, allowed, can_fetch):
775 |     assert can_fetch(robots_txt, agent, path) is allowed
776 | 
777 | 
778 | robots_txt_matching_path_values_10 = """
779 | user-agent: FooBot
780 | disallow: /
781 | allow: /fish
782 | """
783 | 
784 | data_matching_path_values_10 = (
785 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/bar", DISALLOWED],
786 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish", ALLOWED],
787 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish.html", ALLOWED],
788 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED],
789 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fishheads", ALLOWED],
790 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fishheads/yummy.html", ALLOWED],
791 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish.html?id=anything", ALLOWED],
792 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/Fish.asp", DISALLOWED],
793 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/catfish", DISALLOWED],
794 |     [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/?id=fish", DISALLOWED],
795 | )
796 | 
797 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_10)
798 | def test_google_correctness_matching_path_values_10(robots_txt, agent, path, allowed, can_fetch):
799 |     assert can_fetch(robots_txt, agent, path) is allowed
800 | 
801 | 
802 | robots_txt_matching_path_values_11 = """
803 | user-agent: FooBot
804 | disallow: /
805 | allow: /fish*
806 | """
807 | 
808 | data_matching_path_values_11 = (
809 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/bar", DISALLOWED],
810 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish", ALLOWED],
811 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish.html", ALLOWED],
812 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED],
813 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fishheads", ALLOWED],
814 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fishheads/yummy.html", ALLOWED],
815 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish.html?id=anything", ALLOWED],
816 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/Fish.bar", DISALLOWED],
817 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/catfish", DISALLOWED],
818 |     [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/?id=fish", DISALLOWED],
819 | )
820 | 
821 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_11)
822 | def test_google_correctness_matching_path_values_11(robots_txt, agent, path, allowed, can_fetch):
823 |     assert can_fetch(robots_txt, agent, path) is allowed
824 | 
825 | 
826 | robots_txt_matching_path_values_12 = """
827 | user-agent: FooBot
828 | disallow: /
829 | allow: /fish/
830 | """
831 | 
832 | data_matching_path_values_12 = (
833 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/bar", DISALLOWED],
834 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/", ALLOWED],
835 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/salmon", ALLOWED],
836 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/?salmon", ALLOWED],
837 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED],
838 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/?id=anything", ALLOWED],
839 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish", DISALLOWED],
840 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish.html", DISALLOWED],
841 |     [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/Fish/Salmon.html", DISALLOWED],
842 | )
843 | 
844 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_12)
845 | def test_google_correctness_matching_path_values_12(robots_txt, agent, path, allowed, can_fetch):
846 |     assert can_fetch(robots_txt, agent, path) is allowed
847 | 
848 | 
849 | robots_txt_order_of_precedence0 = """
850 | user-agent: FooBot
851 | allow: /p
852 | disallow: /
853 | """
854 | 
855 | data_order_of_precedence0 = (
856 |     [robots_txt_order_of_precedence0, "FooBot", "http://example.com/page", ALLOWED],
857 | )
858 | 
859 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence0)
860 | def test_google_correctness_order_of_precedence0(robots_txt, agent, path, allowed, can_fetch):
861 |     assert can_fetch(robots_txt, agent, path) is allowed
862 | 
863 | 
864 | robots_txt_order_of_precedence1 = """
865 | user-agent: FooBot
866 | allow: /folder
867 | disallow: /folder
868 | """
869 | 
870 | data_order_of_precedence1 = (
871 |     [robots_txt_order_of_precedence1, "FooBot", "http://example.com/folder/page", ALLOWED],
872 | )
873 | 
874 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence1)
875 | def test_google_correctness_order_of_precedence1(robots_txt, agent, path, allowed, can_fetch):
876 |     assert can_fetch(robots_txt, agent, path) is allowed
877 | 
878 | 
879 | robots_txt_order_of_precedence2 = """
880 | user-agent: FooBot
881 | allow: /page
882 | disallow: /*.htm
883 | """
884 | 
885 | data_order_of_precedence2 = (
886 |     [robots_txt_order_of_precedence2, "FooBot", "http://example.com/page.htm", DISALLOWED],
887 | )
888 | 
889 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence2)
890 | def test_google_correctness_order_of_precedence2(robots_txt, agent, path, allowed, can_fetch):
891 |     assert can_fetch(robots_txt, agent, path) is allowed
892 | 
893 | 
894 | robots_txt_order_of_precedence3 = """
895 | user-agent: FooBot
896 | allow: /$
897 | disallow: /
898 | """
899 | 
900 | data_order_of_precedence3 = (
901 |     [robots_txt_order_of_precedence3, "FooBot", "http://example.com/", ALLOWED],
902 |     [robots_txt_order_of_precedence3, "FooBot", "http://example.com/page.html", DISALLOWED],
903 | )
904 | 
905 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence3)
906 | def test_google_correctness_order_of_precedence3(robots_txt, agent, path, allowed, can_fetch):
907 |     assert can_fetch(robots_txt, agent, path) is allowed
908 | 
909 | 


--------------------------------------------------------------------------------
/tests/test_google_stress.py:
--------------------------------------------------------------------------------
   1 | 
   2 | # Code generated from https://github.com/google/robotstxt-spec-test/tree/master/src/main/resources/CTC/
   3 | 
   4 | 
   5 | import pytest
   6 | from .core import *
   7 | 
   8 | 
   9 | robots_txt_638845 = """
  10 | # For more information about the robots.txt standard, see:
  11 | # http://www.robotstxt.org/orig.html
  12 | #
  13 | 
  14 | User-agent: *
  15 | Disallow: /main/
  16 | Disallow: /store/
  17 | Disallow: /scp/
  18 | Disallow: /mods/
  19 | Disallow: /view/
  20 | Disallow: /deps/
  21 | Disallow: /setup/
  22 | Disallow: /language/
  23 | Disallow: /libs/
  24 | Disallow: /data/
  25 | Disallow: /media/
  26 | Disallow: /parts/
  27 | Disallow: /plugins/
  28 | Disallow: /help/
  29 | Disallow: /tmp/
  30 | 
  31 | """
  32 | 
  33 | data_638845 = (
  34 |     [robots_txt_638845, "foobot", "http://example.com/", ALLOWED],
  35 |     [robots_txt_638845, "foobot", "http://example.com/index.html", ALLOWED],
  36 |     [robots_txt_638845, "foobot", "http://example.com/scp/data", DISALLOWED],
  37 |     [robots_txt_638845, "foobot", "http://example.com/medi", ALLOWED],
  38 |     [robots_txt_638845, "foobot", "http://example.com/media", ALLOWED],
  39 |     [robots_txt_638845, "foobot", "http://example.com/loogs?user=admin", ALLOWED],
  40 | )
  41 | 
  42 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_638845)
  43 | def test_google_stress_638845(robots_txt, agent, path, allowed, can_fetch):
  44 |     assert can_fetch(robots_txt, agent, path) is allowed
  45 | 
  46 | 
  47 | robots_txt_308278 = """
  48 | User-agent: *
  49 | Disallow: /asdf-login
  50 | Disallow: /asdf-admin
  51 | Disallow: /databack/
  52 | Disallow: /data/*
  53 | Disallow: /?*/
  54 | Disallow: /author/
  55 | Disallow: /id/*/page/
  56 | Disallow: /id/*/data/
  57 | Sitemap: http://example.com/page-sitemap.xml
  58 | """
  59 | 
  60 | data_308278 = (
  61 |     [robots_txt_308278, "foobot", "http://example.com/asdf-login", DISALLOWED],
  62 |     [robots_txt_308278, "foobot", "http://example.com/asdf-login/", DISALLOWED],
  63 |     [robots_txt_308278, "foobot", "http://example.com/", ALLOWED],
  64 |     [robots_txt_308278, "foobot", "http://example.com/databack", ALLOWED],
  65 |     [robots_txt_308278, "foobot", "http://example.com/databack/recent", DISALLOWED],
  66 |     [robots_txt_308278, "foobot", "http://example.com/foo/?user=admin/data", ALLOWED],
  67 |     [robots_txt_308278, "foobot", "http://example.com/?user=admin/data", DISALLOWED],
  68 |     [robots_txt_308278, "foobot", "http://example.com/id/page/", ALLOWED],
  69 |     [robots_txt_308278, "foobot", "http://example.com/id/some/page/", DISALLOWED],
  70 |     [robots_txt_308278, "foobot", "http://example.com/id/some/data", ALLOWED],
  71 |     [robots_txt_308278, "foobot", "http://example.com/id/some/data/more", DISALLOWED],
  72 | )
  73 | 
  74 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_308278)
  75 | def test_google_stress_308278(robots_txt, agent, path, allowed, can_fetch):
  76 |     assert can_fetch(robots_txt, agent, path) is allowed
  77 | 
  78 | 
  79 | robots_txt_943687 = """
  80 | # Today I heard something new and unmemorable
  81 | # If I don’t like something, I’ll stay away from it
  82 | # Everyone was busy, so I went to the movie alone
  83 | #
  84 | # For more information about the robots.txt standard, see:
  85 | # http://www.robotstxt.org/orig.html
  86 | #
  87 | # For syntax checking, see:
  88 | # http://example.com/robots-checker.phtml
  89 | 
  90 | User-agent: *
  91 | Disallow: /admin/
  92 | Disallow: /bin/
  93 | Disallow: /cache/
  94 | Disallow: /clion/
  95 | Disallow: /components/
  96 | Disallow: /excludes/
  97 | Disallow: /deinstallation/
  98 | Disallow: /layouts/
  99 | Disallow: /libraries/
 100 | Disallow: /logs/
 101 | Disallow: /plugins/
 102 | Disallow: /tmp/
 103 | 
 104 | """
 105 | 
 106 | data_943687 = (
 107 |     [robots_txt_943687, "foobot", "http://www.example.com/foo/bar", ALLOWED],
 108 |     [robots_txt_943687, "foobot", "http://www.example.com/admin/settings", DISALLOWED],
 109 |     [robots_txt_943687, "foobot", "http://www.example.com/bin/sh", DISALLOWED],
 110 |     [robots_txt_943687, "foo-bot", "http://www.example.com/search?req=123", ALLOWED],
 111 |     [robots_txt_943687, "foo_bot", "http://www.example.com/log/113", ALLOWED],
 112 |     [robots_txt_943687, "foo_bot", "http://www.example.com/logs/113", DISALLOWED],
 113 |     [robots_txt_943687, "foo-bot", "http://www.example.com/example/admin", ALLOWED],
 114 |     [robots_txt_943687, "foobot", "http://www.example.com/admin", ALLOWED],
 115 |     [robots_txt_943687, "foobot", "http://www.example.com/admin/", DISALLOWED],
 116 |     [robots_txt_943687, "foo_bot", "http://www.example.com/dev/null", ALLOWED],
 117 |     [robots_txt_943687, "foo_bot", "http://www.example.com/tmp/null", DISALLOWED],
 118 | )
 119 | 
 120 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_943687)
 121 | def test_google_stress_943687(robots_txt, agent, path, allowed, can_fetch):
 122 |     assert can_fetch(robots_txt, agent, path) is allowed
 123 | 
 124 | 
 125 | robots_txt_584234 = """
 126 | User-agent: barbot
 127 | Disallow: /
 128 | 
 129 | User-agent: bazbot
 130 | Disallow: /
 131 | 
 132 | User-agent: qux_bot
 133 | Crawl-delay: 1
 134 | 
 135 | User-agent: *
 136 | Allow: /
 137 | 
 138 | User-agent: *
 139 | Crawl-delay: 1
 140 | """
 141 | 
 142 | data_584234 = (
 143 |     [robots_txt_584234, "barbot", "http://example.com/foo/bar", DISALLOWED],
 144 |     [robots_txt_584234, "barbot", "http://example.com/foo/foo/foo", DISALLOWED],
 145 |     [robots_txt_584234, "barbot", "http://example.com/index.html", DISALLOWED],
 146 |     [robots_txt_584234, "bazbot", "http://example.com/secrets/123", DISALLOWED],
 147 |     [robots_txt_584234, "bazbot", "http://example.com/log?id=113", DISALLOWED],
 148 |     [robots_txt_584234, "qux_bot", "http://example.com/index.html", ALLOWED],
 149 |     [robots_txt_584234, "qux_bot", "http://example.com/foo/bar", ALLOWED],
 150 |     [robots_txt_584234, "qux_bot", "http://example.com/", ALLOWED],
 151 |     [robots_txt_584234, "foobot", "http://example.com/foo/bar", ALLOWED],
 152 |     [robots_txt_584234, "foobot", "http://example.com/log?id=113", ALLOWED],
 153 | )
 154 | 
 155 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_584234)
 156 | def test_google_stress_584234(robots_txt, agent, path, allowed, can_fetch):
 157 |     assert can_fetch(robots_txt, agent, path) is allowed
 158 | 
 159 | 
 160 | robots_txt_912555 = """
 161 | User-Agent: *
 162 | Disallow: /error$
 163 | Disallow: /jm/com.example.FooController
 164 | Disallow: /log
 165 | Disallow: /admin$
 166 | Disallow: /adminactions$
 167 | Disallow: /adminactions?
 168 | Disallow: /baz
 169 | Disallow: /jm/com.example.BarController
 170 | Sitemap: https://example.com/sitemap.xml
 171 | """
 172 | 
 173 | data_912555 = (
 174 |     [robots_txt_912555, "foobot", "http://example.com/error?user=admin", ALLOWED],
 175 |     [robots_txt_912555, "foobot", "http://example.com/error", DISALLOWED],
 176 |     [robots_txt_912555, "foo_bot", "http://example.com/search/foo", ALLOWED],
 177 |     [robots_txt_912555, "foo_bot", "http://example.com/log", DISALLOWED],
 178 |     [robots_txt_912555, "foo-bot", "http://example.com/adminactions", DISALLOWED],
 179 |     [robots_txt_912555, "foo-bot", "http://example.com/adminactions?id=123", DISALLOWED],
 180 |     [robots_txt_912555, "foo-bot", "http://example.com/adminactions/new", ALLOWED],
 181 |     [robots_txt_912555, "foobot", "http://example.com/jm/test.txt", ALLOWED],
 182 |     [robots_txt_912555, "foobot", "http://example.com/jm/com.example.BarController", DISALLOWED],
 183 |     [robots_txt_912555, "foobot", "http://example.com/foo/bar", ALLOWED],
 184 | )
 185 | 
 186 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_912555)
 187 | def test_google_stress_912555(robots_txt, agent, path, allowed, can_fetch):
 188 |     assert can_fetch(robots_txt, agent, path) is allowed
 189 | 
 190 | 
 191 | robots_txt_174022 = """
 192 | User-agent: *
 193 | Disallow: /view-responses.html
 194 | Disallow: /help.html
 195 | Disallow: /chat/reviews/view/
 196 | Disallow: /chat/view/
 197 | Disallow: /chat/view/hg/
 198 | Disallow: /chat/view/asd/
 199 | Disallow: /chat/asd/
 200 | Disallow: /chat/trackback/
 201 | Disallow: /chat/wp/
 202 | Disallow: /chat/*/reviews/view/$
 203 | Disallow: /chat/*/view/$
 204 | Disallow: /chat/*/view/hg/$
 205 | Disallow: /chat/*/view/asd/$
 206 | Disallow: /chat/*/asd/$
 207 | Disallow: /chat/*/trackback/$
 208 | Disallow: /contact-someone.html
 209 | """
 210 | 
 211 | data_174022 = (
 212 |     [robots_txt_174022, "FooBot", "http://example.com/", ALLOWED],
 213 |     [robots_txt_174022, "foobot", "http://example.com/search?req=123", ALLOWED],
 214 |     [robots_txt_174022, "Foo_Bot", "http://example.com/view-responses.html", DISALLOWED],
 215 |     [robots_txt_174022, "barbot", "http://example.com/chat/", ALLOWED],
 216 |     [robots_txt_174022, "BarBot", "http://example.com/chat/reviews/view/112", DISALLOWED],
 217 |     [robots_txt_174022, "BazBot", "http://example.com/chat/view", ALLOWED],
 218 |     [robots_txt_174022, "BazBot", "http://example.com/chat/view/hg", DISALLOWED],
 219 |     [robots_txt_174022, "FooBot", "http://example.com/chat/foo/bar/baz/view/", DISALLOWED],
 220 |     [robots_txt_174022, "barbot", "http://example.com/chat/something/asd/", DISALLOWED],
 221 |     [robots_txt_174022, "BarBot", "http://example.com/chat/asd/", DISALLOWED],
 222 |     [robots_txt_174022, "QuxBot", "http://example.com/contact-someone.html?user=foo", DISALLOWED],
 223 | )
 224 | 
 225 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_174022)
 226 | def test_google_stress_174022(robots_txt, agent, path, allowed, can_fetch):
 227 |     assert can_fetch(robots_txt, agent, path) is allowed
 228 | 
 229 | 
 230 | robots_txt_860237 = """
 231 | User-Agent: *
 232 | Crawl-delay : 60
 233 | Disallow : /*baz*
 234 | Disallow : /*qux*
 235 | 
 236 | User-agent: XYZ123bot
 237 | Crawl-delay : 60
 238 | Disallow: /
 239 | 
 240 | """
 241 | 
 242 | data_860237 = (
 243 |     [robots_txt_860237, "Foobot", "http://example.com/", ALLOWED],
 244 |     [robots_txt_860237, "foo-bot", "http://example.com/foo/bar", ALLOWED],
 245 |     [robots_txt_860237, "foo_bot", "http://example.com/robots.txt", ALLOWED],
 246 |     [robots_txt_860237, "foo_bot", "http://example.com/new_baz", DISALLOWED],
 247 |     [robots_txt_860237, "foo_bot", "http://example.com/baz/new", DISALLOWED],
 248 |     [robots_txt_860237, "foo-bot", "http://example.com/move/qux/add", DISALLOWED],
 249 |     [robots_txt_860237, "foo_bot", "http://example.com/baznew/start", DISALLOWED],
 250 |     [robots_txt_860237, "Foobot", "http://example.com/foo_qux_bar", DISALLOWED],
 251 |     [robots_txt_860237, "XYZ123bot", "http://example.com/robots.txt", ALLOWED],
 252 |     [robots_txt_860237, "XYZ", "http://example.com/robots.txt", DISALLOWED],
 253 | )
 254 | 
 255 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_860237)
 256 | def test_google_stress_860237(robots_txt, agent, path, allowed, can_fetch):
 257 |     assert can_fetch(robots_txt, agent, path) is allowed
 258 | 
 259 | 
 260 | robots_txt_777406 = """
 261 | User-agent: *
 262 | Allow: /
 263 | 
 264 | # Optimization for Baz Bot
 265 | User-Agent: FunBot-Baz-Mobile
 266 | User-Agent: FunBot-Baz
 267 | Disallow: /_api/*
 268 | Disallow: /_misc*
 269 | Disallow: /media/v1/view/*
 270 | 
 271 | Sitemap: https://www.example.com/sitemap.xml
 272 | """
 273 | 
 274 | data_777406 = (
 275 |     [robots_txt_777406, "foobot", "http://www.example.com/foo/bar", ALLOWED],
 276 |     [robots_txt_777406, "foo_bot", "http://www.example.com/", ALLOWED],
 277 |     [robots_txt_777406, "foo-bot", "http://www.example.com/robots.txt", ALLOWED],
 278 |     [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_api/index.html", DISALLOWED],
 279 |     [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_misc", DISALLOWED],
 280 |     [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_media/v2/foo", ALLOWED],
 281 |     [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/media/v1/view/", DISALLOWED],
 282 |     [robots_txt_777406, "FunBot-Baz", "http://www.example.com/media/v1/view/foo", DISALLOWED],
 283 |     [robots_txt_777406, "foo-bot", "http://www.example.com/media/v1/view/foo", ALLOWED],
 284 |     [robots_txt_777406, "foo_bot", "http://www.example.com/_misc/index.html", ALLOWED],
 285 | )
 286 | 
 287 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_777406)
 288 | def test_google_stress_777406(robots_txt, agent, path, allowed, can_fetch):
 289 |     assert can_fetch(robots_txt, agent, path) is allowed
 290 | 
 291 | 
 292 | robots_txt_768939 = """
 293 | User-agent: *
 294 | Crawl-delay: 3500
 295 | Disallow: /ab_controller
 296 | Disallow: /ab_imports
 297 | Disallow: /ab_content/bar
 298 | Disallow: /ab_content/cache
 299 | Disallow: /ab_content/baz
 300 | """
 301 | 
 302 | data_768939 = (
 303 |     [robots_txt_768939, "foobot", "http://www.example.com/ab_controller", DISALLOWED],
 304 |     [robots_txt_768939, "foo_bot", "http://www.example.com/ab_controller-foo", DISALLOWED],
 305 |     [robots_txt_768939, "foo-bot", "http://www.example.com/ab_imports/foo.txt", DISALLOWED],
 306 |     [robots_txt_768939, "foobot", "http://www.example.com/foo/bar", ALLOWED],
 307 |     [robots_txt_768939, "foobot", "http://www.example.com/ab_content/foo", ALLOWED],
 308 |     [robots_txt_768939, "foo_bot", "http://www.example.com/ab_content/bar/foo.bar", DISALLOWED],
 309 |     [robots_txt_768939, "foo-bot", "http://www.example.com/ab_content/cache-foo", DISALLOWED],
 310 |     [robots_txt_768939, "foo-bot", "http://www.example.com/", ALLOWED],
 311 | )
 312 | 
 313 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_768939)
 314 | def test_google_stress_768939(robots_txt, agent, path, allowed, can_fetch):
 315 |     assert can_fetch(robots_txt, agent, path) is allowed
 316 | 
 317 | 
 318 | robots_txt_517712 = """
 319 | # Some comment
 320 | # http://www.exapmle.com/something.html
 321 | 
 322 | 
 323 | 
 324 | # Some more explanation to lines below
 325 | # (and some line wrapping)
 326 | 
 327 | User-agent: *
 328 | Disallow:
 329 | 
 330 | 
 331 | 
 332 | # Some comments regarding some specific robot restrictions
 333 | # maybe regarding his functionality
 334 | # and some website to visit
 335 | # http://www.example.com/some/help/about/quxbot?arg=123
 336 | 
 337 | User-Agent: Quxbot
 338 | Disallow: /*dispatch_request$
 339 | Disallow: /*directory_ctors$
 340 | """
 341 | 
 342 | data_517712 = (
 343 |     [robots_txt_517712, "foobot", "http://example.com/", ALLOWED],
 344 |     [robots_txt_517712, "FooBot", "http://example.com/search?req=123", ALLOWED],
 345 |     [robots_txt_517712, "foobot", "http://example.com/foo/bar/dispatch_request", ALLOWED],
 346 |     [robots_txt_517712, "foo-bot", "http://example.com/bar/baz/foler_ctors", ALLOWED],
 347 |     [robots_txt_517712, "Quxbot", "http://example.com/", ALLOWED],
 348 |     [robots_txt_517712, "barbot", "http://example.com/robots.txt", ALLOWED],
 349 |     [robots_txt_517712, "Quxbot", "http://example.com/baz/dispatch_request", DISALLOWED],
 350 |     [robots_txt_517712, "Quxbot", "http://example.com/baz/dispatch_request?args=123", ALLOWED],
 351 |     [robots_txt_517712, "Quxbot", "http://example.com/new_directory_ctors", DISALLOWED],
 352 |     [robots_txt_517712, "Quxbot", "http://example.com/bar/baz/directory_ctors", DISALLOWED],
 353 | )
 354 | 
 355 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_517712)
 356 | def test_google_stress_517712(robots_txt, agent, path, allowed, can_fetch):
 357 |     assert can_fetch(robots_txt, agent, path) is allowed
 358 | 
 359 | 
 360 | robots_txt_894248 = """
 361 | User-agent: *
 362 | Disallow: /ab-baz/
 363 | Allow: /ab-baz/baz-ajax.php
 364 | 
 365 | Sitemap: https://example.com/ab-sitemap.xml
 366 | """
 367 | 
 368 | data_894248 = (
 369 |     [robots_txt_894248, "FooBot", "http://example.com/", ALLOWED],
 370 |     [robots_txt_894248, "Foo_Bot", "http://example.com/foo/bar.php", ALLOWED],
 371 |     [robots_txt_894248, "foobot", "http://example.com/ab-baz/index.htm", DISALLOWED],
 372 |     [robots_txt_894248, "foo-bot", "http://example.com/ab-baz/foo/bar", DISALLOWED],
 373 |     [robots_txt_894248, "foo_bot", "http://example.com/ab-baz/baz-ajax.php", ALLOWED],
 374 |     [robots_txt_894248, "foo-bot", "http://example.com/ab-baz/baz-ajax.php?user=123", ALLOWED],
 375 | )
 376 | 
 377 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_894248)
 378 | def test_google_stress_894248(robots_txt, agent, path, allowed, can_fetch):
 379 |     assert can_fetch(robots_txt, agent, path) is allowed
 380 | 
 381 | 
 382 | robots_txt_155227 = """
 383 | User-agent: *
 384 | Crawl-delay: 10
 385 | # Foo
 386 | Disallow: /asdf-main/
 387 | Disallow: /asdf-media/
 388 | Disallow: /asdf-shared/
 389 | # Bar
 390 | Disallow: /asdf-control.php
 391 | Disallow: /asdf-control-sample.php
 392 | Disallow: /asdf-settings.php
 393 | """
 394 | 
 395 | data_155227 = (
 396 |     [robots_txt_155227, "foobot", "http://example.com/", ALLOWED],
 397 |     [robots_txt_155227, "foo_bot", "http://example.com/bar/index.html", ALLOWED],
 398 |     [robots_txt_155227, "foo-bot", "http://example.com/asdf-control.pdf", ALLOWED],
 399 |     [robots_txt_155227, "foobot", "http://example.com/asdf-control.php", DISALLOWED],
 400 |     [robots_txt_155227, "foobot", "http://example.com/asdf-control-sample.php", DISALLOWED],
 401 |     [robots_txt_155227, "foobot", "http://example.com/asdf-control-simple.php", ALLOWED],
 402 |     [robots_txt_155227, "FooBot", "http://example.com/asdf-settings.php", DISALLOWED],
 403 |     [robots_txt_155227, "Foo-Bot", "http://example.com/asdf-shared/index.html", DISALLOWED],
 404 | )
 405 | 
 406 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_155227)
 407 | def test_google_stress_155227(robots_txt, agent, path, allowed, can_fetch):
 408 |     assert can_fetch(robots_txt, agent, path) is allowed
 409 | 
 410 | 
 411 | robots_txt_701159 = """
 412 | User-agent: foofoobot*
 413 | Disallow: /workers/
 414 | Disallow: /media/common/
 415 | Disallow: /misc/
 416 | Disallow: /bin/
 417 | Disallow: /trash/
 418 | 
 419 | User-agent: barbarbot*
 420 | Disallow: /workers/
 421 | Disallow: /media/common/
 422 | Disallow: /misc/
 423 | Disallow: /bin/
 424 | Disallow: /trash/
 425 | 
 426 | User-agent: quxbot
 427 | Disallow: /workers/
 428 | Disallow: /media/common/
 429 | Disallow: /misc/
 430 | Disallow: /bin/
 431 | Disallow: /trash/
 432 | 
 433 | User-agent: ddbot
 434 | Disallow: /workers/
 435 | Disallow: /media/common/
 436 | Disallow: /misc/
 437 | Disallow: /bin/
 438 | Disallow: /trash/
 439 | 
 440 | User-agent: toebot
 441 | Disallow: /workers/
 442 | Disallow: /media/common/
 443 | Disallow: /misc/
 444 | Disallow: /bin/
 445 | Disallow: /trash/
 446 | 
 447 | User-agent: io_tester
 448 | Disallow: /workers/
 449 | Disallow: /media/common/
 450 | Disallow: /misc/
 451 | Disallow: /bin/
 452 | Disallow: /trash/
 453 | 
 454 | User-agent: *
 455 | Disallow: /
 456 | 
 457 | 
 458 | Sitemap: http://www.example.com/sitemap.xml
 459 | """
 460 | 
 461 | data_701159 = (
 462 |     [robots_txt_701159, "foofoobot-exp", "http://example.com/workers/log", DISALLOWED],
 463 |     [robots_txt_701159, "foofoobot", "http://example.com/trash/index.html", DISALLOWED],
 464 |     [robots_txt_701159, "barbarbot-prod", "http://example.com/bin/bash", DISALLOWED],
 465 |     [robots_txt_701159, "barbarbot-prod", "http://example.com/foo/bar", DISALLOWED],
 466 |     [robots_txt_701159, "barbarbot", "http://example.com/bin/bash", DISALLOWED],
 467 |     [robots_txt_701159, "barbarbot", "http://example.com/foo/bar", ALLOWED],
 468 |     [robots_txt_701159, "quxbot", "http://example.com/qux/qux/qux", ALLOWED],
 469 |     [robots_txt_701159, "quxbot", "http://example.com/trash/view.html", DISALLOWED],
 470 |     [robots_txt_701159, "io_tester", "http://example.com/search?req=123", ALLOWED],
 471 |     [robots_txt_701159, "io_tester", "http://example.com/media/common/123", DISALLOWED],
 472 |     [robots_txt_701159, "foo_bot", "http://example.com/search?req=123", DISALLOWED],
 473 | )
 474 | 
 475 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_701159)
 476 | def test_google_stress_701159(robots_txt, agent, path, allowed, can_fetch):
 477 |     assert can_fetch(robots_txt, agent, path) is allowed
 478 | 
 479 | 
 480 | robots_txt_541230 = """
 481 | User-agent: *
 482 | Allow: /*.js
 483 | Allow: /*.css
 484 | Allow: /*.jpg
 485 | Allow: /*.png
 486 | Allow: /*.gif
 487 | Allow: /*?page
 488 | Allow: /*?ref=
 489 | Disallow: /*?
 490 | Disallow: /stat/
 491 | Disallow: /id/1
 492 | Disallow: /id/3
 493 | Disallow: /register
 494 | Disallow: /id/5
 495 | Disallow: /id/7
 496 | Disallow: /id/8
 497 | Disallow: /id/9
 498 | Disallow: /id/sub/
 499 | Disallow: /panel/
 500 | Disallow: /admin/
 501 | Disallow: /informer/
 502 | Disallow: /secure/
 503 | Disallow: /poll/
 504 | Disallow: /search/
 505 | Disallow: /abnl/
 506 | Disallow: /*_escaped_pattern_=
 507 | Disallow: /*-*-*-*-321$
 508 | Disallow: /baz/order/
 509 | Disallow: /baz/printorder/
 510 | Disallow: /baz/checkout/
 511 | Disallow: /baz/user/
 512 | Disallow: /baz/search
 513 | Disallow: /*0-*-0-03$
 514 | Disallow: /*-0-0-
 515 | 
 516 | Sitemap: http://example.com/sitemap.xml
 517 | Sitemap: http://example.com/sitemap-forum.xml
 518 | """
 519 | 
 520 | data_541230 = (
 521 |     [robots_txt_541230, "foobot", "http://example.com/foo.js", ALLOWED],
 522 |     [robots_txt_541230, "foobot", "http://example.com/foo/bar.css", ALLOWED],
 523 |     [robots_txt_541230, "foobot", "http://example.com/x/y/z?ref=bar", ALLOWED],
 524 |     [robots_txt_541230, "foobot", "http://example.com/x/y/z", ALLOWED],
 525 |     [robots_txt_541230, "foobot", "http://example.com/status/x", ALLOWED],
 526 |     [robots_txt_541230, "foobot", "http://example.com/stat/perf", DISALLOWED],
 527 |     [robots_txt_541230, "foobot", "http://example.com/id/13579", DISALLOWED],
 528 |     [robots_txt_541230, "foobot", "http://example.com/id/24680", ALLOWED],
 529 |     [robots_txt_541230, "foobot", "http://example.com/search/stats", DISALLOWED],
 530 |     [robots_txt_541230, "foobot", "http://example.com/foo_bar_escaped_pattern_=123", DISALLOWED],
 531 |     [robots_txt_541230, "foobot", "http://example.com/foo-bar-vaz-qux-321", DISALLOWED],
 532 |     [robots_txt_541230, "foobot", "http://example.com/foo-bar-vaz-qux-3216", ALLOWED],
 533 |     [robots_txt_541230, "foobot", "http://example.com/-0-0-312", DISALLOWED],
 534 |     [robots_txt_541230, "foobot", "http://example.com/baz", ALLOWED],
 535 |     [robots_txt_541230, "foobot", "http://example.com/baz/user/123", DISALLOWED],
 536 | )
 537 | 
 538 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_541230)
 539 | def test_google_stress_541230(robots_txt, agent, path, allowed, can_fetch):
 540 |     assert can_fetch(robots_txt, agent, path) is allowed
 541 | 
 542 | 
 543 | robots_txt_824664 = """
 544 | Sitemap: http://example.com/sitemap.xml
 545 | Sitemap: http://example.com/news-sitemap.xml
 546 | User-agent: *
 547 | Disallow: /controller/
 548 | Allow: /controller/admin-ajax.php
 549 | """
 550 | 
 551 | data_824664 = (
 552 |     [robots_txt_824664, "foo-bot", "http://example.com/index.html", ALLOWED],
 553 |     [robots_txt_824664, "foo-bot", "http://example.com/controller/index.html", DISALLOWED],
 554 |     [robots_txt_824664, "foo_bot", "http://example.com/controller/foo/bar/index.htm", DISALLOWED],
 555 |     [robots_txt_824664, "foobot", "http://example.com/controller/admin-ajax.php", ALLOWED],
 556 |     [robots_txt_824664, "foobot", "http://example.com/log?id=234", ALLOWED],
 557 | )
 558 | 
 559 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_824664)
 560 | def test_google_stress_824664(robots_txt, agent, path, allowed, can_fetch):
 561 |     assert can_fetch(robots_txt, agent, path) is allowed
 562 | 
 563 | 
 564 | robots_txt_327748 = """
 565 | User-agent: asdfbot
 566 | Disallow: /
 567 | User-agent: *
 568 | Disallow:
 569 | Crawl-delay: 15
 570 | Sitemap: http://example.com/sitemap.xml
 571 | """
 572 | 
 573 | data_327748 = (
 574 |     [robots_txt_327748, "foobot", "http://m.example.com/", ALLOWED],
 575 |     [robots_txt_327748, "FooBot", "http://m.example.com/foo/bar/baz.php", ALLOWED],
 576 |     [robots_txt_327748, "Foo_Bot", "http://m.example.com/index.html", ALLOWED],
 577 |     [robots_txt_327748, "asdfbot", "http://m.example.com/", DISALLOWED],
 578 |     [robots_txt_327748, "asdfbot", "http://m.example.com/foo/bar/baz.js", DISALLOWED],
 579 |     [robots_txt_327748, "asdfbot", "http://m.example.com/robots.txt", DISALLOWED],
 580 | )
 581 | 
 582 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_327748)
 583 | def test_google_stress_327748(robots_txt, agent, path, allowed, can_fetch):
 584 |     assert can_fetch(robots_txt, agent, path) is allowed
 585 | 
 586 | 
 587 | robots_txt_278501 = """
 588 | User-agent: *
 589 | Disallow: /dump-*
 590 | Disallow: /vlog/dump-*
 591 | Disallow: /_pcms/preview/
 592 | Disallow: /tf/manage-roles/
 593 | 
 594 | Sitemap: https://www.example.com/sitemap.xml
 595 | Disallow: /_pcms/preview/
 596 | Disallow: /tf/manage-roles/
 597 | """
 598 | 
 599 | data_278501 = (
 600 |     [robots_txt_278501, "foobot", "http://www.example.com/index.html", ALLOWED],
 601 |     [robots_txt_278501, "foo-bot", "http://www.example.com/dump-", DISALLOWED],
 602 |     [robots_txt_278501, "foobot", "http://www.example.com/dump", ALLOWED],
 603 |     [robots_txt_278501, "foo_bot", "http://www.example.com/dump-786", DISALLOWED],
 604 |     [robots_txt_278501, "foo-bot", "http://www.example.com/vlog/123", ALLOWED],
 605 |     [robots_txt_278501, "foo-bot", "http://www.example.com/vlog/dump-123", DISALLOWED],
 606 |     [robots_txt_278501, "foobot", "http://www.example.com/_pcms/test.txt", ALLOWED],
 607 |     [robots_txt_278501, "foo_bot", "http://www.example.com/_pcms/preview/test.txt", DISALLOWED],
 608 |     [robots_txt_278501, "foo-bot", "http://www.example.com/pcms/preview/test.txt", ALLOWED],
 609 |     [robots_txt_278501, "foo_bot", "http://www.example.com/tf/manage-roles/foo/bar", DISALLOWED],
 610 |     [robots_txt_278501, "foobot", "http://www.example.com/tf/manage-roles/", DISALLOWED],
 611 |     [robots_txt_278501, "foo_bot", "http://www.example.com/tf/index.html", ALLOWED],
 612 | )
 613 | 
 614 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_278501)
 615 | def test_google_stress_278501(robots_txt, agent, path, allowed, can_fetch):
 616 |     assert can_fetch(robots_txt, agent, path) is allowed
 617 | 
 618 | 
 619 | robots_txt_974982 = """
 620 | # Some Robots Txt
 621 | 
 622 | 
 623 | User-agent: *
 624 | Disallow: /data
 625 | Disallow: /find
 626 | Disallow: /stuff$
 627 | Disallow: /stuff/
 628 | Disallow: /contacts/
 629 | Disallow: /dynamic/
 630 | Disallow:/*?creator=*
 631 | Disallow:/*&creator=*
 632 | Disallow:/*?finder=*
 633 | Disallow:/*&finder=*
 634 | Disallow:/*?locator=*
 635 | Disallow:/*&locator=*
 636 | Disallow:/*?viewer=*
 637 | Disallow:/*&viewer=*
 638 | Disallow:/*?format=json
 639 | Disallow:/*&format=json
 640 | Disallow:/*?format=page-context
 641 | Disallow:/*&format=page-context
 642 | Disallow:/*?format=main-content
 643 | Disallow:/*&format=main-content
 644 | Disallow:/*?format=json-pretty
 645 | Disallow:/*&format=json-pretty
 646 | Disallow:/*?format=ical
 647 | Disallow:/*&format=ical
 648 | Disallow:/*?someStuff=*
 649 | Disallow:/*&someStuff=*
 650 | 
 651 | 
 652 | Sitemap: https://example.com/sitemap.xml
 653 | """
 654 | 
 655 | data_974982 = (
 656 |     [robots_txt_974982, "foobot", "http://www.example.com/", ALLOWED],
 657 |     [robots_txt_974982, "foobot", "http://www.example.com/robots.txt", ALLOWED],
 658 |     [robots_txt_974982, "foobot", "http://www.example.com/find", DISALLOWED],
 659 |     [robots_txt_974982, "foobot", "http://www.example.com/find/", DISALLOWED],
 660 |     [robots_txt_974982, "foobot", "http://www.example.com/find?id=123", DISALLOWED],
 661 |     [robots_txt_974982, "foobot", "http://www.example.com/stuff", DISALLOWED],
 662 |     [robots_txt_974982, "foobot", "http://www.example.com/stuffstats", ALLOWED],
 663 |     [robots_txt_974982, "foobot", "http://www.example.com/stuff/new", DISALLOWED],
 664 |     [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar", DISALLOWED],
 665 |     [robots_txt_974982, "foobot", "http://www.example.com/foo?finder=baz", DISALLOWED],
 666 |     [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar&finder=baz", DISALLOWED],
 667 |     [robots_txt_974982, "foobot", "http://www.example.com/foo?viewer=qux", DISALLOWED],
 668 |     [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar&stuff=baz", DISALLOWED],
 669 |     [robots_txt_974982, "foobot", "http://www.example.com/contacts/index.html", DISALLOWED],
 670 | )
 671 | 
 672 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_974982)
 673 | def test_google_stress_974982(robots_txt, agent, path, allowed, can_fetch):
 674 |     assert can_fetch(robots_txt, agent, path) is allowed
 675 | 
 676 | 
 677 | robots_txt_371856 = """
 678 | User-agent: Foobot
 679 | User-agent: Barbot
 680 | User-agent: Bazbot
 681 | User-agent: Quxbot
 682 | Crawl-delay: 10
 683 | Disallow:
 684 | 
 685 | User-agent: *
 686 | Disallow: /
 687 | """
 688 | 
 689 | data_371856 = (
 690 |     [robots_txt_371856, "Foobot", "http://example.com/foo/bar", ALLOWED],
 691 |     [robots_txt_371856, "Barbot", "http://example.com/foo/bar", ALLOWED],
 692 |     [robots_txt_371856, "Bazbot", "http://example.com/foo/baz", ALLOWED],
 693 |     [robots_txt_371856, "Bazbot", "http://example.com/", ALLOWED],
 694 |     [robots_txt_371856, "Bazbot", "http://example.com/index.html", ALLOWED],
 695 |     [robots_txt_371856, "zazbot", "http://example.com/", DISALLOWED],
 696 |     [robots_txt_371856, "zazbot", "http://example.com/index.html", DISALLOWED],
 697 |     [robots_txt_371856, "zazbot", "http://example.com/foo/zaz", DISALLOWED],
 698 | )
 699 | 
 700 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_371856)
 701 | def test_google_stress_371856(robots_txt, agent, path, allowed, can_fetch):
 702 |     assert can_fetch(robots_txt, agent, path) is allowed
 703 | 
 704 | 
 705 | robots_txt_923994 = """
 706 | User-agent: *
 707 | Disallow: /resources/bazbaz/baz/more_stuff
 708 | Disallow: /wha/some_dir/files
 709 | Disallow: /lib
 710 | Disallow: /sys
 711 | Disallow: /foo
 712 | Disallow: /bar
 713 | Disallow: /baz
 714 | Sitemap: http://www.example.com/wha/some_dir/resources/sitemap.xml
 715 | 
 716 | User-agent: quxbot
 717 | Disallow: /resources/bazbaz/baz/more_stuff
 718 | Disallow: /wha/some_dir/files
 719 | Disallow: /lib
 720 | Disallow: /sys
 721 | Disallow: /foo
 722 | Disallow: /bar
 723 | Disallow: /baz
 724 | Disallow: /users/big_foo/some_stuff
 725 | Disallow: /users/big_foo/other_stuff
 726 | Disallow: /en/stuff/arr
 727 | Disallow: /en/stuff/dep
 728 | Disallow: /sk/stuff/pri
 729 | Disallow: /sk/stuff/odl
 730 | Disallow: /cz/stuff/pri
 731 | Disallow: /cz/stuff/odl
 732 | Disallow: /hu/stuff/rke
 733 | Disallow: /hu/stuff/ind
 734 | Disallow: /addfightyos
 735 | Disallow: /addfightnope
 736 | Crawl-delay: 29
 737 | """
 738 | 
 739 | data_923994 = (
 740 |     [robots_txt_923994, "foobot", "http://example.com/home", ALLOWED],
 741 |     [robots_txt_923994, "foobot", "http://example.com/foo?id=12", DISALLOWED],
 742 |     [robots_txt_923994, "foobot", "http://example.com/qux", ALLOWED],
 743 |     [robots_txt_923994, "foobot", "http://example.com/home/scripts/s.js", ALLOWED],
 744 |     [robots_txt_923994, "foobot", "http://example.com/baz/112", DISALLOWED],
 745 |     [robots_txt_923994, "foobot", "http://example.com/resources/index.html", ALLOWED],
 746 |     [robots_txt_923994, "foobot", "http://example.com/resources/bazbaz/baz/more_stuff", DISALLOWED],
 747 |     [robots_txt_923994, "quxbot", "http://example.com/resources/bazbaz/baz/more_stuff", DISALLOWED],
 748 |     [robots_txt_923994, "quxbot", "http://example.com/users/big_foo/some_stuff/new", DISALLOWED],
 749 |     [robots_txt_923994, "quxbot", "http://example.com/addfightyos", DISALLOWED],
 750 |     [robots_txt_923994, "foobot", "http://example.com/addfight/new", ALLOWED],
 751 |     [robots_txt_923994, "quxbot", "http://example.com/addfight/new", ALLOWED],
 752 |     [robots_txt_923994, "quxbot", "http://example.com/addfightnope?dest=ULLI", DISALLOWED],
 753 |     [robots_txt_923994, "quxbot", "http://example.com/cz", ALLOWED],
 754 | )
 755 | 
 756 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_923994)
 757 | def test_google_stress_923994(robots_txt, agent, path, allowed, can_fetch):
 758 |     assert can_fetch(robots_txt, agent, path) is allowed
 759 | 
 760 | 
 761 | robots_txt_797409 = """
 762 | User-agent: quxbot
 763 | Disallow: /
 764 | User-agent: *
 765 | Disallow:
 766 | Sitemap: https://example.com/sitemap.xml
 767 | """
 768 | 
 769 | data_797409 = (
 770 |     [robots_txt_797409, "foobot", "http://example.com/foo/bar", ALLOWED],
 771 |     [robots_txt_797409, "foobot", "http://example.com/", ALLOWED],
 772 |     [robots_txt_797409, "foo_bot", "http://example.com/log?id=132", ALLOWED],
 773 |     [robots_txt_797409, "quxbot", "http://example.com/", DISALLOWED],
 774 |     [robots_txt_797409, "quxbot", "http://example.com/baz/baz", DISALLOWED],
 775 |     [robots_txt_797409, "quxbot", "http://example.com/index.htm", DISALLOWED],
 776 | )
 777 | 
 778 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_797409)
 779 | def test_google_stress_797409(robots_txt, agent, path, allowed, can_fetch):
 780 |     assert can_fetch(robots_txt, agent, path) is allowed
 781 | 
 782 | 
 783 | robots_txt_715135 = """
 784 | User-agent: admin
 785 | Disallow:
 786 | 
 787 | User-agent: *
 788 | Disallow: /buzz
 789 | Allow: /
 790 | 
 791 | Sitemap: http://example.com/sitemap.xml
 792 | 
 793 | """
 794 | 
 795 | data_715135 = (
 796 |     [robots_txt_715135, "foobot", "http://example.com/buzz/settings", DISALLOWED],
 797 |     [robots_txt_715135, "foobot", "http://example.com/buzz-lite", DISALLOWED],
 798 |     [robots_txt_715135, "barbot", "http://example.com/qux/bar", ALLOWED],
 799 |     [robots_txt_715135, "quxbot", "http://example.com/buzz", DISALLOWED],
 800 |     [robots_txt_715135, "bazbot", "http://example.com/prod/buzz", ALLOWED],
 801 |     [robots_txt_715135, "barbot", "http://example.com/anotherbuzz/x", ALLOWED],
 802 |     [robots_txt_715135, "foobot", "http://example.com/rebuzz/x", ALLOWED],
 803 |     [robots_txt_715135, "foobot", "http://example.com/buzz/buzz/buzz", DISALLOWED],
 804 |     [robots_txt_715135, "foo-bot", "http://example.com/searc/buzz", ALLOWED],
 805 |     [robots_txt_715135, "bar-bot", "http://example.com/buzz/searc", DISALLOWED],
 806 |     [robots_txt_715135, "admin", "http://example.com/buzz/ses", ALLOWED],
 807 |     [robots_txt_715135, "admin", "http://example.com/foo/bar", ALLOWED],
 808 |     [robots_txt_715135, "admin", "http://example.com/buzz", ALLOWED],
 809 | )
 810 | 
 811 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_715135)
 812 | def test_google_stress_715135(robots_txt, agent, path, allowed, can_fetch):
 813 |     assert can_fetch(robots_txt, agent, path) is allowed
 814 | 
 815 | 
 816 | robots_txt_478151 = """
 817 | 
 818 | 
 819 | User-agent: Whoosh-Qux
 820 | Allow: /
 821 | 
 822 | User-agent: Baz-Qux
 823 | Allow: /
 824 | 
 825 | User-agent: barbot
 826 | Allow: /
 827 | Disallow: /braa
 828 | 
 829 | User-agent: BeepBot
 830 | Disallow: /braa
 831 | 
 832 | User-agent: Sample-web-crawler
 833 | Disallow: /braa
 834 | 
 835 | User-agent: *
 836 | Disallow: /
 837 | 
 838 | User-agent: *
 839 | Disallow: /braa
 840 | 
 841 | Sitemap: /sitemap.xml
 842 | """
 843 | 
 844 | data_478151 = (
 845 |     [robots_txt_478151, "Whoosh-Qux", "http://example.com/robots.txt", ALLOWED],
 846 |     [robots_txt_478151, "Baz-Qux", "http://example.com/foo/bar", ALLOWED],
 847 |     [robots_txt_478151, "BeepBot", "http://example.com/braallaboration/index.htm", DISALLOWED],
 848 |     [robots_txt_478151, "BeepBot", "http://example.com/foo/bar", ALLOWED],
 849 |     [robots_txt_478151, "BeepBot", "http://example.com/", ALLOWED],
 850 |     [robots_txt_478151, "BeepBot", "http://example.com/braa/balt", DISALLOWED],
 851 |     [robots_txt_478151, "foobot", "http://example.com/index.htm", DISALLOWED],
 852 |     [robots_txt_478151, "foo_bot", "http://example.com/braabalt", DISALLOWED],
 853 |     [robots_txt_478151, "foo-bot", "http://example.com/", DISALLOWED],
 854 | )
 855 | 
 856 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_478151)
 857 | def test_google_stress_478151(robots_txt, agent, path, allowed, can_fetch):
 858 |     assert can_fetch(robots_txt, agent, path) is allowed
 859 | 
 860 | 
 861 | robots_txt_369883 = """
 862 | User-agent: *
 863 | <br />
 864 | Allow: /
 865 | <br />
 866 | User-agent: BarBot
 867 | <br />
 868 | Disallow: /
 869 | <br />
 870 | User-agent: AB42bot
 871 | <br />
 872 | Disallow: /
 873 | <br />
 874 | sitemap: http://example.com/sitemap.xml
 875 | """
 876 | 
 877 | data_369883 = (
 878 |     [robots_txt_369883, "foobot", "http://example.com/", ALLOWED],
 879 |     [robots_txt_369883, "foo-bot", "http://example.com/foo/bar", ALLOWED],
 880 |     [robots_txt_369883, "foo_bot", "http://example.com/robots.txt", ALLOWED],
 881 |     [robots_txt_369883, "BarBot", "http://example.com/", DISALLOWED],
 882 |     [robots_txt_369883, "BarBot", "http://example.com/foo/bar/baz", DISALLOWED],
 883 |     [robots_txt_369883, "BarBot", "http://example.com/robots.txt", DISALLOWED],
 884 |     [robots_txt_369883, "AB42bot", "http://example.com/foo/bar", ALLOWED],
 885 |     [robots_txt_369883, "AB42bot", "http://example.com/", ALLOWED],
 886 |     [robots_txt_369883, "AB", "http://example.com/", DISALLOWED],
 887 |     [robots_txt_369883, "AB", "http://example.com/robots.txt", DISALLOWED],
 888 | )
 889 | 
 890 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_369883)
 891 | def test_google_stress_369883(robots_txt, agent, path, allowed, can_fetch):
 892 |     assert can_fetch(robots_txt, agent, path) is allowed
 893 | 
 894 | 
 895 | robots_txt_434582 = """
 896 | #
 897 | # robots.txt
 898 | #
 899 | # This is robots.txt
 900 | # and it saves server resources
 901 | # some more comment lines
 902 | # and an empty one
 903 | #
 904 | # Don't forget to put robots.txt in root of your host
 905 | # Used:    http://example.com/robots.txt
 906 | # Ignored: http://example.com/site/robots.txt
 907 | #
 908 | # For more information about the robots.txt standard, see:
 909 | # http://www.robotstxt.org/robotstxt.html
 910 | 
 911 | User-agent: *
 912 | Crawl-delay: 15
 913 | # Foo
 914 | Allow: /stuff/*.css$
 915 | Allow: /stuff/*.css?
 916 | Allow: /stuff/*.js$
 917 | Allow: /stuff/*.js?
 918 | Allow: /stuff/*.gif
 919 | Allow: /stuff/*.jpg
 920 | Allow: /stuff/*.jpeg
 921 | Allow: /stuff/*.png
 922 | Allow: /things/*.css$
 923 | Allow: /things/*.css?
 924 | Allow: /things/*.js$
 925 | Allow: /things/*.js?
 926 | Allow: /things/*.gif
 927 | Allow: /things/*.jpg
 928 | Allow: /things/*.jpeg
 929 | Allow: /things/*.png
 930 | Allow: /data/*.css$
 931 | Allow: /data/*.css?
 932 | Allow: /data/*.js$
 933 | Allow: /data/*.js?
 934 | Allow: /data/*.gif
 935 | Allow: /data/*.jpg
 936 | Allow: /data/*.jpeg
 937 | Allow: /data/*.png
 938 | Allow: /more_data/*.css$
 939 | Allow: /more_data/*.css?
 940 | Allow: /more_data/*.js$
 941 | Allow: /more_data/*.js?
 942 | Allow: /more_data/*.gif
 943 | Allow: /more_data/*.jpg
 944 | Allow: /more_data/*.jpeg
 945 | Allow: /more_data/*.png
 946 | # Bar
 947 | Disallow: /something/
 948 | Disallow: /stuff/
 949 | Disallow: /things/
 950 | Disallow: /data/
 951 | Disallow: /scripts/
 952 | Disallow: /more_data/
 953 | # Baz
 954 | Disallow: /SOME_TEXT.txt
 955 | Disallow: /some_script.php
 956 | Disallow: /INSTALL.foo.txt
 957 | Disallow: /INSTALL.bar.txt
 958 | Disallow: /INSTALL.baz.txt
 959 | Disallow: /get.php
 960 | Disallow: /GET.txt
 961 | Disallow: /LICENSE.txt
 962 | Disallow: /HELPERS.txt
 963 | Disallow: /update.php
 964 | Disallow: /UPGRADE.txt
 965 | Disallow: /what.php
 966 | # Some more stuff to disallow
 967 | Disallow: /?q=main/
 968 | Disallow: /?q=comment/reply/
 969 | Disallow: /?q=filter/ads/
 970 | Disallow: /?q=data/add/
 971 | Disallow: /?q=find/
 972 | Disallow: /?q=baz/password/
 973 | Disallow: /?q=baz/register/
 974 | Disallow: /?q=baz/login/
 975 | Disallow: /?q=baz/logout/
 976 | """
 977 | 
 978 | data_434582 = (
 979 |     [robots_txt_434582, "foobot", "https://www.example.com/", ALLOWED],
 980 |     [robots_txt_434582, "foobot", "https://www.example.com/help.html", ALLOWED],
 981 |     [robots_txt_434582, "foobot", "https://www.example.com/some.css", ALLOWED],
 982 |     [robots_txt_434582, "foobot", "https://www.example.com/foo/some.css", ALLOWED],
 983 |     [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.css", ALLOWED],
 984 |     [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.html", DISALLOWED],
 985 |     [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.jpeg", ALLOWED],
 986 |     [robots_txt_434582, "foobot", "https://www.example.com/things/some.css?user=main", ALLOWED],
 987 |     [robots_txt_434582, "foobot", "https://www.example.com/things/some.jpeg?user=main", ALLOWED],
 988 |     [robots_txt_434582, "foobot", "https://www.example.com/something/foo.cpp", DISALLOWED],
 989 |     [robots_txt_434582, "foobot", "https://www.example.com/more_data/dark", DISALLOWED],
 990 |     [robots_txt_434582, "foobot", "https://www.example.com/some_script.php", DISALLOWED],
 991 |     [robots_txt_434582, "foobot", "https://www.example.com/upgrade.txt", ALLOWED],
 992 |     [robots_txt_434582, "foobot", "https://www.example.com/UPGRADE.txt", DISALLOWED],
 993 |     [robots_txt_434582, "foobot", "https://www.example.com/data/main", DISALLOWED],
 994 |     [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/", ALLOWED],
 995 |     [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/login", ALLOWED],
 996 |     [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/login/", DISALLOWED],
 997 |     [robots_txt_434582, "foobot", "https://www.example.com/?q=data/discard/", ALLOWED],
 998 |     [robots_txt_434582, "foobot", "https://www.example.com/?q=data/add/", DISALLOWED],
 999 | )
1000 | 
1001 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_434582)
1002 | def test_google_stress_434582(robots_txt, agent, path, allowed, can_fetch):
1003 |     assert can_fetch(robots_txt, agent, path) is allowed
1004 | 
1005 | 


--------------------------------------------------------------------------------
/tests/test_network.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NetworkTestCase from:
 3 | https://github.com/python/cpython/blob/a796d8ef9dd1af65f7e4d7a857b56f35b7cb6e78/Lib/test/test_robotparser.py
 4 | converted to PyTest
 5 | """
 6 | 
 7 | import pytest
 8 | import robots
 9 | from .core import *
10 | 
11 | BASE_URL = 'http://www.pythontest.net'
12 | 
13 | 
14 | @pytest.fixture(scope='module')
15 | def parser():
16 |     p = robots.RobotsParser.from_uri(f'{BASE_URL}/elsewhere/robots.txt')
17 |     return p
18 | 
19 | 
20 | def test_basic_disallow_all(parser):
21 |     assert not parser.disallow_all
22 | 
23 | 
24 | def test_basic_allow_all(parser):
25 |     assert not parser.allow_all
26 | 
27 | 
28 | can_fetch_data = (
29 |     ['*', f'{BASE_URL}/elsewhere', ALLOWED],
30 |     ['Nutch', f'{BASE_URL}/', DISALLOWED],
31 |     ['Nutch', f'{BASE_URL}/brian', DISALLOWED],
32 |     ['Nutch', f'{BASE_URL}/brian/', ALLOWED],
33 |     ['Nutch', f'{BASE_URL}/webstats', DISALLOWED],
34 |     ['Nutch', f'{BASE_URL}/webstats/', DISALLOWED],
35 |     ['*', f'{BASE_URL}/webstats', ALLOWED],
36 |     ['*', f'{BASE_URL}/webstats/', DISALLOWED],
37 |     ['*', f'{BASE_URL}/', ALLOWED],
38 | )
39 | 
40 | 
41 | @pytest.mark.parametrize('agent,path,allowed', can_fetch_data)
42 | def test_can_fetch(agent, path, allowed, parser):
43 |     assert parser.can_fetch(agent, path) is allowed
44 | 
45 | 
46 | def test_404():
47 |     p = robots.RobotsParser.from_uri('https://robotspy.org/non_existing_robots.txt')
48 |     assert p.allow_all  # no robots file => allow access to all paths
49 |     assert p.can_fetch('FooBot', '/admin')
50 | 
51 | 
52 | def test_utf16():
53 |     p = robots.RobotsParser.from_uri('https://robotspy.org/tests/robots_utf16.txt')
54 |     assert p.allow_all  # robots file with unexpected encoding (must be UTF-8) => allow access to all paths
55 |     assert p.can_fetch('FooBot', '/admin')
56 | 
57 | def test_short_timeout():
58 |     p = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0)
59 |     assert p.errors
60 |     assert p.disallow_all
61 |     assert not p.can_fetch('FooBot', '/admin')
62 | 
63 | def test_error_timetout():
64 |     p = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 1)
65 | 
66 |     # The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout
67 |     assert p.errors
68 |     assert p.disallow_all
69 |     assert not p.can_fetch('FooBot', '/admin')
70 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for robots.RobotFileParser
 3 | """
 4 | 
 5 | import pytest
 6 | import robots
 7 | 
 8 | url_data = (
 9 |     ['https://example.com/index', 'example.com', '/index'],
10 |     ['https://example.com/', 'example.com', '/'],
11 |     ['https://example.com', 'example.com', '/'],
12 |     ['http://example.com//%7Ejoe/index.html', 'example.com', '/~joe/index.html']
13 | )
14 | 
15 | 
16 | @pytest.mark.parametrize('url,host,path', url_data)
17 | def test_normalize_url(url, host, path):
18 |     h, p = robots.RobotsParser.normalize_url(url)
19 |     assert (h, p) == (host, path)
20 | 
21 | 
22 | dedup_data = (
23 |     ['///path///index.html', '/path/index.html'],
24 |     ['/path/index.html', '/path/index.html'],
25 |     ['//', '/'],
26 |     ['/', '/'],
27 |     ['/foo/bar?qux=taz&baz=http://foo.bar?tar&par', '/foo/bar?qux=taz&baz=http://foo.bar?tar&par'],
28 |     ['///foo//bar?qux=taz&baz=http://foo.bar?tar&par', '/foo/bar?qux=taz&baz=http://foo.bar?tar&par'],
29 |     ['///foo//bar?qux=taz&baz=https://foo.bar?tar&par', '/foo/bar?qux=taz&baz=https://foo.bar?tar&par']
30 | )
31 | 
32 | 
33 | @pytest.mark.parametrize('path,dedup', dedup_data)
34 | def test_dedup_slash(path, dedup):
35 |     assert robots.RobotsParser.dedup_slash(path) == dedup
36 | 
37 | 
38 | path_pattern_data = (
39 |     ['/path/index.html', '/path/*.html', True],
40 |     ['/path/index.html', '/path/*.html$', True],
41 |     ['/path/index.html?test=1', '/path/*.html$', False],
42 |     ['/path/index.html?test=1', '/path*', True],
43 |     ['/path/index.html?test=1', '/p*/i*', True],
44 |     ['/path', '/p*/i*', False],
45 | )
46 | 
47 | 
48 | @pytest.mark.parametrize('path,pattern,expected', path_pattern_data)
49 | def test_startswith_pattern(path, pattern, expected):
50 |     assert robots.RobotsParser.startswith_pattern(path, pattern) is expected
51 | 


--------------------------------------------------------------------------------
/tests/test_robotparser.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import threading
  4 | import unittest
  5 | import robots
  6 | from http.server import BaseHTTPRequestHandler, HTTPServer
  7 | 
  8 | HOST = 'localhost'
  9 | 
 10 | 
 11 | class BaseRobotTest(unittest.TestCase):
 12 |     robots_txt = ''
 13 |     agent = 'test_robotparser'
 14 |     good = []
 15 |     bad = []
 16 |     site_maps = None
 17 | 
 18 |     def setUp(self):
 19 |         lines = io.StringIO(self.robots_txt).readlines()
 20 |         self.parser = robots.RobotFileParser()
 21 |         self.parser.parse(lines)
 22 | 
 23 |     def get_agent_and_url(self, url):
 24 |         if isinstance(url, tuple):
 25 |             agent, url = url
 26 |             return agent, url
 27 |         return self.agent, url
 28 | 
 29 |     def test_good_urls(self):
 30 |         for url in self.good:
 31 |             agent, url = self.get_agent_and_url(url)
 32 |             with self.subTest(url=url, agent=agent):
 33 |                 self.assertTrue(self.parser.can_fetch(agent, url))
 34 | 
 35 |     def test_bad_urls(self):
 36 |         for url in self.bad:
 37 |             agent, url = self.get_agent_and_url(url)
 38 |             with self.subTest(url=url, agent=agent):
 39 |                 self.assertFalse(self.parser.can_fetch(agent, url))
 40 | 
 41 |     def test_site_maps(self):
 42 |         self.assertEqual(self.parser.site_maps(), self.site_maps)
 43 | 
 44 | 
 45 | class UserAgentWildcardTest(BaseRobotTest):
 46 |     robots_txt = """\
 47 | User-agent: *
 48 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 49 | Disallow: /tmp/ # these will soon disappear
 50 | Disallow: /foo.html
 51 |     """
 52 |     good = ['/', '/test.html']
 53 |     bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
 54 | 
 55 | 
 56 | class CrawlDelayAndCustomAgentTest(BaseRobotTest):
 57 |     robots_txt = """\
 58 | # robots.txt for http://www.example.com/
 59 | 
 60 | User-agent: *
 61 | Crawl-delay: 1
 62 | Request-rate: 3/15
 63 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 64 | 
 65 | # Cybermapper knows where to go.
 66 | User-agent: cybermapper
 67 | Disallow:
 68 |     """
 69 |     good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
 70 |     bad = ['/cyberworld/map/index.html']
 71 | 
 72 | 
 73 | class SitemapTest(BaseRobotTest):
 74 |     robots_txt = """\
 75 | # robots.txt for http://www.example.com/
 76 | 
 77 | User-agent: *
 78 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
 79 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml
 80 | Request-rate: 3/15
 81 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 82 | 
 83 |     """
 84 |     good = ['/', '/test.html']
 85 |     bad = ['/cyberworld/map/index.html']
 86 |     site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
 87 |                  'http://www.google.com/hostednews/sitemap_index.xml']
 88 | 
 89 | 
 90 | class RejectAllRobotsTest(BaseRobotTest):
 91 |     robots_txt = """\
 92 | # go away
 93 | User-agent: *
 94 | Disallow: /
 95 |     """
 96 |     good = []
 97 |     bad = ['/cyberworld/map/index.html', '/', '/tmp/']
 98 | 
 99 | 
100 | class BaseRequestRateTest(BaseRobotTest):
101 |     request_rate = None
102 |     crawl_delay = None
103 | 
104 |     def test_request_rate(self):
105 |         parser = self.parser
106 |         for url in self.good + self.bad:
107 |             agent, url = self.get_agent_and_url(url)
108 |             with self.subTest(url=url, agent=agent):
109 |                 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
110 | 
111 |                 parsed_request_rate = parser.request_rate(agent)
112 |                 self.assertEqual(parsed_request_rate, self.request_rate)
113 |                 if self.request_rate is not None:
114 |                     self.assertIsInstance(
115 |                         parsed_request_rate,
116 |                         robots.RequestRate
117 |                     )
118 |                     self.assertEqual(
119 |                         parsed_request_rate.requests,
120 |                         self.request_rate.requests
121 |                     )
122 |                     self.assertEqual(
123 |                         parsed_request_rate.seconds,
124 |                         self.request_rate.seconds
125 |                     )
126 | 
127 | 
128 | class EmptyFileTest(BaseRequestRateTest):
129 |     robots_txt = ''
130 |     good = ['/foo']
131 | 
132 | 
133 | class CrawlDelayAndRequestRateTest(BaseRequestRateTest):
134 |     robots_txt = """\
135 | User-agent: figtree
136 | Crawl-delay: 3
137 | Request-rate: 9/30
138 | Disallow: /tmp
139 | Disallow: /a%3cd.html
140 | Disallow: /a%2fb.html
141 | Disallow: /%7ejoe/index.html
142 |     """
143 |     agent = 'figtree'
144 |     # request_rate = robots.RequestRate(9, 30)
145 |     request_rate = None  # BGD: crawl-delay ignored
146 |     # crawl_delay = 3
147 |     crawl_delay = None  # BGD: crawl-delay ignored
148 |     # good = [('figtree', '/foo.html')]
149 |     good = ['/foo.html']
150 |     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
151 |            '/a%2fb.html', '/~joe/index.html']
152 | 
153 | 
154 | # Different behavior than urllib.robotparser that applies the same rule to 'figtree' and
155 | # 'FigTree Robot libwww-perl/5.04'
156 | class DifferentAgentTest(CrawlDelayAndRequestRateTest):
157 |     agent = 'FigTree Robot libwww-perl/5.04'
158 |     good = ['/foo.html', '/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
159 |             '/a%2fb.html', '/~joe/index.html']
160 |     bad = []
161 | 
162 | 
163 | class InvalidRequestRateTest(BaseRobotTest):
164 |     robots_txt = """\
165 | User-agent: *
166 | Disallow: /tmp/
167 | Disallow: /a%3Cd.html
168 | Disallow: /a/b.html
169 | Disallow: /%7ejoe/index.html
170 | Crawl-delay: 3
171 | Request-rate: 9/banana
172 |     """
173 |     good = ['/tmp']
174 |     bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
175 |            '/%7Ejoe/index.html']
176 |     crawl_delay = 3
177 | 
178 | 
179 | class InvalidCrawlDelayTest(BaseRobotTest):
180 |     # From bug report #523041
181 |     robots_txt = """\
182 | User-Agent: *
183 | Disallow: /.
184 | Crawl-delay: pears
185 |     """
186 |     good = ['/foo.html']
187 |     # bug report says "/" should be denied, but that is not in the RFC
188 |     bad = []
189 | 
190 | 
191 | class AnotherInvalidRequestRateTest(BaseRobotTest):
192 |     # also test that Allow and Diasallow works well with each other
193 |     robots_txt = """\
194 | User-agent: Googlebot
195 | Allow: /folder1/myfile.html
196 | Disallow: /folder1/
197 | Request-rate: whale/banana
198 |     """
199 |     agent = 'Googlebot'
200 |     good = ['/folder1/myfile.html']
201 |     bad = ['/folder1/anotherfile.html']
202 | 
203 | 
204 | class UserAgentOrderingTest(BaseRobotTest):
205 |     # the order of User-agent should be correct. note
206 |     # that this file is incorrect because "Googlebot" is a
207 |     # substring of "Googlebot-Mobile"
208 |     robots_txt = """\
209 | User-agent: Googlebot
210 | Disallow: /
211 | 
212 | User-agent: Googlebot-Mobile
213 | Allow: /
214 |     """
215 |     agent = 'Googlebot'
216 |     bad = ['/something.jpg']
217 | 
218 | 
219 | # Different behavior than urllib.robotparser that applies the same rule for googlebot and
220 | # googlebot-mobile
221 | class UserAgentGoogleMobileTest(UserAgentOrderingTest):
222 |     agent = 'Googlebot-Mobile'
223 |     good = ['/something.jpg']
224 |     bad = []
225 | 
226 | 
227 | class GoogleURLOrderingTest(BaseRobotTest):
228 |     # Google also got the order wrong. You need
229 |     # to specify the URLs from more specific to more general
230 |     robots_txt = """\
231 | User-agent: Googlebot
232 | Allow: /folder1/myfile.html
233 | Disallow: /folder1/
234 |     """
235 |     agent = 'googlebot'
236 |     good = ['/folder1/myfile.html']
237 |     bad = ['/folder1/anotherfile.html']
238 | 
239 | 
240 | class DisallowQueryStringTest(BaseRobotTest):
241 |     # see issue #6325 for details
242 |     robots_txt = """\
243 | User-agent: *
244 | Disallow: /some/path?name=value
245 |     """
246 |     good = ['/some/path']
247 |     bad = ['/some/path?name=value']
248 | 
249 | 
250 | class UseFirstUserAgentWildcardTest(BaseRobotTest):
251 |     # obey first * entry (#4108)
252 |     robots_txt = """\
253 | User-agent: *
254 | Disallow: /some/path
255 | 
256 | User-agent: *
257 | Disallow: /another/path
258 | """
259 | 
260 |     # urllib.robotparser does not
261 |     # combine the rules for the same useragent
262 | 
263 |     # good = ['/another/path']
264 |     bad = ['/some/path', '/another/path']
265 | 
266 | 
267 | class EmptyQueryStringTest(BaseRobotTest):
268 |     # normalize the URL first (#17403)
269 |     robots_txt = """\
270 | User-agent: *
271 | Allow: /some/path?
272 | Disallow: /another/path?
273 |     """
274 |     good = ['/some/path?']
275 |     bad = ['/another/path?']
276 | 
277 | 
278 | class DefaultEntryTest(BaseRequestRateTest):
279 |     robots_txt = """\
280 | User-agent: *
281 | Crawl-delay: 1
282 | Request-rate: 3/15
283 | Disallow: /cyberworld/map/
284 |     """
285 |     # request_rate = robots.RequestRate(3, 15)
286 |     request_rate = None  # BGD: crawl-delay ignored
287 |     # crawl_delay = 1
288 |     crawl_delay = None  # BGD: crawl-delay ignored
289 |     good = ['/', '/test.html']
290 |     bad = ['/cyberworld/map/index.html']
291 | 
292 | 
293 | class StringFormattingTest(BaseRobotTest):
294 |     robots_txt = """
295 | User-agent: *
296 | Crawl-delay: 1
297 | Request-rate: 3/15
298 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
299 | 
300 | # Cybermapper knows where to go.
301 | User-agent: cybermapper
302 | Disallow: /some/path
303 | """
304 | 
305 |     expected_output = """User-agent: *
306 | Disallow: /cyberworld/map/
307 | 
308 | User-agent: cybermapper
309 | Disallow: /some/path
310 | """
311 | 
312 |     # Intentionally, robotspy does not handle crawl-delay or request rate, hence those are not
313 |     # printed out
314 |     def test_string_formatting(self):
315 |         self.assertEqual(str(self.parser), self.expected_output)
316 | 
317 | 
318 | class RobotHandler(BaseHTTPRequestHandler):
319 | 
320 |     def do_GET(self):
321 |         self.send_error(403, "Forbidden access")
322 | 
323 |     def log_message(self, format, *args):
324 |         pass
325 | 
326 | 
327 | class PasswordProtectedSiteTestCase(unittest.TestCase):
328 | 
329 |     def setUp(self):
330 |         self.server = HTTPServer((HOST, 0), RobotHandler)
331 | 
332 |         self.t = threading.Thread(
333 |             name='HTTPServer serving',
334 |             target=self.server.serve_forever,
335 |             # Short poll interval to make the test finish quickly.
336 |             # Time between requests is short enough that we won't wake
337 |             # up spuriously too many times.
338 |             kwargs={'poll_interval': 0.01})
339 |         self.t.daemon = True  # In case this function raises.
340 |         self.t.start()
341 | 
342 |     def tearDown(self):
343 |         self.server.shutdown()
344 |         self.t.join()
345 |         self.server.server_close()
346 | 
347 |     def testPasswordProtectedSite(self):
348 |         addr = self.server.server_address
349 |         url = 'http://' + HOST + ':' + str(addr[1])
350 |         robots_url = url + "/robots.txt"
351 |         parser = robots.RobotFileParser()
352 |         parser.set_url(url)
353 |         parser.read()
354 |         self.assertFalse(parser.can_fetch("*", robots_url))
355 | 
356 | 
357 | class NetworkTestCase(unittest.TestCase):
358 | 
359 |     base_url = 'http://www.pythontest.net/'
360 |     robots_txt = '{}elsewhere/robots.txt'.format(base_url)
361 | 
362 |     @classmethod
363 |     def setUpClass(cls):
364 |         cls.parser = robots.RobotFileParser(cls.robots_txt)
365 |         cls.parser.read()
366 | 
367 |     def url(self, path):
368 |         return '{}{}{}'.format(
369 |             self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
370 |         )
371 | 
372 |     def test_basic(self):
373 |         self.assertFalse(self.parser.disallow_all)
374 |         self.assertFalse(self.parser.allow_all)
375 |         self.assertGreater(self.parser.mtime(), 0)
376 |         self.assertFalse(self.parser.crawl_delay('*'))
377 |         self.assertFalse(self.parser.request_rate('*'))
378 | 
379 |     def test_can_fetch(self):
380 |         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
381 |         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
382 |         self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))  # Different from urllib.robotparser
383 |         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
384 |         self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
385 |         self.assertTrue(self.parser.can_fetch('*', self.base_url))
386 | 
387 |     def test_read_404(self):
388 |         parser = robots.RobotFileParser(self.url('i-robot.txt'))
389 |         parser.read()
390 |         self.assertTrue(parser.allow_all)
391 |         self.assertFalse(parser.disallow_all)
392 |         self.assertEqual(parser.mtime(), 0)
393 |         self.assertIsNone(parser.crawl_delay('*'))
394 |         self.assertIsNone(parser.request_rate('*'))
395 | 
396 | 
397 | if __name__ == '__main__':
398 |     unittest.main()
399 | 


--------------------------------------------------------------------------------
/tests/test_robots.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Mostly tests from:
  3 | https://github.com/python/cpython/blob/a796d8ef9dd1af65f7e4d7a857b56f35b7cb6e78/Lib/test/test_robotparser.py
  4 | converted to PyTest and intended to validate the compatibility with the Python standard library
  5 | package: urllib.robotparser
  6 | 
  7 | For each test a data row contains the following fields:
  8 | robotstxt, useragent, url, allowed/disallowed
  9 | 
 10 | allow/disallowed is expressed as a boolean, True/False
 11 | """
 12 | 
 13 | import pytest
 14 | import robots
 15 | from .core import *
 16 | 
 17 | 
 18 | # Same robots.txt as http://www.pythontest.net/elsewhere/robots.txt
 19 | network = """
 20 | # NetworkTestCase
 21 | 
 22 | User-agent: Nutch
 23 | Disallow: /
 24 | Allow: /brian/
 25 | 
 26 | User-agent: *
 27 | Disallow: /webstats/
 28 | """
 29 | 
 30 | network_data = (
 31 |     [network, '*', '/elsewhere/', ALLOWED],
 32 |     [network, 'Nutch', '/', DISALLOWED],
 33 |     [network, 'Nutch', '/brian', DISALLOWED],
 34 |     [network, 'Nutch', '/brian/', ALLOWED],
 35 |     [network, 'Nutch', '/webstats/', DISALLOWED],
 36 |     [network, '*', '/webstats/', DISALLOWED],
 37 |     [network, '*',  '/', ALLOWED],
 38 | )
 39 | 
 40 | 
 41 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', network_data)
 42 | def test_py01_network(robots_txt, agent, path, allowed, can_fetch):
 43 |     assert can_fetch(robots_txt, agent, path) is allowed
 44 | 
 45 | 
 46 | useragent_wild_card = """
 47 | # UserAgentWildcardTest
 48 | 
 49 | User-agent: *
 50 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 51 | Disallow: /tmp/ # these will soon disappear
 52 | Disallow: /foo.html
 53 | """
 54 | 
 55 | useragent_wild_card_data = (
 56 |     [useragent_wild_card, DEFAULT_AGENT, '/', ALLOWED],
 57 |     [useragent_wild_card, DEFAULT_AGENT, '/test.html', ALLOWED],
 58 |     [useragent_wild_card, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED],
 59 |     [useragent_wild_card, DEFAULT_AGENT, '/tmp/xxx', DISALLOWED],
 60 |     [useragent_wild_card, DEFAULT_AGENT, '/foo.html', DISALLOWED],
 61 | )
 62 | 
 63 | 
 64 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', useragent_wild_card_data)
 65 | def test_useragent_wild_card(robots_txt, agent, path, allowed, can_fetch):
 66 |     assert can_fetch(robots_txt, agent, path) is allowed
 67 | 
 68 | 
 69 | # This test does not take into account crawl-delay. See crawl_delay_request_rate for that.
 70 | crawl_delay_custom_agent = """
 71 | # CrawlDelayAndCustomAgentTest 
 72 | 
 73 | User-agent: *
 74 | Crawl-delay: 1
 75 | Request-rate: 3/15
 76 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 77 | 
 78 | # Cybermapper knows where to go.
 79 | User-agent: cybermapper
 80 | Disallow:
 81 | """
 82 | 
 83 | crawl_delay_custom_agent_data = (
 84 |     [crawl_delay_custom_agent, DEFAULT_AGENT, '/', ALLOWED],
 85 |     [crawl_delay_custom_agent, DEFAULT_AGENT, '/test.html', ALLOWED],
 86 |     [crawl_delay_custom_agent, 'cybermapper', '/cyberworld/map/index.html', ALLOWED],
 87 |     [crawl_delay_custom_agent, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED],
 88 | )
 89 | 
 90 | 
 91 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', crawl_delay_custom_agent_data)
 92 | def test_crawl_delay_custom_agent(robots_txt, agent, path, allowed, can_fetch):
 93 |     assert can_fetch(robots_txt, agent, path) is allowed
 94 | 
 95 | 
 96 | sitemap = """
 97 | # SitemapTest
 98 | 
 99 | User-agent: *
100 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
101 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml
102 | Request-rate: 3/15
103 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
104 | """
105 | 
106 | sitemap_data = (
107 |     [sitemap, DEFAULT_AGENT, '/', ALLOWED],
108 |     [sitemap, DEFAULT_AGENT, '/test.html', ALLOWED],
109 |     [sitemap, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED],
110 | )
111 | 
112 | 
113 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', sitemap_data)
114 | def test_sitemap(robots_txt, agent, path, allowed, can_fetch):
115 |     assert can_fetch(robots_txt, agent, path) is allowed
116 | 
117 | 
118 | reject_all = """
119 | # RejectAllRobotsTest
120 | 
121 | User-agent: *
122 | Disallow: /
123 | """
124 | 
125 | reject_all_data = (
126 |     [reject_all, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED],
127 |     [reject_all, DEFAULT_AGENT, '/', DISALLOWED],
128 |     [reject_all, DEFAULT_AGENT, '/tmp/', DISALLOWED],
129 | )
130 | 
131 | 
132 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', reject_all_data)
133 | def test_reject_all(robots_txt, agent, path, allowed, can_fetch):
134 |     assert can_fetch(robots_txt, agent, path) is allowed
135 | 
136 | 
137 | # TODO: implement handling request-rate and crawl-delay
138 | # Following tests take into account crawl-delay and request-rate
139 | 
140 | empty_data = (
141 |    ['# Empty', DEFAULT_AGENT, '/foo', ALLOWED],
142 |    ['# Empty', '', '', ALLOWED],  # No user agent, no path provided
143 | )
144 | 
145 | 
146 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', empty_data)
147 | def test_empty(robots_txt, agent, path, allowed, can_fetch):
148 |     assert can_fetch(robots_txt, agent, path) is allowed
149 | 
150 | 
151 | crawl_delay_request_rate = """
152 | # CrawlDelayAndRequestRate
153 | 
154 | User-agent: figtree
155 | Crawl-delay: 3
156 | Request-rate: 9/30
157 | Disallow: /tmp
158 | Disallow: /a%3cd.html
159 | Disallow: /a%2fb.html
160 | Disallow: /%7ejoe/index.html
161 | """
162 | 
163 | crawl_delay_request_rate_data = (
164 |     [crawl_delay_request_rate, 'figtree', '/foo.html', ALLOWED],
165 |     [crawl_delay_request_rate, 'figtree', '/tmp', DISALLOWED],
166 |     [crawl_delay_request_rate, 'figtree', '/tmp/a.html', DISALLOWED],
167 |     [crawl_delay_request_rate, 'figtree', '/a%3cd.html', DISALLOWED],
168 |     [crawl_delay_request_rate, 'figtree', '/a%3Cd.html', DISALLOWED],
169 |     [crawl_delay_request_rate, 'figtree', '/a%2fb.html', DISALLOWED],
170 |     [crawl_delay_request_rate, 'figtree', '/~joe/index.html', DISALLOWED],
171 | )
172 | 
173 | 
174 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', crawl_delay_request_rate_data)
175 | def test_crawl_delay_request_rate(robots_txt, agent, path, allowed, can_fetch):
176 |     assert can_fetch(robots_txt, agent, path) is allowed
177 | 
178 | 
179 | crawl_delay_request_rate_diff_agent_data = (
180 |     [crawl_delay_request_rate, '/foo.html', ALLOWED],
181 |     [crawl_delay_request_rate, '/tmp', ALLOWED],
182 |     [crawl_delay_request_rate, '/tmp/a.html', ALLOWED],
183 |     [crawl_delay_request_rate, '/a%3cd.html', ALLOWED],
184 |     [crawl_delay_request_rate, '/a%3Cd.html', ALLOWED],
185 |     [crawl_delay_request_rate, '/a%2fb.html', ALLOWED],
186 |     [crawl_delay_request_rate, '/~joe/index.html', ALLOWED],
187 | )
188 | 
189 | 
190 | # The behavior is different than urllib.robotparser that applies 'figtree' and
191 | # 'FigTree Robot libwww-perl/5.04' with the same rules.
192 | @pytest.mark.parametrize('robots_txt,path,allowed', crawl_delay_request_rate_diff_agent_data)
193 | def test_different_agent(robots_txt, path, allowed, can_fetch):
194 |     agent = 'FigTree Robot libwww-perl/5.04'
195 |     assert can_fetch(robots_txt, agent, path) is allowed
196 | 
197 | 
198 | invalid_request_rate = """
199 | # InvalidRequestRate
200 | 
201 | User-agent: *
202 | Disallow: /tmp/
203 | Disallow: /a%3Cd.html
204 | Disallow: /a/b.html
205 | Disallow: /%7ejoe/index.html
206 | Crawl-delay: 3
207 | Request-rate: 9/banana
208 | """
209 | 
210 | invalid_request_rate_data = (
211 |     [invalid_request_rate, DEFAULT_AGENT, '/tmp', ALLOWED],
212 |     [invalid_request_rate, DEFAULT_AGENT, '/tmp/', DISALLOWED],
213 |     [invalid_request_rate, DEFAULT_AGENT, '/tmp/a.html', DISALLOWED],
214 |     [invalid_request_rate, DEFAULT_AGENT, '/a%3cd.html', DISALLOWED],
215 |     [invalid_request_rate, DEFAULT_AGENT, '/a%3Cd.html', DISALLOWED],
216 |     [invalid_request_rate, DEFAULT_AGENT, '/a/b.html', DISALLOWED],
217 |     [invalid_request_rate, DEFAULT_AGENT, '/%7Ejoe/index.html', DISALLOWED],
218 | )
219 | 
220 | 
221 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', invalid_request_rate_data)
222 | def test_invalid_request_rate(robots_txt, agent, path, allowed, can_fetch):
223 |     assert can_fetch(robots_txt, agent, path) is allowed
224 | 
225 | 
226 | invalid_crawl_delay = """
227 | # InvalidCrawlDelay
228 | 
229 | User-Agent: *
230 | Disallow: /.
231 | Crawl-delay: pears
232 | """
233 | 
234 | 
235 | def test_invalid_crawl_delay(can_fetch):
236 |     assert can_fetch(invalid_crawl_delay, DEFAULT_AGENT, '/foo.html') is ALLOWED
237 | 
238 | 
239 | other_invalid_request_rate = """
240 | # OtherInvalidCrawlDelay
241 | 
242 | User-agent: Googlebot
243 | Allow: /folder1/myfile.html
244 | Disallow: /folder1/
245 | Request-rate: whale/banana
246 | """
247 | 
248 | other_invalid_request_rate_data = (
249 |     [other_invalid_request_rate, 'Googlebot', '/folder1/myfile.html', ALLOWED],
250 |     [other_invalid_request_rate, 'Googlebot', '/folder1/anotherfile.html', DISALLOWED],
251 | )
252 | 
253 | 
254 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', other_invalid_request_rate_data)
255 | def test_other_invalid_request_rate(robots_txt, agent, path, allowed, can_fetch):
256 |     assert can_fetch(robots_txt, agent, path) is allowed
257 | 
258 | 
259 | useragent_ordering = """
260 | # UserAgentOrdering
261 | 
262 | User-agent: Googlebot
263 | Disallow: /
264 | 
265 | User-agent: Googlebot-Mobile
266 | Allow: /
267 | """
268 | 
269 | 
270 | def test_useragent_ordering(can_fetch):
271 |     assert can_fetch(useragent_ordering, 'Googlebot', '/something.jpg') is DISALLOWED
272 | 
273 | 
274 | # Different behavior than urllib.robotparser that applies the same rule to googlebot and
275 | # googlebot-mobile. It ends up validating if the ua saved by the parser is in the ua that
276 | # we want to validate (if 'googlebot' in 'googlebot-mobile') and disallow for google-mobile
277 | # Google robots respects Googlebot-Mobile as a different ua and allow. Same for robotspy.
278 | def test_useragent_google_mobile(can_fetch):
279 |     assert can_fetch(useragent_ordering, 'Googlebot-Mobile', '/something.jpg') is ALLOWED
280 | 
281 | 
282 | google_url_ordering = """
283 | # GoogleURLOrdering
284 | 
285 | User-agent: Googlebot
286 | Allow: /folder1/myfile.html
287 | Disallow: /folder1/
288 | """
289 | 
290 | google_url_ordering_data = (
291 |     [google_url_ordering, 'googlebot', '/folder1/myfile.html', ALLOWED],
292 |     [google_url_ordering, 'googlebot', '/folder1/anotherfile.html', DISALLOWED],
293 | )
294 | 
295 | 
296 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_url_ordering_data)
297 | def test_google_url_ordering(robots_txt, agent, path, allowed, can_fetch):
298 |     assert can_fetch(robots_txt, agent, path) is allowed
299 | 
300 | 
301 | disallow_query_string = """
302 | # DisallowQueryString
303 | 
304 | User-agent: *
305 | Disallow: /some/path?name=value
306 | """
307 | 
308 | disallow_query_string_data = [
309 |     [disallow_query_string, DEFAULT_AGENT, '/some/path', ALLOWED],
310 |     [disallow_query_string, DEFAULT_AGENT, '/some/path?name=value', DISALLOWED],
311 | ]
312 | 
313 | 
314 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', disallow_query_string_data)
315 | def test_disallow_query_string(robots_txt, agent, path, allowed, can_fetch):
316 |     assert can_fetch(robots_txt, agent, path) is allowed
317 | 
318 | 
319 | use_first_useragent_wildcard = """
320 | # UseFirstUserAgentWildcard
321 | 
322 | User-agent: *
323 | Disallow: /some/path
324 | 
325 | User-agent: *
326 | Disallow: /another/path
327 | """
328 | 
329 | test_use_first_useragent_wildcard = (
330 |     [use_first_useragent_wildcard, DEFAULT_AGENT, '/another/path', DISALLOWED],
331 |     [use_first_useragent_wildcard, DEFAULT_AGENT, '/some/path', DISALLOWED],
332 | )
333 | 
334 | 
335 | # The logic in robotspy is to combine the entries with the same useragent, as per the specs:
336 | # https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.1
337 | # TODO: consider renaming this test combine_rules or something similar
338 | #       Mark it as a difference with urllib.robotparser in the Differences section in the README
339 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', test_use_first_useragent_wildcard)
340 | def test_use_first_useragent_wildcard(robots_txt, agent, path, allowed, can_fetch):
341 |     assert can_fetch(robots_txt, agent, path) is allowed
342 | 
343 | 
344 | empty_query_string = """
345 | # EmptyQueryString
346 | 
347 | User-agent: *
348 | Allow: /some/path?
349 | Disallow: /another/path?
350 | """
351 | 
352 | empty_query_string_data = (
353 |     [empty_query_string, DEFAULT_AGENT, '/some/path?', ALLOWED],
354 |     [empty_query_string, DEFAULT_AGENT, '/another/path?', DISALLOWED],
355 | )
356 | 
357 | 
358 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', empty_query_string_data)
359 | def test_empty_query_string(robots_txt, agent, path, allowed, can_fetch):
360 |     assert can_fetch(robots_txt, agent, path) is allowed
361 | 
362 | 
363 | default_entry = """
364 | # DefaultEntry
365 | 
366 | User-agent: *
367 | Crawl-delay: 1
368 | Request-rate: 3/15
369 | Disallow: /cyberworld/map/
370 | """
371 | 
372 | default_entry_data = (
373 |     [default_entry, DEFAULT_AGENT, '/', ALLOWED],
374 |     [default_entry, DEFAULT_AGENT, '/test.html', ALLOWED],
375 |     [default_entry, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED],
376 | )
377 | 
378 | 
379 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', default_entry_data)
380 | def test_default_entry(robots_txt, agent, path, allowed, can_fetch):
381 |     assert can_fetch(robots_txt, agent, path) is allowed
382 | 
383 | 
384 | robots_input = """
385 | # StringFormatting
386 | 
387 | User-agent: *
388 | Crawl-delay: 1
389 | Request-rate: 3/15
390 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
391 | 
392 | # Cybermapper knows where to go.
393 | User-agent: cybermapper
394 | Disallow: /some/path
395 | """
396 | 
397 | expected_robots_output = """User-agent: *
398 | Disallow: /cyberworld/map/
399 | 
400 | User-agent: cybermapper
401 | Disallow: /some/path
402 | """
403 | 
404 | 
405 | def test_string_formatting():
406 |     parser = robots.RobotsParser().from_string(robots_input)
407 |     print(str(parser))
408 |     assert str(parser) == expected_robots_output
409 | 
410 | 
411 | robots_sitemap_input = """
412 | # StringFormatting
413 | 
414 | User-agent: *
415 | Crawl-delay: 1
416 | Request-rate: 3/15
417 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
418 | 
419 | # Cybermapper knows where to go.
420 | User-agent: cybermapper
421 | Disallow: /some/path
422 | 
423 | Sitemap: https://www.example.com/sitemap1.xml
424 | Sitemap: https://www.example.com/sitemap2.xml
425 | """
426 | 
427 | expected_robots_sitemap_output = """User-agent: *
428 | Disallow: /cyberworld/map/
429 | 
430 | User-agent: cybermapper
431 | Disallow: /some/path
432 | 
433 | Sitemap: https://www.example.com/sitemap1.xml
434 | Sitemap: https://www.example.com/sitemap2.xml
435 | """
436 | 
437 | 
438 | def test_string_formatting_sitemaps():
439 |     parser = robots.RobotsParser().from_string(robots_sitemap_input)
440 |     print(str(parser))
441 |     assert str(parser) == expected_robots_sitemap_output
442 | 


--------------------------------------------------------------------------------