├── .dockerignore ├── .github ├── dependabot.yml └── workflows │ └── tests.yml ├── .gitignore ├── .pylintrc ├── Dockerfile ├── LICENSE.md ├── Makefile ├── README.md ├── examples ├── issue_209.py ├── merge_group.py ├── merge_group.txt ├── mistake2.txt ├── robots.txt ├── robots_308278.txt ├── robots_541230.txt ├── robots_file.py ├── robots_file_large.py ├── robots_file_large.txt ├── robots_multiple_agents.py ├── robots_multiple_agents.txt ├── robots_string.py └── robots_url.py ├── make.bat ├── requirements.txt ├── robots ├── __init__.py ├── __main__.py ├── parser.py └── robotparser.py ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── core.py ├── test_google.py ├── test_google_correctness.py ├── test_google_stress.py ├── test_network.py ├── test_parser.py ├── test_robotparser.py └── test_robots.py /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !README.md 3 | !LICENSE.md -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | ignore: 9 | - dependency-name: twine 10 | versions: 11 | - 3.4.0 12 | - 3.4.1 13 | - dependency-name: tqdm 14 | versions: 15 | - 4.56.1 16 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Test RobotsPy with Pytest 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Display Python version 20 | run: python -c "import sys; print(sys.version)" 21 | - name: Install Dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install -r requirements.txt 25 | - name: Execute pytest 26 | run: pytest tests -vv 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv* 2 | .idea/ 3 | .vscode/ 4 | tmp/ 5 | dist/ 6 | build/ 7 | __pycache__/ 8 | .mypy_cache/ 9 | .pytest_cache/ 10 | *.py[cod] 11 | robotspy.egg-info 12 | *.bak -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [DESIGN] 2 | max-attributes=12 3 | good-names=f,m,T -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | FROM python:3.12.5-alpine3.20 4 | 5 | ENV ROBOTSPY_VERSION=0.9.0 \ 6 | maintainer="andre.burgaud@gmail.com" 7 | 8 | LABEL robotspy.version=$ROBOTSPY_VERSION 9 | LABEL python.version=$PYTHON_VERSION 10 | 11 | RUN pip install --no-cache-dir --upgrade pip && \ 12 | pip install --no-cache-dir robotspy==$ROBOTSPY_VERSION 13 | 14 | ENTRYPOINT ["robots"] 15 | 16 | CMD ["--help"] -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2020 Andre Burgaud 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | PROJECT := robotspy 3 | VERSION := $(shell echo `grep __version__ robots/__init__.py | cut -d '"' -f 2`) 4 | 5 | # twine installed globally 6 | check: 7 | twine check dist/* 8 | 9 | clean: 10 | find . -name '*.pyc' -delete || true 11 | find . -name '__pycache__' -type d | xargs rm -rf || true 12 | find . -name '.pytest_cache' -type d | xargs rm -rf || true 13 | rm *.bak || true 14 | rm -rf .cache build dist robotspy.egg-info || true 15 | 16 | # twine installed globally 17 | deploy: 18 | twine upload dist/* 19 | 20 | difflib: SHELL:=/bin/bash 21 | difflib: 22 | diff -w <(pip freeze) <(cat requirements.txt) 23 | 24 | 25 | dist: version test clean wheel check 26 | 27 | docker: 28 | docker build -t 'andreburgaud/${PROJECT}:${VERSION}' . 29 | 30 | docker-scout: docker 31 | docker scout cves 'andreburgaud/${PROJECT}:${VERSION}' 32 | 33 | docker-deploy: docker-scout 34 | docker push 'docker.io/andreburgaud/${PROJECT}:${VERSION}' 35 | docker tag 'andreburgaud/${PROJECT}:${VERSION}' 'docker.io/andreburgaud/${PROJECT}:latest' 36 | docker push 'docker.io/andreburgaud/${PROJECT}:latest' 37 | 38 | # black installed globally 39 | fmt: 40 | black robots 41 | 42 | help: 43 | @echo 'Makefile for RobotsPy (Python robots.txt parser)' 44 | @echo 45 | @echo 'Usage:' 46 | @echo ' make check Check the wheel' 47 | @echo ' make clean Delete temp files (*.pyc), caches (__pycache__)' 48 | @echo ' make deploy Deploy package to the Cheese Shop (PyPI)' 49 | @echo ' make difflib Identify differences between libraries installed and requirement.txt file' 50 | @echo ' make dist Clean, generate the distribution and check' 51 | @echo ' make docker Build a docker image using the Dockerfile at the root of hte repo' 52 | @echo ' make docker-scout Validate the image against CVEs (requires docker scout to be installed on the build system)' 53 | @echo ' make docker-deploy Push the docker image to Docker Hub (requires a docker hub account)' 54 | @echo ' make fmt Format Python files using Black (installed globally)' 55 | @echo ' make freeze Update the requirements.txt excluding local package (robotspy)' 56 | @echo ' make help Display this help message' 57 | @echo ' make lint Lint Python file using Pylint (installed globally)' 58 | @echo ' make test Execute tests' 59 | @echo ' make type Type checking using Mypy (installed globally)' 60 | @echo ' make version Display current package version' 61 | @echo ' make wheel Build the wheel' 62 | 63 | # pylint installed globally 64 | lint: 65 | pylint robots 66 | 67 | tag: 68 | git push 69 | git tag -a ${VERSION} -m 'Version ${VERSION}' 70 | git push --tags 71 | 72 | test: 73 | pytest tests -vv 74 | 75 | # mypy installed globally 76 | type: 77 | mypy --check-untyped-defs robots 78 | 79 | version: 80 | @echo 'robots version: ${VERSION}' 81 | @perl -pi.bak -e 's/version="(\d+\.\d+\.\d+.*)"/version="${VERSION}"/' setup.py 82 | 83 | wheel: 84 | python setup.py sdist bdist_wheel 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Robots Exclusion Standard Parser for Python 2 | 3 | The `robotspy` Python module implements a parser for `robots.txt` files. The recommended class to use is 4 | `robots.RobotsParser`. 5 | 6 | A thin facade `robots.RobotFileParser` can also be used as a substitute for [`urllib.robotparser.RobotFileParser`](https://docs.python.org/3/library/urllib.robotparser.html), 7 | available in the Python standard library. The class `robots.RobotFileParser` exposes an API that is mostly compatible 8 | with `urllib.robotparser.RobotFileParser`. 9 | 10 | The main reasons for this rewrite are the following: 11 | 12 | 1. It was initially intended to experiment with parsing `robots.txt` files for a link checker project (not implemented yet). 13 | 1. It (mostly) follows the specs from the [RFC 9309 - Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309). 14 | 1. It does not try to be compliant with commonly accepted directives that are not in the current specs such as `request-rate` 15 | and `crawl-delay`, but it currently supports `sitemaps`. 16 | 1. It satisfies the same tests as the [Google Robots.txt Parser](https://github.com/google/robotstxt), except for some custom behaviors specific to Google Robots. 17 | 18 | To use the `robots` command line tool (CLI) in a Docker container, read the following section **Docker Image**. 19 | 20 | To install `robotspy` globally as a tool on your system with `pipx` skip to the **Global Installation** section. 21 | 22 | If you are interested in using `robotspy` in a local Python environment or as a library, skip to section **Module Installation**. 23 | 24 | ## Docker Image 25 | 26 | The Robotspy CLI, `robots`, is available as a [Docker](https://www.docker.com/) automated built image at https://hub.docker.com/r/andreburgaud/robotspy. 27 | 28 | If you already have [Docker](https://docs.docker.com/get-docker/) installed on your machine, first pull the image from Docker Hub: 29 | 30 | ``` 31 | $ docker pull andreburgaud/robotspy 32 | ``` 33 | 34 | Then, you can exercise the tool against the following remote Python `robots.txt` test file located at http://www.pythontest.net/elsewhere/robots.txt: 35 | 36 | ``` 37 | # Used by NetworkTestCase in Lib/test/test_robotparser.py 38 | 39 | User-agent: Nutch 40 | Disallow: / 41 | Allow: /brian/ 42 | 43 | User-agent: * 44 | Disallow: /webstats/ 45 | ``` 46 | 47 | The following examples demonstrate how to use the `robots` command line with the Docker container: 48 | 49 | ``` 50 | $ # Example 1: User agent "Johnny" is allowed to access path "/" 51 | $ docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Johnny / 52 | user-agent 'Johnny' with path '/': ALLOWED 53 | ``` 54 | 55 | ``` 56 | $ # Example 2: User agent "Nutch" is not allowed to access path "/brian" 57 | $ docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Nutch /brian 58 | user-agent 'Nutch' with path '/brian': DISALLOWED 59 | ``` 60 | 61 | ``` 62 | $ # Example 3: User agent "Johnny" is not allowed to access path "/webstats/" 63 | docker run --rm andreburgaud/robotspy http://www.pythontest.net/elsewhere/robots.txt Johnny /webstats/ 64 | user-agent 'Johnny' with path '/webstats/': DISALLOWED 65 | ``` 66 | 67 | The arguments are the following: 68 | 69 | 1. Location of the robots.txt file (`http://www.pythontest.net/elsewhere/robots.txt`) 70 | 1. User agent name (`Johnny`) 71 | 1. Path or URL (`/`) 72 | 73 | Without any argument, `robots` displays the help: 74 | 75 | ``` 76 | docker run --rm andreburgaud/robotspy 77 | usage: robots 78 | 79 | Shows whether the given user agent and path combination are allowed or disallowed by the given robots.txt file. 80 | 81 | positional arguments: 82 | robotstxt robots.txt file path or URL 83 | useragent User agent name 84 | path Path or URI 85 | 86 | optional arguments: 87 | -h, --help show this help message and exit 88 | -v, --version show program's version number and exit 89 | ``` 90 | 91 | To use the CLI `robots` as a global tools, continue to the following section. If you want to use `robotspy` as a Python module, skip to **Module Installation**. 92 | 93 | ## Global Installation with pipx 94 | 95 | If you only want to use the command line tool `robots`, you may want to use [pipx](https://pipxproject.github.io/pipx/installation/) to install it as a global tool on your system. 96 | 97 | To install `robotspy` using `pipx` execute the following command: 98 | 99 | ```bash 100 | $ pipx install robotspy 101 | ``` 102 | 103 | When `robotspy` is installed globally on your system, you can invoke it from any folder locations. For example, you can execute: 104 | 105 | ```bash 106 | $ robots --version 107 | robots 0.8.0 108 | ``` 109 | 110 | You can see more detailed usages in section **Usage**. 111 | 112 | ## Module Installation 113 | 114 | **Note**: Python 3.8.x or 3.9.x required 115 | 116 | You preferably want to install the `robotspy` package after creating a Python virtual environment, 117 | in a newly created directory, as follows: 118 | 119 | ``` 120 | $ mkdir project && cd project 121 | $ python -m venv .venv 122 | $ . .venv/bin/activate 123 | (.venv) $ python -m pip install --upgrade pip 124 | (.venv) $ python -m pip install --upgrade setuptools 125 | (.venv) $ python -m pip install robotspy 126 | (.venv) $ python -m robots --help 127 | ... 128 | ``` 129 | 130 | On Windows: 131 | 132 | ``` 133 | C:/> mkdir project && cd project 134 | C:/> python -m venv .venv 135 | C:/> .venv\scripts\activate 136 | (.venv) c:\> python -m pip install --upgrade pip 137 | (.venv) c:\> python -m pip install --upgrade setuptools 138 | (.venv) c:\> python -m pip install robotspy 139 | (.venv) c:\> python -m robots --help 140 | ... 141 | ``` 142 | 143 | ## Usage 144 | 145 | The `robotspy` package can be imported as a module and also exposes an executable, `robots`, invocable with 146 | `python -m`. If installed globally with `pipx`, the command `robots` can be invoked from any folders. The usage examples in the following section use the command `robots`, but you can also substitute it with `python -m robots` in a virtual environment. 147 | 148 | ### Execute the Tool 149 | 150 | After installing `robotspy`, you can validate the installation by running the following command: 151 | 152 | ``` 153 | $ robots --help 154 | usage: robots 155 | 156 | Shows whether the given user agent and path combination are allowed or disallowed by the given robots.txt file. 157 | 158 | positional arguments: 159 | robotstxt robots.txt file path or URL 160 | useragent User agent name 161 | path Path or URI 162 | 163 | optional arguments: 164 | -h, --help show this help message and exit 165 | -v, --version show program's version number and exit 166 | ``` 167 | 168 | ### Examples 169 | 170 | The content of http://www.pythontest.net/elsewhere/robots.txt is the following: 171 | 172 | ``` 173 | # Used by NetworkTestCase in Lib/test/test_robotparser.py 174 | 175 | User-agent: Nutch 176 | Disallow: / 177 | Allow: /brian/ 178 | 179 | User-agent: * 180 | Disallow: /webstats/ 181 | ``` 182 | 183 | To check if the user agent `Nutch` can fetch the path `/brian/` you can execute: 184 | 185 | ``` 186 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian/ 187 | user-agent 'Nutch' with path '/brian/': ALLOWED 188 | ``` 189 | 190 | Or, you can also pass the full URL, http://www.pythontest.net/brian/: 191 | 192 | ``` 193 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian/ 194 | user-agent 'Nutch' with url 'http://www.pythontest.net/brian/': ALLOWED 195 | ``` 196 | 197 | Can user agent `Nutch` fetch the path `/brian`? 198 | 199 | ``` 200 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch /brian 201 | user-agent 'Nutch' with path '/brian': DISALLOWED 202 | ``` 203 | 204 | Or, `/`? 205 | 206 | ``` 207 | $ robots http://www.pythontest.net/elsewhere/robots.txt Nutch / 208 | user-agent 'Nutch' with path '/': DISALLOWED 209 | ``` 210 | 211 | How about user agent `Johnny`? 212 | 213 | ``` 214 | $ robots http://www.pythontest.net/elsewhere/robots.txt Johnny / 215 | user-agent 'Johnny' with path '/': ALLOWED 216 | ``` 217 | 218 | ### Use the Module in a Project 219 | 220 | If you have a virtual environment with the `robotspy` package installed, you can use the `robots` module from the Python shell: 221 | 222 | ``` 223 | (.venv) $ python 224 | >>> import robots 225 | >>> parser = robots.RobotsParser.from_uri('http://www.pythontest.net/elsewhere/robots.txt') 226 | >>> useragent = 'Nutch' 227 | >>> path = '/brian/' 228 | >>> result = parser.can_fetch(useragent, path) 229 | >>> print(f'Can {useragent} fetch {path}? {result}') 230 | Can Nutch fetch /brian/? True 231 | >>> 232 | ``` 233 | 234 | ### Bug in the Python standard library 235 | 236 | There is a bug in [`urllib.robotparser`](https://docs.python.org/3/library/urllib.robotparser.html) 237 | from the Python standard library that causes the following test to differ from the example above with `robotspy`. 238 | 239 | The example with `urllib.robotparser` is the following: 240 | 241 | ``` 242 | $ python 243 | >>> import urllib.robotparser 244 | >>> rp = urllib.robotparser.RobotFileParser() 245 | >>> rp.set_url('http://www.pythontest.net/elsewhere/robots.txt') 246 | >>> rp.read() 247 | >>> rp.can_fetch('Nutch', '/brian/') 248 | False 249 | ``` 250 | 251 | Notice that the result is `False` whereas `robotspy` returns `True`. 252 | 253 | Bug [bpo-39187](https://bugs.python.org/issue39187) was open to raise awareness on this issue and PR 254 | https://github.com/python/cpython/pull/17794 was submitted as a possible fix. `robotspy` does not 255 | exhibit this problem. 256 | 257 | ## Development 258 | 259 | The main development dependency is `pytest` for executing the tests. It is automatically 260 | installed if you perform the following steps: 261 | 262 | ``` 263 | $ git clone https://github.com/andreburgaud/robotspy 264 | $ cd robotspy 265 | $ python -m venv .venv --prompt robots 266 | $ . .venv/bin/activate 267 | (robots) $ python -m pip install -r requirements.txt 268 | (robots) $ python -m pip install -e . 269 | (robots) $ make test 270 | (robots) $ deactivate 271 | $ 272 | ``` 273 | 274 | On Windows: 275 | 276 | ``` 277 | C:/> git clone https://github.com/andreburgaud/robotspy 278 | C:/> cd robotspy 279 | C:/> python -m venv .venv --prompt robotspy 280 | C:/> .venv\scripts\activate 281 | (robots) c:\> python -m pip install -r requirements.txt 282 | (robots) c:\> python -m pip install -e . 283 | (robots) c:\> make test 284 | (robots) c:\> deactivate 285 | ``` 286 | 287 | ## Global Tools 288 | 289 | The following tools were used during the development of `robotspy`: 290 | 291 | * [Black](https://github.com/psf/black) 292 | * [Mypy](http://mypy-lang.org/) 293 | * [Pylint](https://www.pylint.org/) 294 | * [twine](https://pypi.org/project/twine/) 295 | 296 | See the build file, `Makefile` or `make.bat` on Windows, for the commands and parameters. 297 | 298 | ## Release History 299 | 300 | * 0.10.0: 301 | * Fixed bugs in the URL path pattern matching ('?' is now handled correctly as the character '?' instead of matching any one character) 302 | * Added tests 541230 and 541230 from Google project https://github.com/google/robotstxt-spec-test 303 | * Contribution from https://github.com/kox-solid 304 | * 0.9.0: 305 | * Updated the parser to behave like the Google robots parser. It now handles the product token in the user-agent line up to the last correct character instead of discarding it. See [issue #209](https://github.com/andreburgaud/robotspy/issues/209) for more details. 306 | * Contribution from https://github.com/kox-solid 307 | * 0.8.0: 308 | * Addressed an issue raised when a robots.txt file is not UTF-8 encoded 309 | * Added a user agent to fetch the robots.txt, as some websites, such as pages hosted on Cloudflare, may return a 403 error 310 | * Updated the documentation to link to RFC 9309, Robots Exclusion Protocol (REP) 311 | * Added a GitHub action job to execute the tests against Python versions 3.8 to 3.12 312 | * Contribution from https://github.com/tumma72 313 | * 0.7.0: 314 | * Fixed bug with the argument path when using the CLI 315 | * Print 'url' when the argument is a URL, 'path' otherwise 316 | * 0.6.0: 317 | * Simplified dependencies by keeping only `pytest` in `requirements.txt` 318 | * 0.5.0: 319 | * Updated all libraries. Tested with Python 3.9. 320 | * 0.4.0: 321 | * Fixed issue with robots text pointed by relative paths 322 | * Integration of [Mypy](http://mypy-lang.org/), [Black](https://github.com/psf/black) and [Pylint](https://www.pylint.org/) as depencencies to ease cross-platform development 323 | * Limited `make.bat` build file for Windows 324 | * Git ignore vscode files, `tmp` directory, multiple virtual env (`.venv*`) 325 | * Fixed case insensitive issues on Windows 326 | * Tests successful on Windows 327 | * Added an ATRIBUTIONS files and build task to generate it 328 | * Upgraded `pyparsing` and `certifi` 329 | * 0.3.3: 330 | * Upgraded `tqdm`, and `cryptography` packages 331 | * 0.3.2: 332 | * Upgraded `bleach`, `tqdm`, and `setuptools` packages 333 | * 0.3.1: 334 | * Updated `idna` and `wcwidth` packages 335 | * Added `pipdeptree` package to provide visibility on dependencies 336 | * Fixed `mypy` errors 337 | * Explicitly ignored `pylint` errors related to commonly used names like `f`, `m`, or `T` 338 | * 0.3.0: Updated `bleach` package to address CVE-2020-6802 339 | * 0.2.0: Updated the documentation 340 | * 0.1.0: Initial release 341 | 342 | ## License 343 | 344 | [MIT License](LICENSE.md) -------------------------------------------------------------------------------- /examples/issue_209.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | content = """ 4 | User-agent: mozilla/5 5 | Disallow: / 6 | """ 7 | 8 | check_url = "https://example.com" 9 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" 10 | 11 | parser = robots.RobotsParser.from_string(content) 12 | 13 | print(parser.can_fetch(user_agent, check_url)) 14 | print(parser.is_agent_valid(user_agent)) 15 | 16 | 17 | content = """ 18 | User-agent: mozilla 19 | Disallow: / 20 | """ 21 | 22 | check_url = "https://example.com" 23 | user_agent = "Mozilla" 24 | 25 | parser = robots.RobotsParser.from_string(content) 26 | 27 | print(parser.can_fetch(user_agent, check_url)) 28 | print(parser.is_agent_valid(user_agent)) -------------------------------------------------------------------------------- /examples/merge_group.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | parser = robots.RobotsParser.from_file("merge_group.txt") 4 | 5 | assert parser.can_fetch("ExampleBot", "/") 6 | assert not parser.can_fetch("ExampleBot", "/foo") 7 | assert not parser.can_fetch("ExampleBot", "/bar") 8 | assert not parser.can_fetch("ExampleBot", "/baz") 9 | -------------------------------------------------------------------------------- /examples/merge_group.txt: -------------------------------------------------------------------------------- 1 | user-agent: ExampleBot 2 | disallow: /foo 3 | disallow: /bar 4 | 5 | user-agent: ExampleBot 6 | disallow: /baz -------------------------------------------------------------------------------- /examples/mistake2.txt: -------------------------------------------------------------------------------- 1 | user-agent FooBot 2 | disallow / 3 | -------------------------------------------------------------------------------- /examples/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /tmp/ 3 | Disallow: /a%3Cd.html 4 | Disallow: /a/b.html 5 | Disallow: /%7ejoe/index.html 6 | -------------------------------------------------------------------------------- /examples/robots_308278.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /asdf-login 3 | Disallow: /asdf-admin 4 | Disallow: /databack/ 5 | Disallow: /data/* 6 | Disallow: /?*/ 7 | Disallow: /author/ 8 | Disallow: /id/*/page/ 9 | Disallow: /id/*/data/ 10 | Sitemap: http://example.com/page-sitemap.xml 11 | -------------------------------------------------------------------------------- /examples/robots_541230.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: /*.js 3 | Allow: /*.css 4 | Allow: /*.jpg 5 | Allow: /*.png 6 | Allow: /*.gif 7 | Allow: /*?page 8 | Allow: /*?ref= 9 | Disallow: /*? 10 | Disallow: /stat/ 11 | Disallow: /id/1 12 | Disallow: /id/3 13 | Disallow: /register 14 | Disallow: /id/5 15 | Disallow: /id/7 16 | Disallow: /id/8 17 | Disallow: /id/9 18 | Disallow: /id/sub 19 | Disallow: /panel/ 20 | Disallow: /admin/ 21 | Disallow: /informer/ 22 | Disallow: /secure/ 23 | Disallow: /poll/ 24 | Disallow: /search/ 25 | Disallow: /abnl/ 26 | Disallow: /*_escaped_pattern_= 27 | Disallow: /*-*-*-*-321$ 28 | Disallow: /baz/order/ 29 | Disallow: /baz/printorder/ 30 | Disallow: /baz/checkout/ 31 | Disallow: /baz/user/ 32 | Disallow: /baz/search 33 | Disallow: /*0-*-0-03$ 34 | Disallow: /*-0-0- 35 | 36 | Sitemap: http://example.com/sitemap.xml 37 | Sitemap: http://example.com/sitemap-forum.xml -------------------------------------------------------------------------------- /examples/robots_file.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | AGENT = "test_robotparser" 4 | 5 | parser = robots.RobotsParser.from_file("robots.txt") 6 | 7 | if parser.errors: 8 | print("ERRORS:") 9 | print(parser.errors) 10 | 11 | if parser.errors: 12 | print("WARNINGS:") 13 | print(parser.errors) 14 | 15 | assert parser.can_fetch(AGENT, "/tmp") 16 | assert not parser.can_fetch(AGENT, "/tmp/") 17 | assert not parser.can_fetch(AGENT, "/tmp/a.html") 18 | assert not parser.can_fetch(AGENT, "/a%3cd.html") 19 | assert not parser.can_fetch(AGENT, "/a%3Cd.html") 20 | assert not parser.can_fetch(AGENT, "/a/b.html") 21 | assert not parser.can_fetch(AGENT, "/%7Ejoe/index.html") 22 | -------------------------------------------------------------------------------- /examples/robots_file_large.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | parser = robots.RobotsParser.from_file("robots_file_large.txt") 4 | 5 | if parser.errors: 6 | print("ERRORS:") 7 | print(parser.errors) 8 | 9 | if parser.errors: 10 | print("WARNINGS:") 11 | print(parser.errors) 12 | 13 | assert parser.can_fetch("Googlebot", "/") 14 | assert not parser.can_fetch("Exabot", "/") -------------------------------------------------------------------------------- /examples/robots_file_large.txt: -------------------------------------------------------------------------------- 1 | User-agent: Mediapartners-Google 2 | Disallow: 3 | 4 | User-agent: Mediapartners-Google* 5 | Disallow: 6 | 7 | User-agent: * 8 | Disallow: /abuse 9 | Disallow: /admgt/ 10 | Disallow: /donate 11 | Disallow: /go/ 12 | Disallow: /modcp 13 | Disallow: /post 14 | Disallow: /privmsg 15 | Disallow: /spa/ 16 | Disallow: /sta/ 17 | Disallow: /bw 18 | Disallow: /dx 19 | Disallow: /topicit/index.php/connect 20 | Disallow: /calendar_scheduler.forum 21 | Noindex: /login 22 | 23 | User-agent: 008 24 | User-agent: Accoona 25 | User-agent: aipbot 26 | User-agent: aipbot* 27 | User-agent: aipbot/1.0 28 | User-agent: Alexa 29 | User-agent: Alexa Bitlybot 30 | User-agent: Alexibot 31 | User-agent: AltaVista Intranet V2.0 AVS EVAL search@freeit.com 32 | User-agent: AltaVista Intranet V2.0 Compaq Altavista Eval sveand@altavista.net 33 | User-agent: AltaVista Intranet V2.0 evreka.com crawler@evreka.com 34 | User-agent: AltaVista V2.0B crawler@evreka.com 35 | User-agent: Anonymous 36 | User-agent: ApocalXExplorerBot 37 | User-agent: appie 38 | User-agent: Aqua_Products 39 | User-agent: Argus/1.1 40 | User-agent: Artabus 41 | User-agent: Ask Jeeves 42 | User-agent: asterias 43 | User-agent: atSpider 44 | User-agent: attentio 45 | User-agent: AV Fetch 1.0 46 | User-agent: AVSearch-3.0(AltaVista/AVC) 47 | User-agent: AWS Cloud Based 48 | User-agent: b2w 49 | User-agent: b2w/0.1 50 | User-agent: BackDoorBot 51 | User-agent: BackDoorBot/1.0 52 | User-agent: BacklinkCrawler 53 | User-agent: becomebot 54 | User-agent: BecomeBot 55 | User-agent: BigBrother 56 | User-agent: BIGLOTRON (BETA 2;GNU/Linux) 57 | User-agent: BizInformation 58 | User-agent: Black Hole 59 | User-agent: Black.Hole 60 | User-agent: BlackWidow 61 | User-agent: BlowFish 62 | User-agent: BlowFish/1.0 63 | User-agent: BoardPulse 64 | User-agent: boitho.com-dc 65 | User-agent: Bookmark search tool 66 | User-agent: bot/1.0 67 | User-agent: BotALot 68 | User-agent: Bot mailto:craftbot@yahoo.com 69 | User-agent: BotRightHere 70 | User-agent: BrandProtect 71 | User-agent: BuiltBotTough 72 | User-agent: Bullseye 73 | User-agent: Bullseye/1.0 74 | User-agent: BunnySlippers 75 | User-agent: CazoodleBot 76 | User-agent: Cegbfeieh 77 | User-agent: cfetch 78 | User-agent: cfetch/1.0 79 | User-agent: CheeseBot 80 | User-agent: CherryPicker 81 | User-agent: CherryPicker /1.0 82 | User-agent: CherryPickerElite/1.0 83 | User-agent: CherryPickerSE/1.0 84 | User-agent: ChinaClaw 85 | User-agent: Collage 86 | User-agent: cometrics-bot 87 | User-agent: complex_network_group 88 | User-agent: convera 89 | User-agent: ConveraCrawler 90 | User-agent: ConveraCrawler/0.2 91 | User-agent: ConveraCrawler/0.9d 92 | User-agent: Convera Internet Spider V6.x 93 | User-agent: ConveraMultiMediaCrawler/0.1 94 | User-agent: Copernic 95 | User-agent: CopyRightCheck 96 | User-agent: cosmos 97 | User-agent: Crescent 98 | User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0 99 | User-agent: Crescent Internet ToolPak HTTPOLE Control v.1.0 100 | User-agent: Curl 101 | User-agent: Custo 102 | User-agent: CydralSpider 103 | User-agent: Deepnet Explorer 104 | User-agent: default.ida 105 | User-agent: DigExt 106 | User-agent: DISCo 107 | User-agent: discobot 108 | User-agent: DISCoFinder 109 | User-agent: DISCo Pump 110 | User-agent: DISCo Pump 3.0 111 | User-agent: DISCo Pump 3.1 112 | User-agent: DISCo Pump 3.2 113 | User-agent: DittoSpyder 114 | User-agent: DOC 115 | User-agent: dotbot 116 | User-agent: DotBot 117 | User-agent: DotBot/1.1 118 | User-agent: Download Demon 119 | User-agent: Download Demon/3.2.0.8 120 | User-agent: Download Demon/3.5.0.11 121 | User-agent: Download Ninja 122 | User-agent: Download Wonder 123 | User-agent: DSurf 124 | User-agent: Dulance bot 125 | User-agent: dumbot 126 | User-agent: eCatch 127 | User-agent: eCatch/3.0 128 | User-agent: echo! 129 | User-agent: EchO!/2.0 130 | User-agent: EirGrabber 131 | User-agent: EliteSys Entry 132 | User-agent: EmailCollector 133 | User-agent: Email Extractor 134 | User-agent: EmailSiphon 135 | User-agent: EmailSmartz 136 | User-agent: EmailWolf 137 | User-agent: Enterprise_Search 138 | User-agent: Enterprise_Search/1.0 139 | User-agent: EroCrawler 140 | User-agent: es 141 | User-agent: ESIRover 142 | User-agent: e-SocietyRobot 143 | User-agent: Exabot 144 | User-agent: Exabot/2.0 145 | User-agent: Exabot-Images 146 | User-agent: Express WebPictures 147 | User-agent: Express WebPictures (www.express-soft.com) 148 | User-agent: ExtractorPro 149 | User-agent: EyeNetIE 150 | User-agent: FairAd Client 151 | User-agent: Fairshare 152 | User-agent: Fasterfox 153 | User-agent: Fetch 154 | User-agent: findlinks 155 | User-agent: Flaming AttackBot 156 | User-agent: Flamingo_SearchEngine 157 | User-agent: FlashGet 158 | User-agent: FlashGet WebWasher 3.2 159 | User-agent: Foobot 160 | User-agent: FreeFind 161 | User-agent: FreeWebMonitoring SiteChecker/0.1 162 | User-agent: FrontPage 163 | User-agent: FrontPage [NC,OR] 164 | User-agent: FurlBot 165 | User-agent: Gaisbot 166 | User-agent: Gaisbot/3.0 167 | User-agent: GetBot 168 | User-agent: GetRight 169 | User-agent: GetRight/2.11 170 | User-agent: GetRight/3.1 171 | User-agent: GetRight/3.2 172 | User-agent: GetRight/3.3 173 | User-agent: GetRight/3.3.3 174 | User-agent: GetRight/3.3.4 175 | User-agent: GetRight/4.0.0 176 | User-agent: GetRight/4.1.0 177 | User-agent: GetRight/4.1.1 178 | User-agent: GetRight/4.1.2 179 | User-agent: GetRight/4.2 180 | User-agent: GetRight/4.2b (Portuguxeas) 181 | User-agent: GetRight/4.2c 182 | User-agent: GetRight/4.3 183 | User-agent: GetRight/4.5 184 | User-agent: GetRight/4.5a 185 | User-agent: GetRight/4.5b 186 | User-agent: GetRight/4.5b1 187 | User-agent: GetRight/4.5b2 188 | User-agent: GetRight/4.5b3 189 | User-agent: GetRight/4.5b6 190 | User-agent: GetRight/4.5b7 191 | User-agent: GetRight/4.5c 192 | User-agent: GetRight/4.5d 193 | User-agent: GetRight/4.5e 194 | User-agent: GetRight/5.0beta1 195 | User-agent: GetRight/5.0beta2 196 | User-agent: GetUrl 197 | User-agent: GetWeb! 198 | User-agent: Gigabot 199 | User-agent: Gigabot/3.0 200 | User-agent: Go-Ahead-Got-It 201 | User-agent: Go!Zilla 202 | User-agent: Go!Zilla 3.3 (www.gozilla.com) 203 | User-agent: Go!Zilla 3.5 (www.gozilla.com) 204 | User-agent: Go!Zilla (www.gozilla.com) 205 | User-agent: GrabNet 206 | User-agent: Grafula 207 | User-agent: grub 208 | User-agent: grub-client 209 | User-agent: Hackertarget.com 210 | User-agent: Harvest 211 | User-agent: Harvest/1.5 212 | User-agent: Hatena Antenna 213 | User-agent: HavIndex 214 | User-agent: heritrix 215 | User-agent: hloader 216 | User-agent: HMView 217 | User-agent: httplib 218 | User-agent: httrack 219 | User-agent: HTTrack 220 | User-agent: HTTrack 3.0 221 | User-agent: HTTrack 3.0x 222 | User-agent: HTTrack [NC,OR] 223 | User-agent: humanlinks 224 | User-agent: ichiro 225 | User-agent: IconSurf 226 | User-agent: Igentia 227 | User-agent: Image Collector 228 | User-agent: Image Stripper 229 | User-agent: Image Sucker 230 | User-agent: Indy Library 231 | User-agent: Indy Library [NC,OR] 232 | User-agent: InfoNaviRobot 233 | User-agent: InfoSpiders 234 | User-agent: InterGET 235 | User-agent: Internet Explore 236 | User-agent: Internet Ninja 237 | User-agent: Internet Ninja 4.0 238 | User-agent: Internet Ninja 5.0 239 | User-agent: Internet Ninja 6.0 240 | User-agent: InternetSupervision 241 | User-agent: IRLbot 242 | User-agent: Iron 243 | User-agent: Iron33/1.0.2 244 | User-agent: Jeeves 245 | User-agent: JennyBot 246 | User-agent: Jetbot 247 | User-agent: Jetbot/1.0 248 | User-agent: JetCar 249 | User-agent: Jobo 250 | User-agent: JOC Web Spider 251 | User-agent: kalooga 252 | User-agent: KDD Exploror 253 | User-agent: Kenjin Spider 254 | User-agent: Kenjin.Spider 255 | User-agent: Keyword Density 256 | User-agent: Keyword.Density 257 | User-agent: Keyword Density/0.9 258 | User-agent: larbin 259 | User-agent: Larbin 260 | User-agent: larbin_2.6.2 (kabura@sushi.com) 261 | User-agent: larbin_2.6.2 kabura@sushi.com 262 | User-agent: larbin_2.6.2 (larbin2.6.2@unspecified.mail) 263 | User-agent: larbin_2.6.2 larbin2.6.2@unspecified.mail 264 | User-agent: larbin_2.6.2 larbin@correa.org 265 | User-agent: larbin_2.6.2 listonATccDOTgatechDOTedu 266 | User-agent: larbin_2.6.2 (listonATccDOTgatechDOTedu) 267 | User-agent: larbin_2.6.2 (vitalbox1@hotmail.com) 268 | User-agent: larbin_2.6.2 vitalbox1@hotmail.com 269 | User-agent: larbin (samualt9@bigfoot.com) 270 | User-agent: larbin samualt9@bigfoot.com 271 | User-agent: LBot 272 | User-agent: LeechFTP 273 | User-agent: LexiBot 274 | User-agent: libWeb/clsHTTP 275 | User-agent: libWeb/clsHTTPDisallow: / 276 | User-agent: libwww 277 | User-agent: LightningDownload 278 | User-agent: Linguee 279 | User-agent: LinkedIn 280 | User-agent: LinkextractorPro 281 | User-agent: Linknzbot 282 | User-agent: Linknzbot* 283 | User-agent: Linknzbot 2004 284 | User-agent: LinkScan 285 | User-agent: LinkScan/8.1a Unix 286 | User-agent: LinkScan/8.1a.Unix 287 | User-agent: LinkScan/8.1a Unix Disallow: / 288 | User-agent: linksmanager 289 | User-agent: LinksManager 290 | User-agent: LinksManager.com_bot 291 | User-agent: LinkWalker 292 | User-agent: LjSEEK 293 | User-agent: LNSpiderguy 294 | User-agent: looksmart 295 | User-agent: LWP 296 | User-agent: LWP* 297 | User-agent: lwp-trivial 298 | User-agent: lwp-trivial/1.34 299 | User-agent: magpie-crawler 300 | User-agent: Mail Sweeper 301 | User-agent: Marketwirebot 302 | User-agent: Mass Downloader 303 | User-agent: Mass Downloader/2.2 304 | User-agent: Mata Hari 305 | User-agent: Mata.Hari 306 | User-agent: MegaIndex.ru 307 | User-agent: MegaIndex.ru/2.0 308 | User-agent: MetagerBot 309 | User-agent: MetaURI 310 | User-agent: Microsoft.URL 311 | User-agent: Microsoft URL Control 312 | User-agent: Microsoft URL Control* 313 | User-agent: Microsoft.URL.Control 314 | User-agent: Microsoft URL Control - 5.01.4511 315 | User-agent: Microsoft URL Control - 6.00.8169 316 | User-agent: Microsoft URL Control - 6.01.9782 317 | User-agent: MIDown tool 318 | User-agent: MIIxpc 319 | User-agent: MIIxpc/4.2 320 | User-agent: Missigua Locator 321 | User-agent: Mister PiX 322 | User-agent: Mister.PiX 323 | User-agent: Mister Pix II 2.01 324 | User-agent: Mister Pix II 2.02a 325 | User-agent: Mister PiX version.dll 326 | User-agent: MLBot 327 | User-agent: moget 328 | User-agent: moget/2.1 329 | User-agent: mozilla 330 | User-agent: Mozilla 331 | User-agent: Mozilla/2.0 (compatible; Ask Jeeves) 332 | User-agent: mozilla/3 333 | User-agent: mozilla/4 334 | User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95) 335 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 2000) 336 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 95) 337 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows 98) 338 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows ME) 339 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows NT) 340 | User-agent: Mozilla/4.0 (compatible; MSIE 4.0; Windows XP) 341 | User-agent: mozilla/5 342 | User-agent: MRSPUTNIK 343 | User-agent: MSIECrawler 344 | User-agent: MSRBOT 345 | User-agent: MS Search 4.0 Robot 346 | User-agent: MS Search 5.0 Robot 347 | User-agent: munky 348 | User-agent: naver 349 | User-agent: Naverbot 350 | User-agent: NaverBot 351 | User-agent: NaverBot-1.0 352 | User-agent: Navroad 353 | User-agent: NearSite 354 | User-agent: NetAnts 355 | User-agent: NetAnts/1.10 356 | User-agent: NetAnts/1.23 357 | User-agent: NetAnts/1.24 358 | User-agent: NetAnts/1.25 359 | User-agent: NetAttache 360 | User-agent: NetAttache Light 1.1 361 | User-agent: Netcraft Web Server Survey 362 | User-agent: NetMechanic 363 | User-agent: NetSpider 364 | User-agent: Net Vampire 365 | User-agent: Net Vampire/3.0 366 | User-agent: NetZIP 367 | User-agent: NetZip-Downloader 368 | User-agent: NetZip-Downloader/1.0.62 (Win32; Dec 7 1998) 369 | User-agent: NetZip Downloader 1.0 Win32(Nov 12 1998) 370 | User-agent: NetZippy+(http://www.innerprise.net/usp-spider.asp) 371 | User-agent: NetZippy+(http:/www.innerprise.net/usp-spider.asp) 372 | User-agent: NICErsPRO 373 | User-agent: NimbleCrawler 374 | User-agent: NPbot 375 | User-agent: NPBot 376 | User-agent: NPBot/3 377 | User-agent: Nutch 378 | User-agent: Nutch* 379 | User-agent: NutchCVS/0.06-dev 380 | User-agent: NutchCVS/0.7.1 381 | User-agent: NutchOrg 382 | User-agent: oBot 383 | User-agent: Ocelli 384 | User-agent: Octopus 385 | User-agent: Offline Explorer 386 | User-agent: Offline.Explorer 387 | User-agent: Offline Explorer/1.2 388 | User-agent: Offline Explorer/1.4 389 | User-agent: Offline Explorer/1.6 390 | User-agent: Offline Explorer/1.7 391 | User-agent: Offline Explorer/1.9 392 | User-agent: Offline Explorer/2.0 393 | User-agent: Offline Explorer/2.1 394 | User-agent: Offline Explorer/2.3 395 | User-agent: Offline Explorer/2.4 396 | User-agent: Offline Explorer/2.5 397 | User-agent: Offline Navigator 398 | User-agent: OmniExplorer_Bot 399 | User-agent: oneriot 400 | User-agent: Openbot 401 | User-agent: Openfind 402 | User-agent: Openfind data gathere 403 | User-agent: Openfind data gatherer 404 | User-agent: Oracle Ultra Search 405 | User-agent: OutfoxBot/0.5 406 | User-agent: PageGrabber 407 | User-agent: Papa Foto 408 | User-agent: pavuk 409 | User-agent: PBWF 410 | User-agent: pcBrowser 411 | User-agent: penthesilea 412 | User-agent: PerMan 413 | User-agent: PGBot 414 | User-agent: PhpDig 415 | User-agent: Pingdom GIGRIB (http://www.pingdom.com) 416 | User-agent: postrank 417 | User-agent: ProPowerBot 418 | User-agent: ProPowerBot/2.14 419 | User-agent: ProWebWalker 420 | User-agent: psbot 421 | User-agent: psycheclone 422 | User-agent: Psycheclone 423 | User-agent: Python-urllib 424 | User-agent: QuepasaCreep 425 | User-agent: QueryN Metasearch 426 | User-agent: QueryN.Metasearch 427 | User-agent: radian6 comment reader 428 | User-agent: radian6 Feedfetcher 429 | User-agent: Radiation Retriever 430 | User-agent: Radiation Retriever 1.1 431 | User-agent: RB2B-bot 432 | User-agent: RealDownload 433 | User-agent: RealDownload/4.0.0.40 434 | User-agent: RealDownload/4.0.0.41 435 | User-agent: RealDownload/4.0.0.42 436 | User-agent: ReGet 437 | User-agent: RepoMonkey 438 | User-agent: RepoMonkey Bait & Tackle/v1.01 439 | User-agent: RepoMonkey Bait & Tackle 440 | User-agent: RepoMonkey Bait & Tackle/v1.01 441 | User-agent: research-spider 442 | User-agent: RMA 443 | User-agent: Robozilla 444 | User-agent: Roverbot 445 | User-agent: RufusBot 446 | User-agent: sbider 447 | User-agent: Scooter/1.0 448 | User-agent: Scooter/1.0 scooter@pa.dec.com 449 | User-agent: Scooter/1.1 (custom) 450 | User-agent: Scooter/2.0 G.R.A.B. V1.1.0 451 | User-agent: Scooter/2.0 G.R.A.B. X2.0 452 | User-agent: Scooter2_Mercator_x-x.0 453 | User-agent: Scooter-3.0.EU 454 | User-agent: Scooter-3.0.FS 455 | User-agent: Scooter-3.0.HD 456 | User-agent: Scooter-3.0QI 457 | User-agent: Scooter-3.0.VNS 458 | User-agent: Scooter-3.2 459 | User-agent: Scooter-3.2.BT 460 | User-agent: Scooter-3.2.DIL 461 | User-agent: Scooter-3.2.EX 462 | User-agent: Scooter-3.2.JT 463 | User-agent: Scooter-3.2.NIV 464 | User-agent: Scooter-3.2.SF0 465 | User-agent: Scooter-3.2.snippet 466 | User-agent: Scooter/3.3 467 | User-agent: Scooter-3.3dev 468 | User-agent: Scooter/3.3.QA.pczukor 469 | User-agent: Scooter/3.3_SF 470 | User-agent: Scooter/3.3.vscooter 471 | User-agent: Scooter-ARS-1.1 472 | User-agent: Scooter-ARS-1.1-ih 473 | User-agent: Scooter_bh0-3.0.3 474 | User-agent: Scooter_trk3-3.0.3 475 | User-agent: scooter-venus-3.0.vns 476 | User-agent: Scooter-W3-1.0 477 | User-agent: Scooter-W3.1.2 478 | User-agent: Scrubby 479 | User-agent: SearchDaimon.com-dc 480 | User-agent: searchpreview 481 | User-agent: semalt.com 482 | User-agent: seekbot 483 | User-agent: Seekbot 484 | User-agent: Seekbot/1.0 485 | User-agent: SEOprofiler 486 | User-agent: Shai'Hulud 487 | User-agent: Shim-Crawler 488 | User-agent: ShopWiki 489 | User-agent: ShopWiki/1.0 490 | User-agent: SightupBot 491 | User-agent: SiteBot 492 | User-agent: SiteSnagger 493 | User-agent: Slurp China 494 | User-agent: SlySearch 495 | User-agent: SmartDownload 496 | User-agent: SmartDownload/1.2.76 (Win32; Apr 1 1999) 497 | User-agent: SmartDownload/1.2.77 (Win32; Aug 17 1999) 498 | User-agent: SmartDownload/1.2.77 (Win32; Feb 1 2000) 499 | User-agent: SmartDownload/1.2.77 (Win32; Jun 19 2001) 500 | User-agent: Snapbot 501 | User-agent: Snappy 502 | User-agent: Softlayer Server 503 | User-agent: Sogou web spider 504 | User-agent: sootle 505 | User-agent: sosospider 506 | User-agent: SpankBot 507 | User-agent: spanner 508 | User-agent: spbot 509 | User-agent: Speedy 510 | User-agent: SpiderBot 511 | User-agent: Sqworm 512 | User-agent: Sqworm/2.9.85-BETA (beta_release; 20011115-775; i686-pc-linux 513 | User-agent: ssearcher100 514 | User-agent: Stanford 515 | User-agent: Stanford Comp Sci 516 | User-agent: suggybot 517 | User-agent: SuperBot 518 | User-agent: SuperBot/2.6 519 | User-agent: SuperBot/3.0 (Win32) 520 | User-agent: SuperBot/3.1 (Win32) 521 | User-agent: SuperHTTP 522 | User-agent: SuperHTTP/1.0 523 | User-agent: Surfbot 524 | User-agent: SurveyBot 525 | User-agent: suzuran 526 | User-agent: Szukacz 527 | User-agent: Szukacz/1.4 528 | User-agent: tAkeOut 529 | User-agent: Teleport 530 | User-agent: TeleportPro 531 | User-agent: Teleport Pro 532 | User-agent: Teleport Pro/1.29 533 | User-agent: Teleport Pro/1.29.1590 534 | User-agent: Teleport Pro/1.29.1634 535 | User-agent: Teleport Pro/1.29.1718 536 | User-agent: Teleport Pro/1.29.1820 537 | User-agent: Teleport Pro/1.29.1847 538 | User-agent: Telesoft 539 | User-agent: Templeton 540 | User-agent: Teoma 541 | User-agent: The Intraformant 542 | User-agent: The.Intraformant 543 | User-agent: TheNomad 544 | User-agent: TightTwatBot 545 | User-agent: Titan 546 | User-agent: toCrawl 547 | User-agent: toCrawl/UrlDispatcher 548 | User-agent: True_Robot 549 | User-agent: True_Robot/1.0 550 | User-agent: turingos 551 | User-agent: TurnitinBot 552 | User-agent: TurnitinBot/1.5 553 | User-agent: Tweetmeme 554 | User-agent: TwengaBot 555 | User-agent: Twiceler 556 | User-agent: URL Control 557 | User-agent: UrlDispatcher 558 | User-agent: ://URLFAN 559 | User-agent: URL_Spider_Pro 560 | User-agent: URLy Warning 561 | User-agent: URLy.Warning 562 | User-agent: VCI 563 | User-agent: VCI WebViewer VCI WebViewer Win32 564 | User-agent: vobsub 565 | User-agent: VoidEYE 566 | User-agent: vscooter 567 | User-agent: w3mir 568 | User-agent: WatchDog/3.0 569 | User-agent: WebAuto 570 | User-agent: WebAuto/3.40 (Win98; I) 571 | User-agent: WebBandit 572 | User-agent: WebBandit/3.50 573 | User-agent: WebCapture 574 | User-agent: WebCapture 2.0 575 | User-agent: WebCatcher 576 | User-agent: webcopier 577 | User-agent: WebCopier 578 | User-agent: WebCopier v.2.2 579 | User-agent: WebCopier v2.5 580 | User-agent: WebCopier v2.6 581 | User-agent: WebCopier v2.7a 582 | User-agent: WebCopier v2.8 583 | User-agent: WebCopier v3.0 584 | User-agent: WebCopier v3.0.1 585 | User-agent: WebCopier v3.2 586 | User-agent: WebCopier v3.2a 587 | User-agent: webcopy 588 | User-agent: WebCopy 589 | User-agent: webcrawl.net 590 | User-agent: WebEmailExtrac 591 | User-agent: WebEMailExtrac.* 592 | User-agent: WebEnhancer 593 | User-agent: WebFetch 594 | User-agent: webfetch/2.1.0 595 | User-agent: WebFetcher 596 | User-agent: WebGo IS 597 | User-agent: Web Image Collector 598 | User-agent: Web.Image.Collector 599 | User-agent: WebLeacher 600 | User-agent: WebmasterWorld Extractor 601 | User-agent: WebmasterWorldForumBot 602 | User-agent: webmirror 603 | User-agent: WebMirror 604 | User-agent: WebReaper 605 | User-agent: Web Reaper 606 | User-agent: WebReaper [info@webreaper.net] 607 | User-agent: WebReaper v9.1 - www.otway.com/webreaper 608 | User-agent: WebReaper v9.7 - www.webreaper.net 609 | User-agent: WebReaper v9.8 - www.webreaper.net 610 | User-agent: WebReaper vWebReaper v7.3 - www,otway.com/webreaper 611 | User-agent: WebReaper [webreaper@otway.com] 612 | User-agent: WebSauger 613 | User-agent: WebSauger 1.20b 614 | User-agent: WebSauger 1.20j 615 | User-agent: WebSauger 1.20k 616 | User-agent: website extractor 617 | User-agent: Website eXtractor 618 | User-agent: Website eXtractor (http:/www.asona.org) 619 | User-agent: Website Quester 620 | User-agent: Website.Quester 621 | User-agent: Website Quester - www.asona.org 622 | User-agent: Website Quester - www.esalesbiz.com/extra/ 623 | User-agent: Webster Pro 624 | User-agent: Webster.Pro 625 | User-agent: WebStripper 626 | User-agent: WebStripper/2.02 627 | User-agent: WebStripper/2.03 628 | User-agent: WebStripper/2.10 629 | User-agent: WebStripper/2.12 630 | User-agent: WebStripper/2.13 631 | User-agent: WebStripper/2.15 632 | User-agent: WebStripper/2.16 633 | User-agent: WebStripper/2.19 634 | User-agent: Web Sucker 635 | User-agent: webvac 636 | User-agent: WebVac 637 | User-agent: WebVulnCrawl 638 | User-agent: WebVulnScan 639 | User-agent: WebWalk 640 | User-agent: WebWasher 641 | User-agent: WebWhacker 642 | User-agent: WebZip 643 | User-agent: WebZIP 644 | User-agent: WebZIP/2.75 (http://www.spidersoft.com) 645 | User-agent: WebZIP/2.75 (http:/www.spidersoft.com) 646 | User-agent: WebZIP/3.65 (http://www.spidersoft.com) 647 | User-agent: WebZIP/3.65 (http:/www.spidersoft.com) 648 | User-agent: WebZIP/3.80 (http://www.spidersoft.com) 649 | User-agent: WebZIP/3.80 (http:/www.spidersoft.com) 650 | User-agent: WebZip/4.0 651 | User-agent: WebZIP/4.0 (http://www.spidersoft.com) 652 | User-agent: WebZIP/4.0 (http:/www.spidersoft.com) 653 | User-agent: WebZIP/4.1 (http://www.spidersoft.com) 654 | User-agent: WebZIP/4.1 (http:/www.spidersoft.com) 655 | User-agent: WebZIP/4.21 656 | User-agent: WebZIP/4.21 (http://www.spidersoft.com) 657 | User-agent: WebZIP/4.21 (http:/www.spidersoft.com) 658 | User-agent: WebZIP/5.0 659 | User-agent: WebZIP/5.0 (http://www.spidersoft.com) 660 | User-agent: WebZIP/5.0 (http:/www.spidersoft.com) 661 | User-agent: WebZIP/5.0 PR1 (http://www.spidersoft.com) 662 | User-agent: WebZIP/5.0 PR1 (http:/www.spidersoft.com) 663 | User-agent: wget 664 | User-agent: wGet 665 | User-agent: Wget 666 | User-agent: Wget/1.10.2 667 | User-agent: Wget/1.5.2 668 | User-agent: Wget/1.5.3 669 | User-agent: Wget/1.6 670 | User-agent: Wget/1.7 671 | User-agent: Wget/1.8 672 | User-agent: Wget/1.8.1 673 | User-agent: Wget/1.8.1+cvs 674 | User-agent: Wget/1.8.2 675 | User-agent: Wget/1.9-beta 676 | User-agent: whitevector crawler 677 | User-agent: Whitevector+Crawler 678 | User-agent: Widow 679 | User-agent: WikioFeedBot 680 | User-agent: wikiwix-bot-3.0 681 | User-agent: Willow 682 | User-agent: WinHTTrack 683 | User-agent: Wise-Guys 684 | User-agent: woozweb-monitoring 685 | User-agent: woriobot 686 | User-agent: WWW-Collector 687 | User-agent: WWW-Collector-E 688 | User-agent: WWWOFFLE 689 | User-agent: Xaldon WebSpider 690 | User-agent: Xaldon WebSpider 2.5.b3 691 | User-agent: Xenu 692 | User-agent: Xenu Link Sleuth 693 | User-agent: Xenu's 694 | User-agent: Xenu's Link Sleuth 1.1c 695 | User-agent: xGet 696 | User-agent: Yahoo-MMCrawler 697 | User-agent: YahooSeeker/CafeKelsa 698 | User-agent: Yeti 699 | User-agent: YodaoBot 700 | User-agent: YRSPider 701 | User-agent: Zao 702 | User-agent: Zealbot 703 | User-agent: Zeus 704 | User-agent: Zeus 11389 Webster Pro V2.9 Win32 705 | User-agent: Zeus 11652 Webster Pro V2.9 Win32 706 | User-agent: Zeus 18018 Webster Pro V2.9 Win32 707 | User-agent: Zeus 26378 Webster Pro V2.9 Win32 708 | User-agent: Zeus 30747 Webster Pro V2.9 Win32 709 | User-agent: Zeus 32297 Webster Pro V2.9 Win32 710 | User-agent: Zeus 39206 Webster Pro V2.9 Win32 711 | User-agent: Zeus 41641 Webster Pro V2.9 Win32 712 | User-agent: Zeus 44238 Webster Pro V2.9 Win32 713 | User-agent: Zeus 51070 Webster Pro V2.9 Win32 714 | User-agent: Zeus 51674 Webster Pro V2.9 Win32 715 | User-agent: Zeus 51837 Webster Pro V2.9 Win32 716 | User-agent: Zeus 63567 Webster Pro V2.9 Win32 717 | User-agent: Zeus 6694 Webster Pro V2.9 Win32 718 | User-agent: Zeus 71129 Webster Pro V2.9 Win32 719 | User-agent: Zeus 82016 Webster Pro V2.9 Win32 720 | User-agent: Zeus 82900 Webster Pro V2.9 Win32 721 | User-agent: Zeus 84842 Webster Pro V2.9 Win32 722 | User-agent: Zeus 90872 Webster Pro V2.9 Win32 723 | User-agent: Zeus 94934 Webster Pro V2.9 Win32 724 | User-agent: Zeus 95245 Webster Pro V2.9 Win32 725 | User-agent: Zeus 95351 Webster Pro V2.9 Win32 726 | User-agent: Zeus 97371 Webster Pro V2.9 Win32 727 | User-agent: Zeus Link Scout 728 | User-agent: ZyBorg 729 | Disallow: / 730 | 731 | User-agent: AhrefsBot 732 | User-agent: SemrushBot 733 | User-agent: Sogou web spider 734 | User-agent: sogou spider 735 | User-agent: MJ12bot 736 | User-agent: MJ12bot/v1.4.3 737 | Crawl-delay: 2 738 | 739 | Sitemap: https://22-lr.forumactif.com/sitemap.xml -------------------------------------------------------------------------------- /examples/robots_multiple_agents.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | parser = robots.RobotsParser.from_file("robots_multiple_agents.txt") 4 | 5 | if parser.errors: 6 | print("ERRORS:") 7 | print(parser.errors) 8 | 9 | if parser.errors: 10 | print("WARNINGS:") 11 | print(parser.errors) 12 | 13 | assert parser.can_fetch("GoogleBot", "/") 14 | assert parser.can_fetch("GoogleBot", "/tmp") 15 | assert not parser.can_fetch("GoogleBot", "/tmp/") 16 | 17 | assert parser.can_fetch("FacebookBot", "/") 18 | assert parser.can_fetch("FacebookBot", "/tmp") 19 | assert not parser.can_fetch("FacebookBot", "/tmp/") -------------------------------------------------------------------------------- /examples/robots_multiple_agents.txt: -------------------------------------------------------------------------------- 1 | User-agent: Mediapartners-Google 2 | Disallow: 3 | 4 | User-agent: Mediapartners-Google* 5 | Disallow: 6 | 7 | User-agent: * 8 | Disallow: /abuse 9 | Disallow: /admgt/ 10 | Disallow: /donate 11 | Disallow: /go/ 12 | Disallow: /modcp 13 | Disallow: /post 14 | Disallow: /privmsg 15 | Disallow: /spa/ 16 | Disallow: /sta/ 17 | Disallow: /bw 18 | Disallow: /dx 19 | Disallow: /topicit/index.php/connect 20 | Disallow: /calendar_scheduler.forum 21 | Noindex: /login 22 | 23 | User-agent: 008 24 | User-agent: Accoona 25 | User-agent: aipbot 26 | User-agent: aipbot* 27 | User-agent: aipbot/1.0 28 | User-agent: Alexa 29 | User-agent: Alexa Bitlybot 30 | User-agent: Alexibot 31 | User-agent: AltaVista Intranet V2.0 AVS EVAL search@freeit.com 32 | User-agent: AltaVista Intranet V2.0 Compaq Altavista Eval sveand@altavista.net 33 | User-agent: AltaVista Intranet V2.0 evreka.com crawler@evreka.com 34 | User-agent: AltaVista V2.0B crawler@evreka.com 35 | Disallow: /bad 36 | 37 | User-agent: GoogleBot 38 | User-agent: MicrosoftBot 39 | User-agent: FacebookBot 40 | Disallow: /tmp/ 41 | Disallow: /secrets/ 42 | 43 | User-agent: AhrefsBot 44 | User-agent: SemrushBot 45 | User-agent: Sogou web spider 46 | User-agent: sogou spider 47 | User-agent: MJ12bot 48 | User-agent: MJ12bot/v1.4.3 49 | Crawl-delay: 2 50 | 51 | 52 | -------------------------------------------------------------------------------- /examples/robots_string.py: -------------------------------------------------------------------------------- 1 | import robots 2 | 3 | r = """ 4 | # GoogleOnly_System 5 | 6 | user-agent: FooBot 7 | disallow: / 8 | 9 | BAD LINE 10 | """ 11 | 12 | parser = robots.RobotsParser.from_string(r) 13 | 14 | if parser.errors: 15 | print("ERRORS:") 16 | print(parser.errors) 17 | 18 | if parser.errors: 19 | print("WARNINGS:") 20 | print(parser.errors) 21 | 22 | assert not parser.can_fetch("FooBot", "/toto") 23 | 24 | r = """ 25 | # CrawlDelayAndCustomAgentTest 26 | 27 | User-agent: * 28 | Crawl-delay: 1 29 | Request-rate: 3/15 30 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 31 | 32 | # Cybermapper knows where to go. 33 | User-agent: cybermapper 34 | Disallow: 35 | """ 36 | 37 | parser = robots.RobotsParser.from_string(r) 38 | 39 | if parser.errors: 40 | print("ERRORS:") 41 | print(parser.errors) 42 | 43 | if parser.warnings: 44 | print("WARNINGS:") 45 | print(parser.warnings) 46 | 47 | assert parser.can_fetch("cybermapper", "/cyberworld/map/index.html") 48 | -------------------------------------------------------------------------------- /examples/robots_url.py: -------------------------------------------------------------------------------- 1 | # Content of http://www.musi-cal.com/robots.txt: 2 | """ 3 | User-agent: * 4 | Disallow: /wp-admin/ 5 | Allow: /wp-admin/admin-ajax.php 6 | """ 7 | 8 | # The first implementation is using the Python standard library urllib.robotparser 9 | 10 | import urllib.robotparser 11 | import robots 12 | 13 | rp = urllib.robotparser.RobotFileParser() 14 | rp.set_url("http://www.musi-cal.com/robots.txt") 15 | rp.read() 16 | 17 | assert rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco") 18 | assert rp.can_fetch("*", "http://www.musi-cal.com/") 19 | assert not rp.can_fetch("*", "http://www.musi-cal.com/wp-admin/") 20 | assert not rp.can_fetch("*", "/wp-admin/") 21 | 22 | # The second implementation is using the robotspy thin layer supporting the same api as 23 | # the python standard library urllib.robotparser 24 | 25 | parser = robots.RobotFileParser() 26 | parser.set_url("http://www.musi-cal.com/robots.txt") 27 | parser.read() 28 | 29 | assert parser.can_fetch( 30 | "*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco" 31 | ) 32 | assert parser.can_fetch("*", "http://www.musi-cal.com/") 33 | assert not parser.can_fetch("*", "http://www.musi-cal.com/wp-admin/") 34 | assert not parser.can_fetch("*", "/wp-admin/") 35 | 36 | # The third implementation is directly using robots.RobotsParser 37 | 38 | parser = robots.RobotsParser.from_uri("http://www.musi-cal.com/robots.txt") 39 | 40 | if parser.errors: 41 | print("ERRORS:") 42 | print(parser.errors) 43 | 44 | if parser.warnings: 45 | print("WARNINGS:") 46 | print(parser.errors) 47 | 48 | assert parser.can_fetch( 49 | "*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco" 50 | ) 51 | assert parser.can_fetch("*", "http://www.musi-cal.com/") 52 | assert not parser.can_fetch("*", "http://www.musi-cal.com/wp-admin/") 53 | assert not parser.can_fetch("*", "/wp-admin/") 54 | 55 | # Examples with custom timeout 56 | parser = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 2) 57 | 58 | if parser.errors: 59 | print("ERRORS:") 60 | print(parser.errors) 61 | 62 | if parser.warnings: 63 | print("WARNINGS:") 64 | print(parser.errors) 65 | 66 | assert parser.can_fetch( 67 | "Googlebot", "https://robotspy.org/" 68 | ) 69 | assert parser.can_fetch("*", "https://robotspy.org/") 70 | 71 | # Set a 0 timeout should result in an error 72 | 73 | parser = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0) 74 | assert parser.errors 75 | if parser.errors: 76 | print("ERRORS:") 77 | print(parser.errors) 78 | 79 | 80 | # Timeout error 81 | parser = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 2) 82 | 83 | # The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout 84 | assert parser.errors 85 | if parser.errors: 86 | print("ERRORS:") 87 | print(parser.errors) 88 | -------------------------------------------------------------------------------- /make.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | if {%1} == {} ( 4 | goto USAGE 5 | ) 6 | 7 | if {%1} == {help} ( 8 | goto USAGE 9 | ) 10 | 11 | if {%1} == {clean} ( 12 | goto CLEAN 13 | ) 14 | 15 | if {%1} == {fmt} (black robots) & (goto :EOF) 16 | if {%1} == {attributions} (pip-licenses -d -u -f markdown -o license > ATTRIBUTIONS.md) & (goto :EOF) 17 | if {%1} == {lint} (pylint robots) & (goto :EOF) 18 | if {%1} == {test} (pytest tests -vv) & (goto :EOF) 19 | if {%1} == {tree} (pipdeptree) & (goto :EOF) 20 | if {%1} == {type} (mypy robots) & (goto :EOF) 21 | 22 | :CLEAN 23 | rmdir /q /s .cache build dist robotspy.egg-info .pytest_cache robots\__pycache__ tests\__pycache__ 24 | del /q *.bak 25 | goto :EOF 26 | 27 | :USAGE 28 | echo. 29 | echo Usage: 30 | echo. make ^ 31 | echo. 32 | echo The tasks are: 33 | echo. 34 | echo attributions Generate attribution list for software used by robotspy 35 | echo make clean Delete temp files (*.pyc), caches (__pycache__) 36 | echo make fmt Format Python files using Black (Assuming Black installed globally) 37 | echo make help Display this help message 38 | echo make lint Lint Python file using Pylint (Assuming Pylint installed globally) 39 | echo make test Execute tests 40 | echo make tree Display the dependency tree (using pipdeptree) 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==8.3.4 2 | -------------------------------------------------------------------------------- /robots/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module robots. 3 | """ 4 | 5 | from robots.parser import RobotsParser 6 | from robots.parser import RequestRate 7 | from robots.robotparser import RobotFileParser 8 | 9 | __version__ = "0.12.0" 10 | 11 | __all__ = ["RobotsParser", "RobotFileParser"] 12 | -------------------------------------------------------------------------------- /robots/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main module for package robots. The script is executed when invoking: 3 | python -m robots 4 | 5 | For help, use: 6 | python -m robots -h | --help 7 | 8 | It mimics the behavior of Google robotstxt available at: 9 | https://github.com/google/robotstxt 10 | """ 11 | 12 | import argparse 13 | import pathlib 14 | import sys 15 | import urllib.parse 16 | 17 | import robots 18 | 19 | 20 | def init_cli() -> argparse.ArgumentParser: 21 | """Initialize the argument parser to handle the command line interface.""" 22 | 23 | cli: argparse.ArgumentParser = argparse.ArgumentParser( 24 | usage="%(prog)s ", 25 | description=( 26 | "Shows whether a given user agent and path/url combination " 27 | "is allowed or disallowed by a given robots.txt file." 28 | ), 29 | ) 30 | cli.prog = __package__ 31 | cli.add_argument( 32 | "-v", "--version", action="version", version=f"{cli.prog} {robots.__version__}" 33 | ) 34 | cli.add_argument("robotstxt", help="robots.txt file path or URL") 35 | cli.add_argument("useragent", help="User agent name") 36 | cli.add_argument("path", help="Path or URL") 37 | 38 | return cli 39 | 40 | 41 | def is_url(path_uri: str) -> bool: 42 | """Validate if a given string is a URL.""" 43 | 44 | res = urllib.parse.urlsplit(path_uri) 45 | return res.scheme in ("http", "https", "ftp", "file") 46 | 47 | 48 | def normalize_uri(path_uri: str) -> str: 49 | """Convert any path to URI. If not a path, return the URI.""" 50 | 51 | if not isinstance(path_uri, pathlib.Path) and is_url(path_uri): 52 | return path_uri 53 | 54 | return pathlib.Path(path_uri).resolve().as_uri() 55 | 56 | 57 | def create_robots(robots_uri: str) -> robots.RobotsParser: 58 | """Instantiate a RobotParser object with a URI.""" 59 | 60 | parser: robots.RobotsParser = robots.RobotsParser.from_uri(robots_uri) 61 | return parser 62 | 63 | 64 | def main() -> None: 65 | """Entry point for the package as a Python module (python -m)""" 66 | 67 | cli = init_cli() 68 | args = cli.parse_args() 69 | 70 | robots_uri = normalize_uri(args.robotstxt) 71 | robots_parser = create_robots(robots_uri) 72 | 73 | allowed = robots_parser.can_fetch(args.useragent, args.path) 74 | 75 | allowed_str = "ALLOWED" if allowed else "DISALLOWED" 76 | url_or_path = "url" if is_url(args.path) else "path" 77 | print(f"user-agent '{args.useragent}' with {url_or_path} '{args.path}': {allowed_str}") 78 | 79 | if errors := robots_parser.errors: 80 | for error in errors: 81 | print(f"{error[0]} -> {error[1]}", file=sys.stderr) 82 | 83 | if warnings := robots_parser.warnings: 84 | for warning in warnings: 85 | print(f"{warning[0]} -> {warning[1]}", file=sys.stderr) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /robots/parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Alternate implementation of RobotParser (alternative to standard library urllib.robotparser) 3 | 4 | Reference: 5 | Robots Exclusion Protocol (REP) https://www.rfc-editor.org/rfc/rfc9309 6 | """ 7 | 8 | from typing import Dict, Iterator, List, NamedTuple, Tuple, Type, TypeVar 9 | 10 | import enum 11 | import fnmatch 12 | import re 13 | import time 14 | 15 | import urllib.parse 16 | import urllib.error 17 | import urllib.request 18 | 19 | import robots 20 | 21 | # Pattern used to validate a user agent token (not used by the parser) 22 | RE_AGENT_TOKEN = re.compile(r"^[a-zA-Z_-]+$") 23 | 24 | # Pattern to read and identify a product token, user agent. 25 | # Match up to the first invalid character (for example stops at a number but does not error out) 26 | # Note: the hash character ('#') needs to be escaped in VERBOSE mode, otherwise it would be 27 | # interpreted as a comment. 28 | RE_AGENT = re.compile( 29 | r"^\s*user-agent\s*:?\s*(?P\*|[a-zA-Z_-]+)[^]s]*\s*(?:\#.*)?$", 30 | re.IGNORECASE | re.VERBOSE, 31 | ) 32 | 33 | RE_SITEMAP = re.compile( 34 | r"^\s*sitemap\s*:\s*(?Phttps?://[^\n\s]+)\s*$", 35 | re.IGNORECASE | re.VERBOSE, 36 | ) 37 | 38 | # Product token in the user-agent line: 39 | RE_PRODUCT = re.compile(r"^[a-zA-Z_-]+$|\*") 40 | 41 | # Rule allow 42 | RE_RULE_ALLOW = re.compile( 43 | r"^\s*(?Pallow)\s*:?\s*(?P\*|[^\s#]+)?\s*(?:\#.*)?$", 44 | re.IGNORECASE | re.VERBOSE, 45 | ) 46 | 47 | # Rule disallow 48 | RE_RULE_DISALLOW = re.compile( 49 | r"^\s*(?Pdisallow)\s*:?\s*(?P\*|[^\s#]+)?\s*(?:\#.*)?$", 50 | re.IGNORECASE | re.VERBOSE, 51 | ) 52 | 53 | # NamedTuple used to store rules. Each record includes: 54 | # - A path or path pattern 55 | # - A boolean indicating if the given path can be access or not 56 | Rule = NamedTuple("Rule", [("path", str), ("allowed", bool)]) 57 | 58 | RequestRate = NamedTuple("RequestRate", [("requests", int), ("seconds", int)]) 59 | 60 | 61 | class State(enum.Enum): 62 | """Define states while parsing the robotstxt file""" 63 | 64 | BEGIN = enum.auto() # Begin parsing 65 | AGENT = enum.auto() # User-agent line 66 | RULE = enum.auto() # Rule line 67 | 68 | 69 | class TokenType(enum.Enum): 70 | """Token definitions for the parser""" 71 | 72 | AGENT = enum.auto() 73 | ALLOW = enum.auto() 74 | DISALLOW = enum.auto() 75 | SITEMAP = enum.auto() 76 | CRAWL_DELAY = enum.auto() 77 | REQ_RATE = enum.auto() 78 | UNEXPECTED = enum.auto() 79 | 80 | 81 | class Errors(enum.Enum): 82 | """Errors definitions and messages""" 83 | 84 | WARNING_EMPTY_ALLOW_RULE = ( 85 | "Warning: An empty allow rule has no effect and is confusing" 86 | ) 87 | WARNING_RULE_WITHOUT_AGENT = "Warning: Rule without an agent is ignored" 88 | WARNING_NOTFOUND = "Warning: No remote robots.txt file found" 89 | WARNING_CRAWL_DELAY_IGNORED = "Warning: Directive 'crawl-delay' ignored" 90 | WARNING_REQUEST_RATE_IGNORED = "Warning: Directive 'request-rate' ignored" 91 | WARNING_UNEXPECTED_OR_IGNORED = "Warning: Unexpected or ignored token" 92 | ERROR_NO_FILE_FOUND = "Error: No file found" 93 | 94 | 95 | Token = NamedTuple("Token", [("type", TokenType), ("value", str), ("linenum", int)]) 96 | 97 | 98 | def gen_tokens(gen_func, source): 99 | """Token generator. 100 | 101 | Emit tokens when parsing the content of a robots.txt 102 | """ 103 | 104 | linenum = 0 105 | for line in gen_func(source): 106 | linenum += 1 107 | # pylint: disable=superfluous-parens 108 | if not (line := line.strip()): 109 | continue # Skip empty lines 110 | if line.startswith("#"): 111 | continue # Skip line comment 112 | if m := RE_AGENT.match(line): 113 | agent = m.group("AGENT") 114 | yield Token(TokenType.AGENT, agent, linenum) 115 | elif m := RE_RULE_ALLOW.match(line): 116 | path = m.group("PATH") 117 | yield Token(TokenType.ALLOW, path, linenum) 118 | elif m := RE_RULE_DISALLOW.match(line): 119 | path = m.group("PATH") 120 | yield Token(TokenType.DISALLOW, path, linenum) 121 | elif m := RE_SITEMAP.match(line): 122 | sitemap = m.group("SITEMAP") 123 | yield Token(TokenType.SITEMAP, sitemap, linenum) 124 | else: 125 | yield Token(TokenType.UNEXPECTED, line, linenum) 126 | 127 | 128 | T = TypeVar("T", bound="RobotsParser") 129 | 130 | 131 | class RobotsParser: 132 | """Encapsulates functions and data to parse a robotstxt file and get 133 | feedback on what a crawler is allowed to access to on a given website. 134 | """ 135 | 136 | AGENT_NOT_VALID = "User agent [%s] is not valid" 137 | UNEXPECTED_LINE = "Unexpected line: [%s]" 138 | 139 | def __init__(self, url=""): 140 | self.agents_rules: Dict[str, List[Rule]] = {} 141 | self._sitemaps = [] 142 | self._errors = [] 143 | self._warnings = [] 144 | self.url = url 145 | self.disallow_all = False 146 | self.allow_all = False 147 | self.timeout = 5 148 | self._host = "" 149 | self._path = "/robots.txt" 150 | self._time = 0.0 # Time the robots.txt is fetched 151 | 152 | @property 153 | def errors(self): 154 | """Property pointing to the errors list""" 155 | return self._errors 156 | 157 | @property 158 | def warnings(self): 159 | """Property pointing to the warning list""" 160 | return self._warnings 161 | 162 | @property 163 | def url(self): 164 | """Url pointing to the robots.txt. For example: https://example.com/robots.com""" 165 | return self._url 166 | 167 | @url.setter 168 | def url(self, url): 169 | """Set the url (example: 'https://www.example.com/robots.txt'), the path of the robots.txt 170 | file, example: '/robots.txt', and the hostname, example 'www.example.com'. 171 | 172 | It discards the scheme ('http' or 'https') for compatibility with the Python standard 173 | library module 'urllib.robotparser'.""" 174 | 175 | self._url = url 176 | self._host, self._path = urllib.parse.urlparse(url)[1:3] 177 | 178 | @property 179 | def host(self): 180 | """Host of the server serving the robots.txt file.""" 181 | return self._host 182 | 183 | @property 184 | def path(self): 185 | """Path of the robots.txt file. 186 | 187 | Example: '/robots.txt'. 188 | """ 189 | return self._path 190 | 191 | @property 192 | def timestamp(self): 193 | """Property pointing to the time the robots.txt was parsed""" 194 | return self._time 195 | 196 | @timestamp.setter 197 | def timestamp(self, timestamp): 198 | self._time = timestamp 199 | 200 | @property 201 | def sitemaps(self): 202 | """Property pointing to the private sitemaps list""" 203 | return self._sitemaps or None 204 | 205 | @classmethod 206 | def from_string(cls: Type[T], robotstxt: str) -> T: 207 | """Build a robots parser from a string representing the content of a robots.txt file.""" 208 | parser = cls() 209 | gen_string = lambda txt: (line for line in txt.split("\n")) 210 | parser.parse_tokens(gen_tokens(gen_string, robotstxt)) 211 | return parser 212 | 213 | def gen_uri(self, uri: str): 214 | """Instantiate a generator from a URI (either http:// or https:// or file:///""" 215 | 216 | try: 217 | useragent = f"robotspy/{robots.__version__}" 218 | req = urllib.request.Request(uri, headers={'User-Agent': useragent}) 219 | self.timestamp = time.time() 220 | with urllib.request.urlopen(req, timeout=self.timeout) as f: 221 | for line in f: 222 | try: 223 | yield line.decode("utf-8-sig") # uft-8-sig to handle BOM characters 224 | except UnicodeDecodeError as err: 225 | self.allow_all = True 226 | self._errors.append(("robots.txt must be UTF-8 encoded", f"{str(err)} for {uri}")) 227 | return 228 | except urllib.error.HTTPError as err: 229 | if err.code in (401, 403): 230 | self.disallow_all = True 231 | self._errors.append((str(err.code), f"{str(err)} for {uri}")) 232 | elif 400 <= err.code < 500: # Unavailable status 233 | self.allow_all = True 234 | self._warnings.append((str(err.code), f"{str(err)} for {uri}")) 235 | elif 500 <= err.code < 600: # Unreachable status 236 | self.disallow_all = True 237 | self._warnings.append((str(err.code), f"{str(err)} for {uri}")) 238 | self.timestamp = 0 239 | except urllib.error.URLError as err: # Unreachable status? 240 | self.disallow_all = True 241 | now = time.time() 242 | duration = round(now - self.timestamp) 243 | self._errors.append(("", f"{str(err)} for {uri} (duration={duration}s)")) 244 | 245 | @classmethod 246 | def from_uri(cls: Type[T], uri: str, timeout=5) -> T: 247 | """Build a robots parser given a url or uri pointing to a robots.txt.""" 248 | parser = cls() 249 | parser.timeout = timeout 250 | parser.parse_tokens(gen_tokens(parser.gen_uri, uri)) 251 | return parser 252 | 253 | def gen_file(self, filename): 254 | """Instantiate a generator from a file""" 255 | 256 | try: 257 | with open(filename) as f: 258 | for line in f: 259 | 260 | yield line 261 | except FileNotFoundError: 262 | self._errors.append((filename, Errors.ERROR_NO_FILE_FOUND.value)) 263 | 264 | @classmethod 265 | def from_file(cls: Type[T], filename: str) -> T: 266 | """Build a robots parser given a local path pointing to a robots.txt file.""" 267 | parser = cls() 268 | parser.parse_tokens(gen_tokens(parser.gen_file, filename)) 269 | return parser 270 | 271 | @staticmethod 272 | def is_agent_valid(useragent: str) -> bool: 273 | """Helper function not used internally by the parser. Useful to validate user agent token. 274 | 275 | https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 276 | User agent in robots.txt should only include the following characters: "a-zA-Z_-" 277 | """ 278 | if RE_AGENT_TOKEN.match(useragent): 279 | return True 280 | return False 281 | 282 | def update_rules(self, agents: List[str], rules: List[Rule]) -> None: 283 | """Sort the rules for a given group. 284 | 285 | The rules are sorted with the longest path first and the allowed first in case of both 286 | 'diasallowed' and 'allowed' rule for same path 287 | See: https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 288 | """ 289 | rules.sort(key=lambda x: (len(x.path), x.allowed), reverse=True) 290 | 291 | # Need to dedup agents `list(set(agents))` caused by intentional lax parsing stopping at the first invalid character. 292 | # For example, GoogleBot and GoogleBot* will both result in googlebot 293 | for agent in list(set(agents)): 294 | if existing_rules := self.agents_rules.get(agent, None): 295 | rules = ( 296 | existing_rules + rules 297 | ) # Combine rules if agent found several times 298 | rules.sort(key=lambda x: (len(x.path), x.allowed), reverse=True) 299 | self.agents_rules[agent] = rules 300 | 301 | def parse_tokens(self, tokens: Iterator) -> None: 302 | """Main function of the parser. 303 | 304 | Parse a robots.txt file and generate a data structure that can then be used by the 305 | Robots object to answer question (can_fetch?) given a URL and a robots ID. 306 | """ 307 | 308 | state = State.BEGIN 309 | current_agents: List[str] = [] 310 | current_rules: List[Rule] = [] 311 | 312 | for token in tokens: 313 | if token.type == TokenType.AGENT: 314 | if state == State.RULE: 315 | self.update_rules(current_agents, current_rules) 316 | current_agents = [] 317 | current_rules = [] 318 | state = State.AGENT 319 | current_agents.append(token.value.lower()) 320 | elif token.type in (TokenType.ALLOW, TokenType.DISALLOW): 321 | if state == State.BEGIN: 322 | self._warnings.append( 323 | ( 324 | f"line {token.linenum}", 325 | Errors.WARNING_RULE_WITHOUT_AGENT.value, 326 | ) 327 | ) 328 | continue # A rule without an agent is ignored 329 | state = State.RULE 330 | if path := token.value: 331 | current_rules.append( 332 | Rule(urllib.parse.unquote(path), token.type == TokenType.ALLOW) 333 | ) 334 | else: 335 | if token.type == TokenType.ALLOW: 336 | self._warnings.append( 337 | (f"line {token.linenum}", Errors.WARNING_EMPTY_ALLOW_RULE.value) 338 | ) 339 | elif token.type == TokenType.SITEMAP: 340 | self._sitemaps.append(token.value) 341 | else: 342 | # Unprocessed or unexpected token 343 | self._warnings.append( 344 | ( 345 | f"line {token.linenum}", 346 | f"{Errors.WARNING_UNEXPECTED_OR_IGNORED.value}: {token.value}", 347 | ) 348 | ) 349 | 350 | self.update_rules(current_agents, current_rules) 351 | 352 | def find_rules(self, agent: str) -> List[Rule]: 353 | """Return rules for a given agent. If agent is not stored, return 354 | rules for wild card agent ('*'), if no rule for '*', return empty list. 355 | """ 356 | 357 | # Crawlers MUST use case-insensitive matching to find the group that matches the product token => 358 | # convert the product token (agent) to lower case. 359 | rules = self.agents_rules.get(agent.lower(), self.agents_rules.get("*", [])) 360 | return rules 361 | 362 | @staticmethod 363 | def dedup_slash(path: str) -> str: 364 | """Replace multiple slashes in a path to one slash. 365 | Keep the duplicate slash after https: or http: (a URL can appear in a query string) 366 | Note: This would be a problem with other patterns like file:/// or ftp:// 367 | """ 368 | 369 | # Regex lookbehind with http or https not possible as it has to be fixed length 370 | path = re.sub(r"//+", "/", path) 371 | # Inject back double slash after any scheme contained in the query string (http or https) 372 | return re.sub(r"(https:/|http:/)", r"\1/", path) 373 | 374 | @staticmethod 375 | def normalize_url(url: str) -> Tuple[str, str]: 376 | """Normalize a URL to extract a quoted path to be used to compare with 377 | a saved rule. 378 | 379 | Returns a tuple containing the host part of the URL if any and a normalized path 380 | """ 381 | 382 | url = urllib.parse.unquote(url) 383 | result = urllib.parse.urlsplit(url) 384 | 385 | # extract the path portion of the URL as-is (e.g. preserve a standalone ?) 386 | host_url = urllib.parse.urlunsplit((result.scheme, result.netloc, "", "", "")) 387 | path = url[len(host_url):] or "/" 388 | return result.netloc, RobotsParser.dedup_slash(path) 389 | 390 | @staticmethod 391 | def startswith_pattern(path: str, pattern: str) -> bool: 392 | """A match is intended to be a 'startswith' match. To accommodate, add a 393 | star ('*') at the end of the pattern if it does not exist already. 394 | """ 395 | 396 | if pattern.endswith("$"): 397 | # When ending with '$', needs to be an exact match 398 | return fnmatch.fnmatchcase(path, pattern[:-1]) 399 | 400 | if not pattern.endswith("*"): 401 | pattern += "*" 402 | 403 | # In unix file name pattern a `?' is a single character. In a url path, it is a '?'. To take it as character 404 | # replace with [?] (https://docs.python.org/3/library/fnmatch.html) 405 | pattern = pattern.replace("?", "[?]") 406 | 407 | return fnmatch.fnmatchcase(path, pattern) 408 | 409 | # pylint: disable=too-many-return-statements 410 | def can_fetch(self, useragent: str, url: str) -> bool: 411 | """Answer the question if a user agent can fetch a particular URL. 412 | 413 | The parser checks the group of rules applying to the given robots ID (user-agent), 414 | and then check the rule that may apply to the given URL. 415 | """ 416 | 417 | if self.allow_all: 418 | return True 419 | if self.disallow_all: 420 | return False 421 | 422 | host, path = RobotsParser.normalize_url(url) 423 | 424 | if host and self.host and host != self.host: 425 | return False 426 | 427 | rules = self.find_rules(useragent) 428 | 429 | for rule in rules: 430 | # $ is a special character for robots and indicate the exact end of the pattern 431 | if rule.path.endswith("$") and rule.path[:-1] == path: 432 | return rule.allowed 433 | 434 | if path.startswith(rule.path): 435 | return rule.allowed 436 | 437 | if rule.path != "*" and "*" in rule.path: 438 | if RobotsParser.startswith_pattern(path, rule.path): 439 | return rule.allowed 440 | 441 | if rule.path == "*": 442 | return rule.allowed 443 | 444 | return True 445 | 446 | def __str__(self): 447 | """Produces a robots.txt from the structure of the rules memorized by the parser.""" 448 | 449 | lines = [] 450 | for agent, rules in self.agents_rules.items(): 451 | lines.append(f"User-agent: {agent}") 452 | for rule in rules: 453 | lines.append( 454 | ("Allow" if rule.allowed else "Disallow") + ": " + rule.path 455 | ) 456 | lines.append("") 457 | 458 | if self._sitemaps: 459 | for sitemap in self._sitemaps: 460 | lines.append(f"Sitemap: {sitemap}") 461 | 462 | lines.append("") 463 | 464 | return "\n".join(lines) 465 | -------------------------------------------------------------------------------- /robots/robotparser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Thin facade in front of robots.parser to mimic the api from the Python standard library 3 | urllib.robotparser https://docs.python.org/3/library/urllib.robotparser.html 4 | """ 5 | 6 | import time 7 | from typing import List 8 | from . import parser 9 | 10 | 11 | def gen_lines(lines: List[str]): 12 | """Instantiate a generator from a list""" 13 | return (line for line in lines) 14 | 15 | 16 | class RobotFileParser(parser.RobotsParser): 17 | """Thin wrapper on RobotsParser to enable some level of compatibility with 18 | urllib.robotparser.RobotFileParser. The implementation is incomplete, for 19 | example, crawl_delay and request_rate are hard-coded to return None. The 20 | unit tests take into account the implementation.""" 21 | 22 | def set_url(self, url): 23 | """Sets the URL referring to a robots.txt file.""" 24 | self.url = url 25 | 26 | def read(self): 27 | """Populate the tokens if a URL is assigned to the url attribute""" 28 | if self.url: 29 | self.parse_tokens(parser.gen_tokens(self.gen_uri, self.url)) 30 | else: 31 | self._errors.append( 32 | ( 33 | self.url, 34 | "RobotFileParser.read requires RobotFileParser.url to be set", 35 | ) 36 | ) 37 | 38 | def parse(self, lines): 39 | """Method 'parse' compatible with urllib.robotparser.RobotFileParser. Parses the tokens 40 | given an iterator.""" 41 | self.parse_tokens(parser.gen_tokens(gen_lines, lines)) 42 | 43 | def mtime(self): 44 | """Method 'mtime' compatible with urllib.robotparser.RobotFileParser. Return the timestamp 45 | initialized when parsing a robots.txt url.""" 46 | return self.timestamp 47 | 48 | def modified(self): 49 | """Method 'modified' compatible with urllib.robotparser.RobotFileParser. When invoked, 50 | instantiate the internal timestamp to the current time.""" 51 | self.timestamp = time.time() 52 | 53 | def crawl_delay(self, _: str): 54 | """The 'crawl-delay' directive is not recognize by the Google robots parser. Ignoring it in 55 | robotspy. Keep this method for compatibility with urllib.robotparser.""" 56 | self._warnings.append( 57 | ("crawl-delay", parser.Errors.WARNING_CRAWL_DELAY_IGNORED) 58 | ) 59 | 60 | def request_rate(self, _: str): 61 | """The 'request-rate' directive is not recognize by the Google robots parser. Ignoring it in 62 | robotspy. Keep this method for compatibility with urllib.robotparser.""" 63 | self._warnings.append( 64 | ("request-rate", parser.Errors.WARNING_REQUEST_RATE_IGNORED) 65 | ) 66 | 67 | def site_maps(self): 68 | """Method site_maps compatible with urllib.robotparser.RobotFileParser. Return the list of 69 | sitemaps encountered while parsing a robots.txt content.""" 70 | return self.sitemaps 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | CWD = pathlib.Path(__file__).parent 5 | 6 | README = (CWD / "README.md").read_text() 7 | 8 | setup( 9 | name="robotspy", 10 | version="0.12.0", 11 | description="Robots Exclusion Protocol File Parser", 12 | long_description=README, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/andreburgaud/robotspy", 15 | author="Andre Burgaud", 16 | author_email="andre.burgaud@gmail.com", 17 | license="MIT", 18 | classifiers=[ 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | ], 27 | packages=["robots"], 28 | entry_points={ 29 | "console_scripts": [ 30 | "robots=robots.__main__:main", 31 | ] 32 | }, 33 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreburgaud/robotspy/8cd173a5f1370ad9671aea3bb89456ea542ed4ee/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import robots 3 | 4 | 5 | @pytest.fixture(scope='function') 6 | def can_fetch(): 7 | def _parser(robots_txt, agent, path): 8 | p = robots.RobotsParser.from_string(robots_txt) 9 | return p.can_fetch(agent, path) 10 | return _parser 11 | 12 | 13 | def pytest_make_parametrize_id(config, val, argname): 14 | if isinstance(val, str): 15 | if not val: 16 | return f'{argname}=' 17 | output = val.strip() 18 | output = output.split('\n')[0].strip() 19 | return f"{argname}={output}" 20 | return f'{argname}={val}' 21 | -------------------------------------------------------------------------------- /tests/core.py: -------------------------------------------------------------------------------- 1 | ALLOWED = True 2 | DISALLOWED = False 3 | DEFAULT_AGENT = 'test_robotparser' # From urllib.robotparser 4 | FOOBOT_AGENT = 'FooBot' 5 | -------------------------------------------------------------------------------- /tests/test_google.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mostly tests from: 3 | https://github.com/google/robotstxt/blob/master/robots_test.cc 4 | implemented with PyTest and intended to validate the compatibility with the robots.txt parser 5 | from Google (written in C++) and released under https://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Each ID_ corresponds to a test or set of tests in robots_test.cc 8 | 9 | For each test a data row contains the following fields: 10 | robotstxt, useragent, url, allowed/disallowed 11 | 12 | allow/disallowed is expressed as a boolean, True/False 13 | 14 | Reference: 15 | Robots Exclusion Protocol (REP) draft-koster-rep-01 16 | https://tools.ietf.org/html/draft-koster-rep 17 | """ 18 | 19 | import pytest 20 | import robots 21 | 22 | from .core import * 23 | 24 | google_only_system = """ 25 | # GoogleOnly_System 26 | 27 | user-agent: FooBot 28 | disallow: / 29 | """ 30 | 31 | google_only_system_data = ( 32 | # Empty robots.txt: everything allowed 33 | ['', FOOBOT_AGENT, '', ALLOWED], 34 | # Empty user agent to be matched: everything allowed 35 | [google_only_system, '', '', ALLOWED], 36 | 37 | # Empty url: implicitly disallowed, see method comment for GetPathParamsQuery in robots.cc. 38 | [google_only_system, FOOBOT_AGENT, '', DISALLOWED], 39 | 40 | # All params empty: same as robots.txt empty, everything allowed. 41 | ['', '', '', ALLOWED], 42 | ) 43 | 44 | 45 | # Rules are colon separated name-value pairs. The following names are provisioned: 46 | # user-agent: 47 | # allow: 48 | # disallow: 49 | # See REP I-D section "Protocol Definition". 50 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1 51 | # Google specific: webmasters sometimes miss the colon separator, but it's 52 | # obvious what they mean by "disallow /", so we assume the colon if it's missing. 53 | 54 | # Note: robotspy discards incorrect lines and does not implicitly assume that it is 55 | # a webmaster mistake if a colon (:) is missing 56 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_only_system_data) 57 | def test_useragent_wild_card(robots_txt, agent, path, allowed, can_fetch): 58 | assert can_fetch(robots_txt, agent, path) is allowed 59 | 60 | 61 | linesyntax_line_correct = """ 62 | # ID_LineSyntax_Line (correct) 63 | 64 | user-agent: FooBot 65 | disallow: / 66 | """ 67 | 68 | linesyntax_line_incorrect = """ 69 | # ID_LineSyntax_Line (incorrect) 70 | 71 | foo: FooBot 72 | bar: / 73 | """ 74 | 75 | linesyntax_line_incorrect_accepted = """ 76 | # ID_LineSyntax_Line (mistake - missing ":" - accepted by Google 77 | 78 | user-agent FooBot 79 | disallow / 80 | """ 81 | 82 | url = 'http://foo.bar/x/y' 83 | 84 | linesyntax_line_data = ( 85 | [linesyntax_line_correct, FOOBOT_AGENT, url, DISALLOWED], 86 | [linesyntax_line_incorrect, FOOBOT_AGENT, url, ALLOWED], 87 | [linesyntax_line_incorrect_accepted, FOOBOT_AGENT, url, DISALLOWED], 88 | ) 89 | 90 | 91 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', linesyntax_line_data) 92 | def test_linesyntax_line(robots_txt, agent, path, allowed, can_fetch): 93 | assert can_fetch(robots_txt, agent, path) is allowed 94 | 95 | 96 | # A group is one or more user-agent line followed by rules, and terminated 97 | # by a another user-agent line. Rules for same user-agents are combined 98 | # opaquely into one group. Rules outside groups are ignored. 99 | # See REP I-D section "Protocol Definition". 100 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1 101 | 102 | linesyntax_group = """ 103 | # ID_LineSyntax_Groups 104 | 105 | allow: /foo/bar/ 106 | 107 | user-agent: FooBot 108 | disallow: / 109 | allow: /x/ 110 | user-agent: BarBot 111 | disallow: / 112 | allow: /y/ 113 | 114 | 115 | allow: /w/ 116 | user-agent: BazBot 117 | 118 | user-agent: FooBot 119 | allow: /z/ 120 | disallow: / 121 | """ 122 | 123 | linesyntax_group_data = ( 124 | [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/x/b', ALLOWED], 125 | [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/z/d', ALLOWED], 126 | [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/y/c', DISALLOWED], 127 | [linesyntax_group, 'BarBot', 'http://foo.bar/y/c', ALLOWED], 128 | [linesyntax_group, 'BarBot', 'http://foo.bar/w/a', ALLOWED], 129 | [linesyntax_group, 'BarBot', 'http://foo.bar/z/d', DISALLOWED], 130 | [linesyntax_group, 'BazBot', 'http://foo.bar/z/d', ALLOWED], 131 | 132 | # Lines with rules outside groups are ignored 133 | [linesyntax_group, FOOBOT_AGENT, 'http://foo.bar/foo/bar/', DISALLOWED], 134 | [linesyntax_group, 'BarBot', 'http://foo.bar/foo/bar/', DISALLOWED], 135 | [linesyntax_group, 'BazBot', 'http://foo.bar/foo/bar/', DISALLOWED], 136 | ) 137 | 138 | 139 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', linesyntax_group_data) 140 | def test_linesyntax_group(robots_txt, agent, path, allowed, can_fetch): 141 | assert can_fetch(robots_txt, agent, path) is allowed 142 | 143 | 144 | # Robot Exclusion Protocol (REP) lines are case insensitive. 145 | # See REP I-D section "Protocol Definition". 146 | # https://tools.ietf.org/html/draft-koster-rep#section-2.1 147 | 148 | line_names_camel = """ 149 | # ID_REPLineNamesCaseInsensitive (camel) 150 | 151 | uSeR-aGeNt: FooBot 152 | AlLoW: /x/ 153 | dIsAlLoW: / 154 | """ 155 | 156 | line_names_lower = """ 157 | # ID_REPLineNamesCaseInsensitive (lower) 158 | 159 | user-agent: FooBot 160 | allow: /x/ 161 | disallow: / 162 | """ 163 | 164 | line_names_upper = """ 165 | # ID_REPLineNamesCaseInsensitive (upper) 166 | 167 | USER-AGENT: FooBot 168 | ALLOW: /x/ 169 | DISALLOW: / 170 | """ 171 | 172 | url_allowed = 'http://foo.bar/x/y' 173 | url_disallowed = 'http://foo.bar/a/b' 174 | 175 | line_names_case_insensitive_data = ( 176 | [line_names_upper, FOOBOT_AGENT, url_allowed, ALLOWED], 177 | [line_names_lower, FOOBOT_AGENT, url_allowed, ALLOWED], 178 | [line_names_camel, FOOBOT_AGENT, url_allowed, ALLOWED], 179 | [line_names_upper, FOOBOT_AGENT, url_disallowed, DISALLOWED], 180 | [line_names_lower, FOOBOT_AGENT, url_disallowed, DISALLOWED], 181 | [line_names_upper, FOOBOT_AGENT, url_disallowed, DISALLOWED], 182 | ) 183 | 184 | 185 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', line_names_case_insensitive_data) 186 | def test_line_names_case_insensitive(robots_txt, agent, path, allowed, can_fetch): 187 | assert can_fetch(robots_txt, agent, path) is allowed 188 | 189 | 190 | # A user-agent line is expected to contain only [a-zA-Z_-] characters and must 191 | # not be empty. See REP I-D section "The user-agent line". 192 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 193 | # ID_VerifyValidUserAgentsToObey 194 | useragents_data = ( 195 | ['Foobot', True], 196 | ['Foobot-Bar', True], 197 | ['Foo_Bar', True], 198 | #[None, False], 199 | ['', False], 200 | ['ツ', False], 201 | ['Foobot*', False], 202 | ['Foobot/2.1', False], 203 | ['Foobot Bar', False], 204 | ) 205 | 206 | 207 | @pytest.mark.parametrize('agent,valid', useragents_data) 208 | def test_valid_agent(agent, valid): 209 | assert robots.RobotsParser.is_agent_valid(agent) is valid 210 | 211 | 212 | # The following test data is google specific as the Google robots parses the first string of the agent 213 | # and ignore the rest. robotspy is intentionally stricter. 214 | robots_upper = """ 215 | # ID_UserAgentValueCaseInsensitive (upper) 216 | 217 | User-Agent: FOO BAR 218 | Allow: /x/ 219 | Disallow: / 220 | """ 221 | 222 | robots_lower = """ 223 | # ID_UserAgentValueCaseInsensitive (lower) 224 | 225 | User-Agent: foo bar 226 | Allow: /x/ 227 | Disallow: / 228 | """ 229 | 230 | robots_camel = """ 231 | # ID_UserAgentValueCaseInsensitive (camel) 232 | 233 | User-Agent: FoO bAr 234 | Allow: /x/ 235 | Disallow: / 236 | """ 237 | 238 | url_allowed = "http://foo.bar/x/y" 239 | url_disallowed = "http://foo.bar/a/b" 240 | 241 | agent_case_insensitive_google_data = ( 242 | [robots_upper, 'Foo', url_allowed, ALLOWED], 243 | [robots_lower, 'Foo', url_allowed, ALLOWED], 244 | [robots_camel, 'Foo', url_allowed, ALLOWED], 245 | [robots_upper, 'Foo', url_disallowed, DISALLOWED], 246 | [robots_lower, 'Foo', url_disallowed, DISALLOWED], 247 | [robots_camel, 'Foo', url_disallowed, DISALLOWED], 248 | [robots_upper, 'foo', url_allowed, ALLOWED], 249 | [robots_lower, 'foo', url_allowed, ALLOWED], 250 | [robots_camel, 'foo', url_allowed, ALLOWED], 251 | [robots_upper, 'foo', url_disallowed, DISALLOWED], 252 | [robots_lower, 'foo', url_disallowed, DISALLOWED], 253 | [robots_camel, 'foo', url_disallowed, DISALLOWED], 254 | ) 255 | 256 | 257 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', agent_case_insensitive_google_data) 258 | def test_agent_case_insensitive_google(robots_txt, agent, path, allowed, can_fetch): 259 | assert can_fetch(robots_txt, agent, path) is allowed 260 | 261 | 262 | # The following test data is modified from the google test data above and eliminates the space 263 | # It allows to validate the case insensitivity of the user agent. 264 | robots_upper = """ 265 | # ID_UserAgentValueCaseInsensitive (upper) 266 | 267 | User-Agent: FOO 268 | Allow: /x/ 269 | Disallow: / 270 | """ 271 | 272 | robots_lower = """ 273 | # ID_UserAgentValueCaseInsensitive (lower) 274 | 275 | User-Agent: foo 276 | Allow: /x/ 277 | Disallow: / 278 | """ 279 | 280 | robots_camel = """ 281 | # ID_UserAgentValueCaseInsensitive (camel) 282 | 283 | User-Agent: FoO 284 | Allow: /x/ 285 | Disallow: / 286 | """ 287 | 288 | agent_case_insensitive_data = ( 289 | [robots_upper, 'Foo', url_allowed, ALLOWED], 290 | [robots_lower, 'Foo', url_allowed, ALLOWED], 291 | [robots_camel, 'Foo', url_allowed, ALLOWED], 292 | [robots_upper, 'Foo', url_disallowed, DISALLOWED], 293 | [robots_lower, 'Foo', url_disallowed, DISALLOWED], 294 | [robots_camel, 'Foo', url_disallowed, DISALLOWED], 295 | [robots_upper, 'foo', url_allowed, ALLOWED], 296 | [robots_lower, 'foo', url_allowed, ALLOWED], 297 | [robots_camel, 'foo', url_allowed, ALLOWED], 298 | [robots_upper, 'foo', url_disallowed, DISALLOWED], 299 | [robots_lower, 'foo', url_disallowed, DISALLOWED], 300 | [robots_camel, 'foo', url_disallowed, DISALLOWED], 301 | ) 302 | 303 | 304 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', agent_case_insensitive_data) 305 | def test_agent_case_insensitive(robots_txt, agent, path, allowed, can_fetch): 306 | assert can_fetch(robots_txt, agent, path) is allowed 307 | 308 | 309 | robotstxt_global = """ 310 | # ID_GlobalGroups_Secondary 311 | 312 | user-agent: * 313 | allow: / 314 | user-agent: FooBot 315 | disallow: / 316 | """ 317 | 318 | robotstxt_only_specific = """ 319 | # ID_GlobalGroups_Secondary 320 | 321 | user-agent: FooBot 322 | allow: / 323 | user-agent: BarBot 324 | disallow: / 325 | user-agent: BazBot 326 | disallow: / 327 | """ 328 | 329 | url = 'http://foo.bar/x/y' 330 | 331 | robotstxt_global_data = ( 332 | ['', FOOBOT_AGENT, url, ALLOWED], 333 | [robotstxt_global, FOOBOT_AGENT, url, DISALLOWED], 334 | [robotstxt_global, 'BarBot', url, ALLOWED], 335 | [robotstxt_only_specific, 'QusBot', url, ALLOWED], 336 | ) 337 | 338 | 339 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', robotstxt_global_data) 340 | def test_robotstxt_global(robots_txt, agent, path, allowed, can_fetch): 341 | assert can_fetch(robots_txt, agent, path) is allowed 342 | 343 | 344 | # Matching rules against URIs is case sensitive. 345 | # See REP I-D section "The Allow and Disallow lines". 346 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.2 347 | 348 | robots_url_lower = """ 349 | # ID_AllowDisallow_Value_CaseSensitive (lower) 350 | 351 | user-agent: FooBot 352 | disallow: /x/ 353 | """ 354 | 355 | robots_url_upper = """ 356 | # ID_AllowDisallow_Value_CaseSensitive (upper) 357 | user-agent: FooBot 358 | disallow: /X/ 359 | """ 360 | 361 | url = 'http://foo.bar/x/y' 362 | 363 | uri_case_sensitive_data = ( 364 | [robots_url_lower, FOOBOT_AGENT, url, DISALLOWED], 365 | [robots_url_upper, FOOBOT_AGENT, url, ALLOWED], 366 | ) 367 | 368 | 369 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', uri_case_sensitive_data) 370 | def test_uri_case_sensitive(robots_txt, agent, path, allowed, can_fetch): 371 | assert can_fetch(robots_txt, agent, path) is allowed 372 | 373 | 374 | longest_match_01 = """ 375 | # ID_LongestMatch 01 376 | 377 | user-agent: FooBot 378 | disallow: /x/page.html 379 | allow: /x/ 380 | """ 381 | 382 | longest_match_02 = """ 383 | # ID_LongestMatch 02 384 | 385 | user-agent: FooBot 386 | allow: /x/page.html 387 | disallow: /x/ 388 | """ 389 | 390 | longest_match_03 = """ 391 | # ID_LongestMatch 03 392 | 393 | user-agent: FooBot 394 | disallow: 395 | allow: 396 | """ 397 | 398 | longest_match_04 = """ 399 | # ID_LongestMatch 04 400 | 401 | user-agent: FooBot 402 | disallow: / 403 | allow: / 404 | """ 405 | 406 | longest_match_05 = """ 407 | # ID_LongestMatch 05 408 | 409 | user-agent: FooBot 410 | disallow: /x 411 | allow: /x/ 412 | """ 413 | 414 | longest_match_06 = """ 415 | # ID_LongestMatch 06 416 | 417 | user-agent: FooBot 418 | disallow: /x/page.html 419 | allow: /x/page.html 420 | """ 421 | 422 | longest_match_07 = """ 423 | # ID_LongestMatch 07 424 | 425 | user-agent: FooBot 426 | allow: /page 427 | disallow: /*.html 428 | """ 429 | 430 | longest_match_08 = """ 431 | # ID_LongestMatch 08 432 | 433 | user-agent: FooBot 434 | allow: /x/page. 435 | disallow: /*.html 436 | """ 437 | 438 | longest_match_09 = """ 439 | # ID_LongestMatch 09 440 | 441 | User-agent: * 442 | Disallow: /x/ 443 | User-agent: FooBot 444 | Disallow: /y/ 445 | """ 446 | 447 | url = 'http://foo.bar/x/page.html' 448 | 449 | longest_match_data = ( 450 | [longest_match_01, FOOBOT_AGENT, url, DISALLOWED], 451 | [longest_match_02, FOOBOT_AGENT, url, ALLOWED], 452 | [longest_match_02, FOOBOT_AGENT, 'http://foo.bar/x/', DISALLOWED], 453 | [longest_match_03, FOOBOT_AGENT, url, ALLOWED], 454 | [longest_match_04, FOOBOT_AGENT, url, ALLOWED], 455 | [longest_match_05, FOOBOT_AGENT, 'http://foo.bar/x', DISALLOWED], 456 | [longest_match_05, FOOBOT_AGENT, 'http://foo.bar/x/', ALLOWED], 457 | [longest_match_06, FOOBOT_AGENT, url, ALLOWED], 458 | [longest_match_07, FOOBOT_AGENT, 'http://foo.bar/page.html', DISALLOWED], 459 | [longest_match_07, FOOBOT_AGENT, 'http://foo.bar/page', ALLOWED], 460 | [longest_match_08, FOOBOT_AGENT, url, ALLOWED], 461 | [longest_match_08, FOOBOT_AGENT, 'http://foo.bar/x/y.html', DISALLOWED], 462 | 463 | [longest_match_09, FOOBOT_AGENT, 'http://foo.bar/x/page', ALLOWED], 464 | [longest_match_09, FOOBOT_AGENT, 'http://foo.bar/y/page', DISALLOWED], 465 | 466 | ) 467 | 468 | 469 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', longest_match_data) 470 | def test_longest_match(robots_txt, agent, path, allowed, can_fetch): 471 | assert can_fetch(robots_txt, agent, path) is allowed 472 | 473 | 474 | # Octets in the URI and robots.txt paths outside the range of the US-ASCII 475 | # coded character set, and those in the reserved range defined by RFC3986, 476 | # MUST be percent-encoded as defined by RFC3986 prior to comparison. 477 | # See REP I-D section "The Allow and Disallow lines". 478 | # https://tools.ietf.org/html/draft-koster-rep#section-2.2.2 479 | # 480 | # NOTE: It's up to the caller to percent encode a URL before passing it to the 481 | # parser. Percent encoding URIs in the rules is unnecessary. 482 | 483 | 484 | encoding_01 = """ 485 | # ID_Encoding 01 486 | 487 | User-agent: FooBot 488 | Disallow: / 489 | Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par 490 | """ 491 | 492 | encoding_02 = """ 493 | # ID_Encoding 02 494 | 495 | User-agent: FooBot 496 | Disallow: / 497 | Allow: /foo/bar/ツ 498 | """ 499 | 500 | encoding_03 = """ 501 | # ID_Encoding 03 502 | 503 | User-agent: FooBot 504 | Disallow: / 505 | Allow: /foo/bar/%E3%83%84 506 | """ 507 | 508 | encoding_04 = """ 509 | # ID_Encoding 04 510 | 511 | User-agent: FooBot 512 | Disallow: / 513 | Allow: /foo/bar/%62%61%7A 514 | """ 515 | 516 | # TODO: Revisit the encoding to match Google robots? 517 | encoding_data = ( 518 | [encoding_01, FOOBOT_AGENT, 'http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par', ALLOWED], 519 | [encoding_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%E3%83%84"', ALLOWED], 520 | [encoding_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/ツ', ALLOWED], # Google -> DISALLOWED 521 | [encoding_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%E3%83%84', ALLOWED], 522 | [encoding_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar/ツ', ALLOWED], # Google -> DISALLOWED 523 | [encoding_04, FOOBOT_AGENT, 'http://foo.bar/foo/bar/baz', ALLOWED], # Google -> DISALLOWED 524 | [encoding_04, FOOBOT_AGENT, 'http://foo.bar/foo/bar/%62%61%7A', ALLOWED], 525 | ) 526 | 527 | 528 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', encoding_data) 529 | def test_encoding(robots_txt, agent, path, allowed, can_fetch): 530 | assert can_fetch(robots_txt, agent, path) is allowed 531 | 532 | 533 | special_characters_01 = """ 534 | # ID_SpecialCharacters 01 535 | 536 | User-agent: FooBot 537 | Disallow: /foo/bar/quz 538 | Allow: /foo/*/qux 539 | """ 540 | 541 | special_characters_02 = """ 542 | # ID_SpecialCharacters 02 543 | 544 | User-agent: FooBot 545 | Disallow: /foo/bar$ 546 | Allow: /foo/bar/qux 547 | """ 548 | 549 | special_characters_03 = """ 550 | # ID_SpecialCharacters 03 551 | 552 | User-agent: FooBot 553 | # Disallow: / 554 | Disallow: /foo/quz#qux 555 | Allow: / 556 | """ 557 | 558 | special_characters_data = ( 559 | [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/bar/quz', DISALLOWED], 560 | [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/quz', ALLOWED], 561 | [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo//quz', ALLOWED], 562 | [special_characters_01, FOOBOT_AGENT, 'http://foo.bar/foo/bax/quz', ALLOWED], 563 | [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar', DISALLOWED], 564 | [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/qux', ALLOWED], 565 | [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/', ALLOWED], 566 | [special_characters_02, FOOBOT_AGENT, 'http://foo.bar/foo/bar/baz', ALLOWED], 567 | [special_characters_03, FOOBOT_AGENT, 'http://foo.bar/foo/bar', ALLOWED], 568 | [special_characters_03, FOOBOT_AGENT, 'http://foo.bar/foo/quz', DISALLOWED], 569 | ) 570 | 571 | # Skip: 572 | # - GoogleOnly_IndexHTMLisDirectory 573 | # - GoogleOnly_LineTooLong 574 | 575 | google_doc_01 = """ 576 | # GoogleOnly_DocumentationChecks 01 577 | 578 | user-agent: FooBot 579 | disallow: / 580 | allow: /fish 581 | """ 582 | 583 | google_doc_02 = """ 584 | # GoogleOnly_DocumentationChecks 02 585 | 586 | user-agent: FooBot 587 | disallow: / 588 | allow: /fish* 589 | """ 590 | 591 | google_doc_03 = """ 592 | # GoogleOnly_DocumentationChecks 03 593 | 594 | user-agent: FooBot 595 | disallow: / 596 | allow: /fish/ 597 | """ 598 | 599 | google_doc_data = ( 600 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 601 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish', ALLOWED], 602 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish.html', ALLOWED], 603 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED], 604 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fishheads', ALLOWED], 605 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fishheads/yummy.html', ALLOWED], 606 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/fish.html?id=anything', ALLOWED], 607 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/Fish.asp', DISALLOWED], 608 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/catfish', DISALLOWED], 609 | [google_doc_01, FOOBOT_AGENT, 'http://foo.bar/?id=fish', DISALLOWED], 610 | 611 | # "/fish*" equals "/fish" 612 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 613 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish', ALLOWED], 614 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish.html', ALLOWED], 615 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED], 616 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fishheads', ALLOWED], 617 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fishheads/yummy.html', ALLOWED], 618 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/fish.html?id=anything', ALLOWED], 619 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/Fish.bar', DISALLOWED], 620 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/catfish', DISALLOWED], 621 | [google_doc_02, FOOBOT_AGENT, 'http://foo.bar/?id=fish', DISALLOWED], 622 | 623 | # "/fish/" does not equal "/fish" 624 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 625 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/', ALLOWED], 626 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/salmon', ALLOWED], 627 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/?salmon', ALLOWED], 628 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/salmon.html', ALLOWED], 629 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish/?id=anything', ALLOWED], 630 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish', DISALLOWED], 631 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/fish.html', DISALLOWED], 632 | [google_doc_03, FOOBOT_AGENT, 'http://foo.bar/Fish/Salmon.html', DISALLOWED], 633 | ) 634 | 635 | 636 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_doc_data) 637 | def test_google_doc(robots_txt, agent, path, allowed, can_fetch): 638 | assert can_fetch(robots_txt, agent, path) is allowed 639 | 640 | 641 | google_php_01 = """ 642 | # GoogleOnly_DocumentationChecks PHP 01 643 | 644 | user-agent: FooBot 645 | disallow: / 646 | allow: /*.php 647 | """ 648 | 649 | google_php_02 = """ 650 | # GoogleOnly_DocumentationChecks PHP 02 651 | 652 | user-agent: FooBot 653 | disallow: / 654 | allow: /*.php$ 655 | """ 656 | 657 | google_php_03 = """ 658 | # GoogleOnly_DocumentationChecks PHP 03 659 | 660 | user-agent: FooBot 661 | disallow: / 662 | allow: /fish*.php 663 | """ 664 | 665 | google_php_data = ( 666 | # "/*.php" 667 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 668 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/filename.php', ALLOWED], 669 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php', ALLOWED], 670 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php?parameters', ALLOWED], 671 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/filename.php/', ALLOWED], 672 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/index?f=filename.php/', ALLOWED], 673 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/php/', DISALLOWED], 674 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/index?php', DISALLOWED], 675 | [google_php_01, FOOBOT_AGENT, 'http://foo.bar/windows.PHP', DISALLOWED], 676 | 677 | # "/*.php$" 678 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 679 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php', ALLOWED], 680 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/folder/filename.php', ALLOWED], 681 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php?parameters', DISALLOWED], 682 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php/', DISALLOWED], 683 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename.php5', DISALLOWED], 684 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/filename?php', DISALLOWED], 685 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar/aaaphpaaa', DISALLOWED], 686 | [google_php_02, FOOBOT_AGENT, 'http://foo.bar//windows.PHP', DISALLOWED], 687 | 688 | # "/fish*.php" 689 | [google_php_03, FOOBOT_AGENT, 'http://foo.bar/bar', DISALLOWED], 690 | [google_php_03, FOOBOT_AGENT, 'http://foo.bar/fish.php', ALLOWED], 691 | [google_php_03, FOOBOT_AGENT, 'http://foo.bar/fishheads/catfish.php?parameters', ALLOWED], 692 | [google_php_03, FOOBOT_AGENT, 'http://foo.bar/Fish.PHP', DISALLOWED], 693 | ) 694 | 695 | 696 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_php_data) 697 | def test_google_php(robots_txt, agent, path, allowed, can_fetch): 698 | assert can_fetch(robots_txt, agent, path) is allowed 699 | 700 | 701 | # Order of precedence for group-member records 702 | order_precedence_01 = """ 703 | # GoogleOnly_DocumentationChecks 01 (Order Precedence) 704 | 705 | user-agent: FooBot 706 | allow: /folder 707 | disallow: /folder 708 | """ 709 | 710 | order_precedence_02 = """ 711 | # GoogleOnly_DocumentationChecks 02 (Order Precedence) 712 | 713 | user-agent: FooBot 714 | allow: /folder 715 | disallow: /folder 716 | """ 717 | 718 | order_precedence_03 = """ 719 | # GoogleOnly_DocumentationChecks 03 (Order Precedence) 720 | 721 | user-agent: FooBot 722 | allow: /page 723 | disallow: /*.htm 724 | """ 725 | 726 | order_precedence_04 = """ 727 | # GoogleOnly_DocumentationChecks 04 (Order Precedence) 728 | 729 | user-agent: FooBot 730 | allow: /$ 731 | disallow: / 732 | """ 733 | 734 | order_precedence_data = ( 735 | [order_precedence_01, FOOBOT_AGENT, 'http://example.com/page', ALLOWED], 736 | [order_precedence_02, FOOBOT_AGENT, 'http://example.com/folder/page', ALLOWED], 737 | [order_precedence_03, FOOBOT_AGENT, 'http://example.com/page.htm', DISALLOWED], 738 | [order_precedence_04, FOOBOT_AGENT, 'http://example.com/', ALLOWED], 739 | [order_precedence_04, FOOBOT_AGENT, 'http://example.com/page.html', DISALLOWED], 740 | ) 741 | 742 | 743 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', order_precedence_data) 744 | def test_order_precedence(robots_txt, agent, path, allowed, can_fetch): 745 | assert can_fetch(robots_txt, agent, path) is allowed 746 | -------------------------------------------------------------------------------- /tests/test_google_correctness.py: -------------------------------------------------------------------------------- 1 | 2 | # Code generated from https://github.com/google/robotstxt-spec-test/tree/master/src/main/resources/CTC/ 3 | 4 | 5 | import pytest 6 | from .core import * 7 | 8 | 9 | robots_txt_matching_path_values_20 = """ 10 | user-agent: FooBot 11 | disallow: / 12 | allow: /*.php 13 | """ 14 | 15 | data_matching_path_values_20 = ( 16 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/bar", DISALLOWED], 17 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/filename.php", ALLOWED], 18 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/folder/filename.php", ALLOWED], 19 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/folder/filename.php?parameters", ALLOWED], 20 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar//folder/any.php.file.html", ALLOWED], 21 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/filename.php/", ALLOWED], 22 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/index?f=filename.php/", ALLOWED], 23 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/php/", DISALLOWED], 24 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/index?php", DISALLOWED], 25 | [robots_txt_matching_path_values_20, "FooBot", "http://foo.bar/windows.PHP", DISALLOWED], 26 | ) 27 | 28 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_20) 29 | def test_google_correctness_matching_path_values_20(robots_txt, agent, path, allowed, can_fetch): 30 | assert can_fetch(robots_txt, agent, path) is allowed 31 | 32 | 33 | robots_txt_matching_path_values_21 = """ 34 | user-agent: FooBot 35 | disallow: / 36 | allow: /*.php$ 37 | """ 38 | 39 | data_matching_path_values_21 = ( 40 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/bar", DISALLOWED], 41 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php", ALLOWED], 42 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/folder/filename.php", ALLOWED], 43 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php?parameters", DISALLOWED], 44 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php/", DISALLOWED], 45 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename.php5", DISALLOWED], 46 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/php/", DISALLOWED], 47 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/filename?php", DISALLOWED], 48 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar/aaaphpaaa", DISALLOWED], 49 | [robots_txt_matching_path_values_21, "FooBot", "http://foo.bar//windows.PHP", DISALLOWED], 50 | ) 51 | 52 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_21) 53 | def test_google_correctness_matching_path_values_21(robots_txt, agent, path, allowed, can_fetch): 54 | assert can_fetch(robots_txt, agent, path) is allowed 55 | 56 | 57 | robots_txt_matching_path_values_22 = """ 58 | user-agent: FooBot 59 | disallow: / 60 | allow: /fish*.php 61 | """ 62 | 63 | data_matching_path_values_22 = ( 64 | [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/bar", DISALLOWED], 65 | [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/fish.php", ALLOWED], 66 | [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/fishheads/catfish.php?parameters", ALLOWED], 67 | [robots_txt_matching_path_values_22, "FooBot", "http://foo.bar/Fish.PHP", DISALLOWED], 68 | ) 69 | 70 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_22) 71 | def test_google_correctness_matching_path_values_22(robots_txt, agent, path, allowed, can_fetch): 72 | assert can_fetch(robots_txt, agent, path) is allowed 73 | 74 | 75 | robots_txt_BOM_characters0 = """ 76 | User-Agent: foo 77 | Disallow: /AnyValue 78 | """ 79 | 80 | data_BOM_characters0 = ( 81 | [robots_txt_BOM_characters0, "foo", "http://example.com/AnyValue", DISALLOWED], 82 | ) 83 | 84 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters0) 85 | def test_google_correctness_BOM_characters0(robots_txt, agent, path, allowed, can_fetch): 86 | assert can_fetch(robots_txt, agent, path) is allowed 87 | 88 | 89 | robots_txt_BOM_characters1 = """ 90 | User-Agent: foo 91 | Disallow: /AnyValue 92 | """ 93 | 94 | data_BOM_characters1 = ( 95 | [robots_txt_BOM_characters1, "foo", "http://example.com/AnyValue", DISALLOWED], 96 | ) 97 | 98 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters1) 99 | def test_google_correctness_BOM_characters1(robots_txt, agent, path, allowed, can_fetch): 100 | assert can_fetch(robots_txt, agent, path) is allowed 101 | 102 | 103 | robots_txt_BOM_characters2 = """ 104 | User-Agent: foo 105 | Disallow: /AnyValue 106 | """ 107 | 108 | data_BOM_characters2 = ( 109 | [robots_txt_BOM_characters2, "foo", "http://example.com/AnyValue", DISALLOWED], 110 | ) 111 | 112 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters2) 113 | def test_google_correctness_BOM_characters2(robots_txt, agent, path, allowed, can_fetch): 114 | assert can_fetch(robots_txt, agent, path) is allowed 115 | 116 | 117 | robots_txt_BOM_characters3 = """ 118 | User-Agent: foo 119 | Disallow: /AnyValue 120 | """ 121 | 122 | data_BOM_characters3 = ( 123 | # [robots_txt_BOM_characters3, "foo", "http://example.com/AnyValue", ALLOWED], # Fails Google correctness 124 | ) 125 | 126 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters3) 127 | def test_google_correctness_BOM_characters3(robots_txt, agent, path, allowed, can_fetch): 128 | assert can_fetch(robots_txt, agent, path) is allowed 129 | 130 | 131 | robots_txt_BOM_characters4 = """ 132 | User-Agent: foo 133 | Disallow: /AnyValue 134 | """ 135 | 136 | data_BOM_characters4 = ( 137 | # [robots_txt_BOM_characters4, "foo", "http://example.com/AnyValue", ALLOWED], # Fails Google correctness 138 | ) 139 | 140 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_BOM_characters4) 141 | def test_google_correctness_BOM_characters4(robots_txt, agent, path, allowed, can_fetch): 142 | assert can_fetch(robots_txt, agent, path) is allowed 143 | 144 | 145 | robots_txt_empty_string0 = """ 146 | 147 | """ 148 | 149 | data_empty_string0 = ( 150 | [robots_txt_empty_string0, "FooBot", "", ALLOWED], 151 | [robots_txt_empty_string0, "", "", ALLOWED], 152 | ) 153 | 154 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_empty_string0) 155 | def test_google_correctness_empty_string0(robots_txt, agent, path, allowed, can_fetch): 156 | assert can_fetch(robots_txt, agent, path) is allowed 157 | 158 | 159 | robots_txt_empty_string1 = """ 160 | user-agent: FooBot 161 | disallow: / 162 | """ 163 | 164 | data_empty_string1 = ( 165 | [robots_txt_empty_string1, "", "", ALLOWED], 166 | [robots_txt_empty_string1, "FooBot", "", DISALLOWED], 167 | ) 168 | 169 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_empty_string1) 170 | def test_google_correctness_empty_string1(robots_txt, agent, path, allowed, can_fetch): 171 | assert can_fetch(robots_txt, agent, path) is allowed 172 | 173 | 174 | robots_txt_accepted_mistakes0 = """ 175 | user-agent: FooBot 176 | disallow: / 177 | """ 178 | 179 | data_accepted_mistakes0 = ( 180 | [robots_txt_accepted_mistakes0, "FooBot", "http://foo.bar/x/y", DISALLOWED], 181 | ) 182 | 183 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes0) 184 | def test_google_correctness_accepted_mistakes0(robots_txt, agent, path, allowed, can_fetch): 185 | assert can_fetch(robots_txt, agent, path) is allowed 186 | 187 | 188 | robots_txt_accepted_mistakes1 = """ 189 | foo: FooBot 190 | bar: / 191 | """ 192 | 193 | data_accepted_mistakes1 = ( 194 | [robots_txt_accepted_mistakes1, "FooBot", "http://foo.bar/x/y", ALLOWED], 195 | ) 196 | 197 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes1) 198 | def test_google_correctness_accepted_mistakes1(robots_txt, agent, path, allowed, can_fetch): 199 | assert can_fetch(robots_txt, agent, path) is allowed 200 | 201 | 202 | robots_txt_accepted_mistakes2 = """ 203 | user-agent FooBot 204 | disallow / 205 | """ 206 | 207 | data_accepted_mistakes2 = ( 208 | [robots_txt_accepted_mistakes2, "FooBot", "http://foo.bar/x/y", DISALLOWED], 209 | ) 210 | 211 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_accepted_mistakes2) 212 | def test_google_correctness_accepted_mistakes2(robots_txt, agent, path, allowed, can_fetch): 213 | assert can_fetch(robots_txt, agent, path) is allowed 214 | 215 | 216 | robots_txt_uri_case_sensitivity0 = """ 217 | user-agent: FooBot 218 | disallow: /X/ 219 | """ 220 | 221 | data_uri_case_sensitivity0 = ( 222 | [robots_txt_uri_case_sensitivity0, "FooBot", "http://foo.bar/x/y", ALLOWED], 223 | ) 224 | 225 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_uri_case_sensitivity0) 226 | def test_google_correctness_uri_case_sensitivity0(robots_txt, agent, path, allowed, can_fetch): 227 | assert can_fetch(robots_txt, agent, path) is allowed 228 | 229 | 230 | robots_txt_uri_case_sensitivity1 = """ 231 | user-agent: FooBot 232 | disallow: /x/ 233 | """ 234 | 235 | data_uri_case_sensitivity1 = ( 236 | [robots_txt_uri_case_sensitivity1, "FooBot", "http://foo.bar/x/y", DISALLOWED], 237 | ) 238 | 239 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_uri_case_sensitivity1) 240 | def test_google_correctness_uri_case_sensitivity1(robots_txt, agent, path, allowed, can_fetch): 241 | assert can_fetch(robots_txt, agent, path) is allowed 242 | 243 | 244 | robots_txt_global_rules0 = """ 245 | 246 | """ 247 | 248 | data_global_rules0 = ( 249 | [robots_txt_global_rules0, "FooBot", "http://foo.bar/x/y", ALLOWED], 250 | ) 251 | 252 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules0) 253 | def test_google_correctness_global_rules0(robots_txt, agent, path, allowed, can_fetch): 254 | assert can_fetch(robots_txt, agent, path) is allowed 255 | 256 | 257 | robots_txt_global_rules1 = """ 258 | user-agent: * 259 | disallow: /x 260 | user-agent: FooBot 261 | allow: /x/y 262 | """ 263 | 264 | data_global_rules1 = ( 265 | [robots_txt_global_rules1, "FooBot", "http://foo.bar/x/y", ALLOWED], 266 | [robots_txt_global_rules1, "BarBot", "http://foo.bar/x/y", DISALLOWED], 267 | ) 268 | 269 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules1) 270 | def test_google_correctness_global_rules1(robots_txt, agent, path, allowed, can_fetch): 271 | assert can_fetch(robots_txt, agent, path) is allowed 272 | 273 | 274 | robots_txt_global_rules2 = """ 275 | user-agent: FooBot 276 | allow: / 277 | user-agent: BarBot 278 | disallow: / 279 | user-agent: BazBot 280 | disallow: / 281 | """ 282 | 283 | data_global_rules2 = ( 284 | [robots_txt_global_rules2, "QuxBot", "http://foo.bar/x/y", ALLOWED], 285 | ) 286 | 287 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_global_rules2) 288 | def test_google_correctness_global_rules2(robots_txt, agent, path, allowed, can_fetch): 289 | assert can_fetch(robots_txt, agent, path) is allowed 290 | 291 | 292 | robots_txt_non_ascii_paths0 = """ 293 | User-agent: FooBot 294 | Disallow: / 295 | Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par 296 | """ 297 | 298 | data_non_ascii_paths0 = ( 299 | [robots_txt_non_ascii_paths0, "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par", ALLOWED], 300 | ) 301 | 302 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths0) 303 | def test_google_correctness_non_ascii_paths0(robots_txt, agent, path, allowed, can_fetch): 304 | assert can_fetch(robots_txt, agent, path) is allowed 305 | 306 | 307 | robots_txt_non_ascii_paths1 = """ 308 | User-agent: FooBot 309 | Disallow: / 310 | Allow: /foo/bar/ツ 311 | """ 312 | 313 | data_non_ascii_paths1 = ( 314 | [robots_txt_non_ascii_paths1, "FooBot", "http://foo.bar/foo/bar/%E3%83%84", ALLOWED], 315 | # [robots_txt_non_ascii_paths1, "FooBot", "http://foo.bar/foo/bar/ツ", DISALLOWED], # Fails Google correctness 316 | ) 317 | 318 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths1) 319 | def test_google_correctness_non_ascii_paths1(robots_txt, agent, path, allowed, can_fetch): 320 | assert can_fetch(robots_txt, agent, path) is allowed 321 | 322 | 323 | robots_txt_non_ascii_paths2 = """ 324 | User-agent: FooBot 325 | Disallow: / 326 | Allow: /foo/bar/%E3%83%84 327 | """ 328 | 329 | data_non_ascii_paths2 = ( 330 | [robots_txt_non_ascii_paths2, "FooBot", "http://foo.bar/foo/bar/%E3%83%84", ALLOWED], 331 | # [robots_txt_non_ascii_paths2, "FooBot", "http://foo.bar/foo/bar/ツ", DISALLOWED], # Fails Google correctness 332 | ) 333 | 334 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths2) 335 | def test_google_correctness_non_ascii_paths2(robots_txt, agent, path, allowed, can_fetch): 336 | assert can_fetch(robots_txt, agent, path) is allowed 337 | 338 | 339 | robots_txt_non_ascii_paths3 = """ 340 | User-agent: FooBot 341 | Disallow: / 342 | Allow: /foo/bar/%62%61%7A 343 | """ 344 | 345 | data_non_ascii_paths3 = ( 346 | # [robots_txt_non_ascii_paths3, "FooBot", "http://foo.bar/foo/bar/baz", DISALLOWED], # Fails google correctness 347 | [robots_txt_non_ascii_paths3, "FooBot", "http://foo.bar/foo/bar/%62%61%7A", ALLOWED], 348 | ) 349 | 350 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_non_ascii_paths3) 351 | def test_google_correctness_non_ascii_paths3(robots_txt, agent, path, allowed, can_fetch): 352 | assert can_fetch(robots_txt, agent, path) is allowed 353 | 354 | 355 | robots_txt_special_characters0 = """ 356 | User-agent: FooBot 357 | Disallow: /foo/bar/quz 358 | Allow: /foo/*/qux 359 | """ 360 | 361 | data_special_characters0 = ( 362 | [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/bar/quz", DISALLOWED], 363 | [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/quz", ALLOWED], 364 | [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo//quz", ALLOWED], 365 | [robots_txt_special_characters0, "FooBot", "http://foo.bar/foo/bax/quz", ALLOWED], 366 | ) 367 | 368 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters0) 369 | def test_google_correctness_special_characters0(robots_txt, agent, path, allowed, can_fetch): 370 | assert can_fetch(robots_txt, agent, path) is allowed 371 | 372 | 373 | robots_txt_special_characters1 = """ 374 | User-agent: FooBot 375 | Disallow: /foo/bar$ 376 | Allow: /foo/bar/qux 377 | """ 378 | 379 | data_special_characters1 = ( 380 | [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar", DISALLOWED], 381 | [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/qux", ALLOWED], 382 | [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/", ALLOWED], 383 | [robots_txt_special_characters1, "FooBot", "http://foo.bar/foo/bar/baz", ALLOWED], 384 | ) 385 | 386 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters1) 387 | def test_google_correctness_special_characters1(robots_txt, agent, path, allowed, can_fetch): 388 | assert can_fetch(robots_txt, agent, path) is allowed 389 | 390 | 391 | robots_txt_special_characters2 = """ 392 | User-agent: FooBot 393 | # Disallow: / 394 | Disallow: /foo/quz#qux 395 | Allow: / 396 | """ 397 | 398 | data_special_characters2 = ( 399 | [robots_txt_special_characters2, "FooBot", "http://foo.bar/foo/bar", ALLOWED], 400 | [robots_txt_special_characters2, "FooBot", "http://foo.bar/foo/quz", DISALLOWED], 401 | ) 402 | 403 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_special_characters2) 404 | def test_google_correctness_special_characters2(robots_txt, agent, path, allowed, can_fetch): 405 | assert can_fetch(robots_txt, agent, path) is allowed 406 | 407 | 408 | robots_txt_index_page0 = """ 409 | User-Agent: * 410 | Allow: /allowed-slash/index.html 411 | Disallow: / 412 | """ 413 | 414 | data_index_page0 = ( 415 | # [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/", ALLOWED], # google specific - fails google correcness 416 | [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/index.htm", DISALLOWED], 417 | [robots_txt_index_page0, "foobot", "http://foo.com/allowed-slash/index.html", ALLOWED], 418 | [robots_txt_index_page0, "foobot", "http://foo.com/anyother-url", DISALLOWED], 419 | ) 420 | 421 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_index_page0) 422 | def test_google_correctness_index_page0(robots_txt, agent, path, allowed, can_fetch): 423 | assert can_fetch(robots_txt, agent, path) is allowed 424 | 425 | 426 | robots_txt_user_agent_name0 = """ 427 | User-Agent: * 428 | Disallow: / 429 | User-Agent: Foo Bar 430 | Allow: /x/ 431 | Disallow: / 432 | """ 433 | 434 | data_user_agent_name0 = ( 435 | [robots_txt_user_agent_name0, "Foo", "http://foo.bar/x/y", ALLOWED], 436 | [robots_txt_user_agent_name0, "Foo Bar", "http://foo.bar/x/y", DISALLOWED], 437 | ) 438 | 439 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name0) 440 | def test_google_correctness_user_agent_name0(robots_txt, agent, path, allowed, can_fetch): 441 | assert can_fetch(robots_txt, agent, path) is allowed 442 | 443 | 444 | robots_txt_user_agent_name1 = """ 445 | user-agent: FOO BAR 446 | allow: /x/ 447 | disallow: / 448 | """ 449 | 450 | data_user_agent_name1 = ( 451 | [robots_txt_user_agent_name1, "Foo", "http://foo.bar/x/y", ALLOWED], 452 | [robots_txt_user_agent_name1, "foo", "http://foo.bar/x/y", ALLOWED], 453 | [robots_txt_user_agent_name1, "Foo", "http://foo.bar/a/b", DISALLOWED], 454 | [robots_txt_user_agent_name1, "foo", "http://foo.bar/a/b", DISALLOWED], 455 | ) 456 | 457 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name1) 458 | def test_google_correctness_user_agent_name1(robots_txt, agent, path, allowed, can_fetch): 459 | assert can_fetch(robots_txt, agent, path) is allowed 460 | 461 | 462 | robots_txt_user_agent_name2 = """ 463 | user-agent: foo bar 464 | allow: /x/ 465 | disallow: / 466 | """ 467 | 468 | data_user_agent_name2 = ( 469 | [robots_txt_user_agent_name2, "Foo", "http://foo.bar/x/y", ALLOWED], 470 | [robots_txt_user_agent_name2, "foo", "http://foo.bar/x/y", ALLOWED], 471 | [robots_txt_user_agent_name2, "Foo", "http://foo.bar/a/b", DISALLOWED], 472 | [robots_txt_user_agent_name2, "foo", "http://foo.bar/a/b", DISALLOWED], 473 | ) 474 | 475 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name2) 476 | def test_google_correctness_user_agent_name2(robots_txt, agent, path, allowed, can_fetch): 477 | assert can_fetch(robots_txt, agent, path) is allowed 478 | 479 | 480 | robots_txt_user_agent_name3 = """ 481 | user-agent: FoO bAr 482 | allow: /x/ 483 | disallow: / 484 | """ 485 | 486 | data_user_agent_name3 = ( 487 | [robots_txt_user_agent_name3, "Foo", "http://foo.bar/x/y", ALLOWED], 488 | [robots_txt_user_agent_name3, "foo", "http://foo.bar/x/y", ALLOWED], 489 | [robots_txt_user_agent_name3, "Foo", "http://foo.bar/a/b", DISALLOWED], 490 | [robots_txt_user_agent_name3, "foo", "http://foo.bar/a/b", DISALLOWED], 491 | ) 492 | 493 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_user_agent_name3) 494 | def test_google_correctness_user_agent_name3(robots_txt, agent, path, allowed, can_fetch): 495 | assert can_fetch(robots_txt, agent, path) is allowed 496 | 497 | 498 | robots_txt_directives_case_insensitivity0 = """ 499 | USER-AGENT: FooBot 500 | ALLOW: /x/ 501 | DISALLOW: / 502 | """ 503 | 504 | data_directives_case_insensitivity0 = ( 505 | [robots_txt_directives_case_insensitivity0, "FooBot", "http://foo.bar/x/y", ALLOWED], 506 | [robots_txt_directives_case_insensitivity0, "FooBot", "http://foo.bar/a/b", DISALLOWED], 507 | ) 508 | 509 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity0) 510 | def test_google_correctness_directives_case_insensitivity0(robots_txt, agent, path, allowed, can_fetch): 511 | assert can_fetch(robots_txt, agent, path) is allowed 512 | 513 | 514 | robots_txt_directives_case_insensitivity1 = """ 515 | user-agent: FooBot 516 | allow: /x/ 517 | disallow: / 518 | """ 519 | 520 | data_directives_case_insensitivity1 = ( 521 | [robots_txt_directives_case_insensitivity1, "FooBot", "http://foo.bar/x/y", ALLOWED], 522 | [robots_txt_directives_case_insensitivity1, "FooBot", "http://foo.bar/a/b", DISALLOWED], 523 | ) 524 | 525 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity1) 526 | def test_google_correctness_directives_case_insensitivity1(robots_txt, agent, path, allowed, can_fetch): 527 | assert can_fetch(robots_txt, agent, path) is allowed 528 | 529 | 530 | robots_txt_directives_case_insensitivity2 = """ 531 | uSeR-aGeNt: FooBot 532 | AlLoW: /x/ 533 | dIsAlLoW: / 534 | """ 535 | 536 | data_directives_case_insensitivity2 = ( 537 | [robots_txt_directives_case_insensitivity2, "FooBot", "http://foo.bar/x/y", ALLOWED], 538 | [robots_txt_directives_case_insensitivity2, "FooBot", "http://foo.bar/a/b", DISALLOWED], 539 | ) 540 | 541 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_directives_case_insensitivity2) 542 | def test_google_correctness_directives_case_insensitivity2(robots_txt, agent, path, allowed, can_fetch): 543 | assert can_fetch(robots_txt, agent, path) is allowed 544 | 545 | 546 | robots_txt_groups0 = """ 547 | allow: /foo/bar/ 548 | 549 | user-agent: FooBot 550 | disallow: / 551 | allow: /x/ 552 | user-agent: BarBot 553 | disallow: / 554 | allow: /y/ 555 | 556 | 557 | allow: /w/ 558 | user-agent: BazBot 559 | 560 | user-agent: FooBot 561 | allow: /z/ 562 | disallow: / 563 | """ 564 | 565 | data_groups0 = ( 566 | [robots_txt_groups0, "FooBot", "http://foo.bar/x/b", ALLOWED], 567 | [robots_txt_groups0, "FooBot", "http://foo.bar/z/d", ALLOWED], 568 | [robots_txt_groups0, "FooBot", "http://foo.bar/y/c", DISALLOWED], 569 | [robots_txt_groups0, "BarBot", "http://foo.bar/y/c", ALLOWED], 570 | [robots_txt_groups0, "BarBot", "http://foo.bar/w/a", ALLOWED], 571 | [robots_txt_groups0, "BarBot", "http://foo.bar/z/d", DISALLOWED], 572 | [robots_txt_groups0, "BazBot", "http://foo.bar/z/d", ALLOWED], 573 | [robots_txt_groups0, "FooBot", "http://foo.bar/foo/bar/", DISALLOWED], 574 | [robots_txt_groups0, "BarBot", "http://foo.bar/foo/bar/", DISALLOWED], 575 | [robots_txt_groups0, "BazBot", "http://foo.bar/foo/bar/", DISALLOWED], 576 | ) 577 | 578 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_groups0) 579 | def test_google_correctness_groups0(robots_txt, agent, path, allowed, can_fetch): 580 | assert can_fetch(robots_txt, agent, path) is allowed 581 | 582 | 583 | robots_txt_most_specific_match0 = """ 584 | user-agent: FooBot 585 | disallow: /x/page.html 586 | allow: /x/ 587 | """ 588 | 589 | data_most_specific_match0 = ( 590 | [robots_txt_most_specific_match0, "FooBot", "http://foo.bar/x/page.html", DISALLOWED], 591 | ) 592 | 593 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match0) 594 | def test_google_correctness_most_specific_match0(robots_txt, agent, path, allowed, can_fetch): 595 | assert can_fetch(robots_txt, agent, path) is allowed 596 | 597 | 598 | robots_txt_most_specific_match1 = """ 599 | user-agent: FooBot 600 | allow: /x/page.html 601 | disallow: /x/ 602 | """ 603 | 604 | data_most_specific_match1 = ( 605 | [robots_txt_most_specific_match1, "FooBot", "http://foo.bar/x/page.html", ALLOWED], 606 | [robots_txt_most_specific_match1, "FooBot", "http://foo.bar/x/", DISALLOWED], 607 | ) 608 | 609 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match1) 610 | def test_google_correctness_most_specific_match1(robots_txt, agent, path, allowed, can_fetch): 611 | assert can_fetch(robots_txt, agent, path) is allowed 612 | 613 | 614 | robots_txt_most_specific_match2 = """ 615 | user-agent: FooBot 616 | disallow: 617 | allow: 618 | """ 619 | 620 | data_most_specific_match2 = ( 621 | [robots_txt_most_specific_match2, "FooBot", "http://foo.bar/x/page.html", ALLOWED], 622 | ) 623 | 624 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match2) 625 | def test_google_correctness_most_specific_match2(robots_txt, agent, path, allowed, can_fetch): 626 | assert can_fetch(robots_txt, agent, path) is allowed 627 | 628 | 629 | robots_txt_most_specific_match3 = """ 630 | user-agent: FooBot 631 | disallow: / 632 | allow: / 633 | """ 634 | 635 | data_most_specific_match3 = ( 636 | [robots_txt_most_specific_match3, "FooBot", "http://foo.bar/x/page.html", ALLOWED], 637 | ) 638 | 639 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match3) 640 | def test_google_correctness_most_specific_match3(robots_txt, agent, path, allowed, can_fetch): 641 | assert can_fetch(robots_txt, agent, path) is allowed 642 | 643 | 644 | robots_txt_most_specific_match4 = """ 645 | user-agent: FooBot 646 | disallow: /x 647 | allow: /x/ 648 | """ 649 | 650 | data_most_specific_match4 = ( 651 | [robots_txt_most_specific_match4, "FooBot", "http://foo.bar/x", DISALLOWED], 652 | [robots_txt_most_specific_match4, "FooBot", "http://foo.bar/x/", ALLOWED], 653 | ) 654 | 655 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match4) 656 | def test_google_correctness_most_specific_match4(robots_txt, agent, path, allowed, can_fetch): 657 | assert can_fetch(robots_txt, agent, path) is allowed 658 | 659 | 660 | robots_txt_most_specific_match5 = """ 661 | user-agent: FooBot 662 | disallow: /x/page.html 663 | allow: /x/page.html 664 | """ 665 | 666 | data_most_specific_match5 = ( 667 | [robots_txt_most_specific_match5, "FooBot", "http://foo.bar/x/page.html", ALLOWED], 668 | ) 669 | 670 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match5) 671 | def test_google_correctness_most_specific_match5(robots_txt, agent, path, allowed, can_fetch): 672 | assert can_fetch(robots_txt, agent, path) is allowed 673 | 674 | 675 | robots_txt_most_specific_match6 = """ 676 | user-agent: FooBot 677 | allow: /page 678 | disallow: /*.html 679 | """ 680 | 681 | data_most_specific_match6 = ( 682 | [robots_txt_most_specific_match6, "FooBot", "http://foo.bar/page.html", DISALLOWED], 683 | [robots_txt_most_specific_match6, "FooBot", "http://foo.bar/page", ALLOWED], 684 | ) 685 | 686 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match6) 687 | def test_google_correctness_most_specific_match6(robots_txt, agent, path, allowed, can_fetch): 688 | assert can_fetch(robots_txt, agent, path) is allowed 689 | 690 | 691 | robots_txt_most_specific_match7 = """ 692 | user-agent: FooBot 693 | allow: /x/page. 694 | disallow: /*.html 695 | """ 696 | 697 | data_most_specific_match7 = ( 698 | [robots_txt_most_specific_match7, "FooBot", "http://foo.bar/x/page.html", ALLOWED], 699 | [robots_txt_most_specific_match7, "FooBot", "http://foo.bar/x/y.html", DISALLOWED], 700 | ) 701 | 702 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match7) 703 | def test_google_correctness_most_specific_match7(robots_txt, agent, path, allowed, can_fetch): 704 | assert can_fetch(robots_txt, agent, path) is allowed 705 | 706 | 707 | robots_txt_most_specific_match8 = """ 708 | User-agent: * 709 | Disallow: /x/ 710 | User-agent: FooBot 711 | Disallow: /y/ 712 | """ 713 | 714 | data_most_specific_match8 = ( 715 | [robots_txt_most_specific_match8, "FooBot", "http://foo.bar/x/page", ALLOWED], 716 | [robots_txt_most_specific_match8, "FooBot", "http://foo.bar/y/page", DISALLOWED], 717 | ) 718 | 719 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_most_specific_match8) 720 | def test_google_correctness_most_specific_match8(robots_txt, agent, path, allowed, can_fetch): 721 | assert can_fetch(robots_txt, agent, path) is allowed 722 | 723 | 724 | robots_txt_different_line_endings0 = """ 725 | User-Agent: foo 726 | Allow: /some/path 727 | User-Agent: bar 728 | 729 | 730 | Disallow: / 731 | """ 732 | 733 | data_different_line_endings0 = ( 734 | [robots_txt_different_line_endings0, "bar", "http://example.com/page", DISALLOWED], 735 | ) 736 | 737 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings0) 738 | def test_google_correctness_different_line_endings0(robots_txt, agent, path, allowed, can_fetch): 739 | assert can_fetch(robots_txt, agent, path) is allowed 740 | 741 | 742 | robots_txt_different_line_endings1 = """ 743 | User-Agent: foo 744 | Allow: /some/path 745 | User-Agent: bar 746 | 747 | 748 | Disallow: / 749 | """ 750 | 751 | data_different_line_endings1 = ( 752 | [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED], 753 | [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED], 754 | [robots_txt_different_line_endings1, "bar", "http://example.com/page", DISALLOWED], 755 | ) 756 | 757 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings1) 758 | def test_google_correctness_different_line_endings1(robots_txt, agent, path, allowed, can_fetch): 759 | assert can_fetch(robots_txt, agent, path) is allowed 760 | 761 | 762 | robots_txt_different_line_endings2 = """ 763 | User-Agent: foo 764 | User-Agent: bar 765 | 766 | Disallow: / 767 | """ 768 | 769 | data_different_line_endings2 = ( 770 | [robots_txt_different_line_endings2, "bar", "http://example.com/page", DISALLOWED], 771 | ) 772 | 773 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_different_line_endings2) 774 | def test_google_correctness_different_line_endings2(robots_txt, agent, path, allowed, can_fetch): 775 | assert can_fetch(robots_txt, agent, path) is allowed 776 | 777 | 778 | robots_txt_matching_path_values_10 = """ 779 | user-agent: FooBot 780 | disallow: / 781 | allow: /fish 782 | """ 783 | 784 | data_matching_path_values_10 = ( 785 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/bar", DISALLOWED], 786 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish", ALLOWED], 787 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish.html", ALLOWED], 788 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED], 789 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fishheads", ALLOWED], 790 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fishheads/yummy.html", ALLOWED], 791 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/fish.html?id=anything", ALLOWED], 792 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/Fish.asp", DISALLOWED], 793 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/catfish", DISALLOWED], 794 | [robots_txt_matching_path_values_10, "FooBot", "http://foo.bar/?id=fish", DISALLOWED], 795 | ) 796 | 797 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_10) 798 | def test_google_correctness_matching_path_values_10(robots_txt, agent, path, allowed, can_fetch): 799 | assert can_fetch(robots_txt, agent, path) is allowed 800 | 801 | 802 | robots_txt_matching_path_values_11 = """ 803 | user-agent: FooBot 804 | disallow: / 805 | allow: /fish* 806 | """ 807 | 808 | data_matching_path_values_11 = ( 809 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/bar", DISALLOWED], 810 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish", ALLOWED], 811 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish.html", ALLOWED], 812 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED], 813 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fishheads", ALLOWED], 814 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fishheads/yummy.html", ALLOWED], 815 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/fish.html?id=anything", ALLOWED], 816 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/Fish.bar", DISALLOWED], 817 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/catfish", DISALLOWED], 818 | [robots_txt_matching_path_values_11, "FooBot", "http://foo.bar/?id=fish", DISALLOWED], 819 | ) 820 | 821 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_11) 822 | def test_google_correctness_matching_path_values_11(robots_txt, agent, path, allowed, can_fetch): 823 | assert can_fetch(robots_txt, agent, path) is allowed 824 | 825 | 826 | robots_txt_matching_path_values_12 = """ 827 | user-agent: FooBot 828 | disallow: / 829 | allow: /fish/ 830 | """ 831 | 832 | data_matching_path_values_12 = ( 833 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/bar", DISALLOWED], 834 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/", ALLOWED], 835 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/salmon", ALLOWED], 836 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/?salmon", ALLOWED], 837 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/salmon.html", ALLOWED], 838 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish/?id=anything", ALLOWED], 839 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish", DISALLOWED], 840 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/fish.html", DISALLOWED], 841 | [robots_txt_matching_path_values_12, "FooBot", "http://foo.bar/Fish/Salmon.html", DISALLOWED], 842 | ) 843 | 844 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_matching_path_values_12) 845 | def test_google_correctness_matching_path_values_12(robots_txt, agent, path, allowed, can_fetch): 846 | assert can_fetch(robots_txt, agent, path) is allowed 847 | 848 | 849 | robots_txt_order_of_precedence0 = """ 850 | user-agent: FooBot 851 | allow: /p 852 | disallow: / 853 | """ 854 | 855 | data_order_of_precedence0 = ( 856 | [robots_txt_order_of_precedence0, "FooBot", "http://example.com/page", ALLOWED], 857 | ) 858 | 859 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence0) 860 | def test_google_correctness_order_of_precedence0(robots_txt, agent, path, allowed, can_fetch): 861 | assert can_fetch(robots_txt, agent, path) is allowed 862 | 863 | 864 | robots_txt_order_of_precedence1 = """ 865 | user-agent: FooBot 866 | allow: /folder 867 | disallow: /folder 868 | """ 869 | 870 | data_order_of_precedence1 = ( 871 | [robots_txt_order_of_precedence1, "FooBot", "http://example.com/folder/page", ALLOWED], 872 | ) 873 | 874 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence1) 875 | def test_google_correctness_order_of_precedence1(robots_txt, agent, path, allowed, can_fetch): 876 | assert can_fetch(robots_txt, agent, path) is allowed 877 | 878 | 879 | robots_txt_order_of_precedence2 = """ 880 | user-agent: FooBot 881 | allow: /page 882 | disallow: /*.htm 883 | """ 884 | 885 | data_order_of_precedence2 = ( 886 | [robots_txt_order_of_precedence2, "FooBot", "http://example.com/page.htm", DISALLOWED], 887 | ) 888 | 889 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence2) 890 | def test_google_correctness_order_of_precedence2(robots_txt, agent, path, allowed, can_fetch): 891 | assert can_fetch(robots_txt, agent, path) is allowed 892 | 893 | 894 | robots_txt_order_of_precedence3 = """ 895 | user-agent: FooBot 896 | allow: /$ 897 | disallow: / 898 | """ 899 | 900 | data_order_of_precedence3 = ( 901 | [robots_txt_order_of_precedence3, "FooBot", "http://example.com/", ALLOWED], 902 | [robots_txt_order_of_precedence3, "FooBot", "http://example.com/page.html", DISALLOWED], 903 | ) 904 | 905 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_order_of_precedence3) 906 | def test_google_correctness_order_of_precedence3(robots_txt, agent, path, allowed, can_fetch): 907 | assert can_fetch(robots_txt, agent, path) is allowed 908 | 909 | -------------------------------------------------------------------------------- /tests/test_google_stress.py: -------------------------------------------------------------------------------- 1 | 2 | # Code generated from https://github.com/google/robotstxt-spec-test/tree/master/src/main/resources/CTC/ 3 | 4 | 5 | import pytest 6 | from .core import * 7 | 8 | 9 | robots_txt_638845 = """ 10 | # For more information about the robots.txt standard, see: 11 | # http://www.robotstxt.org/orig.html 12 | # 13 | 14 | User-agent: * 15 | Disallow: /main/ 16 | Disallow: /store/ 17 | Disallow: /scp/ 18 | Disallow: /mods/ 19 | Disallow: /view/ 20 | Disallow: /deps/ 21 | Disallow: /setup/ 22 | Disallow: /language/ 23 | Disallow: /libs/ 24 | Disallow: /data/ 25 | Disallow: /media/ 26 | Disallow: /parts/ 27 | Disallow: /plugins/ 28 | Disallow: /help/ 29 | Disallow: /tmp/ 30 | 31 | """ 32 | 33 | data_638845 = ( 34 | [robots_txt_638845, "foobot", "http://example.com/", ALLOWED], 35 | [robots_txt_638845, "foobot", "http://example.com/index.html", ALLOWED], 36 | [robots_txt_638845, "foobot", "http://example.com/scp/data", DISALLOWED], 37 | [robots_txt_638845, "foobot", "http://example.com/medi", ALLOWED], 38 | [robots_txt_638845, "foobot", "http://example.com/media", ALLOWED], 39 | [robots_txt_638845, "foobot", "http://example.com/loogs?user=admin", ALLOWED], 40 | ) 41 | 42 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_638845) 43 | def test_google_stress_638845(robots_txt, agent, path, allowed, can_fetch): 44 | assert can_fetch(robots_txt, agent, path) is allowed 45 | 46 | 47 | robots_txt_308278 = """ 48 | User-agent: * 49 | Disallow: /asdf-login 50 | Disallow: /asdf-admin 51 | Disallow: /databack/ 52 | Disallow: /data/* 53 | Disallow: /?*/ 54 | Disallow: /author/ 55 | Disallow: /id/*/page/ 56 | Disallow: /id/*/data/ 57 | Sitemap: http://example.com/page-sitemap.xml 58 | """ 59 | 60 | data_308278 = ( 61 | [robots_txt_308278, "foobot", "http://example.com/asdf-login", DISALLOWED], 62 | [robots_txt_308278, "foobot", "http://example.com/asdf-login/", DISALLOWED], 63 | [robots_txt_308278, "foobot", "http://example.com/", ALLOWED], 64 | [robots_txt_308278, "foobot", "http://example.com/databack", ALLOWED], 65 | [robots_txt_308278, "foobot", "http://example.com/databack/recent", DISALLOWED], 66 | [robots_txt_308278, "foobot", "http://example.com/foo/?user=admin/data", ALLOWED], 67 | [robots_txt_308278, "foobot", "http://example.com/?user=admin/data", DISALLOWED], 68 | [robots_txt_308278, "foobot", "http://example.com/id/page/", ALLOWED], 69 | [robots_txt_308278, "foobot", "http://example.com/id/some/page/", DISALLOWED], 70 | [robots_txt_308278, "foobot", "http://example.com/id/some/data", ALLOWED], 71 | [robots_txt_308278, "foobot", "http://example.com/id/some/data/more", DISALLOWED], 72 | ) 73 | 74 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_308278) 75 | def test_google_stress_308278(robots_txt, agent, path, allowed, can_fetch): 76 | assert can_fetch(robots_txt, agent, path) is allowed 77 | 78 | 79 | robots_txt_943687 = """ 80 | # Today I heard something new and unmemorable 81 | # If I don’t like something, I’ll stay away from it 82 | # Everyone was busy, so I went to the movie alone 83 | # 84 | # For more information about the robots.txt standard, see: 85 | # http://www.robotstxt.org/orig.html 86 | # 87 | # For syntax checking, see: 88 | # http://example.com/robots-checker.phtml 89 | 90 | User-agent: * 91 | Disallow: /admin/ 92 | Disallow: /bin/ 93 | Disallow: /cache/ 94 | Disallow: /clion/ 95 | Disallow: /components/ 96 | Disallow: /excludes/ 97 | Disallow: /deinstallation/ 98 | Disallow: /layouts/ 99 | Disallow: /libraries/ 100 | Disallow: /logs/ 101 | Disallow: /plugins/ 102 | Disallow: /tmp/ 103 | 104 | """ 105 | 106 | data_943687 = ( 107 | [robots_txt_943687, "foobot", "http://www.example.com/foo/bar", ALLOWED], 108 | [robots_txt_943687, "foobot", "http://www.example.com/admin/settings", DISALLOWED], 109 | [robots_txt_943687, "foobot", "http://www.example.com/bin/sh", DISALLOWED], 110 | [robots_txt_943687, "foo-bot", "http://www.example.com/search?req=123", ALLOWED], 111 | [robots_txt_943687, "foo_bot", "http://www.example.com/log/113", ALLOWED], 112 | [robots_txt_943687, "foo_bot", "http://www.example.com/logs/113", DISALLOWED], 113 | [robots_txt_943687, "foo-bot", "http://www.example.com/example/admin", ALLOWED], 114 | [robots_txt_943687, "foobot", "http://www.example.com/admin", ALLOWED], 115 | [robots_txt_943687, "foobot", "http://www.example.com/admin/", DISALLOWED], 116 | [robots_txt_943687, "foo_bot", "http://www.example.com/dev/null", ALLOWED], 117 | [robots_txt_943687, "foo_bot", "http://www.example.com/tmp/null", DISALLOWED], 118 | ) 119 | 120 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_943687) 121 | def test_google_stress_943687(robots_txt, agent, path, allowed, can_fetch): 122 | assert can_fetch(robots_txt, agent, path) is allowed 123 | 124 | 125 | robots_txt_584234 = """ 126 | User-agent: barbot 127 | Disallow: / 128 | 129 | User-agent: bazbot 130 | Disallow: / 131 | 132 | User-agent: qux_bot 133 | Crawl-delay: 1 134 | 135 | User-agent: * 136 | Allow: / 137 | 138 | User-agent: * 139 | Crawl-delay: 1 140 | """ 141 | 142 | data_584234 = ( 143 | [robots_txt_584234, "barbot", "http://example.com/foo/bar", DISALLOWED], 144 | [robots_txt_584234, "barbot", "http://example.com/foo/foo/foo", DISALLOWED], 145 | [robots_txt_584234, "barbot", "http://example.com/index.html", DISALLOWED], 146 | [robots_txt_584234, "bazbot", "http://example.com/secrets/123", DISALLOWED], 147 | [robots_txt_584234, "bazbot", "http://example.com/log?id=113", DISALLOWED], 148 | [robots_txt_584234, "qux_bot", "http://example.com/index.html", ALLOWED], 149 | [robots_txt_584234, "qux_bot", "http://example.com/foo/bar", ALLOWED], 150 | [robots_txt_584234, "qux_bot", "http://example.com/", ALLOWED], 151 | [robots_txt_584234, "foobot", "http://example.com/foo/bar", ALLOWED], 152 | [robots_txt_584234, "foobot", "http://example.com/log?id=113", ALLOWED], 153 | ) 154 | 155 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_584234) 156 | def test_google_stress_584234(robots_txt, agent, path, allowed, can_fetch): 157 | assert can_fetch(robots_txt, agent, path) is allowed 158 | 159 | 160 | robots_txt_912555 = """ 161 | User-Agent: * 162 | Disallow: /error$ 163 | Disallow: /jm/com.example.FooController 164 | Disallow: /log 165 | Disallow: /admin$ 166 | Disallow: /adminactions$ 167 | Disallow: /adminactions? 168 | Disallow: /baz 169 | Disallow: /jm/com.example.BarController 170 | Sitemap: https://example.com/sitemap.xml 171 | """ 172 | 173 | data_912555 = ( 174 | [robots_txt_912555, "foobot", "http://example.com/error?user=admin", ALLOWED], 175 | [robots_txt_912555, "foobot", "http://example.com/error", DISALLOWED], 176 | [robots_txt_912555, "foo_bot", "http://example.com/search/foo", ALLOWED], 177 | [robots_txt_912555, "foo_bot", "http://example.com/log", DISALLOWED], 178 | [robots_txt_912555, "foo-bot", "http://example.com/adminactions", DISALLOWED], 179 | [robots_txt_912555, "foo-bot", "http://example.com/adminactions?id=123", DISALLOWED], 180 | [robots_txt_912555, "foo-bot", "http://example.com/adminactions/new", ALLOWED], 181 | [robots_txt_912555, "foobot", "http://example.com/jm/test.txt", ALLOWED], 182 | [robots_txt_912555, "foobot", "http://example.com/jm/com.example.BarController", DISALLOWED], 183 | [robots_txt_912555, "foobot", "http://example.com/foo/bar", ALLOWED], 184 | ) 185 | 186 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_912555) 187 | def test_google_stress_912555(robots_txt, agent, path, allowed, can_fetch): 188 | assert can_fetch(robots_txt, agent, path) is allowed 189 | 190 | 191 | robots_txt_174022 = """ 192 | User-agent: * 193 | Disallow: /view-responses.html 194 | Disallow: /help.html 195 | Disallow: /chat/reviews/view/ 196 | Disallow: /chat/view/ 197 | Disallow: /chat/view/hg/ 198 | Disallow: /chat/view/asd/ 199 | Disallow: /chat/asd/ 200 | Disallow: /chat/trackback/ 201 | Disallow: /chat/wp/ 202 | Disallow: /chat/*/reviews/view/$ 203 | Disallow: /chat/*/view/$ 204 | Disallow: /chat/*/view/hg/$ 205 | Disallow: /chat/*/view/asd/$ 206 | Disallow: /chat/*/asd/$ 207 | Disallow: /chat/*/trackback/$ 208 | Disallow: /contact-someone.html 209 | """ 210 | 211 | data_174022 = ( 212 | [robots_txt_174022, "FooBot", "http://example.com/", ALLOWED], 213 | [robots_txt_174022, "foobot", "http://example.com/search?req=123", ALLOWED], 214 | [robots_txt_174022, "Foo_Bot", "http://example.com/view-responses.html", DISALLOWED], 215 | [robots_txt_174022, "barbot", "http://example.com/chat/", ALLOWED], 216 | [robots_txt_174022, "BarBot", "http://example.com/chat/reviews/view/112", DISALLOWED], 217 | [robots_txt_174022, "BazBot", "http://example.com/chat/view", ALLOWED], 218 | [robots_txt_174022, "BazBot", "http://example.com/chat/view/hg", DISALLOWED], 219 | [robots_txt_174022, "FooBot", "http://example.com/chat/foo/bar/baz/view/", DISALLOWED], 220 | [robots_txt_174022, "barbot", "http://example.com/chat/something/asd/", DISALLOWED], 221 | [robots_txt_174022, "BarBot", "http://example.com/chat/asd/", DISALLOWED], 222 | [robots_txt_174022, "QuxBot", "http://example.com/contact-someone.html?user=foo", DISALLOWED], 223 | ) 224 | 225 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_174022) 226 | def test_google_stress_174022(robots_txt, agent, path, allowed, can_fetch): 227 | assert can_fetch(robots_txt, agent, path) is allowed 228 | 229 | 230 | robots_txt_860237 = """ 231 | User-Agent: * 232 | Crawl-delay : 60 233 | Disallow : /*baz* 234 | Disallow : /*qux* 235 | 236 | User-agent: XYZ123bot 237 | Crawl-delay : 60 238 | Disallow: / 239 | 240 | """ 241 | 242 | data_860237 = ( 243 | [robots_txt_860237, "Foobot", "http://example.com/", ALLOWED], 244 | [robots_txt_860237, "foo-bot", "http://example.com/foo/bar", ALLOWED], 245 | [robots_txt_860237, "foo_bot", "http://example.com/robots.txt", ALLOWED], 246 | [robots_txt_860237, "foo_bot", "http://example.com/new_baz", DISALLOWED], 247 | [robots_txt_860237, "foo_bot", "http://example.com/baz/new", DISALLOWED], 248 | [robots_txt_860237, "foo-bot", "http://example.com/move/qux/add", DISALLOWED], 249 | [robots_txt_860237, "foo_bot", "http://example.com/baznew/start", DISALLOWED], 250 | [robots_txt_860237, "Foobot", "http://example.com/foo_qux_bar", DISALLOWED], 251 | [robots_txt_860237, "XYZ123bot", "http://example.com/robots.txt", ALLOWED], 252 | [robots_txt_860237, "XYZ", "http://example.com/robots.txt", DISALLOWED], 253 | ) 254 | 255 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_860237) 256 | def test_google_stress_860237(robots_txt, agent, path, allowed, can_fetch): 257 | assert can_fetch(robots_txt, agent, path) is allowed 258 | 259 | 260 | robots_txt_777406 = """ 261 | User-agent: * 262 | Allow: / 263 | 264 | # Optimization for Baz Bot 265 | User-Agent: FunBot-Baz-Mobile 266 | User-Agent: FunBot-Baz 267 | Disallow: /_api/* 268 | Disallow: /_misc* 269 | Disallow: /media/v1/view/* 270 | 271 | Sitemap: https://www.example.com/sitemap.xml 272 | """ 273 | 274 | data_777406 = ( 275 | [robots_txt_777406, "foobot", "http://www.example.com/foo/bar", ALLOWED], 276 | [robots_txt_777406, "foo_bot", "http://www.example.com/", ALLOWED], 277 | [robots_txt_777406, "foo-bot", "http://www.example.com/robots.txt", ALLOWED], 278 | [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_api/index.html", DISALLOWED], 279 | [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_misc", DISALLOWED], 280 | [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/_media/v2/foo", ALLOWED], 281 | [robots_txt_777406, "FunBot-Baz-Mobile", "http://www.example.com/media/v1/view/", DISALLOWED], 282 | [robots_txt_777406, "FunBot-Baz", "http://www.example.com/media/v1/view/foo", DISALLOWED], 283 | [robots_txt_777406, "foo-bot", "http://www.example.com/media/v1/view/foo", ALLOWED], 284 | [robots_txt_777406, "foo_bot", "http://www.example.com/_misc/index.html", ALLOWED], 285 | ) 286 | 287 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_777406) 288 | def test_google_stress_777406(robots_txt, agent, path, allowed, can_fetch): 289 | assert can_fetch(robots_txt, agent, path) is allowed 290 | 291 | 292 | robots_txt_768939 = """ 293 | User-agent: * 294 | Crawl-delay: 3500 295 | Disallow: /ab_controller 296 | Disallow: /ab_imports 297 | Disallow: /ab_content/bar 298 | Disallow: /ab_content/cache 299 | Disallow: /ab_content/baz 300 | """ 301 | 302 | data_768939 = ( 303 | [robots_txt_768939, "foobot", "http://www.example.com/ab_controller", DISALLOWED], 304 | [robots_txt_768939, "foo_bot", "http://www.example.com/ab_controller-foo", DISALLOWED], 305 | [robots_txt_768939, "foo-bot", "http://www.example.com/ab_imports/foo.txt", DISALLOWED], 306 | [robots_txt_768939, "foobot", "http://www.example.com/foo/bar", ALLOWED], 307 | [robots_txt_768939, "foobot", "http://www.example.com/ab_content/foo", ALLOWED], 308 | [robots_txt_768939, "foo_bot", "http://www.example.com/ab_content/bar/foo.bar", DISALLOWED], 309 | [robots_txt_768939, "foo-bot", "http://www.example.com/ab_content/cache-foo", DISALLOWED], 310 | [robots_txt_768939, "foo-bot", "http://www.example.com/", ALLOWED], 311 | ) 312 | 313 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_768939) 314 | def test_google_stress_768939(robots_txt, agent, path, allowed, can_fetch): 315 | assert can_fetch(robots_txt, agent, path) is allowed 316 | 317 | 318 | robots_txt_517712 = """ 319 | # Some comment 320 | # http://www.exapmle.com/something.html 321 | 322 | 323 | 324 | # Some more explanation to lines below 325 | # (and some line wrapping) 326 | 327 | User-agent: * 328 | Disallow: 329 | 330 | 331 | 332 | # Some comments regarding some specific robot restrictions 333 | # maybe regarding his functionality 334 | # and some website to visit 335 | # http://www.example.com/some/help/about/quxbot?arg=123 336 | 337 | User-Agent: Quxbot 338 | Disallow: /*dispatch_request$ 339 | Disallow: /*directory_ctors$ 340 | """ 341 | 342 | data_517712 = ( 343 | [robots_txt_517712, "foobot", "http://example.com/", ALLOWED], 344 | [robots_txt_517712, "FooBot", "http://example.com/search?req=123", ALLOWED], 345 | [robots_txt_517712, "foobot", "http://example.com/foo/bar/dispatch_request", ALLOWED], 346 | [robots_txt_517712, "foo-bot", "http://example.com/bar/baz/foler_ctors", ALLOWED], 347 | [robots_txt_517712, "Quxbot", "http://example.com/", ALLOWED], 348 | [robots_txt_517712, "barbot", "http://example.com/robots.txt", ALLOWED], 349 | [robots_txt_517712, "Quxbot", "http://example.com/baz/dispatch_request", DISALLOWED], 350 | [robots_txt_517712, "Quxbot", "http://example.com/baz/dispatch_request?args=123", ALLOWED], 351 | [robots_txt_517712, "Quxbot", "http://example.com/new_directory_ctors", DISALLOWED], 352 | [robots_txt_517712, "Quxbot", "http://example.com/bar/baz/directory_ctors", DISALLOWED], 353 | ) 354 | 355 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_517712) 356 | def test_google_stress_517712(robots_txt, agent, path, allowed, can_fetch): 357 | assert can_fetch(robots_txt, agent, path) is allowed 358 | 359 | 360 | robots_txt_894248 = """ 361 | User-agent: * 362 | Disallow: /ab-baz/ 363 | Allow: /ab-baz/baz-ajax.php 364 | 365 | Sitemap: https://example.com/ab-sitemap.xml 366 | """ 367 | 368 | data_894248 = ( 369 | [robots_txt_894248, "FooBot", "http://example.com/", ALLOWED], 370 | [robots_txt_894248, "Foo_Bot", "http://example.com/foo/bar.php", ALLOWED], 371 | [robots_txt_894248, "foobot", "http://example.com/ab-baz/index.htm", DISALLOWED], 372 | [robots_txt_894248, "foo-bot", "http://example.com/ab-baz/foo/bar", DISALLOWED], 373 | [robots_txt_894248, "foo_bot", "http://example.com/ab-baz/baz-ajax.php", ALLOWED], 374 | [robots_txt_894248, "foo-bot", "http://example.com/ab-baz/baz-ajax.php?user=123", ALLOWED], 375 | ) 376 | 377 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_894248) 378 | def test_google_stress_894248(robots_txt, agent, path, allowed, can_fetch): 379 | assert can_fetch(robots_txt, agent, path) is allowed 380 | 381 | 382 | robots_txt_155227 = """ 383 | User-agent: * 384 | Crawl-delay: 10 385 | # Foo 386 | Disallow: /asdf-main/ 387 | Disallow: /asdf-media/ 388 | Disallow: /asdf-shared/ 389 | # Bar 390 | Disallow: /asdf-control.php 391 | Disallow: /asdf-control-sample.php 392 | Disallow: /asdf-settings.php 393 | """ 394 | 395 | data_155227 = ( 396 | [robots_txt_155227, "foobot", "http://example.com/", ALLOWED], 397 | [robots_txt_155227, "foo_bot", "http://example.com/bar/index.html", ALLOWED], 398 | [robots_txt_155227, "foo-bot", "http://example.com/asdf-control.pdf", ALLOWED], 399 | [robots_txt_155227, "foobot", "http://example.com/asdf-control.php", DISALLOWED], 400 | [robots_txt_155227, "foobot", "http://example.com/asdf-control-sample.php", DISALLOWED], 401 | [robots_txt_155227, "foobot", "http://example.com/asdf-control-simple.php", ALLOWED], 402 | [robots_txt_155227, "FooBot", "http://example.com/asdf-settings.php", DISALLOWED], 403 | [robots_txt_155227, "Foo-Bot", "http://example.com/asdf-shared/index.html", DISALLOWED], 404 | ) 405 | 406 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_155227) 407 | def test_google_stress_155227(robots_txt, agent, path, allowed, can_fetch): 408 | assert can_fetch(robots_txt, agent, path) is allowed 409 | 410 | 411 | robots_txt_701159 = """ 412 | User-agent: foofoobot* 413 | Disallow: /workers/ 414 | Disallow: /media/common/ 415 | Disallow: /misc/ 416 | Disallow: /bin/ 417 | Disallow: /trash/ 418 | 419 | User-agent: barbarbot* 420 | Disallow: /workers/ 421 | Disallow: /media/common/ 422 | Disallow: /misc/ 423 | Disallow: /bin/ 424 | Disallow: /trash/ 425 | 426 | User-agent: quxbot 427 | Disallow: /workers/ 428 | Disallow: /media/common/ 429 | Disallow: /misc/ 430 | Disallow: /bin/ 431 | Disallow: /trash/ 432 | 433 | User-agent: ddbot 434 | Disallow: /workers/ 435 | Disallow: /media/common/ 436 | Disallow: /misc/ 437 | Disallow: /bin/ 438 | Disallow: /trash/ 439 | 440 | User-agent: toebot 441 | Disallow: /workers/ 442 | Disallow: /media/common/ 443 | Disallow: /misc/ 444 | Disallow: /bin/ 445 | Disallow: /trash/ 446 | 447 | User-agent: io_tester 448 | Disallow: /workers/ 449 | Disallow: /media/common/ 450 | Disallow: /misc/ 451 | Disallow: /bin/ 452 | Disallow: /trash/ 453 | 454 | User-agent: * 455 | Disallow: / 456 | 457 | 458 | Sitemap: http://www.example.com/sitemap.xml 459 | """ 460 | 461 | data_701159 = ( 462 | [robots_txt_701159, "foofoobot-exp", "http://example.com/workers/log", DISALLOWED], 463 | [robots_txt_701159, "foofoobot", "http://example.com/trash/index.html", DISALLOWED], 464 | [robots_txt_701159, "barbarbot-prod", "http://example.com/bin/bash", DISALLOWED], 465 | [robots_txt_701159, "barbarbot-prod", "http://example.com/foo/bar", DISALLOWED], 466 | [robots_txt_701159, "barbarbot", "http://example.com/bin/bash", DISALLOWED], 467 | [robots_txt_701159, "barbarbot", "http://example.com/foo/bar", ALLOWED], 468 | [robots_txt_701159, "quxbot", "http://example.com/qux/qux/qux", ALLOWED], 469 | [robots_txt_701159, "quxbot", "http://example.com/trash/view.html", DISALLOWED], 470 | [robots_txt_701159, "io_tester", "http://example.com/search?req=123", ALLOWED], 471 | [robots_txt_701159, "io_tester", "http://example.com/media/common/123", DISALLOWED], 472 | [robots_txt_701159, "foo_bot", "http://example.com/search?req=123", DISALLOWED], 473 | ) 474 | 475 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_701159) 476 | def test_google_stress_701159(robots_txt, agent, path, allowed, can_fetch): 477 | assert can_fetch(robots_txt, agent, path) is allowed 478 | 479 | 480 | robots_txt_541230 = """ 481 | User-agent: * 482 | Allow: /*.js 483 | Allow: /*.css 484 | Allow: /*.jpg 485 | Allow: /*.png 486 | Allow: /*.gif 487 | Allow: /*?page 488 | Allow: /*?ref= 489 | Disallow: /*? 490 | Disallow: /stat/ 491 | Disallow: /id/1 492 | Disallow: /id/3 493 | Disallow: /register 494 | Disallow: /id/5 495 | Disallow: /id/7 496 | Disallow: /id/8 497 | Disallow: /id/9 498 | Disallow: /id/sub/ 499 | Disallow: /panel/ 500 | Disallow: /admin/ 501 | Disallow: /informer/ 502 | Disallow: /secure/ 503 | Disallow: /poll/ 504 | Disallow: /search/ 505 | Disallow: /abnl/ 506 | Disallow: /*_escaped_pattern_= 507 | Disallow: /*-*-*-*-321$ 508 | Disallow: /baz/order/ 509 | Disallow: /baz/printorder/ 510 | Disallow: /baz/checkout/ 511 | Disallow: /baz/user/ 512 | Disallow: /baz/search 513 | Disallow: /*0-*-0-03$ 514 | Disallow: /*-0-0- 515 | 516 | Sitemap: http://example.com/sitemap.xml 517 | Sitemap: http://example.com/sitemap-forum.xml 518 | """ 519 | 520 | data_541230 = ( 521 | [robots_txt_541230, "foobot", "http://example.com/foo.js", ALLOWED], 522 | [robots_txt_541230, "foobot", "http://example.com/foo/bar.css", ALLOWED], 523 | [robots_txt_541230, "foobot", "http://example.com/x/y/z?ref=bar", ALLOWED], 524 | [robots_txt_541230, "foobot", "http://example.com/x/y/z", ALLOWED], 525 | [robots_txt_541230, "foobot", "http://example.com/status/x", ALLOWED], 526 | [robots_txt_541230, "foobot", "http://example.com/stat/perf", DISALLOWED], 527 | [robots_txt_541230, "foobot", "http://example.com/id/13579", DISALLOWED], 528 | [robots_txt_541230, "foobot", "http://example.com/id/24680", ALLOWED], 529 | [robots_txt_541230, "foobot", "http://example.com/search/stats", DISALLOWED], 530 | [robots_txt_541230, "foobot", "http://example.com/foo_bar_escaped_pattern_=123", DISALLOWED], 531 | [robots_txt_541230, "foobot", "http://example.com/foo-bar-vaz-qux-321", DISALLOWED], 532 | [robots_txt_541230, "foobot", "http://example.com/foo-bar-vaz-qux-3216", ALLOWED], 533 | [robots_txt_541230, "foobot", "http://example.com/-0-0-312", DISALLOWED], 534 | [robots_txt_541230, "foobot", "http://example.com/baz", ALLOWED], 535 | [robots_txt_541230, "foobot", "http://example.com/baz/user/123", DISALLOWED], 536 | ) 537 | 538 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_541230) 539 | def test_google_stress_541230(robots_txt, agent, path, allowed, can_fetch): 540 | assert can_fetch(robots_txt, agent, path) is allowed 541 | 542 | 543 | robots_txt_824664 = """ 544 | Sitemap: http://example.com/sitemap.xml 545 | Sitemap: http://example.com/news-sitemap.xml 546 | User-agent: * 547 | Disallow: /controller/ 548 | Allow: /controller/admin-ajax.php 549 | """ 550 | 551 | data_824664 = ( 552 | [robots_txt_824664, "foo-bot", "http://example.com/index.html", ALLOWED], 553 | [robots_txt_824664, "foo-bot", "http://example.com/controller/index.html", DISALLOWED], 554 | [robots_txt_824664, "foo_bot", "http://example.com/controller/foo/bar/index.htm", DISALLOWED], 555 | [robots_txt_824664, "foobot", "http://example.com/controller/admin-ajax.php", ALLOWED], 556 | [robots_txt_824664, "foobot", "http://example.com/log?id=234", ALLOWED], 557 | ) 558 | 559 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_824664) 560 | def test_google_stress_824664(robots_txt, agent, path, allowed, can_fetch): 561 | assert can_fetch(robots_txt, agent, path) is allowed 562 | 563 | 564 | robots_txt_327748 = """ 565 | User-agent: asdfbot 566 | Disallow: / 567 | User-agent: * 568 | Disallow: 569 | Crawl-delay: 15 570 | Sitemap: http://example.com/sitemap.xml 571 | """ 572 | 573 | data_327748 = ( 574 | [robots_txt_327748, "foobot", "http://m.example.com/", ALLOWED], 575 | [robots_txt_327748, "FooBot", "http://m.example.com/foo/bar/baz.php", ALLOWED], 576 | [robots_txt_327748, "Foo_Bot", "http://m.example.com/index.html", ALLOWED], 577 | [robots_txt_327748, "asdfbot", "http://m.example.com/", DISALLOWED], 578 | [robots_txt_327748, "asdfbot", "http://m.example.com/foo/bar/baz.js", DISALLOWED], 579 | [robots_txt_327748, "asdfbot", "http://m.example.com/robots.txt", DISALLOWED], 580 | ) 581 | 582 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_327748) 583 | def test_google_stress_327748(robots_txt, agent, path, allowed, can_fetch): 584 | assert can_fetch(robots_txt, agent, path) is allowed 585 | 586 | 587 | robots_txt_278501 = """ 588 | User-agent: * 589 | Disallow: /dump-* 590 | Disallow: /vlog/dump-* 591 | Disallow: /_pcms/preview/ 592 | Disallow: /tf/manage-roles/ 593 | 594 | Sitemap: https://www.example.com/sitemap.xml 595 | Disallow: /_pcms/preview/ 596 | Disallow: /tf/manage-roles/ 597 | """ 598 | 599 | data_278501 = ( 600 | [robots_txt_278501, "foobot", "http://www.example.com/index.html", ALLOWED], 601 | [robots_txt_278501, "foo-bot", "http://www.example.com/dump-", DISALLOWED], 602 | [robots_txt_278501, "foobot", "http://www.example.com/dump", ALLOWED], 603 | [robots_txt_278501, "foo_bot", "http://www.example.com/dump-786", DISALLOWED], 604 | [robots_txt_278501, "foo-bot", "http://www.example.com/vlog/123", ALLOWED], 605 | [robots_txt_278501, "foo-bot", "http://www.example.com/vlog/dump-123", DISALLOWED], 606 | [robots_txt_278501, "foobot", "http://www.example.com/_pcms/test.txt", ALLOWED], 607 | [robots_txt_278501, "foo_bot", "http://www.example.com/_pcms/preview/test.txt", DISALLOWED], 608 | [robots_txt_278501, "foo-bot", "http://www.example.com/pcms/preview/test.txt", ALLOWED], 609 | [robots_txt_278501, "foo_bot", "http://www.example.com/tf/manage-roles/foo/bar", DISALLOWED], 610 | [robots_txt_278501, "foobot", "http://www.example.com/tf/manage-roles/", DISALLOWED], 611 | [robots_txt_278501, "foo_bot", "http://www.example.com/tf/index.html", ALLOWED], 612 | ) 613 | 614 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_278501) 615 | def test_google_stress_278501(robots_txt, agent, path, allowed, can_fetch): 616 | assert can_fetch(robots_txt, agent, path) is allowed 617 | 618 | 619 | robots_txt_974982 = """ 620 | # Some Robots Txt 621 | 622 | 623 | User-agent: * 624 | Disallow: /data 625 | Disallow: /find 626 | Disallow: /stuff$ 627 | Disallow: /stuff/ 628 | Disallow: /contacts/ 629 | Disallow: /dynamic/ 630 | Disallow:/*?creator=* 631 | Disallow:/*&creator=* 632 | Disallow:/*?finder=* 633 | Disallow:/*&finder=* 634 | Disallow:/*?locator=* 635 | Disallow:/*&locator=* 636 | Disallow:/*?viewer=* 637 | Disallow:/*&viewer=* 638 | Disallow:/*?format=json 639 | Disallow:/*&format=json 640 | Disallow:/*?format=page-context 641 | Disallow:/*&format=page-context 642 | Disallow:/*?format=main-content 643 | Disallow:/*&format=main-content 644 | Disallow:/*?format=json-pretty 645 | Disallow:/*&format=json-pretty 646 | Disallow:/*?format=ical 647 | Disallow:/*&format=ical 648 | Disallow:/*?someStuff=* 649 | Disallow:/*&someStuff=* 650 | 651 | 652 | Sitemap: https://example.com/sitemap.xml 653 | """ 654 | 655 | data_974982 = ( 656 | [robots_txt_974982, "foobot", "http://www.example.com/", ALLOWED], 657 | [robots_txt_974982, "foobot", "http://www.example.com/robots.txt", ALLOWED], 658 | [robots_txt_974982, "foobot", "http://www.example.com/find", DISALLOWED], 659 | [robots_txt_974982, "foobot", "http://www.example.com/find/", DISALLOWED], 660 | [robots_txt_974982, "foobot", "http://www.example.com/find?id=123", DISALLOWED], 661 | [robots_txt_974982, "foobot", "http://www.example.com/stuff", DISALLOWED], 662 | [robots_txt_974982, "foobot", "http://www.example.com/stuffstats", ALLOWED], 663 | [robots_txt_974982, "foobot", "http://www.example.com/stuff/new", DISALLOWED], 664 | [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar", DISALLOWED], 665 | [robots_txt_974982, "foobot", "http://www.example.com/foo?finder=baz", DISALLOWED], 666 | [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar&finder=baz", DISALLOWED], 667 | [robots_txt_974982, "foobot", "http://www.example.com/foo?viewer=qux", DISALLOWED], 668 | [robots_txt_974982, "foobot", "http://www.example.com/foo?creator=bar&stuff=baz", DISALLOWED], 669 | [robots_txt_974982, "foobot", "http://www.example.com/contacts/index.html", DISALLOWED], 670 | ) 671 | 672 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_974982) 673 | def test_google_stress_974982(robots_txt, agent, path, allowed, can_fetch): 674 | assert can_fetch(robots_txt, agent, path) is allowed 675 | 676 | 677 | robots_txt_371856 = """ 678 | User-agent: Foobot 679 | User-agent: Barbot 680 | User-agent: Bazbot 681 | User-agent: Quxbot 682 | Crawl-delay: 10 683 | Disallow: 684 | 685 | User-agent: * 686 | Disallow: / 687 | """ 688 | 689 | data_371856 = ( 690 | [robots_txt_371856, "Foobot", "http://example.com/foo/bar", ALLOWED], 691 | [robots_txt_371856, "Barbot", "http://example.com/foo/bar", ALLOWED], 692 | [robots_txt_371856, "Bazbot", "http://example.com/foo/baz", ALLOWED], 693 | [robots_txt_371856, "Bazbot", "http://example.com/", ALLOWED], 694 | [robots_txt_371856, "Bazbot", "http://example.com/index.html", ALLOWED], 695 | [robots_txt_371856, "zazbot", "http://example.com/", DISALLOWED], 696 | [robots_txt_371856, "zazbot", "http://example.com/index.html", DISALLOWED], 697 | [robots_txt_371856, "zazbot", "http://example.com/foo/zaz", DISALLOWED], 698 | ) 699 | 700 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_371856) 701 | def test_google_stress_371856(robots_txt, agent, path, allowed, can_fetch): 702 | assert can_fetch(robots_txt, agent, path) is allowed 703 | 704 | 705 | robots_txt_923994 = """ 706 | User-agent: * 707 | Disallow: /resources/bazbaz/baz/more_stuff 708 | Disallow: /wha/some_dir/files 709 | Disallow: /lib 710 | Disallow: /sys 711 | Disallow: /foo 712 | Disallow: /bar 713 | Disallow: /baz 714 | Sitemap: http://www.example.com/wha/some_dir/resources/sitemap.xml 715 | 716 | User-agent: quxbot 717 | Disallow: /resources/bazbaz/baz/more_stuff 718 | Disallow: /wha/some_dir/files 719 | Disallow: /lib 720 | Disallow: /sys 721 | Disallow: /foo 722 | Disallow: /bar 723 | Disallow: /baz 724 | Disallow: /users/big_foo/some_stuff 725 | Disallow: /users/big_foo/other_stuff 726 | Disallow: /en/stuff/arr 727 | Disallow: /en/stuff/dep 728 | Disallow: /sk/stuff/pri 729 | Disallow: /sk/stuff/odl 730 | Disallow: /cz/stuff/pri 731 | Disallow: /cz/stuff/odl 732 | Disallow: /hu/stuff/rke 733 | Disallow: /hu/stuff/ind 734 | Disallow: /addfightyos 735 | Disallow: /addfightnope 736 | Crawl-delay: 29 737 | """ 738 | 739 | data_923994 = ( 740 | [robots_txt_923994, "foobot", "http://example.com/home", ALLOWED], 741 | [robots_txt_923994, "foobot", "http://example.com/foo?id=12", DISALLOWED], 742 | [robots_txt_923994, "foobot", "http://example.com/qux", ALLOWED], 743 | [robots_txt_923994, "foobot", "http://example.com/home/scripts/s.js", ALLOWED], 744 | [robots_txt_923994, "foobot", "http://example.com/baz/112", DISALLOWED], 745 | [robots_txt_923994, "foobot", "http://example.com/resources/index.html", ALLOWED], 746 | [robots_txt_923994, "foobot", "http://example.com/resources/bazbaz/baz/more_stuff", DISALLOWED], 747 | [robots_txt_923994, "quxbot", "http://example.com/resources/bazbaz/baz/more_stuff", DISALLOWED], 748 | [robots_txt_923994, "quxbot", "http://example.com/users/big_foo/some_stuff/new", DISALLOWED], 749 | [robots_txt_923994, "quxbot", "http://example.com/addfightyos", DISALLOWED], 750 | [robots_txt_923994, "foobot", "http://example.com/addfight/new", ALLOWED], 751 | [robots_txt_923994, "quxbot", "http://example.com/addfight/new", ALLOWED], 752 | [robots_txt_923994, "quxbot", "http://example.com/addfightnope?dest=ULLI", DISALLOWED], 753 | [robots_txt_923994, "quxbot", "http://example.com/cz", ALLOWED], 754 | ) 755 | 756 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_923994) 757 | def test_google_stress_923994(robots_txt, agent, path, allowed, can_fetch): 758 | assert can_fetch(robots_txt, agent, path) is allowed 759 | 760 | 761 | robots_txt_797409 = """ 762 | User-agent: quxbot 763 | Disallow: / 764 | User-agent: * 765 | Disallow: 766 | Sitemap: https://example.com/sitemap.xml 767 | """ 768 | 769 | data_797409 = ( 770 | [robots_txt_797409, "foobot", "http://example.com/foo/bar", ALLOWED], 771 | [robots_txt_797409, "foobot", "http://example.com/", ALLOWED], 772 | [robots_txt_797409, "foo_bot", "http://example.com/log?id=132", ALLOWED], 773 | [robots_txt_797409, "quxbot", "http://example.com/", DISALLOWED], 774 | [robots_txt_797409, "quxbot", "http://example.com/baz/baz", DISALLOWED], 775 | [robots_txt_797409, "quxbot", "http://example.com/index.htm", DISALLOWED], 776 | ) 777 | 778 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_797409) 779 | def test_google_stress_797409(robots_txt, agent, path, allowed, can_fetch): 780 | assert can_fetch(robots_txt, agent, path) is allowed 781 | 782 | 783 | robots_txt_715135 = """ 784 | User-agent: admin 785 | Disallow: 786 | 787 | User-agent: * 788 | Disallow: /buzz 789 | Allow: / 790 | 791 | Sitemap: http://example.com/sitemap.xml 792 | 793 | """ 794 | 795 | data_715135 = ( 796 | [robots_txt_715135, "foobot", "http://example.com/buzz/settings", DISALLOWED], 797 | [robots_txt_715135, "foobot", "http://example.com/buzz-lite", DISALLOWED], 798 | [robots_txt_715135, "barbot", "http://example.com/qux/bar", ALLOWED], 799 | [robots_txt_715135, "quxbot", "http://example.com/buzz", DISALLOWED], 800 | [robots_txt_715135, "bazbot", "http://example.com/prod/buzz", ALLOWED], 801 | [robots_txt_715135, "barbot", "http://example.com/anotherbuzz/x", ALLOWED], 802 | [robots_txt_715135, "foobot", "http://example.com/rebuzz/x", ALLOWED], 803 | [robots_txt_715135, "foobot", "http://example.com/buzz/buzz/buzz", DISALLOWED], 804 | [robots_txt_715135, "foo-bot", "http://example.com/searc/buzz", ALLOWED], 805 | [robots_txt_715135, "bar-bot", "http://example.com/buzz/searc", DISALLOWED], 806 | [robots_txt_715135, "admin", "http://example.com/buzz/ses", ALLOWED], 807 | [robots_txt_715135, "admin", "http://example.com/foo/bar", ALLOWED], 808 | [robots_txt_715135, "admin", "http://example.com/buzz", ALLOWED], 809 | ) 810 | 811 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_715135) 812 | def test_google_stress_715135(robots_txt, agent, path, allowed, can_fetch): 813 | assert can_fetch(robots_txt, agent, path) is allowed 814 | 815 | 816 | robots_txt_478151 = """ 817 | 818 | 819 | User-agent: Whoosh-Qux 820 | Allow: / 821 | 822 | User-agent: Baz-Qux 823 | Allow: / 824 | 825 | User-agent: barbot 826 | Allow: / 827 | Disallow: /braa 828 | 829 | User-agent: BeepBot 830 | Disallow: /braa 831 | 832 | User-agent: Sample-web-crawler 833 | Disallow: /braa 834 | 835 | User-agent: * 836 | Disallow: / 837 | 838 | User-agent: * 839 | Disallow: /braa 840 | 841 | Sitemap: /sitemap.xml 842 | """ 843 | 844 | data_478151 = ( 845 | [robots_txt_478151, "Whoosh-Qux", "http://example.com/robots.txt", ALLOWED], 846 | [robots_txt_478151, "Baz-Qux", "http://example.com/foo/bar", ALLOWED], 847 | [robots_txt_478151, "BeepBot", "http://example.com/braallaboration/index.htm", DISALLOWED], 848 | [robots_txt_478151, "BeepBot", "http://example.com/foo/bar", ALLOWED], 849 | [robots_txt_478151, "BeepBot", "http://example.com/", ALLOWED], 850 | [robots_txt_478151, "BeepBot", "http://example.com/braa/balt", DISALLOWED], 851 | [robots_txt_478151, "foobot", "http://example.com/index.htm", DISALLOWED], 852 | [robots_txt_478151, "foo_bot", "http://example.com/braabalt", DISALLOWED], 853 | [robots_txt_478151, "foo-bot", "http://example.com/", DISALLOWED], 854 | ) 855 | 856 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_478151) 857 | def test_google_stress_478151(robots_txt, agent, path, allowed, can_fetch): 858 | assert can_fetch(robots_txt, agent, path) is allowed 859 | 860 | 861 | robots_txt_369883 = """ 862 | User-agent: * 863 |
864 | Allow: / 865 |
866 | User-agent: BarBot 867 |
868 | Disallow: / 869 |
870 | User-agent: AB42bot 871 |
872 | Disallow: / 873 |
874 | sitemap: http://example.com/sitemap.xml 875 | """ 876 | 877 | data_369883 = ( 878 | [robots_txt_369883, "foobot", "http://example.com/", ALLOWED], 879 | [robots_txt_369883, "foo-bot", "http://example.com/foo/bar", ALLOWED], 880 | [robots_txt_369883, "foo_bot", "http://example.com/robots.txt", ALLOWED], 881 | [robots_txt_369883, "BarBot", "http://example.com/", DISALLOWED], 882 | [robots_txt_369883, "BarBot", "http://example.com/foo/bar/baz", DISALLOWED], 883 | [robots_txt_369883, "BarBot", "http://example.com/robots.txt", DISALLOWED], 884 | [robots_txt_369883, "AB42bot", "http://example.com/foo/bar", ALLOWED], 885 | [robots_txt_369883, "AB42bot", "http://example.com/", ALLOWED], 886 | [robots_txt_369883, "AB", "http://example.com/", DISALLOWED], 887 | [robots_txt_369883, "AB", "http://example.com/robots.txt", DISALLOWED], 888 | ) 889 | 890 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_369883) 891 | def test_google_stress_369883(robots_txt, agent, path, allowed, can_fetch): 892 | assert can_fetch(robots_txt, agent, path) is allowed 893 | 894 | 895 | robots_txt_434582 = """ 896 | # 897 | # robots.txt 898 | # 899 | # This is robots.txt 900 | # and it saves server resources 901 | # some more comment lines 902 | # and an empty one 903 | # 904 | # Don't forget to put robots.txt in root of your host 905 | # Used: http://example.com/robots.txt 906 | # Ignored: http://example.com/site/robots.txt 907 | # 908 | # For more information about the robots.txt standard, see: 909 | # http://www.robotstxt.org/robotstxt.html 910 | 911 | User-agent: * 912 | Crawl-delay: 15 913 | # Foo 914 | Allow: /stuff/*.css$ 915 | Allow: /stuff/*.css? 916 | Allow: /stuff/*.js$ 917 | Allow: /stuff/*.js? 918 | Allow: /stuff/*.gif 919 | Allow: /stuff/*.jpg 920 | Allow: /stuff/*.jpeg 921 | Allow: /stuff/*.png 922 | Allow: /things/*.css$ 923 | Allow: /things/*.css? 924 | Allow: /things/*.js$ 925 | Allow: /things/*.js? 926 | Allow: /things/*.gif 927 | Allow: /things/*.jpg 928 | Allow: /things/*.jpeg 929 | Allow: /things/*.png 930 | Allow: /data/*.css$ 931 | Allow: /data/*.css? 932 | Allow: /data/*.js$ 933 | Allow: /data/*.js? 934 | Allow: /data/*.gif 935 | Allow: /data/*.jpg 936 | Allow: /data/*.jpeg 937 | Allow: /data/*.png 938 | Allow: /more_data/*.css$ 939 | Allow: /more_data/*.css? 940 | Allow: /more_data/*.js$ 941 | Allow: /more_data/*.js? 942 | Allow: /more_data/*.gif 943 | Allow: /more_data/*.jpg 944 | Allow: /more_data/*.jpeg 945 | Allow: /more_data/*.png 946 | # Bar 947 | Disallow: /something/ 948 | Disallow: /stuff/ 949 | Disallow: /things/ 950 | Disallow: /data/ 951 | Disallow: /scripts/ 952 | Disallow: /more_data/ 953 | # Baz 954 | Disallow: /SOME_TEXT.txt 955 | Disallow: /some_script.php 956 | Disallow: /INSTALL.foo.txt 957 | Disallow: /INSTALL.bar.txt 958 | Disallow: /INSTALL.baz.txt 959 | Disallow: /get.php 960 | Disallow: /GET.txt 961 | Disallow: /LICENSE.txt 962 | Disallow: /HELPERS.txt 963 | Disallow: /update.php 964 | Disallow: /UPGRADE.txt 965 | Disallow: /what.php 966 | # Some more stuff to disallow 967 | Disallow: /?q=main/ 968 | Disallow: /?q=comment/reply/ 969 | Disallow: /?q=filter/ads/ 970 | Disallow: /?q=data/add/ 971 | Disallow: /?q=find/ 972 | Disallow: /?q=baz/password/ 973 | Disallow: /?q=baz/register/ 974 | Disallow: /?q=baz/login/ 975 | Disallow: /?q=baz/logout/ 976 | """ 977 | 978 | data_434582 = ( 979 | [robots_txt_434582, "foobot", "https://www.example.com/", ALLOWED], 980 | [robots_txt_434582, "foobot", "https://www.example.com/help.html", ALLOWED], 981 | [robots_txt_434582, "foobot", "https://www.example.com/some.css", ALLOWED], 982 | [robots_txt_434582, "foobot", "https://www.example.com/foo/some.css", ALLOWED], 983 | [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.css", ALLOWED], 984 | [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.html", DISALLOWED], 985 | [robots_txt_434582, "foobot", "https://www.example.com/stuff/some.jpeg", ALLOWED], 986 | [robots_txt_434582, "foobot", "https://www.example.com/things/some.css?user=main", ALLOWED], 987 | [robots_txt_434582, "foobot", "https://www.example.com/things/some.jpeg?user=main", ALLOWED], 988 | [robots_txt_434582, "foobot", "https://www.example.com/something/foo.cpp", DISALLOWED], 989 | [robots_txt_434582, "foobot", "https://www.example.com/more_data/dark", DISALLOWED], 990 | [robots_txt_434582, "foobot", "https://www.example.com/some_script.php", DISALLOWED], 991 | [robots_txt_434582, "foobot", "https://www.example.com/upgrade.txt", ALLOWED], 992 | [robots_txt_434582, "foobot", "https://www.example.com/UPGRADE.txt", DISALLOWED], 993 | [robots_txt_434582, "foobot", "https://www.example.com/data/main", DISALLOWED], 994 | [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/", ALLOWED], 995 | [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/login", ALLOWED], 996 | [robots_txt_434582, "foobot", "https://www.example.com/?q=baz/login/", DISALLOWED], 997 | [robots_txt_434582, "foobot", "https://www.example.com/?q=data/discard/", ALLOWED], 998 | [robots_txt_434582, "foobot", "https://www.example.com/?q=data/add/", DISALLOWED], 999 | ) 1000 | 1001 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', data_434582) 1002 | def test_google_stress_434582(robots_txt, agent, path, allowed, can_fetch): 1003 | assert can_fetch(robots_txt, agent, path) is allowed 1004 | 1005 | -------------------------------------------------------------------------------- /tests/test_network.py: -------------------------------------------------------------------------------- 1 | """ 2 | NetworkTestCase from: 3 | https://github.com/python/cpython/blob/a796d8ef9dd1af65f7e4d7a857b56f35b7cb6e78/Lib/test/test_robotparser.py 4 | converted to PyTest 5 | """ 6 | 7 | import pytest 8 | import robots 9 | from .core import * 10 | 11 | BASE_URL = 'http://www.pythontest.net' 12 | 13 | 14 | @pytest.fixture(scope='module') 15 | def parser(): 16 | p = robots.RobotsParser.from_uri(f'{BASE_URL}/elsewhere/robots.txt') 17 | return p 18 | 19 | 20 | def test_basic_disallow_all(parser): 21 | assert not parser.disallow_all 22 | 23 | 24 | def test_basic_allow_all(parser): 25 | assert not parser.allow_all 26 | 27 | 28 | can_fetch_data = ( 29 | ['*', f'{BASE_URL}/elsewhere', ALLOWED], 30 | ['Nutch', f'{BASE_URL}/', DISALLOWED], 31 | ['Nutch', f'{BASE_URL}/brian', DISALLOWED], 32 | ['Nutch', f'{BASE_URL}/brian/', ALLOWED], 33 | ['Nutch', f'{BASE_URL}/webstats', DISALLOWED], 34 | ['Nutch', f'{BASE_URL}/webstats/', DISALLOWED], 35 | ['*', f'{BASE_URL}/webstats', ALLOWED], 36 | ['*', f'{BASE_URL}/webstats/', DISALLOWED], 37 | ['*', f'{BASE_URL}/', ALLOWED], 38 | ) 39 | 40 | 41 | @pytest.mark.parametrize('agent,path,allowed', can_fetch_data) 42 | def test_can_fetch(agent, path, allowed, parser): 43 | assert parser.can_fetch(agent, path) is allowed 44 | 45 | 46 | def test_404(): 47 | p = robots.RobotsParser.from_uri('https://robotspy.org/non_existing_robots.txt') 48 | assert p.allow_all # no robots file => allow access to all paths 49 | assert p.can_fetch('FooBot', '/admin') 50 | 51 | 52 | def test_utf16(): 53 | p = robots.RobotsParser.from_uri('https://robotspy.org/tests/robots_utf16.txt') 54 | assert p.allow_all # robots file with unexpected encoding (must be UTF-8) => allow access to all paths 55 | assert p.can_fetch('FooBot', '/admin') 56 | 57 | def test_short_timeout(): 58 | p = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0) 59 | assert p.errors 60 | assert p.disallow_all 61 | assert not p.can_fetch('FooBot', '/admin') 62 | 63 | def test_error_timetout(): 64 | p = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 1) 65 | 66 | # The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout 67 | assert p.errors 68 | assert p.disallow_all 69 | assert not p.can_fetch('FooBot', '/admin') 70 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for robots.RobotFileParser 3 | """ 4 | 5 | import pytest 6 | import robots 7 | 8 | url_data = ( 9 | ['https://example.com/index', 'example.com', '/index'], 10 | ['https://example.com/', 'example.com', '/'], 11 | ['https://example.com', 'example.com', '/'], 12 | ['http://example.com//%7Ejoe/index.html', 'example.com', '/~joe/index.html'] 13 | ) 14 | 15 | 16 | @pytest.mark.parametrize('url,host,path', url_data) 17 | def test_normalize_url(url, host, path): 18 | h, p = robots.RobotsParser.normalize_url(url) 19 | assert (h, p) == (host, path) 20 | 21 | 22 | dedup_data = ( 23 | ['///path///index.html', '/path/index.html'], 24 | ['/path/index.html', '/path/index.html'], 25 | ['//', '/'], 26 | ['/', '/'], 27 | ['/foo/bar?qux=taz&baz=http://foo.bar?tar&par', '/foo/bar?qux=taz&baz=http://foo.bar?tar&par'], 28 | ['///foo//bar?qux=taz&baz=http://foo.bar?tar&par', '/foo/bar?qux=taz&baz=http://foo.bar?tar&par'], 29 | ['///foo//bar?qux=taz&baz=https://foo.bar?tar&par', '/foo/bar?qux=taz&baz=https://foo.bar?tar&par'] 30 | ) 31 | 32 | 33 | @pytest.mark.parametrize('path,dedup', dedup_data) 34 | def test_dedup_slash(path, dedup): 35 | assert robots.RobotsParser.dedup_slash(path) == dedup 36 | 37 | 38 | path_pattern_data = ( 39 | ['/path/index.html', '/path/*.html', True], 40 | ['/path/index.html', '/path/*.html$', True], 41 | ['/path/index.html?test=1', '/path/*.html$', False], 42 | ['/path/index.html?test=1', '/path*', True], 43 | ['/path/index.html?test=1', '/p*/i*', True], 44 | ['/path', '/p*/i*', False], 45 | ) 46 | 47 | 48 | @pytest.mark.parametrize('path,pattern,expected', path_pattern_data) 49 | def test_startswith_pattern(path, pattern, expected): 50 | assert robots.RobotsParser.startswith_pattern(path, pattern) is expected 51 | -------------------------------------------------------------------------------- /tests/test_robotparser.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import threading 4 | import unittest 5 | import robots 6 | from http.server import BaseHTTPRequestHandler, HTTPServer 7 | 8 | HOST = 'localhost' 9 | 10 | 11 | class BaseRobotTest(unittest.TestCase): 12 | robots_txt = '' 13 | agent = 'test_robotparser' 14 | good = [] 15 | bad = [] 16 | site_maps = None 17 | 18 | def setUp(self): 19 | lines = io.StringIO(self.robots_txt).readlines() 20 | self.parser = robots.RobotFileParser() 21 | self.parser.parse(lines) 22 | 23 | def get_agent_and_url(self, url): 24 | if isinstance(url, tuple): 25 | agent, url = url 26 | return agent, url 27 | return self.agent, url 28 | 29 | def test_good_urls(self): 30 | for url in self.good: 31 | agent, url = self.get_agent_and_url(url) 32 | with self.subTest(url=url, agent=agent): 33 | self.assertTrue(self.parser.can_fetch(agent, url)) 34 | 35 | def test_bad_urls(self): 36 | for url in self.bad: 37 | agent, url = self.get_agent_and_url(url) 38 | with self.subTest(url=url, agent=agent): 39 | self.assertFalse(self.parser.can_fetch(agent, url)) 40 | 41 | def test_site_maps(self): 42 | self.assertEqual(self.parser.site_maps(), self.site_maps) 43 | 44 | 45 | class UserAgentWildcardTest(BaseRobotTest): 46 | robots_txt = """\ 47 | User-agent: * 48 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 49 | Disallow: /tmp/ # these will soon disappear 50 | Disallow: /foo.html 51 | """ 52 | good = ['/', '/test.html'] 53 | bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 54 | 55 | 56 | class CrawlDelayAndCustomAgentTest(BaseRobotTest): 57 | robots_txt = """\ 58 | # robots.txt for http://www.example.com/ 59 | 60 | User-agent: * 61 | Crawl-delay: 1 62 | Request-rate: 3/15 63 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 64 | 65 | # Cybermapper knows where to go. 66 | User-agent: cybermapper 67 | Disallow: 68 | """ 69 | good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] 70 | bad = ['/cyberworld/map/index.html'] 71 | 72 | 73 | class SitemapTest(BaseRobotTest): 74 | robots_txt = """\ 75 | # robots.txt for http://www.example.com/ 76 | 77 | User-agent: * 78 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml 79 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml 80 | Request-rate: 3/15 81 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 82 | 83 | """ 84 | good = ['/', '/test.html'] 85 | bad = ['/cyberworld/map/index.html'] 86 | site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', 87 | 'http://www.google.com/hostednews/sitemap_index.xml'] 88 | 89 | 90 | class RejectAllRobotsTest(BaseRobotTest): 91 | robots_txt = """\ 92 | # go away 93 | User-agent: * 94 | Disallow: / 95 | """ 96 | good = [] 97 | bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 98 | 99 | 100 | class BaseRequestRateTest(BaseRobotTest): 101 | request_rate = None 102 | crawl_delay = None 103 | 104 | def test_request_rate(self): 105 | parser = self.parser 106 | for url in self.good + self.bad: 107 | agent, url = self.get_agent_and_url(url) 108 | with self.subTest(url=url, agent=agent): 109 | self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) 110 | 111 | parsed_request_rate = parser.request_rate(agent) 112 | self.assertEqual(parsed_request_rate, self.request_rate) 113 | if self.request_rate is not None: 114 | self.assertIsInstance( 115 | parsed_request_rate, 116 | robots.RequestRate 117 | ) 118 | self.assertEqual( 119 | parsed_request_rate.requests, 120 | self.request_rate.requests 121 | ) 122 | self.assertEqual( 123 | parsed_request_rate.seconds, 124 | self.request_rate.seconds 125 | ) 126 | 127 | 128 | class EmptyFileTest(BaseRequestRateTest): 129 | robots_txt = '' 130 | good = ['/foo'] 131 | 132 | 133 | class CrawlDelayAndRequestRateTest(BaseRequestRateTest): 134 | robots_txt = """\ 135 | User-agent: figtree 136 | Crawl-delay: 3 137 | Request-rate: 9/30 138 | Disallow: /tmp 139 | Disallow: /a%3cd.html 140 | Disallow: /a%2fb.html 141 | Disallow: /%7ejoe/index.html 142 | """ 143 | agent = 'figtree' 144 | # request_rate = robots.RequestRate(9, 30) 145 | request_rate = None # BGD: crawl-delay ignored 146 | # crawl_delay = 3 147 | crawl_delay = None # BGD: crawl-delay ignored 148 | # good = [('figtree', '/foo.html')] 149 | good = ['/foo.html'] 150 | bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 151 | '/a%2fb.html', '/~joe/index.html'] 152 | 153 | 154 | # Different behavior than urllib.robotparser that applies the same rule to 'figtree' and 155 | # 'FigTree Robot libwww-perl/5.04' 156 | class DifferentAgentTest(CrawlDelayAndRequestRateTest): 157 | agent = 'FigTree Robot libwww-perl/5.04' 158 | good = ['/foo.html', '/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 159 | '/a%2fb.html', '/~joe/index.html'] 160 | bad = [] 161 | 162 | 163 | class InvalidRequestRateTest(BaseRobotTest): 164 | robots_txt = """\ 165 | User-agent: * 166 | Disallow: /tmp/ 167 | Disallow: /a%3Cd.html 168 | Disallow: /a/b.html 169 | Disallow: /%7ejoe/index.html 170 | Crawl-delay: 3 171 | Request-rate: 9/banana 172 | """ 173 | good = ['/tmp'] 174 | bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', 175 | '/%7Ejoe/index.html'] 176 | crawl_delay = 3 177 | 178 | 179 | class InvalidCrawlDelayTest(BaseRobotTest): 180 | # From bug report #523041 181 | robots_txt = """\ 182 | User-Agent: * 183 | Disallow: /. 184 | Crawl-delay: pears 185 | """ 186 | good = ['/foo.html'] 187 | # bug report says "/" should be denied, but that is not in the RFC 188 | bad = [] 189 | 190 | 191 | class AnotherInvalidRequestRateTest(BaseRobotTest): 192 | # also test that Allow and Diasallow works well with each other 193 | robots_txt = """\ 194 | User-agent: Googlebot 195 | Allow: /folder1/myfile.html 196 | Disallow: /folder1/ 197 | Request-rate: whale/banana 198 | """ 199 | agent = 'Googlebot' 200 | good = ['/folder1/myfile.html'] 201 | bad = ['/folder1/anotherfile.html'] 202 | 203 | 204 | class UserAgentOrderingTest(BaseRobotTest): 205 | # the order of User-agent should be correct. note 206 | # that this file is incorrect because "Googlebot" is a 207 | # substring of "Googlebot-Mobile" 208 | robots_txt = """\ 209 | User-agent: Googlebot 210 | Disallow: / 211 | 212 | User-agent: Googlebot-Mobile 213 | Allow: / 214 | """ 215 | agent = 'Googlebot' 216 | bad = ['/something.jpg'] 217 | 218 | 219 | # Different behavior than urllib.robotparser that applies the same rule for googlebot and 220 | # googlebot-mobile 221 | class UserAgentGoogleMobileTest(UserAgentOrderingTest): 222 | agent = 'Googlebot-Mobile' 223 | good = ['/something.jpg'] 224 | bad = [] 225 | 226 | 227 | class GoogleURLOrderingTest(BaseRobotTest): 228 | # Google also got the order wrong. You need 229 | # to specify the URLs from more specific to more general 230 | robots_txt = """\ 231 | User-agent: Googlebot 232 | Allow: /folder1/myfile.html 233 | Disallow: /folder1/ 234 | """ 235 | agent = 'googlebot' 236 | good = ['/folder1/myfile.html'] 237 | bad = ['/folder1/anotherfile.html'] 238 | 239 | 240 | class DisallowQueryStringTest(BaseRobotTest): 241 | # see issue #6325 for details 242 | robots_txt = """\ 243 | User-agent: * 244 | Disallow: /some/path?name=value 245 | """ 246 | good = ['/some/path'] 247 | bad = ['/some/path?name=value'] 248 | 249 | 250 | class UseFirstUserAgentWildcardTest(BaseRobotTest): 251 | # obey first * entry (#4108) 252 | robots_txt = """\ 253 | User-agent: * 254 | Disallow: /some/path 255 | 256 | User-agent: * 257 | Disallow: /another/path 258 | """ 259 | 260 | # urllib.robotparser does not 261 | # combine the rules for the same useragent 262 | 263 | # good = ['/another/path'] 264 | bad = ['/some/path', '/another/path'] 265 | 266 | 267 | class EmptyQueryStringTest(BaseRobotTest): 268 | # normalize the URL first (#17403) 269 | robots_txt = """\ 270 | User-agent: * 271 | Allow: /some/path? 272 | Disallow: /another/path? 273 | """ 274 | good = ['/some/path?'] 275 | bad = ['/another/path?'] 276 | 277 | 278 | class DefaultEntryTest(BaseRequestRateTest): 279 | robots_txt = """\ 280 | User-agent: * 281 | Crawl-delay: 1 282 | Request-rate: 3/15 283 | Disallow: /cyberworld/map/ 284 | """ 285 | # request_rate = robots.RequestRate(3, 15) 286 | request_rate = None # BGD: crawl-delay ignored 287 | # crawl_delay = 1 288 | crawl_delay = None # BGD: crawl-delay ignored 289 | good = ['/', '/test.html'] 290 | bad = ['/cyberworld/map/index.html'] 291 | 292 | 293 | class StringFormattingTest(BaseRobotTest): 294 | robots_txt = """ 295 | User-agent: * 296 | Crawl-delay: 1 297 | Request-rate: 3/15 298 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 299 | 300 | # Cybermapper knows where to go. 301 | User-agent: cybermapper 302 | Disallow: /some/path 303 | """ 304 | 305 | expected_output = """User-agent: * 306 | Disallow: /cyberworld/map/ 307 | 308 | User-agent: cybermapper 309 | Disallow: /some/path 310 | """ 311 | 312 | # Intentionally, robotspy does not handle crawl-delay or request rate, hence those are not 313 | # printed out 314 | def test_string_formatting(self): 315 | self.assertEqual(str(self.parser), self.expected_output) 316 | 317 | 318 | class RobotHandler(BaseHTTPRequestHandler): 319 | 320 | def do_GET(self): 321 | self.send_error(403, "Forbidden access") 322 | 323 | def log_message(self, format, *args): 324 | pass 325 | 326 | 327 | class PasswordProtectedSiteTestCase(unittest.TestCase): 328 | 329 | def setUp(self): 330 | self.server = HTTPServer((HOST, 0), RobotHandler) 331 | 332 | self.t = threading.Thread( 333 | name='HTTPServer serving', 334 | target=self.server.serve_forever, 335 | # Short poll interval to make the test finish quickly. 336 | # Time between requests is short enough that we won't wake 337 | # up spuriously too many times. 338 | kwargs={'poll_interval': 0.01}) 339 | self.t.daemon = True # In case this function raises. 340 | self.t.start() 341 | 342 | def tearDown(self): 343 | self.server.shutdown() 344 | self.t.join() 345 | self.server.server_close() 346 | 347 | def testPasswordProtectedSite(self): 348 | addr = self.server.server_address 349 | url = 'http://' + HOST + ':' + str(addr[1]) 350 | robots_url = url + "/robots.txt" 351 | parser = robots.RobotFileParser() 352 | parser.set_url(url) 353 | parser.read() 354 | self.assertFalse(parser.can_fetch("*", robots_url)) 355 | 356 | 357 | class NetworkTestCase(unittest.TestCase): 358 | 359 | base_url = 'http://www.pythontest.net/' 360 | robots_txt = '{}elsewhere/robots.txt'.format(base_url) 361 | 362 | @classmethod 363 | def setUpClass(cls): 364 | cls.parser = robots.RobotFileParser(cls.robots_txt) 365 | cls.parser.read() 366 | 367 | def url(self, path): 368 | return '{}{}{}'.format( 369 | self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 370 | ) 371 | 372 | def test_basic(self): 373 | self.assertFalse(self.parser.disallow_all) 374 | self.assertFalse(self.parser.allow_all) 375 | self.assertGreater(self.parser.mtime(), 0) 376 | self.assertFalse(self.parser.crawl_delay('*')) 377 | self.assertFalse(self.parser.request_rate('*')) 378 | 379 | def test_can_fetch(self): 380 | self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 381 | self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 382 | self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian'))) # Different from urllib.robotparser 383 | self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 384 | self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 385 | self.assertTrue(self.parser.can_fetch('*', self.base_url)) 386 | 387 | def test_read_404(self): 388 | parser = robots.RobotFileParser(self.url('i-robot.txt')) 389 | parser.read() 390 | self.assertTrue(parser.allow_all) 391 | self.assertFalse(parser.disallow_all) 392 | self.assertEqual(parser.mtime(), 0) 393 | self.assertIsNone(parser.crawl_delay('*')) 394 | self.assertIsNone(parser.request_rate('*')) 395 | 396 | 397 | if __name__ == '__main__': 398 | unittest.main() 399 | -------------------------------------------------------------------------------- /tests/test_robots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mostly tests from: 3 | https://github.com/python/cpython/blob/a796d8ef9dd1af65f7e4d7a857b56f35b7cb6e78/Lib/test/test_robotparser.py 4 | converted to PyTest and intended to validate the compatibility with the Python standard library 5 | package: urllib.robotparser 6 | 7 | For each test a data row contains the following fields: 8 | robotstxt, useragent, url, allowed/disallowed 9 | 10 | allow/disallowed is expressed as a boolean, True/False 11 | """ 12 | 13 | import pytest 14 | import robots 15 | from .core import * 16 | 17 | 18 | # Same robots.txt as http://www.pythontest.net/elsewhere/robots.txt 19 | network = """ 20 | # NetworkTestCase 21 | 22 | User-agent: Nutch 23 | Disallow: / 24 | Allow: /brian/ 25 | 26 | User-agent: * 27 | Disallow: /webstats/ 28 | """ 29 | 30 | network_data = ( 31 | [network, '*', '/elsewhere/', ALLOWED], 32 | [network, 'Nutch', '/', DISALLOWED], 33 | [network, 'Nutch', '/brian', DISALLOWED], 34 | [network, 'Nutch', '/brian/', ALLOWED], 35 | [network, 'Nutch', '/webstats/', DISALLOWED], 36 | [network, '*', '/webstats/', DISALLOWED], 37 | [network, '*', '/', ALLOWED], 38 | ) 39 | 40 | 41 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', network_data) 42 | def test_py01_network(robots_txt, agent, path, allowed, can_fetch): 43 | assert can_fetch(robots_txt, agent, path) is allowed 44 | 45 | 46 | useragent_wild_card = """ 47 | # UserAgentWildcardTest 48 | 49 | User-agent: * 50 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 51 | Disallow: /tmp/ # these will soon disappear 52 | Disallow: /foo.html 53 | """ 54 | 55 | useragent_wild_card_data = ( 56 | [useragent_wild_card, DEFAULT_AGENT, '/', ALLOWED], 57 | [useragent_wild_card, DEFAULT_AGENT, '/test.html', ALLOWED], 58 | [useragent_wild_card, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED], 59 | [useragent_wild_card, DEFAULT_AGENT, '/tmp/xxx', DISALLOWED], 60 | [useragent_wild_card, DEFAULT_AGENT, '/foo.html', DISALLOWED], 61 | ) 62 | 63 | 64 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', useragent_wild_card_data) 65 | def test_useragent_wild_card(robots_txt, agent, path, allowed, can_fetch): 66 | assert can_fetch(robots_txt, agent, path) is allowed 67 | 68 | 69 | # This test does not take into account crawl-delay. See crawl_delay_request_rate for that. 70 | crawl_delay_custom_agent = """ 71 | # CrawlDelayAndCustomAgentTest 72 | 73 | User-agent: * 74 | Crawl-delay: 1 75 | Request-rate: 3/15 76 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 77 | 78 | # Cybermapper knows where to go. 79 | User-agent: cybermapper 80 | Disallow: 81 | """ 82 | 83 | crawl_delay_custom_agent_data = ( 84 | [crawl_delay_custom_agent, DEFAULT_AGENT, '/', ALLOWED], 85 | [crawl_delay_custom_agent, DEFAULT_AGENT, '/test.html', ALLOWED], 86 | [crawl_delay_custom_agent, 'cybermapper', '/cyberworld/map/index.html', ALLOWED], 87 | [crawl_delay_custom_agent, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED], 88 | ) 89 | 90 | 91 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', crawl_delay_custom_agent_data) 92 | def test_crawl_delay_custom_agent(robots_txt, agent, path, allowed, can_fetch): 93 | assert can_fetch(robots_txt, agent, path) is allowed 94 | 95 | 96 | sitemap = """ 97 | # SitemapTest 98 | 99 | User-agent: * 100 | Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml 101 | Sitemap: http://www.google.com/hostednews/sitemap_index.xml 102 | Request-rate: 3/15 103 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 104 | """ 105 | 106 | sitemap_data = ( 107 | [sitemap, DEFAULT_AGENT, '/', ALLOWED], 108 | [sitemap, DEFAULT_AGENT, '/test.html', ALLOWED], 109 | [sitemap, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED], 110 | ) 111 | 112 | 113 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', sitemap_data) 114 | def test_sitemap(robots_txt, agent, path, allowed, can_fetch): 115 | assert can_fetch(robots_txt, agent, path) is allowed 116 | 117 | 118 | reject_all = """ 119 | # RejectAllRobotsTest 120 | 121 | User-agent: * 122 | Disallow: / 123 | """ 124 | 125 | reject_all_data = ( 126 | [reject_all, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED], 127 | [reject_all, DEFAULT_AGENT, '/', DISALLOWED], 128 | [reject_all, DEFAULT_AGENT, '/tmp/', DISALLOWED], 129 | ) 130 | 131 | 132 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', reject_all_data) 133 | def test_reject_all(robots_txt, agent, path, allowed, can_fetch): 134 | assert can_fetch(robots_txt, agent, path) is allowed 135 | 136 | 137 | # TODO: implement handling request-rate and crawl-delay 138 | # Following tests take into account crawl-delay and request-rate 139 | 140 | empty_data = ( 141 | ['# Empty', DEFAULT_AGENT, '/foo', ALLOWED], 142 | ['# Empty', '', '', ALLOWED], # No user agent, no path provided 143 | ) 144 | 145 | 146 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', empty_data) 147 | def test_empty(robots_txt, agent, path, allowed, can_fetch): 148 | assert can_fetch(robots_txt, agent, path) is allowed 149 | 150 | 151 | crawl_delay_request_rate = """ 152 | # CrawlDelayAndRequestRate 153 | 154 | User-agent: figtree 155 | Crawl-delay: 3 156 | Request-rate: 9/30 157 | Disallow: /tmp 158 | Disallow: /a%3cd.html 159 | Disallow: /a%2fb.html 160 | Disallow: /%7ejoe/index.html 161 | """ 162 | 163 | crawl_delay_request_rate_data = ( 164 | [crawl_delay_request_rate, 'figtree', '/foo.html', ALLOWED], 165 | [crawl_delay_request_rate, 'figtree', '/tmp', DISALLOWED], 166 | [crawl_delay_request_rate, 'figtree', '/tmp/a.html', DISALLOWED], 167 | [crawl_delay_request_rate, 'figtree', '/a%3cd.html', DISALLOWED], 168 | [crawl_delay_request_rate, 'figtree', '/a%3Cd.html', DISALLOWED], 169 | [crawl_delay_request_rate, 'figtree', '/a%2fb.html', DISALLOWED], 170 | [crawl_delay_request_rate, 'figtree', '/~joe/index.html', DISALLOWED], 171 | ) 172 | 173 | 174 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', crawl_delay_request_rate_data) 175 | def test_crawl_delay_request_rate(robots_txt, agent, path, allowed, can_fetch): 176 | assert can_fetch(robots_txt, agent, path) is allowed 177 | 178 | 179 | crawl_delay_request_rate_diff_agent_data = ( 180 | [crawl_delay_request_rate, '/foo.html', ALLOWED], 181 | [crawl_delay_request_rate, '/tmp', ALLOWED], 182 | [crawl_delay_request_rate, '/tmp/a.html', ALLOWED], 183 | [crawl_delay_request_rate, '/a%3cd.html', ALLOWED], 184 | [crawl_delay_request_rate, '/a%3Cd.html', ALLOWED], 185 | [crawl_delay_request_rate, '/a%2fb.html', ALLOWED], 186 | [crawl_delay_request_rate, '/~joe/index.html', ALLOWED], 187 | ) 188 | 189 | 190 | # The behavior is different than urllib.robotparser that applies 'figtree' and 191 | # 'FigTree Robot libwww-perl/5.04' with the same rules. 192 | @pytest.mark.parametrize('robots_txt,path,allowed', crawl_delay_request_rate_diff_agent_data) 193 | def test_different_agent(robots_txt, path, allowed, can_fetch): 194 | agent = 'FigTree Robot libwww-perl/5.04' 195 | assert can_fetch(robots_txt, agent, path) is allowed 196 | 197 | 198 | invalid_request_rate = """ 199 | # InvalidRequestRate 200 | 201 | User-agent: * 202 | Disallow: /tmp/ 203 | Disallow: /a%3Cd.html 204 | Disallow: /a/b.html 205 | Disallow: /%7ejoe/index.html 206 | Crawl-delay: 3 207 | Request-rate: 9/banana 208 | """ 209 | 210 | invalid_request_rate_data = ( 211 | [invalid_request_rate, DEFAULT_AGENT, '/tmp', ALLOWED], 212 | [invalid_request_rate, DEFAULT_AGENT, '/tmp/', DISALLOWED], 213 | [invalid_request_rate, DEFAULT_AGENT, '/tmp/a.html', DISALLOWED], 214 | [invalid_request_rate, DEFAULT_AGENT, '/a%3cd.html', DISALLOWED], 215 | [invalid_request_rate, DEFAULT_AGENT, '/a%3Cd.html', DISALLOWED], 216 | [invalid_request_rate, DEFAULT_AGENT, '/a/b.html', DISALLOWED], 217 | [invalid_request_rate, DEFAULT_AGENT, '/%7Ejoe/index.html', DISALLOWED], 218 | ) 219 | 220 | 221 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', invalid_request_rate_data) 222 | def test_invalid_request_rate(robots_txt, agent, path, allowed, can_fetch): 223 | assert can_fetch(robots_txt, agent, path) is allowed 224 | 225 | 226 | invalid_crawl_delay = """ 227 | # InvalidCrawlDelay 228 | 229 | User-Agent: * 230 | Disallow: /. 231 | Crawl-delay: pears 232 | """ 233 | 234 | 235 | def test_invalid_crawl_delay(can_fetch): 236 | assert can_fetch(invalid_crawl_delay, DEFAULT_AGENT, '/foo.html') is ALLOWED 237 | 238 | 239 | other_invalid_request_rate = """ 240 | # OtherInvalidCrawlDelay 241 | 242 | User-agent: Googlebot 243 | Allow: /folder1/myfile.html 244 | Disallow: /folder1/ 245 | Request-rate: whale/banana 246 | """ 247 | 248 | other_invalid_request_rate_data = ( 249 | [other_invalid_request_rate, 'Googlebot', '/folder1/myfile.html', ALLOWED], 250 | [other_invalid_request_rate, 'Googlebot', '/folder1/anotherfile.html', DISALLOWED], 251 | ) 252 | 253 | 254 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', other_invalid_request_rate_data) 255 | def test_other_invalid_request_rate(robots_txt, agent, path, allowed, can_fetch): 256 | assert can_fetch(robots_txt, agent, path) is allowed 257 | 258 | 259 | useragent_ordering = """ 260 | # UserAgentOrdering 261 | 262 | User-agent: Googlebot 263 | Disallow: / 264 | 265 | User-agent: Googlebot-Mobile 266 | Allow: / 267 | """ 268 | 269 | 270 | def test_useragent_ordering(can_fetch): 271 | assert can_fetch(useragent_ordering, 'Googlebot', '/something.jpg') is DISALLOWED 272 | 273 | 274 | # Different behavior than urllib.robotparser that applies the same rule to googlebot and 275 | # googlebot-mobile. It ends up validating if the ua saved by the parser is in the ua that 276 | # we want to validate (if 'googlebot' in 'googlebot-mobile') and disallow for google-mobile 277 | # Google robots respects Googlebot-Mobile as a different ua and allow. Same for robotspy. 278 | def test_useragent_google_mobile(can_fetch): 279 | assert can_fetch(useragent_ordering, 'Googlebot-Mobile', '/something.jpg') is ALLOWED 280 | 281 | 282 | google_url_ordering = """ 283 | # GoogleURLOrdering 284 | 285 | User-agent: Googlebot 286 | Allow: /folder1/myfile.html 287 | Disallow: /folder1/ 288 | """ 289 | 290 | google_url_ordering_data = ( 291 | [google_url_ordering, 'googlebot', '/folder1/myfile.html', ALLOWED], 292 | [google_url_ordering, 'googlebot', '/folder1/anotherfile.html', DISALLOWED], 293 | ) 294 | 295 | 296 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', google_url_ordering_data) 297 | def test_google_url_ordering(robots_txt, agent, path, allowed, can_fetch): 298 | assert can_fetch(robots_txt, agent, path) is allowed 299 | 300 | 301 | disallow_query_string = """ 302 | # DisallowQueryString 303 | 304 | User-agent: * 305 | Disallow: /some/path?name=value 306 | """ 307 | 308 | disallow_query_string_data = [ 309 | [disallow_query_string, DEFAULT_AGENT, '/some/path', ALLOWED], 310 | [disallow_query_string, DEFAULT_AGENT, '/some/path?name=value', DISALLOWED], 311 | ] 312 | 313 | 314 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', disallow_query_string_data) 315 | def test_disallow_query_string(robots_txt, agent, path, allowed, can_fetch): 316 | assert can_fetch(robots_txt, agent, path) is allowed 317 | 318 | 319 | use_first_useragent_wildcard = """ 320 | # UseFirstUserAgentWildcard 321 | 322 | User-agent: * 323 | Disallow: /some/path 324 | 325 | User-agent: * 326 | Disallow: /another/path 327 | """ 328 | 329 | test_use_first_useragent_wildcard = ( 330 | [use_first_useragent_wildcard, DEFAULT_AGENT, '/another/path', DISALLOWED], 331 | [use_first_useragent_wildcard, DEFAULT_AGENT, '/some/path', DISALLOWED], 332 | ) 333 | 334 | 335 | # The logic in robotspy is to combine the entries with the same useragent, as per the specs: 336 | # https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.1 337 | # TODO: consider renaming this test combine_rules or something similar 338 | # Mark it as a difference with urllib.robotparser in the Differences section in the README 339 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', test_use_first_useragent_wildcard) 340 | def test_use_first_useragent_wildcard(robots_txt, agent, path, allowed, can_fetch): 341 | assert can_fetch(robots_txt, agent, path) is allowed 342 | 343 | 344 | empty_query_string = """ 345 | # EmptyQueryString 346 | 347 | User-agent: * 348 | Allow: /some/path? 349 | Disallow: /another/path? 350 | """ 351 | 352 | empty_query_string_data = ( 353 | [empty_query_string, DEFAULT_AGENT, '/some/path?', ALLOWED], 354 | [empty_query_string, DEFAULT_AGENT, '/another/path?', DISALLOWED], 355 | ) 356 | 357 | 358 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', empty_query_string_data) 359 | def test_empty_query_string(robots_txt, agent, path, allowed, can_fetch): 360 | assert can_fetch(robots_txt, agent, path) is allowed 361 | 362 | 363 | default_entry = """ 364 | # DefaultEntry 365 | 366 | User-agent: * 367 | Crawl-delay: 1 368 | Request-rate: 3/15 369 | Disallow: /cyberworld/map/ 370 | """ 371 | 372 | default_entry_data = ( 373 | [default_entry, DEFAULT_AGENT, '/', ALLOWED], 374 | [default_entry, DEFAULT_AGENT, '/test.html', ALLOWED], 375 | [default_entry, DEFAULT_AGENT, '/cyberworld/map/index.html', DISALLOWED], 376 | ) 377 | 378 | 379 | @pytest.mark.parametrize('robots_txt,agent,path,allowed', default_entry_data) 380 | def test_default_entry(robots_txt, agent, path, allowed, can_fetch): 381 | assert can_fetch(robots_txt, agent, path) is allowed 382 | 383 | 384 | robots_input = """ 385 | # StringFormatting 386 | 387 | User-agent: * 388 | Crawl-delay: 1 389 | Request-rate: 3/15 390 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 391 | 392 | # Cybermapper knows where to go. 393 | User-agent: cybermapper 394 | Disallow: /some/path 395 | """ 396 | 397 | expected_robots_output = """User-agent: * 398 | Disallow: /cyberworld/map/ 399 | 400 | User-agent: cybermapper 401 | Disallow: /some/path 402 | """ 403 | 404 | 405 | def test_string_formatting(): 406 | parser = robots.RobotsParser().from_string(robots_input) 407 | print(str(parser)) 408 | assert str(parser) == expected_robots_output 409 | 410 | 411 | robots_sitemap_input = """ 412 | # StringFormatting 413 | 414 | User-agent: * 415 | Crawl-delay: 1 416 | Request-rate: 3/15 417 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space 418 | 419 | # Cybermapper knows where to go. 420 | User-agent: cybermapper 421 | Disallow: /some/path 422 | 423 | Sitemap: https://www.example.com/sitemap1.xml 424 | Sitemap: https://www.example.com/sitemap2.xml 425 | """ 426 | 427 | expected_robots_sitemap_output = """User-agent: * 428 | Disallow: /cyberworld/map/ 429 | 430 | User-agent: cybermapper 431 | Disallow: /some/path 432 | 433 | Sitemap: https://www.example.com/sitemap1.xml 434 | Sitemap: https://www.example.com/sitemap2.xml 435 | """ 436 | 437 | 438 | def test_string_formatting_sitemaps(): 439 | parser = robots.RobotsParser().from_string(robots_sitemap_input) 440 | print(str(parser)) 441 | assert str(parser) == expected_robots_sitemap_output 442 | --------------------------------------------------------------------------------