├── .bandit.yml ├── .flake8 ├── .git-blame-ignore-revs ├── .github └── workflows │ ├── main.yml │ └── publish.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE.txt ├── README.rst ├── docs ├── Makefile ├── conf.py ├── headers.rst ├── index.rst ├── news.rst ├── requirements.txt ├── settings.rst └── stats.rst ├── pyproject.toml ├── requirements.txt ├── scrapy_zyte_smartproxy ├── __init__.py ├── middleware.py └── utils.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── requirements.txt └── test_all.py └── tox.ini /.bandit.yml: -------------------------------------------------------------------------------- 1 | skips: 2 | - B101 # assert_used, needed for mypy 3 | - B311 4 | - B320 5 | - B410 6 | exclude_dirs: ['tests'] 7 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # applying pre-commit hooks to the project 2 | 05665a6fb1717ef513d7a8ac87b8eb499a64cdc9 3 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | branches: [ master ] 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | include: 14 | - python-version: "2.7" 15 | env: 16 | TOXENV: min 17 | - python-version: "2.7" 18 | env: 19 | TOXENV: py 20 | - python-version: "3.9" 21 | env: 22 | TOXENV: py 23 | - python-version: "3.10" 24 | env: 25 | TOXENV: py 26 | - python-version: "3.10" 27 | env: 28 | TOXENV: mypy 29 | - python-version: "3.11" 30 | env: 31 | TOXENV: py 32 | - python-version: "3.12" 33 | env: 34 | TOXENV: py 35 | - python-version: "3.13" 36 | env: 37 | TOXENV: py 38 | - python-version: "3.13" 39 | env: 40 | TOXENV: pre-commit 41 | - python-version: "3.12" # Keep in sync with .readthedocs.yml 42 | env: 43 | TOXENV: docs 44 | - python-version: "3.13" 45 | env: 46 | TOXENV: twinecheck 47 | steps: 48 | - uses: actions/checkout@v4 49 | - name: Set up Python ${{ matrix.python-version }} 50 | uses: MatteoH2O1999/setup-python@v4 51 | with: 52 | python-version: ${{ matrix.python-version }} 53 | allow-build: info 54 | cache-build: true 55 | cache: pip 56 | - name: Install dependencies 57 | run: | 58 | sudo apt update -y && sudo apt install -y gcc-9 59 | python -m pip install --upgrade pip 60 | pip install tox codecov 61 | - name: Run tests 62 | env: ${{ matrix.env }} 63 | run: tox 64 | - name: Upload coverage.xml to codecov 65 | uses: codecov/codecov-action@v5 66 | with: 67 | token: ${{ secrets.CODECOV_TOKEN }} 68 | pre-commit: 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v4 72 | - uses: pre-commit/action@v3.0.0 73 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - 'v[0-9]+.[0-9]+.[0-9]+' 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: pypi 11 | url: https://pypi.org/p/${{ github.event.repository.name }} 12 | permissions: 13 | id-token: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.13 19 | - run: | 20 | python -m pip install --upgrade build 21 | python -m build 22 | - name: Publish to PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### VirtualEnv template 2 | # Virtualenv 3 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 4 | venv/ 5 | .Python 6 | [Bb]in 7 | [Ii]nclude 8 | [Ll]ib 9 | pyvenv.cfg 10 | ### IPythonNotebook template 11 | # Temporary data 12 | .ipynb_checkpoints/ 13 | ### Python template 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # Distribution / packaging 20 | env/ 21 | build/ 22 | .build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *,cover 56 | .pytest_cache 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IDEA 71 | .idea/ 72 | 73 | # Pipenv 74 | Pipfile* 75 | 76 | # Eclipse 77 | .pydevproject 78 | .project 79 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/bandit 3 | rev: 1.7.9 4 | hooks: 5 | - id: bandit 6 | args: [-r, -c, .bandit.yml] 7 | - repo: https://github.com/psf/black.git 8 | rev: 24.8.0 9 | hooks: 10 | - id: black 11 | - repo: https://github.com/PyCQA/flake8 12 | rev: 7.1.1 13 | hooks: 14 | - id: flake8 15 | - repo: https://github.com/pycqa/isort 16 | rev: 5.13.2 17 | hooks: 18 | - id: isort -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | fail_on_warning: true 6 | 7 | build: 8 | os: ubuntu-20.04 9 | tools: 10 | # For available versions, see: 11 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 12 | python: "3.12" # Keep in sync with .github/workflows/main.yml 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements.txt 17 | - path: . 18 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Zyte Group Ltd 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Zyte nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | scrapy-zyte-smartproxy 3 | ====================== 4 | 5 | .. image:: https://img.shields.io/pypi/v/scrapy-zyte-smartproxy.svg 6 | :target: https://pypi.python.org/pypi/scrapy-zyte-smartproxy 7 | :alt: PyPI Version 8 | 9 | .. image:: https://travis-ci.org/scrapy-plugins/scrapy-zyte-smartproxy.svg?branch=master 10 | :target: http://travis-ci.org/scrapy-plugins/scrapy-zyte-smartproxy 11 | :alt: Build Status 12 | 13 | .. image:: http://codecov.io/github/scrapy-plugins/scrapy-zyte-smartproxy/coverage.svg?branch=master 14 | :target: http://codecov.io/github/scrapy-plugins/scrapy-zyte-smartproxy?branch=master 15 | :alt: Code Coverage 16 | 17 | scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of 18 | Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart 19 | Proxy Manager`_ (formerly Crawlera). 20 | 21 | .. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 22 | .. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 23 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 24 | .. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ 25 | 26 | Requirements 27 | ============ 28 | 29 | * Python 2.7 or Python 3.4+ 30 | * Scrapy 1.4+ 31 | 32 | Installation 33 | ============ 34 | 35 | You can install scrapy-zyte-smartproxy using pip:: 36 | 37 | pip install scrapy-zyte-smartproxy 38 | 39 | 40 | Documentation 41 | ============= 42 | 43 | Documentation is available online at 44 | https://scrapy-zyte-smartproxy.readthedocs.io/ and in the ``docs`` directory. 45 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = scrapy-zyte-smartproxy 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # scrapy-zyte-smartproxy documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Jan 21 13:17:41 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | # 21 | from os import path 22 | 23 | sys.path.insert(0, path.dirname(path.dirname(__file__))) 24 | 25 | 26 | html_theme = "sphinx_rtd_theme" 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autosectionlabel", 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ["_templates"] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | # source_suffix = ['.rst', '.md'] 48 | source_suffix = {".rst": "restructuredtext"} 49 | 50 | # The master toctree document. 51 | master_doc = "index" 52 | 53 | # General information about the project. 54 | project = "scrapy-zyte-smartproxy" 55 | copyright = "2011-2021, Zyte Group Ltd" 56 | author = "Zyte" 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | 63 | try: 64 | import scrapy_zyte_smartproxy 65 | 66 | version = ".".join(scrapy_zyte_smartproxy.__version__.split(".")[:2]) 67 | release = scrapy_zyte_smartproxy.__version__ 68 | except ImportError: 69 | version = "" 70 | release = "" 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This patterns also effect to html_static_path and html_extra_path 75 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = "sphinx" 79 | 80 | # If true, `todo` and `todoList` produce output, else they produce nothing. 81 | todo_include_todos = False 82 | 83 | 84 | # -- Options for HTML output ---------------------------------------------- 85 | 86 | # The theme to use for HTML and HTML Help pages. See the documentation for 87 | # a list of builtin themes. 88 | # 89 | 90 | # Theme options are theme-specific and customize the look and feel of a theme 91 | # further. For a list of options available for each theme, see the 92 | # documentation. 93 | # 94 | # html_theme_options = {} 95 | 96 | # Add any paths that contain custom static files (such as style sheets) here, 97 | # relative to this directory. They are copied after the builtin static files, 98 | # so a file named "default.css" will overwrite the builtin "default.css". 99 | # html_static_path = ['_static'] 100 | 101 | 102 | # -- Options for HTMLHelp output ------------------------------------------ 103 | 104 | # Output file base name for HTML help builder. 105 | htmlhelp_basename = "scrapy-zyte-smartproxydoc" 106 | 107 | 108 | # -- Options for LaTeX output --------------------------------------------- 109 | 110 | latex_elements = { 111 | # The paper size ('letterpaper' or 'a4paper'). 112 | # 113 | # 'papersize': 'letterpaper', 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | # Latex figure (float) alignment 121 | # 122 | # 'figure_align': 'htbp', 123 | } 124 | 125 | # Grouping the document tree into LaTeX files. List of tuples 126 | # (source start file, target name, title, 127 | # author, documentclass [howto, manual, or own class]). 128 | latex_documents = [ 129 | ( 130 | master_doc, 131 | "scrapy-zyte-smartproxy.tex", 132 | "scrapy-zyte-smartproxy Documentation", 133 | "Zyte", 134 | "manual", 135 | ), 136 | ] 137 | 138 | 139 | # -- Options for manual page output --------------------------------------- 140 | 141 | # One entry per manual page. List of tuples 142 | # (source start file, name, description, authors, manual section). 143 | man_pages = [ 144 | ( 145 | master_doc, 146 | "scrapy-zyte-smartproxy", 147 | "scrapy-zyte-smartproxy Documentation", 148 | [author], 149 | 1, 150 | ) 151 | ] 152 | 153 | 154 | # -- Options for Texinfo output ------------------------------------------- 155 | 156 | # Grouping the document tree into Texinfo files. List of tuples 157 | # (source start file, target name, title, author, 158 | # dir menu entry, description, category) 159 | texinfo_documents = [ 160 | ( 161 | master_doc, 162 | "scrapy-zyte-smartproxy", 163 | "scrapy-zyte-smartproxy Documentation", 164 | author, 165 | "scrapy-zyte-smartproxy", 166 | "One line description of project.", 167 | "Miscellaneous", 168 | ), 169 | ] 170 | -------------------------------------------------------------------------------- /docs/headers.rst: -------------------------------------------------------------------------------- 1 | Headers 2 | ======= 3 | 4 | The Zyte proxy services that you can use with this downloader middleware each 5 | support a different set of HTTP request and response headers that give you 6 | access to additional features. You can find more information about those 7 | headers in the documentation of each service, `Zyte API’s `_ 8 | and `Zyte Smart Proxy Manager’s `_. 9 | 10 | .. _zyte-api-headers: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 11 | .. _spm-headers: https://docs.zyte.com/smart-proxy-manager.html#request-headers 12 | 13 | If you try to use a header for one service while using the other service, this 14 | downloader middleware will try to translate your header into the right header 15 | for the target service and, regardless of whether or not translation was done, 16 | the original header will be dropped. 17 | 18 | Also, response headers that can be translated will be always translated, 19 | without dropping the original header, so code expecting a response header from 20 | one service can work even if a different service was used. 21 | 22 | Translation is supported for the following headers: 23 | 24 | ========================= =========================== 25 | Zyte API Zyte Smart Proxy Manager 26 | ========================= =========================== 27 | ``Zyte-Device`` ``X-Crawlera-Profile`` 28 | ``Zyte-Error-Type`` ``X-Crawlera-Error`` 29 | ``Zyte-Geolocation`` ``X-Crawlera-Region`` 30 | ``Zyte-JobId`` ``X-Crawlera-JobId`` 31 | ``Zyte-Override-Headers`` ``X-Crawlera-Profile-Pass`` 32 | ========================= =========================== 33 | 34 | Also, if a request is not being proxied and includes a header for any of these 35 | services, it will be dropped, to prevent leaking data to external websites. 36 | This downloader middleware assumes that a header prefixed with ``Zyte-`` is a 37 | Zyte API header, and that a header prefixed with ``X-Crawlera-`` is a Zyte 38 | Smart Proxy Manager header, even if they are not known headers otherwise. 39 | 40 | When dropping a header, be it as part of header translation or to avoid leaking 41 | data, a warning message with details will be logged. 42 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ============================================== 2 | scrapy-zyte-smartproxy |version| documentation 3 | ============================================== 4 | 5 | .. toctree:: 6 | :hidden: 7 | 8 | headers 9 | stats 10 | settings 11 | news 12 | 13 | scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of 14 | Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart 15 | Proxy Manager`_ (formerly Crawlera). 16 | 17 | .. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 18 | .. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 19 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 20 | .. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ 21 | 22 | Configuration 23 | ============= 24 | 25 | #. Add the downloader middleware to your ``DOWNLOADER_MIDDLEWARES`` Scrapy 26 | setting: 27 | 28 | .. code-block:: python 29 | :caption: settings.py 30 | 31 | DOWNLOADER_MIDDLEWARES = { 32 | ... 33 | 'scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware': 610 34 | } 35 | 36 | #. Enable the middleware and configure your API key, either through Scrapy 37 | settings: 38 | 39 | .. code-block:: python 40 | :caption: settings.py 41 | 42 | ZYTE_SMARTPROXY_ENABLED = True 43 | ZYTE_SMARTPROXY_APIKEY = 'apikey' 44 | 45 | Or through spider attributes: 46 | 47 | .. code-block:: python 48 | 49 | class MySpider(scrapy.Spider): 50 | zyte_smartproxy_enabled = True 51 | zyte_smartproxy_apikey = 'apikey' 52 | 53 | .. _ZYTE_SMARTPROXY_URL: 54 | 55 | #. Set the ``ZYTE_SMARTPROXY_URL`` Scrapy setting as needed: 56 | 57 | - To use the `proxy mode`_ of `Zyte API`_, set it to 58 | ``http://api.zyte.com:8011``: 59 | 60 | .. code-block:: python 61 | :caption: settings.py 62 | 63 | ZYTE_SMARTPROXY_URL = "http://api.zyte.com:8011" 64 | 65 | .. tip:: This URL is logged, so that you can tell which value was used 66 | from crawl logs. 67 | 68 | - To use the default Zyte Smart Proxy Manager endpoint, leave it unset. 69 | 70 | - To use a custom Zyte Smart Proxy Manager endpoint, in case you have a 71 | dedicated or private instance, set it to your custom endpoint. For 72 | example: 73 | 74 | .. code-block:: python 75 | :caption: settings.py 76 | 77 | ZYTE_SMARTPROXY_URL = "http://myinstance.zyte.com:8011" 78 | 79 | 80 | Usage 81 | ===== 82 | 83 | Once the downloader middleware is properly configured, every request goes 84 | through the configured Zyte proxy service. 85 | 86 | .. _override: 87 | 88 | Although the plugin configuration only allows defining a single proxy endpoint 89 | and API key, it is possible to override them for specific requests, so that you 90 | can use different combinations for different requests within the same spider. 91 | 92 | To **override** which combination of endpoint and API key is used for a given 93 | request, set ``proxy`` in the request metadata to a URL indicating both the 94 | target endpoint and the API key to use. For example: 95 | 96 | .. code-block:: python 97 | 98 | scrapy.Request( 99 | "https://topscrape.com", 100 | meta={ 101 | "proxy": "http://YOUR_API_KEY@api.zyte.com:8011", 102 | ... 103 | }, 104 | ) 105 | 106 | .. TODO: Check that a colon after the API key is not needed in this case. 107 | 108 | To **disable** proxying altogether for a given request, set ``dont_proxy`` to 109 | ``True`` on the request metadata: 110 | 111 | .. code-block:: python 112 | 113 | scrapy.Request( 114 | "https://topscrape.com", 115 | meta={ 116 | "dont_proxy": True, 117 | ... 118 | }, 119 | ) 120 | 121 | You can set `Zyte API proxy headers`_ or `Zyte Smart Proxy Manager headers`_ as 122 | regular `Scrapy headers`_, e.g. using the ``headers`` parameter of ``Request`` 123 | or using the DEFAULT_REQUEST_HEADERS_ setting. For example: 124 | 125 | .. code-block:: python 126 | 127 | scrapy.Request( 128 | "https://topscrape.com", 129 | headers={ 130 | "Zyte-Geolocation": "FR", 131 | ... 132 | }, 133 | ) 134 | 135 | .. _Zyte API proxy headers: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 136 | .. _Zyte Smart Proxy Manager headers: https://docs.zyte.com/smart-proxy-manager.html#request-headers 137 | .. _Scrapy headers: https://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.headers 138 | .. _DEFAULT_REQUEST_HEADERS: https://doc.scrapy.org/en/latest/topics/settings.html#default-request-headers 139 | 140 | For information about proxy-specific header processing, see :doc:`headers`. 141 | 142 | See also :ref:`settings` for the complete list of settings that this downloader 143 | middleware supports. 144 | -------------------------------------------------------------------------------- /docs/news.rst: -------------------------------------------------------------------------------- 1 | .. _news: 2 | 3 | Changes 4 | ======= 5 | 6 | v2.4.1 (2025-03-24) 7 | ------------------- 8 | 9 | Stop expecting a ``Zyte-Error`` header in responses from `Zyte API`_ `proxy 10 | mode`_, it is named ``Zyte-Error-Type``. 11 | 12 | v2.4.0 (2024-12-30) 13 | ------------------- 14 | 15 | Removed official support for Python 3.4, 3.5, 3.6, 3.7 and 3.8. Added official 16 | Python 3.13 support. 17 | 18 | `Backward-compatible 19 | `__ 20 | ``X-Crawlera``-prefixed headers are no longer translated into their matching 21 | `Zyte API proxy mode headers 22 | `_, 23 | Zyte API now handles their translation on the server side. 24 | 25 | Added a new ``ZYTE_SMARTPROXY_KEEP_HEADERS`` setting that allows disabling 26 | header dropping and translation. 27 | 28 | v2.3.5 (2024-08-05) 29 | ------------------- 30 | 31 | Ban and throttling responses from `Zyte API`_ `proxy mode`_ are now handled in 32 | line with matching responses from Zyte Smart Proxy Manager. 33 | 34 | v2.3.4 (2024-05-09) 35 | ------------------- 36 | 37 | `Zyte API`_ `proxy mode`_ now has its own stat prefix. 38 | 39 | Some user-facing messages mentioning only Zyte Smart Proxy Manager have also 40 | been updated to reflect the fact that scrapy-zyte-smartproxy also supports Zyte 41 | API proxy mode. 42 | 43 | v2.3.3 (2024-02-22) 44 | ------------------- 45 | 46 | Fix response handling for `Zyte API`_ `proxy mode`_. Before, a single 47 | connection issue during a request would add a 90 second delay between requests 48 | until the end of the crawl, instead of removing the delay after the first 49 | successful response. 50 | 51 | v2.3.2 (2024-02-14) 52 | ------------------- 53 | 54 | Detect scenarios where the ``proxy`` ``Request.meta`` key has probably been 55 | accidentally copied from an earlier response, warn about it, and fix the value. 56 | 57 | The ``Zyte-Client`` header is again sent when using `Zyte API`_ `proxy mode`_, 58 | now that Zyte API supports it. 59 | 60 | v2.3.1 (2023-11-20) 61 | ------------------- 62 | 63 | Fixed `Zyte API`_ `proxy mode`_ support by removing the mapping of unsupported 64 | headers ``Zyte-Client`` and ``Zyte-No-Bancheck``. 65 | 66 | v2.3.0 (2023-10-20) 67 | ------------------- 68 | 69 | Added support for the upcoming `proxy mode`_ of `Zyte API`_. 70 | 71 | .. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 72 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 73 | 74 | Added a BSD-3-Clause license file. 75 | 76 | v2.2.0 (2022-08-05) 77 | ------------------- 78 | 79 | Added support for Scrapy 2.6.2 and later. 80 | 81 | Scrapy 1.4 became the minimum supported Scrapy version. 82 | 83 | v2.1.0 (2021-06-16) 84 | ------------------- 85 | 86 | - Use a custom logger instead of the root one 87 | 88 | v2.0.0 (2021-05-12) 89 | ------------------- 90 | 91 | Following the upstream rebranding of Crawlera as Zyte Smart Proxy Manager, 92 | ``scrapy-crawlera`` has been renamed as ``scrapy-zyte-smartproxy``, with the 93 | following backward-incompatible changes: 94 | 95 | - The repository name and Python Package Index (PyPI) name are now 96 | ``scrapy-zyte-smartproxy``. 97 | 98 | - Setting prefixes have switched from ``CRAWLERA_`` to ``ZYTE_SMARTPROXY_``. 99 | 100 | - Spider attribute prefixes and request meta key prefixes have switched from 101 | ``crawlera_`` to ``zyte_smartproxy_``. 102 | 103 | - ``scrapy_crawlera`` is now ``scrapy_zyte_smartproxy``. 104 | 105 | - ``CrawleraMiddleware`` is now ``ZyteSmartProxyMiddleware``, and its default 106 | ``url`` is now ``http://proxy.zyte.com:8011``. 107 | 108 | - Stat prefixes have switched from ``crawlera/`` to ``zyte_smartproxy/``. 109 | 110 | - The online documentation is moving to 111 | https://scrapy-zyte-smartproxy.readthedocs.io/ 112 | 113 | .. note:: Zyte Smart Proxy Manager headers continue to use the ``X-Crawlera-`` 114 | prefix. 115 | 116 | - In addition to that, the ``X-Crawlera-Client`` header is now automatically 117 | included in all requests. 118 | 119 | v1.7.2 (2020-12-01) 120 | ------------------- 121 | - Use request.meta than response.meta in the middleware 122 | 123 | v1.7.1 (2020-10-22) 124 | ------------------- 125 | - Consider Crawlera response if contains `X-Crawlera-Version` header 126 | - Build the documentation in Travis CI and fail on documentation issues 127 | - Update matrix of tests 128 | 129 | v1.7.0 (2020-04-01) 130 | ------------------- 131 | - Added more stats to better understanding the internal states. 132 | - Log warning when using `https://` protocol. 133 | - Add default `http://` protocol in case of none provided, and log warning about it. 134 | - Fix duplicated request when the response is not from crawlera, this was causing an 135 | infinite loop of retries when `dont_filter=True`. 136 | 137 | v1.6.0 (2019-05-27) 138 | ------------------- 139 | 140 | - Enable crawlera on demand by setting ``CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES`` 141 | 142 | v1.5.1 (2019-05-21) 143 | ------------------- 144 | 145 | - Remove username and password from settings since it's removed from crawlera. 146 | - Include affected spider in logs. 147 | - Handle situations when crawlera is restarted and reply with 407's for a few minutes 148 | by retrying the requests with a exponential backoff system. 149 | 150 | v1.5.0 (2019-01-23) 151 | ------------------- 152 | 153 | - Correctly check for bans in crawlera (Jobs will not get banned on non ban 503's). 154 | - Exponential backoff when crawlera doesn't have proxies available. 155 | - Fix ``dont_proxy=False`` header disabling crawlera when it is enabled. 156 | 157 | v1.4.0 (2018-09-20) 158 | ------------------- 159 | 160 | - Remove X-Crawlera-* headers when Crawlera is disabled. 161 | - Introduction of DEFAULT_CRAWLERA_HEADERS settings. 162 | 163 | v1.3.0 (2018-01-10) 164 | ------------------- 165 | 166 | - Use CONNECT method to contact Crawlera proxy. 167 | 168 | v1.2.4 (2017-07-04) 169 | ------------------- 170 | 171 | - Trigger PYPI deployments after changes made to TOXENV in v1.2.3 172 | 173 | v1.2.3 (2017-06-29) 174 | ------------------- 175 | 176 | - Multiple documentation fixes 177 | - Test scrapy-crawlera on combinations of software used by scrapinghub stacks 178 | 179 | 180 | v1.2.2 (2017-01-19) 181 | ------------------- 182 | 183 | - Fix Crawlera error stats key in Python 3. 184 | - Add support for Python 3.6. 185 | 186 | 187 | v1.2.1 (2016-10-17) 188 | ------------------- 189 | 190 | - Fix release date in README. 191 | 192 | 193 | v1.2.0 (2016-10-17) 194 | ------------------- 195 | 196 | - Recommend middleware order to be ``610`` to run before ``RedirectMiddleware``. 197 | - Change default download timeout to 190s or 3 minutes 10 seconds 198 | (instead of 1800s or 30 minutes). 199 | - Test and advertize Python 3 compatiblity. 200 | - New ``crawlera/request`` and ``crawlera/request/method/*`` stats counts. 201 | - Clear Scrapy DNS cache for proxy URL in case of connection errors. 202 | - Distribute plugin as universal wheel. 203 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /docs/settings.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Settings 3 | ======== 4 | 5 | This Scrapy downloader middleware adds some settings to configure how to work 6 | with your Zyte proxy service. 7 | 8 | ZYTE_SMARTPROXY_APIKEY 9 | ---------------------- 10 | 11 | Default: ``None`` 12 | 13 | Default API key for your Zyte proxy service. 14 | 15 | Note that Zyte API and Zyte Smart Proxy Manager have different API keys. 16 | 17 | You can :ref:`override this value on specific requests `. 18 | 19 | 20 | ZYTE_SMARTPROXY_URL 21 | ------------------- 22 | 23 | Default: ``'http://proxy.zyte.com:8011'`` 24 | 25 | Default endpoint for your Zyte proxy service. 26 | 27 | For guidelines on setting a value, see the :ref:`initial configuration 28 | instructions `. 29 | 30 | You can :ref:`override this value on specific requests `. 31 | 32 | ZYTE_SMARTPROXY_MAXBANS 33 | ----------------------- 34 | 35 | Default: ``400`` 36 | 37 | Number of consecutive bans necessary to stop the spider. 38 | 39 | ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT 40 | -------------------------------- 41 | 42 | Default: ``190`` 43 | 44 | Timeout for processing proxied requests. It overrides Scrapy's ``DOWNLOAD_TIMEOUT``. 45 | 46 | ZYTE_SMARTPROXY_PRESERVE_DELAY 47 | ------------------------------ 48 | 49 | Default: ``False`` 50 | 51 | If ``False`` sets Scrapy's ``DOWNLOAD_DELAY`` to ``0``, making the spider to crawl faster. If set to ``True``, it will 52 | respect the provided ``DOWNLOAD_DELAY`` from Scrapy. 53 | 54 | ZYTE_SMARTPROXY_DEFAULT_HEADERS 55 | ------------------------------- 56 | 57 | Default: ``{}`` 58 | 59 | Default headers added only to proxied requests. Headers defined on ``DEFAULT_REQUEST_HEADERS`` will take precedence as long as the ``ZyteSmartProxyMiddleware`` is placed after the ``DefaultHeadersMiddleware``. Headers set on the requests have precedence over the two settings. 60 | 61 | * This is the default behavior, ``DefaultHeadersMiddleware`` default priority is ``400`` and we recommend ``ZyteSmartProxyMiddleware`` priority to be ``610``. 62 | 63 | ZYTE_SMARTPROXY_BACKOFF_STEP 64 | ---------------------------- 65 | 66 | Default: ``15`` 67 | 68 | Step size used for calculating exponential backoff according to the formula: ``random.uniform(0, min(max, step * 2 ** attempt))``. 69 | 70 | ZYTE_SMARTPROXY_BACKOFF_MAX 71 | --------------------------- 72 | 73 | Default: ``180`` 74 | 75 | Max value for exponential backoff as showed in the formula above. 76 | 77 | ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES 78 | ------------------------------------------ 79 | 80 | Default: ``[]`` 81 | 82 | List of HTTP response status codes that warrant enabling your Zyte proxy 83 | service for the corresponding domain. 84 | 85 | When a response with one of these HTTP status codes is received after an 86 | unproxied request, the request is retried with your Zyte proxy service, and any 87 | new request to the same domain is also proxied. 88 | 89 | ZYTE_SMARTPROXY_KEEP_HEADERS 90 | ---------------------------- 91 | 92 | Default: ``False`` 93 | 94 | If ``True``, header dropping and translation is disabled. 95 | -------------------------------------------------------------------------------- /docs/stats.rst: -------------------------------------------------------------------------------- 1 | Stats 2 | ===== 3 | 4 | This Scrapy plugin tracks some stats. 5 | 6 | Stats for the `proxy mode`_ of `Zyte API`_ and stats for `Zyte Smart 7 | Proxy Manager`_ (formerly Crawlera) have a different prefix, ``zyte_api_proxy`` 8 | and ``zyte_smartproxy`` respectively. 9 | 10 | .. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html 11 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 12 | .. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.bumpversion] 2 | current_version = "2.4.1" 3 | commit = true 4 | tag = true 5 | tag_name = "v{new_version}" 6 | 7 | [[tool.bumpversion.files]] 8 | filename = "docs/news.rst" 9 | search = "\\(unreleased\\)$" 10 | replace = "({now:%Y-%m-%d})" 11 | regex = true 12 | 13 | [[tool.bumpversion.files]] 14 | filename = "scrapy_zyte_smartproxy/__init__.py" 15 | 16 | [[tool.bumpversion.files]] 17 | filename = "setup.py" 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.4 2 | six 3 | w3lib 4 | -------------------------------------------------------------------------------- /scrapy_zyte_smartproxy/__init__.py: -------------------------------------------------------------------------------- 1 | from .middleware import ZyteSmartProxyMiddleware 2 | 3 | __version__ = "2.4.1" 4 | __all__ = ["ZyteSmartProxyMiddleware"] 5 | -------------------------------------------------------------------------------- /scrapy_zyte_smartproxy/middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import warnings 4 | from base64 import urlsafe_b64decode 5 | from collections import defaultdict 6 | from typing import Dict, List # noqa 7 | 8 | try: 9 | from urllib.request import _parse_proxy # type: ignore 10 | except ImportError: 11 | from urllib2 import _parse_proxy # type: ignore 12 | 13 | from scrapy import signals 14 | from scrapy.exceptions import ScrapyDeprecationWarning 15 | from scrapy.resolver import dnscache 16 | from six.moves.urllib.parse import urlparse, urlunparse 17 | from twisted.internet.error import ConnectionDone, ConnectionRefusedError 18 | from w3lib.http import basic_auth_header 19 | 20 | from scrapy_zyte_smartproxy.utils import exp_backoff 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def _remove_auth(auth_proxy_url): 26 | proxy_type, user, password, hostport = _parse_proxy(auth_proxy_url) 27 | return urlunparse((proxy_type, hostport, "", "", "", "")) 28 | 29 | 30 | class ZyteSmartProxyMiddleware(object): 31 | 32 | url = "http://proxy.zyte.com:8011" 33 | maxbans = 400 34 | ban_code = 503 35 | download_timeout = 190 36 | # Handle Zyte Smart Proxy Manager server failures 37 | connection_refused_delay = 90 38 | preserve_delay = False 39 | header_prefix = "X-Crawlera-" # Deprecated 40 | header_lowercase_prefixes = (b"zyte-", b"x-crawlera-") 41 | conflicting_headers = ("X-Crawlera-Profile", "X-Crawlera-UA") 42 | backoff_step = 15 43 | backoff_max = 180 44 | exp_backoff = None 45 | max_auth_retry_times = 10 46 | apikey = "" 47 | 48 | def __init__(self, crawler): 49 | self.crawler = crawler 50 | self.job_id = os.environ.get("SCRAPY_JOB") 51 | self.spider = None 52 | self._bans = defaultdict(int) 53 | self._saved_delays = defaultdict(lambda: None) 54 | self._auth_url = None 55 | self.enabled_for_domain = {} # type: Dict[str, bool] 56 | self.force_enable_on_http_codes = [] # type: List[int] 57 | self.zyte_api_to_spm_translations = { 58 | b"zyte-device": b"x-crawlera-profile", 59 | b"zyte-geolocation": b"x-crawlera-region", 60 | b"zyte-jobid": b"x-crawlera-jobid", 61 | b"zyte-override-headers": b"x-crawlera-profile-pass", 62 | } 63 | self._settings = [ 64 | ("apikey", str), 65 | ("url", str), 66 | ("maxbans", int), 67 | ("download_timeout", int), 68 | ("preserve_delay", bool), 69 | ("backoff_step", int), 70 | ("backoff_max", int), 71 | ("force_enable_on_http_codes", list), 72 | ] 73 | # Keys are proxy URLs, values are booleans (True means Zyte API, False 74 | # means Zyte Smart Proxy Manager). 75 | self._targets = {} 76 | # SPM headers that can be used with Zyte API proxy mode 77 | # https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping 78 | self.spm_bc_headers = [ 79 | b"x-crawlera-cookies", 80 | b"x-crawlera-jobid", 81 | b"x-crawlera-profile", 82 | b"x-crawlera-profile-pass", 83 | b"x-crawlera-region", 84 | b"x-crawlera-session", 85 | ] 86 | self._keep_headers = crawler.settings.getbool( 87 | "ZYTE_SMARTPROXY_KEEP_HEADERS", False 88 | ) 89 | 90 | @classmethod 91 | def from_crawler(cls, crawler): 92 | o = cls(crawler) 93 | crawler.signals.connect(o.open_spider, signals.spider_opened) 94 | return o 95 | 96 | def _make_auth_url(self, spider): 97 | parsed_url = urlparse(self.url) 98 | auth = self.get_proxyauth(spider) 99 | if not auth.startswith(b"Basic "): 100 | raise ValueError( 101 | "Zyte proxy services only support HTTP basic access " 102 | "authentication, but %s.%s.get_proxyauth() returned %r" 103 | % (self.__module__, self.__class__.__name__, auth) 104 | ) 105 | user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode("utf-8") 106 | netloc = user_and_colon + "@" + parsed_url.netloc.split("@")[-1] 107 | parsed_url = parsed_url._replace(netloc=netloc) 108 | return urlunparse(parsed_url) 109 | 110 | def open_spider(self, spider): 111 | self.enabled = self.is_enabled(spider) 112 | self.spider = spider 113 | 114 | for k, type_ in self._settings: 115 | setattr(self, k, self._get_setting_value(spider, k, type_)) 116 | 117 | self._fix_url_protocol() 118 | self._headers = self.crawler.settings.get( 119 | "ZYTE_SMARTPROXY_DEFAULT_HEADERS", {} 120 | ).items() 121 | self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) 122 | 123 | if not self.enabled and not self.force_enable_on_http_codes: 124 | return 125 | 126 | if not self.apikey: 127 | logger.warning( 128 | "Zyte proxy services cannot be used without an API key", 129 | extra={"spider": spider}, 130 | ) 131 | return 132 | 133 | self._auth_url = self._make_auth_url(spider) 134 | self._authless_url = _remove_auth(self._auth_url) 135 | 136 | logger.info( 137 | "Using Zyte proxy service %s with an API key ending in %s" 138 | % (self.url, self.apikey[:7]), 139 | extra={"spider": spider}, 140 | ) 141 | 142 | if not self.preserve_delay: 143 | # Setting spider download delay to 0 to get maximum crawl rate 144 | spider.download_delay = 0 145 | logger.info( 146 | "ZyteSmartProxyMiddleware: disabling download delays in " 147 | "Scrapy to optimize delays introduced by Zyte proxy services. " 148 | "To avoid this behaviour you can use the " 149 | "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " 150 | "that this may slow down the crawl significantly", 151 | extra={"spider": spider}, 152 | ) 153 | 154 | def _settings_get(self, type_, *a, **kw): 155 | if type_ is int: 156 | return self.crawler.settings.getint(*a, **kw) 157 | elif type_ is bool: 158 | return self.crawler.settings.getbool(*a, **kw) 159 | elif type_ is list: 160 | return self.crawler.settings.getlist(*a, **kw) 161 | elif type_ is dict: 162 | return self.crawler.settings.getdict(*a, **kw) 163 | else: 164 | return self.crawler.settings.get(*a, **kw) 165 | 166 | def _get_setting_value(self, spider, k, type_): 167 | if hasattr(spider, "hubproxy_" + k): 168 | warnings.warn( 169 | "hubproxy_%s attribute is deprecated, " 170 | "use zyte_smartproxy_%s instead." % (k, k), 171 | category=ScrapyDeprecationWarning, 172 | stacklevel=1, 173 | ) 174 | 175 | if self.crawler.settings.get("HUBPROXY_%s" % k.upper()) is not None: 176 | warnings.warn( 177 | "HUBPROXY_%s setting is deprecated, " 178 | "use ZYTE_SMARTPROXY_%s instead." % (k.upper(), k.upper()), 179 | category=ScrapyDeprecationWarning, 180 | stacklevel=1, 181 | ) 182 | 183 | o = getattr(self, k, None) 184 | s = self._settings_get( 185 | type_, 186 | "ZYTE_SMARTPROXY_" + k.upper(), 187 | self._settings_get(type_, "HUBPROXY_" + k.upper(), o), 188 | ) 189 | return getattr( 190 | spider, "zyte_smartproxy_" + k, getattr(spider, "hubproxy_" + k, s) 191 | ) 192 | 193 | def _fix_url_protocol(self): 194 | if self.url.startswith("https://"): 195 | logger.warning( 196 | 'ZYTE_SMARTPROXY_URL "%s" set with "https://" protocol.' % self.url 197 | ) 198 | elif not self.url.startswith("http://"): 199 | logger.warning('Adding "http://" to ZYTE_SMARTPROXY_URL %s' % self.url) 200 | self.url = "http://" + self.url 201 | 202 | def is_enabled(self, spider): 203 | """Hook to enable middleware by custom rules.""" 204 | if hasattr(spider, "use_hubproxy"): 205 | warnings.warn( 206 | "use_hubproxy attribute is deprecated, " 207 | "use zyte_smartproxy_enabled instead.", 208 | category=ScrapyDeprecationWarning, 209 | stacklevel=1, 210 | ) 211 | 212 | if self.crawler.settings.get("HUBPROXY_ENABLED") is not None: 213 | warnings.warn( 214 | "HUBPROXY_ENABLED setting is deprecated, " 215 | "use ZYTE_SMARTPROXY_ENABLED instead.", 216 | category=ScrapyDeprecationWarning, 217 | stacklevel=1, 218 | ) 219 | return getattr( 220 | spider, 221 | "zyte_smartproxy_enabled", 222 | self.crawler.settings.getbool("ZYTE_SMARTPROXY_ENABLED"), 223 | ) or getattr( 224 | spider, "use_hubproxy", self.crawler.settings.getbool("HUBPROXY_ENABLED") 225 | ) 226 | 227 | def get_proxyauth(self, spider): 228 | """Hook to compute Proxy-Authorization header by custom rules.""" 229 | return basic_auth_header(self.apikey, "") 230 | 231 | def _targets_zyte_api(self, request): 232 | if self._auth_url is None: 233 | return False 234 | auth_url = request.meta.get("proxy", self._auth_url) 235 | targets_zyte_api = self._targets.get(auth_url, None) 236 | if targets_zyte_api is None: 237 | targets_zyte_api = urlparse(auth_url).hostname == "api.zyte.com" 238 | self._targets[auth_url] = targets_zyte_api 239 | return targets_zyte_api 240 | 241 | def _translate_headers(self, request, targets_zyte_api): 242 | if targets_zyte_api: 243 | return 244 | for header, translation in self.zyte_api_to_spm_translations.items(): 245 | if header not in request.headers: 246 | continue 247 | values = request.headers.pop(header) 248 | value = b"".join(values) 249 | request.headers[translation] = value 250 | logger.warning( 251 | "Translating header %r (%r) to %r on request %r", 252 | header, 253 | value, 254 | translation, 255 | request, 256 | ) 257 | 258 | def _inc_stat(self, stat, targets_zyte_api, value=1): 259 | prefix = "zyte_api_proxy" if targets_zyte_api else "zyte_smartproxy" 260 | self.crawler.stats.inc_value("{}/{}".format(prefix, stat), value) 261 | 262 | def process_request(self, request, spider): 263 | if self._is_enabled_for_request(request): 264 | if "proxy" not in request.meta: 265 | request.meta["proxy"] = self._auth_url 266 | elif ( 267 | request.meta["proxy"] == self._authless_url 268 | and b"Proxy-Authorization" not in request.headers 269 | ): 270 | logger.warning( 271 | "The value of the 'proxy' meta key of request {request} " 272 | "has no API key. You seem to have copied the value of " 273 | "the 'proxy' request meta key from a response or from a " 274 | "different request. Copying request meta keys set by " 275 | "middlewares from one request to another is a bad " 276 | "practice that can cause issues.".format(request=request) 277 | ) 278 | request.meta["proxy"] = self._auth_url 279 | targets_zyte_api = self._targets_zyte_api(request) 280 | self._set_zyte_smartproxy_default_headers(request) 281 | request.meta["download_timeout"] = self.download_timeout 282 | if self.job_id: 283 | job_header = "Zyte-JobId" if targets_zyte_api else "X-Crawlera-JobId" 284 | request.headers[job_header] = self.job_id 285 | user_agent_header = ( 286 | "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" 287 | ) 288 | from scrapy_zyte_smartproxy import __version__ 289 | 290 | request.headers[user_agent_header] = ( 291 | "scrapy-zyte-smartproxy/%s" % __version__ 292 | ) 293 | self._inc_stat("request", targets_zyte_api=targets_zyte_api) 294 | self._inc_stat( 295 | "request/method/{}".format(request.method), 296 | targets_zyte_api=targets_zyte_api, 297 | ) 298 | if not self._keep_headers: 299 | self._translate_headers(request, targets_zyte_api=targets_zyte_api) 300 | self._clean_zyte_smartproxy_headers( 301 | request, targets_zyte_api=targets_zyte_api 302 | ) 303 | elif not self._keep_headers: 304 | self._clean_zyte_smartproxy_headers(request) 305 | 306 | def _is_banned(self, response): 307 | return ( 308 | response.status == self.ban_code 309 | and response.headers.get("X-Crawlera-Error") == b"banned" 310 | ) or (response.status in {520, 521} and response.headers.get("Zyte-Error-Type")) 311 | 312 | def _is_auth_error(self, response): 313 | return ( 314 | response.status == 407 315 | and response.headers.get("X-Crawlera-Error") == b"bad_proxy_auth" 316 | ) 317 | 318 | def _throttle_error(self, response): 319 | error = response.headers.get("Zyte-Error-Type") or response.headers.get( 320 | "X-Crawlera-Error" 321 | ) 322 | if response.status in {429, 503} and error and error != b"banned": 323 | return error.decode("utf-8") 324 | return None 325 | 326 | def _process_error(self, response): 327 | if "Zyte-Error-Type" in response.headers: 328 | value = response.headers.get("Zyte-Error-Type") 329 | response.headers["X-Crawlera-Error"] = value 330 | return value 331 | if "X-Crawlera-Error" in response.headers: 332 | value = response.headers.get("X-Crawlera-Error") 333 | response.headers["Zyte-Error-Type"] = value 334 | return value 335 | return None 336 | 337 | def process_response(self, request, response, spider): 338 | zyte_smartproxy_error = self._process_error(response) 339 | 340 | targets_zyte_api = self._targets_zyte_api(request) 341 | 342 | if not self._is_enabled_for_request(request): 343 | return self._handle_not_enabled_response( 344 | request, response, targets_zyte_api=targets_zyte_api 345 | ) 346 | 347 | if not self._is_zyte_smartproxy_or_zapi_response(response): 348 | return response 349 | 350 | key = self._get_slot_key(request) 351 | self._restore_original_delay(request) 352 | 353 | is_auth_error = self._is_auth_error(response) 354 | throttle_error = self._throttle_error(response) 355 | if is_auth_error or throttle_error: 356 | if is_auth_error: 357 | reason = "autherror" 358 | else: 359 | assert throttle_error 360 | reason = throttle_error.lstrip("/") 361 | self._set_custom_delay( 362 | request, 363 | next(self.exp_backoff), 364 | reason=reason, 365 | targets_zyte_api=targets_zyte_api, 366 | ) 367 | else: 368 | self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api) 369 | self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) 370 | 371 | if is_auth_error: 372 | # When Zyte Smart Proxy Manager has issues it might not be able to 373 | # authenticate users we must retry 374 | retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) 375 | if retries < self.max_auth_retry_times: 376 | return self._retry_auth( 377 | response, request, spider, targets_zyte_api=targets_zyte_api 378 | ) 379 | else: 380 | self._inc_stat( 381 | "retries/auth/max_reached", targets_zyte_api=targets_zyte_api 382 | ) 383 | logger.warning( 384 | "Max retries for authentication issues reached," 385 | "please check auth information settings", 386 | extra={"spider": self.spider}, 387 | ) 388 | 389 | if self._is_banned(response): 390 | self._bans[key] += 1 391 | if self._bans[key] > self.maxbans: 392 | self.crawler.engine.close_spider(spider, "banned") 393 | else: 394 | after = response.headers.get("retry-after") 395 | if after: 396 | self._set_custom_delay( 397 | request, 398 | float(after), 399 | reason="banned", 400 | targets_zyte_api=targets_zyte_api, 401 | ) 402 | self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api) 403 | else: 404 | self._bans[key] = 0 405 | # If placed behind `RedirectMiddleware`, 406 | # it would not count 3xx responses 407 | self._inc_stat("response", targets_zyte_api=targets_zyte_api) 408 | self._inc_stat( 409 | "response/status/{}".format(response.status), 410 | targets_zyte_api=targets_zyte_api, 411 | ) 412 | if zyte_smartproxy_error: 413 | self._inc_stat("response/error", targets_zyte_api=targets_zyte_api) 414 | error_msg = zyte_smartproxy_error.decode("utf8") 415 | self._inc_stat( 416 | "response/error/{}".format(error_msg), 417 | targets_zyte_api=targets_zyte_api, 418 | ) 419 | return response 420 | 421 | def process_exception(self, request, exception, spider): 422 | if not self._is_enabled_for_request(request): 423 | return 424 | if isinstance(exception, (ConnectionRefusedError, ConnectionDone)): 425 | # Handle Zyte Smart Proxy Manager downtime 426 | self._clear_dns_cache() 427 | targets_zyte_api = self._targets_zyte_api(request) 428 | self._set_custom_delay( 429 | request, 430 | self.connection_refused_delay, 431 | reason="conn_refused", 432 | targets_zyte_api=targets_zyte_api, 433 | ) 434 | 435 | def _handle_not_enabled_response(self, request, response, targets_zyte_api): 436 | if self._should_enable_for_response(response): 437 | domain = self._get_url_domain(request.url) 438 | self.enabled_for_domain[domain] = True 439 | 440 | retryreq = request.copy() 441 | retryreq.dont_filter = True 442 | self._inc_stat( 443 | "retries/should_have_been_enabled", 444 | targets_zyte_api=targets_zyte_api, 445 | ) 446 | return retryreq 447 | return response 448 | 449 | def _retry_auth(self, response, request, spider, targets_zyte_api): 450 | logger.warning( 451 | ( 452 | "Retrying a request due to an authentication issue with " 453 | "the configured Zyte proxy service" 454 | ), 455 | extra={"spider": self.spider}, 456 | ) 457 | retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) + 1 458 | retryreq = request.copy() 459 | retryreq.meta["zyte_smartproxy_auth_retry_times"] = retries 460 | retryreq.dont_filter = True 461 | self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api) 462 | return retryreq 463 | 464 | def _clear_dns_cache(self): 465 | # Scrapy doesn't expire dns records by default, so we force it here, 466 | # so client can reconnect trough DNS failover. 467 | dnscache.pop(urlparse(self.url).hostname, None) 468 | 469 | def _should_enable_for_response(self, response): 470 | return response.status in self.force_enable_on_http_codes 471 | 472 | def _is_enabled_for_request(self, request): 473 | domain = self._get_url_domain(request.url) 474 | domain_enabled = self.enabled_for_domain.get(domain, False) 475 | dont_proxy = request.meta.get("dont_proxy", False) 476 | return (domain_enabled or self.enabled) and not dont_proxy 477 | 478 | def _get_url_domain(self, url): 479 | parsed = urlparse(url) 480 | return parsed.netloc 481 | 482 | def _is_zyte_smartproxy_or_zapi_response(self, response): 483 | """Check if is Smart Proxy Manager or Zyte API proxy mode response""" 484 | return ( 485 | "X-Crawlera-Version" in response.headers 486 | or "Zyte-Request-Id" in response.headers 487 | ) 488 | 489 | def _get_slot_key(self, request): 490 | return request.meta.get("download_slot") 491 | 492 | def _get_slot(self, request): 493 | key = self._get_slot_key(request) 494 | return key, self.crawler.engine.downloader.slots.get(key) 495 | 496 | def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None): 497 | """Set custom delay for slot and save original one.""" 498 | key, slot = self._get_slot(request) 499 | if not slot: 500 | return 501 | if self._saved_delays[key] is None: 502 | self._saved_delays[key] = slot.delay 503 | slot.delay = delay 504 | if reason is not None: 505 | self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api) 506 | self._inc_stat( 507 | "delay/{}/total".format(reason), 508 | value=delay, 509 | targets_zyte_api=targets_zyte_api, 510 | ) 511 | 512 | def _restore_original_delay(self, request): 513 | """Restore original delay for slot if it was changed.""" 514 | key, slot = self._get_slot(request) 515 | if not slot: 516 | return 517 | if self._saved_delays[key] is not None: 518 | slot.delay, self._saved_delays[key] = self._saved_delays[key], None 519 | 520 | def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): 521 | """Remove X-Crawlera-* headers from the request.""" 522 | if targets_zyte_api is None: 523 | prefixes = self.header_lowercase_prefixes 524 | elif targets_zyte_api: 525 | prefixes = (b"x-crawlera-",) 526 | else: 527 | prefixes = (b"zyte-",) 528 | targets = [ 529 | header for header in request.headers if self._drop_header(header, prefixes) 530 | ] 531 | for header in targets: 532 | values = request.headers.pop(header, None) 533 | value = b"".join(values) 534 | if targets_zyte_api is not None: 535 | actual_target, header_target = ( 536 | ("Zyte API", "Zyte Smart Proxy Manager") 537 | if targets_zyte_api 538 | else ("Zyte Smart Proxy Manager", "Zyte API") 539 | ) 540 | logger.warning( 541 | ( 542 | "Dropping header %r (%r) from request %r, as this " 543 | "request is proxied with %s and not with %s, and " 544 | "automatic translation is not supported for this " 545 | "header. See " 546 | "https://docs.zyte.com/zyte-api/migration/zyte/" 547 | "smartproxy.html#parameter-mapping" 548 | " to learn the right way to translate this header " 549 | "manually." 550 | ), 551 | header, 552 | value, 553 | request, 554 | actual_target, 555 | header_target, 556 | ) 557 | else: 558 | logger.warning( 559 | ( 560 | "Dropping header {header!r} ({value!r}) from request " 561 | "{request!r}, as this request is not handled by " 562 | "scrapy-zyte-smartproxy. If you are sure that you need " 563 | "to send this header in a request not handled by " 564 | "scrapy-zyte-smartproxy, use the " 565 | "ZYTE_SMARTPROXY_KEEP_HEADERS setting." 566 | ).format( 567 | header=header, 568 | value=value, 569 | request=request, 570 | ) 571 | ) 572 | 573 | def _drop_header(self, header_name, prefixes): 574 | if not header_name: 575 | return False 576 | header_name_lowercase = header_name.lower() 577 | has_drop_prefix = any( 578 | header_name_lowercase.startswith(prefix) for prefix in prefixes 579 | ) 580 | if ( 581 | has_drop_prefix 582 | # When dropping all prefixes, always drop matching headers, i.e. 583 | # ignore self.spm_bc_headers. 584 | and len(prefixes) <= 1 585 | and header_name_lowercase in self.spm_bc_headers 586 | ): 587 | logger.warning( 588 | "Keeping deprecated header {header_name!r}.".format( 589 | header_name=header_name 590 | ) 591 | ) 592 | return False 593 | return has_drop_prefix 594 | 595 | def _set_zyte_smartproxy_default_headers(self, request): 596 | for header, value in self._headers: 597 | if value is None: 598 | continue 599 | request.headers.setdefault(header, value) 600 | lower_case_headers = [ 601 | header.decode("utf-8").lower() for header in request.headers 602 | ] 603 | if all(h.lower() in lower_case_headers for h in self.conflicting_headers): 604 | # Send a general warning once, 605 | # and specific urls if LOG_LEVEL = DEBUG 606 | warnings.warn( 607 | "The headers %s are conflicting on some of your requests. " 608 | "Please check " 609 | "https://docs.zyte.com/smart-proxy-manager.html" 610 | "#request-headers " 611 | "for more information. You can set LOG_LEVEL=DEBUG to see the " 612 | "urls with problems." % str(self.conflicting_headers) 613 | ) 614 | logger.debug( 615 | "The headers %s are conflicting on request %s. X-Crawlera-UA " 616 | "will be ignored. Please check " 617 | "https://docs.zyte.com/smart-proxy-manager.html#request-headers " 618 | "for more information" % (str(self.conflicting_headers), request.url), 619 | extra={"spider": self.spider}, 620 | ) 621 | -------------------------------------------------------------------------------- /scrapy_zyte_smartproxy/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from itertools import count 4 | 5 | 6 | def exp_backoff(step, max): 7 | """Exponential backoff time with Full Jitter""" 8 | # this is a numerically stable version of 9 | # random.uniform(0, min(max, step * 2 ** attempt)) 10 | max_attempts = math.log(max / step, 2) 11 | for attempt in count(0, 1): 12 | if attempt <= max_attempts: 13 | yield random.uniform(0, step * 2**attempt) # nosec 14 | else: 15 | yield random.uniform(0, max) # nosec 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [mypy] 5 | 6 | [mypy-pytest.*] 7 | ignore_missing_imports = True 8 | 9 | [mypy-scrapy.*] 10 | ignore_missing_imports = True 11 | 12 | [mypy-twisted.*] 13 | ignore_missing_imports = True 14 | 15 | [mypy-w3lib.*] 16 | ignore_missing_imports = True 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.rst", "rb") as f: 4 | readme = f.read().decode("utf-8") 5 | 6 | 7 | setup( 8 | name="scrapy-zyte-smartproxy", 9 | version="2.4.1", 10 | license="BSD", 11 | description="Scrapy middleware for Zyte Smart Proxy Manager", 12 | long_description=readme, 13 | long_description_content_type="text/x-rst", 14 | maintainer="Raul Gallegos", 15 | maintainer_email="raul.ogh@gmail.com", 16 | author="Zyte", 17 | author_email="opensource@zyte.com", 18 | url="https://github.com/scrapy-plugins/scrapy-zyte-smartproxy", 19 | packages=["scrapy_zyte_smartproxy"], 20 | platforms=["Any"], 21 | classifiers=[ 22 | "Development Status :: 5 - Production/Stable", 23 | "License :: OSI Approved :: BSD License", 24 | "Operating System :: OS Independent", 25 | "Programming Language :: Python", 26 | "Programming Language :: Python :: 2.7", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13", 32 | "Framework :: Scrapy", 33 | "Intended Audience :: Developers", 34 | "Topic :: Internet :: WWW/HTTP", 35 | "Topic :: Internet :: Proxy Servers", 36 | "Topic :: Software Development :: Libraries :: Application Frameworks", 37 | "Topic :: Software Development :: Libraries :: Python Modules", 38 | ], 39 | install_requires=["scrapy>=1.4.0", "six", "w3lib"], 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-zyte-smartproxy/a136ad4f83465db25dddaa128a23a5d97873070a/tests/__init__.py -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | mock; python_version == '2.7' 4 | -------------------------------------------------------------------------------- /tests/test_all.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import binascii 4 | import os 5 | from copy import copy 6 | from random import choice 7 | from unittest import TestCase 8 | 9 | import pytest 10 | 11 | try: 12 | from unittest.mock import call, patch # type: ignore 13 | except ImportError: 14 | from mock import call, patch # type: ignore 15 | 16 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 17 | from scrapy.exceptions import ScrapyDeprecationWarning 18 | from scrapy.http import Request, Response 19 | from scrapy.resolver import dnscache 20 | from scrapy.spiders import Spider 21 | from scrapy.utils.test import get_crawler 22 | from twisted.internet.error import ConnectionDone, ConnectionRefusedError 23 | from w3lib.http import basic_auth_header 24 | 25 | from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware, __version__ 26 | 27 | RESPONSE_IDENTIFYING_HEADERS = ( 28 | ("X-Crawlera-Version", None), 29 | ("X-Crawlera-Version", ""), 30 | ("X-Crawlera-Version", "1.36.3-cd5e44"), 31 | ("Zyte-Request-Id", "123456789"), 32 | ) 33 | 34 | 35 | class MockedSlot(object): 36 | 37 | def __init__(self, delay=0.0): 38 | self.delay = delay 39 | 40 | 41 | class ZyteSmartProxyMiddlewareTestCase(TestCase): 42 | 43 | mwcls = ZyteSmartProxyMiddleware 44 | bancode = 503 45 | auth_error_code = 407 46 | 47 | def setUp(self): 48 | self.spider = Spider("foo") 49 | self.settings = {"ZYTE_SMARTPROXY_APIKEY": "apikey"} 50 | Response_init_orig = Response.__init__ 51 | 52 | def Response_init_new(self, *args, **kwargs): 53 | assert not kwargs.get( 54 | "request" 55 | ), "response objects at this stage shall not be pinned" 56 | return Response_init_orig(self, *args, **kwargs) 57 | 58 | Response.__init__ = Response_init_new 59 | 60 | def _mock_zyte_smartproxy_response(self, url, headers=None, **kwargs): 61 | headers = headers or {} 62 | k, v = choice(RESPONSE_IDENTIFYING_HEADERS) 63 | headers[k] = v 64 | return Response(url, headers=headers, **kwargs) 65 | 66 | def _mock_crawler(self, spider, settings=None): 67 | 68 | class MockedDownloader(object): 69 | slots = {} 70 | 71 | class MockedEngine(object): 72 | downloader = MockedDownloader() 73 | fake_spider_closed_result = None 74 | 75 | def close_spider(self, spider, reason): 76 | self.fake_spider_closed_result = (spider, reason) 77 | 78 | # with `spider` instead of `type(spider)` raises an exception 79 | crawler = get_crawler(type(spider), settings) 80 | crawler.engine = MockedEngine() 81 | return crawler 82 | 83 | def _assert_disabled(self, spider, settings=None): 84 | crawler = self._mock_crawler(spider, settings) 85 | mw = self.mwcls.from_crawler(crawler) 86 | mw.open_spider(spider) 87 | req = Request("http://example.com") 88 | out = mw.process_request(req, spider) 89 | self.assertEqual(out, None) 90 | self.assertEqual(req.meta.get("proxy"), None) 91 | self.assertEqual(req.meta.get("download_timeout"), None) 92 | self.assertEqual(req.headers.get("Proxy-Authorization"), None) 93 | res = Response(req.url) 94 | assert mw.process_response(req, res, spider) is res 95 | res = Response(req.url, status=mw.ban_code) 96 | assert mw.process_response(req, res, spider) is res 97 | 98 | def _assert_enabled( 99 | self, 100 | spider, 101 | settings=None, 102 | proxyurl="http://proxy.zyte.com:8011", 103 | proxyurlcreds="http://apikey:@proxy.zyte.com:8011", 104 | proxyauth=basic_auth_header("apikey", ""), 105 | maxbans=400, 106 | download_timeout=190, 107 | ): 108 | crawler = self._mock_crawler(spider, settings) 109 | mw = self.mwcls.from_crawler(crawler) 110 | mw.open_spider(spider) 111 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 112 | assert mw.url == proxyurl 113 | req = Request("http://example.com") 114 | assert mw.process_request(req, spider) is None 115 | self.assertEqual(req.meta.get("proxy"), proxyurlcreds) 116 | self.assertEqual(req.meta.get("download_timeout"), download_timeout) 117 | self.assertNotIn(b"Proxy-Authorization", req.headers) 118 | 119 | res = self._mock_zyte_smartproxy_response(req.url) 120 | assert mw.process_response(req, res, spider) is res 121 | 122 | # disabled if 'dont_proxy=True' is set 123 | req = Request("http://example.com") 124 | req.meta["dont_proxy"] = True 125 | assert mw.process_request(req, spider) is None 126 | assert httpproxy.process_request(req, spider) is None 127 | self.assertEqual(req.meta.get("proxy"), None) 128 | self.assertEqual(req.meta.get("download_timeout"), None) 129 | self.assertNotIn(b"Proxy-Authorization", req.headers) 130 | res = self._mock_zyte_smartproxy_response(req.url) 131 | assert mw.process_response(req, res, spider) is res 132 | 133 | del req.meta["dont_proxy"] 134 | assert mw.process_request(req, spider) is None 135 | assert httpproxy.process_request(req, spider) is None 136 | self.assertEqual(req.meta.get("proxy"), proxyurl) 137 | self.assertEqual(req.meta.get("download_timeout"), download_timeout) 138 | self.assertEqual(req.headers.get("Proxy-Authorization"), proxyauth) 139 | 140 | if maxbans > 0: 141 | # assert ban count is reseted after a succesful response 142 | res = self._mock_zyte_smartproxy_response( 143 | "http://banned.example", status=self.bancode 144 | ) 145 | assert mw.process_response(req, res, spider) is res 146 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 147 | res = self._mock_zyte_smartproxy_response("http://unbanned.example") 148 | assert mw.process_response(req, res, spider) is res 149 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 150 | self.assertEqual(mw._bans[None], 0) 151 | 152 | # check for not banning before maxbans for bancode 153 | for x in range(maxbans + 1): 154 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 155 | res = self._mock_zyte_smartproxy_response( 156 | "http://banned.example/%d" % x, 157 | status=self.bancode, 158 | headers={"X-Crawlera-Error": "banned"}, 159 | ) 160 | assert mw.process_response(req, res, spider) is res 161 | assert res.headers["X-Crawlera-Error"] == b"banned" 162 | assert res.headers["Zyte-Error-Type"] == b"banned" 163 | 164 | # max bans reached and close_spider called 165 | self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned")) 166 | 167 | def test_disabled_by_lack_of_zyte_smartproxy_settings(self): 168 | self._assert_disabled(self.spider, settings={}) 169 | 170 | def test_spider_zyte_smartproxy_enabled(self): 171 | self.assertFalse(hasattr(self.spider, "zyte_smartproxy_enabled")) 172 | self._assert_disabled(self.spider, self.settings) 173 | self.spider.zyte_smartproxy_enabled = True 174 | self._assert_enabled(self.spider, self.settings) 175 | self.spider.zyte_smartproxy_enabled = False 176 | self._assert_disabled(self.spider, self.settings) 177 | 178 | def test_enabled(self): 179 | self._assert_disabled(self.spider, self.settings) 180 | self.settings["ZYTE_SMARTPROXY_ENABLED"] = True 181 | self._assert_enabled(self.spider, self.settings) 182 | 183 | def test_spider_zyte_smartproxy_enabled_priority(self): 184 | self.spider.zyte_smartproxy_enabled = False 185 | self.settings["ZYTE_SMARTPROXY_ENABLED"] = True 186 | self._assert_disabled(self.spider, self.settings) 187 | 188 | self.spider.zyte_smartproxy_enabled = True 189 | self.settings["ZYTE_SMARTPROXY_ENABLED"] = False 190 | self._assert_enabled(self.spider, self.settings) 191 | 192 | self.spider.zyte_smartproxy_enabled = True 193 | self.settings["ZYTE_SMARTPROXY_ENABLED"] = True 194 | self._assert_enabled(self.spider, self.settings) 195 | 196 | self.spider.zyte_smartproxy_enabled = False 197 | self.settings["ZYTE_SMARTPROXY_ENABLED"] = False 198 | self._assert_disabled(self.spider, self.settings) 199 | 200 | def test_apikey(self): 201 | self.spider.zyte_smartproxy_enabled = True 202 | self.settings["ZYTE_SMARTPROXY_APIKEY"] = apikey = "apikey" 203 | proxyauth = basic_auth_header(apikey, "") 204 | self._assert_enabled( 205 | self.spider, 206 | self.settings, 207 | proxyauth=proxyauth, 208 | proxyurlcreds="http://apikey:@proxy.zyte.com:8011", 209 | ) 210 | 211 | apikey = "notfromsettings" 212 | proxyauth = basic_auth_header(apikey, "") 213 | self.spider.zyte_smartproxy_apikey = apikey 214 | self._assert_enabled( 215 | self.spider, 216 | self.settings, 217 | proxyauth=proxyauth, 218 | proxyurlcreds="http://notfromsettings:@proxy.zyte.com:8011", 219 | ) 220 | 221 | def test_proxyurl(self): 222 | self.spider.zyte_smartproxy_enabled = True 223 | self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011" 224 | self._assert_enabled( 225 | self.spider, 226 | self.settings, 227 | proxyurl="http://localhost:8011", 228 | proxyurlcreds="http://apikey:@localhost:8011", 229 | ) 230 | 231 | def test_proxyurl_no_protocol(self): 232 | self.spider.zyte_smartproxy_enabled = True 233 | self.settings["ZYTE_SMARTPROXY_URL"] = "localhost:8011" 234 | self._assert_enabled( 235 | self.spider, 236 | self.settings, 237 | proxyurl="http://localhost:8011", 238 | proxyurlcreds="http://apikey:@localhost:8011", 239 | ) 240 | 241 | def test_proxyurl_https(self): 242 | self.spider.zyte_smartproxy_enabled = True 243 | self.settings["ZYTE_SMARTPROXY_URL"] = "https://localhost:8011" 244 | self._assert_enabled( 245 | self.spider, 246 | self.settings, 247 | proxyurl="https://localhost:8011", 248 | proxyurlcreds="https://apikey:@localhost:8011", 249 | ) 250 | 251 | def test_proxyurl_including_noconnect(self): 252 | self.spider.zyte_smartproxy_enabled = True 253 | self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011?noconnect" 254 | self._assert_enabled( 255 | self.spider, 256 | self.settings, 257 | proxyurl="http://localhost:8011?noconnect", 258 | proxyurlcreds="http://apikey:@localhost:8011?noconnect", 259 | ) 260 | 261 | def test_maxbans(self): 262 | self.spider.zyte_smartproxy_enabled = True 263 | self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 0 264 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 265 | self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 100 266 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 267 | # Assert setting is coerced into correct type 268 | self.settings["ZYTE_SMARTPROXY_MAXBANS"] = "123" 269 | self._assert_enabled(self.spider, self.settings, maxbans=123) 270 | self.spider.zyte_smartproxy_maxbans = 99 271 | self._assert_enabled(self.spider, self.settings, maxbans=99) 272 | 273 | def test_download_timeout(self): 274 | self.spider.zyte_smartproxy_enabled = True 275 | self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = 60 276 | self._assert_enabled(self.spider, self.settings, download_timeout=60) 277 | # Assert setting is coerced into correct type 278 | self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = "42" 279 | self._assert_enabled(self.spider, self.settings, download_timeout=42) 280 | self.spider.zyte_smartproxy_download_timeout = 120 281 | self._assert_enabled(self.spider, self.settings, download_timeout=120) 282 | 283 | def test_hooks(self): 284 | proxyauth = basic_auth_header("foo", "") 285 | 286 | class _ECLS(self.mwcls): 287 | def is_enabled(self, spider): 288 | wascalled.append("is_enabled") 289 | return enabled 290 | 291 | def get_proxyauth(self, spider): 292 | wascalled.append("get_proxyauth") 293 | return proxyauth 294 | 295 | wascalled = [] 296 | self.mwcls = _ECLS 297 | 298 | # test is_enabled returns False 299 | enabled = False 300 | self.spider.zyte_smartproxy_enabled = True 301 | self._assert_disabled(self.spider, self.settings) 302 | self.assertEqual(wascalled, ["is_enabled"]) 303 | 304 | wascalled[:] = [] # reset 305 | enabled = True 306 | self.spider.zyte_smartproxy_enabled = False 307 | self._assert_enabled( 308 | self.spider, 309 | self.settings, 310 | proxyauth=proxyauth, 311 | proxyurlcreds="http://foo:@proxy.zyte.com:8011", 312 | ) 313 | self.assertEqual(wascalled, ["is_enabled", "get_proxyauth"]) 314 | 315 | def test_delay_adjustment(self): 316 | delay = 0.5 317 | slot_key = "example.com" 318 | url = "http://example.com" 319 | ban_url = "http://banned.example" 320 | 321 | self.spider.zyte_smartproxy_enabled = True 322 | 323 | crawler = self._mock_crawler(self.spider, self.settings) 324 | # ignore spider delay by default 325 | self.spider.download_delay = delay 326 | mw = self.mwcls.from_crawler(crawler) 327 | mw.open_spider(self.spider) 328 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 329 | self.assertEqual(self.spider.download_delay, 0) 330 | 331 | # preserve original delay 332 | self.spider.download_delay = delay 333 | self.spider.zyte_smartproxy_preserve_delay = True 334 | mw = self.mwcls.from_crawler(crawler) 335 | mw.open_spider(self.spider) 336 | self.assertEqual(self.spider.download_delay, delay) 337 | 338 | slot = MockedSlot(self.spider.download_delay) 339 | crawler.engine.downloader.slots[slot_key] = slot 340 | 341 | # ban without retry-after 342 | req = Request(url, meta={"download_slot": slot_key}) 343 | assert mw.process_request(req, self.spider) is None 344 | assert httpproxy.process_request(req, self.spider) is None 345 | headers = {"X-Crawlera-Error": "banned"} 346 | res = self._mock_zyte_smartproxy_response( 347 | ban_url, 348 | status=self.bancode, 349 | headers=headers, 350 | ) 351 | mw.process_response(req, res, self.spider) 352 | self.assertEqual(slot.delay, delay) 353 | self.assertEqual(self.spider.download_delay, delay) 354 | 355 | # ban with retry-after 356 | retry_after = 1.5 357 | headers = {"retry-after": str(retry_after), "X-Crawlera-Error": "banned"} 358 | res = self._mock_zyte_smartproxy_response( 359 | ban_url, 360 | status=self.bancode, 361 | headers=headers, 362 | ) 363 | mw.process_response(req, res, self.spider) 364 | self.assertEqual(slot.delay, retry_after) 365 | self.assertEqual(self.spider.download_delay, delay) 366 | 367 | # DNS cache should be cleared in case of errors 368 | dnscache["proxy.zyte.com"] = "1.1.1.1" 369 | 370 | res = self._mock_zyte_smartproxy_response(url) 371 | mw.process_response(req, res, self.spider) 372 | self.assertEqual(slot.delay, delay) 373 | self.assertEqual(self.spider.download_delay, delay) 374 | self.assertIn("proxy.zyte.com", dnscache) 375 | 376 | # server failures 377 | mw.process_exception(req, ConnectionRefusedError(), self.spider) 378 | self.assertEqual(slot.delay, mw.connection_refused_delay) 379 | self.assertEqual(self.spider.download_delay, delay) 380 | self.assertNotIn("proxy.zyte.com", dnscache) 381 | 382 | dnscache["proxy.zyte.com"] = "1.1.1.1" 383 | res = self._mock_zyte_smartproxy_response(ban_url) 384 | mw.process_response(req, res, self.spider) 385 | self.assertEqual(slot.delay, delay) 386 | self.assertEqual(self.spider.download_delay, delay) 387 | self.assertIn("proxy.zyte.com", dnscache) 388 | 389 | mw.process_exception(req, ConnectionRefusedError(), self.spider) 390 | self.assertEqual(slot.delay, mw.connection_refused_delay) 391 | self.assertEqual(self.spider.download_delay, delay) 392 | self.assertNotIn("proxy.zyte.com", dnscache) 393 | 394 | dnscache["proxy.zyte.com"] = "1.1.1.1" 395 | res = self._mock_zyte_smartproxy_response(ban_url, status=self.bancode) 396 | mw.process_response(req, res, self.spider) 397 | self.assertEqual(slot.delay, delay) 398 | self.assertEqual(self.spider.download_delay, delay) 399 | self.assertIn("proxy.zyte.com", dnscache) 400 | 401 | mw.process_exception(req, ConnectionDone(), self.spider) 402 | self.assertEqual(slot.delay, mw.connection_refused_delay) 403 | self.assertEqual(self.spider.download_delay, delay) 404 | self.assertNotIn("proxy.zyte.com", dnscache) 405 | 406 | def test_process_exception_outside_zyte_smartproxy(self): 407 | self.spider.zyte_smartproxy_enabled = False 408 | crawler = self._mock_crawler(self.spider, self.settings) 409 | mw = self.mwcls.from_crawler(crawler) 410 | mw.open_spider(self.spider) 411 | 412 | req = Request("https://scrapy.org") 413 | assert mw.process_exception(req, ConnectionDone(), self.spider) is None 414 | 415 | def test_jobid_header(self): 416 | # test without the environment variable 'SCRAPY_JOB' 417 | self.spider.zyte_smartproxy_enabled = True 418 | crawler = self._mock_crawler(self.spider, self.settings) 419 | mw1 = self.mwcls.from_crawler(crawler) 420 | mw1.open_spider(self.spider) 421 | req1 = Request("http://example.com") 422 | self.assertEqual(mw1.process_request(req1, self.spider), None) 423 | self.assertEqual(req1.headers.get("X-Crawlera-Jobid"), None) 424 | self.assertEqual(req1.headers.get("Zyte-JobId"), None) 425 | 426 | # test with the environment variable 'SCRAPY_JOB' 427 | os.environ["SCRAPY_JOB"] = "2816" 428 | self.spider.zyte_smartproxy_enabled = True 429 | mw2 = self.mwcls.from_crawler(crawler) 430 | mw2.open_spider(self.spider) 431 | req2 = Request("http://example.com") 432 | self.assertEqual(mw2.process_request(req2, self.spider), None) 433 | self.assertEqual(req2.headers.get("X-Crawlera-Jobid"), b"2816") 434 | self.assertEqual(req2.headers.get("Zyte-JobId"), None) 435 | 436 | # Zyte API 437 | mw3 = self.mwcls.from_crawler(crawler) 438 | mw3.open_spider(self.spider) 439 | req3 = Request( 440 | "http://example.com", 441 | meta={ 442 | "proxy": "http://apikey:@api.zyte.com:8011", 443 | }, 444 | ) 445 | self.assertEqual(mw3.process_request(req3, self.spider), None) 446 | self.assertEqual(req3.headers.get("X-Crawlera-Jobid"), None) 447 | self.assertEqual(req3.headers.get("Zyte-JobId"), b"2816") 448 | del os.environ["SCRAPY_JOB"] 449 | 450 | def _test_stats(self, settings, prefix): 451 | self.spider.zyte_smartproxy_enabled = True 452 | spider = self.spider 453 | settings = copy(settings) 454 | settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [555] 455 | crawler = self._mock_crawler(spider, settings) 456 | mw = self.mwcls.from_crawler(crawler) 457 | mw.open_spider(spider) 458 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 459 | 460 | req = Request("http://example.com") 461 | assert mw.process_request(req, spider) is None 462 | assert httpproxy.process_request(req, spider) is None 463 | self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 1) 464 | self.assertEqual( 465 | crawler.stats.get_value("{}/request/method/GET".format(prefix)), 1 466 | ) 467 | 468 | res = self._mock_zyte_smartproxy_response(req.url) 469 | assert mw.process_response(req, res, spider) is res 470 | self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 1) 471 | self.assertEqual( 472 | crawler.stats.get_value("{}/response/status/200".format(prefix)), 1 473 | ) 474 | 475 | req = Request("http://example.com/other", method="POST") 476 | assert mw.process_request(req, spider) is None 477 | assert httpproxy.process_request(req, spider) is None 478 | self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 2) 479 | self.assertEqual( 480 | crawler.stats.get_value("{}/request/method/POST".format(prefix)), 1 481 | ) 482 | 483 | res = self._mock_zyte_smartproxy_response( 484 | req.url, status=mw.ban_code, headers={"Zyte-Error-Type": "somethingbad"} 485 | ) 486 | assert mw.process_response(req, res, spider) is res 487 | self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 2) 488 | self.assertEqual( 489 | crawler.stats.get_value( 490 | "{}/response/status/{}".format(prefix, mw.ban_code) 491 | ), 492 | 1, 493 | ) 494 | self.assertEqual(crawler.stats.get_value("{}/response/error".format(prefix)), 1) 495 | self.assertEqual( 496 | crawler.stats.get_value("{}/response/error/somethingbad".format(prefix)), 1 497 | ) 498 | self.assertEqual(res.headers["X-Crawlera-Error"], b"somethingbad") 499 | self.assertEqual(res.headers["Zyte-Error-Type"], b"somethingbad") 500 | 501 | res = self._mock_zyte_smartproxy_response( 502 | req.url, 503 | status=mw.ban_code, 504 | headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, 505 | ) 506 | assert mw.process_response(req, res, spider) is res 507 | self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 3) 508 | self.assertEqual( 509 | crawler.stats.get_value( 510 | "{}/response/status/{}".format(prefix, mw.ban_code) 511 | ), 512 | 2, 513 | ) 514 | self.assertEqual( 515 | crawler.stats.get_value("{}/response/banned".format(prefix)), 1 516 | ) 517 | self.assertEqual(res.headers["X-Crawlera-Error"], b"banned") 518 | self.assertEqual(res.headers["Zyte-Error-Type"], b"banned") 519 | 520 | res = self._mock_zyte_smartproxy_response( 521 | req.url, 522 | status=mw.ban_code, 523 | headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, 524 | ) 525 | slot_key = "example.com" 526 | crawler.engine.downloader.slots[slot_key] = MockedSlot() 527 | req.meta["download_slot"] = "example.com" 528 | assert mw.process_response(req, res, spider) is res 529 | del req.meta["download_slot"] 530 | self.assertEqual(crawler.stats.get_value("{}/delay/banned".format(prefix)), 1) 531 | self.assertEqual( 532 | crawler.stats.get_value("{}/delay/banned/total".format(prefix)), 1 533 | ) 534 | 535 | res = self._mock_zyte_smartproxy_response( 536 | req.url, 537 | status=407, 538 | headers={"X-Crawlera-Error": "bad_proxy_auth"}, 539 | ) 540 | assert isinstance(mw.process_response(req, res, spider), Request) 541 | self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) 542 | 543 | res = self._mock_zyte_smartproxy_response( 544 | req.url, 545 | status=407, 546 | headers={"X-Crawlera-Error": "bad_proxy_auth"}, 547 | ) 548 | req.meta["zyte_smartproxy_auth_retry_times"] = 11 549 | assert mw.process_response(req, res, spider) is res 550 | del req.meta["zyte_smartproxy_auth_retry_times"] 551 | self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) 552 | self.assertEqual( 553 | crawler.stats.get_value("{}/retries/auth/max_reached".format(prefix)), 1 554 | ) 555 | 556 | res = self._mock_zyte_smartproxy_response( 557 | req.url, 558 | status=555, 559 | ) 560 | req.meta["dont_proxy"] = True 561 | assert isinstance(mw.process_response(req, res, spider), Request) 562 | del req.meta["dont_proxy"] 563 | self.assertEqual( 564 | crawler.stats.get_value( 565 | "{}/retries/should_have_been_enabled".format(prefix) 566 | ), 567 | 1, 568 | ) 569 | 570 | def test_stats_spm(self): 571 | self._test_stats(self.settings, "zyte_smartproxy") 572 | 573 | def test_stats_zapi(self): 574 | settings = copy(self.settings) 575 | settings["ZYTE_SMARTPROXY_URL"] = "http://api.zyte.com:8011" 576 | self._test_stats(settings, "zyte_api_proxy") 577 | 578 | def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): 579 | spider.zyte_smartproxy_enabled = zyte_smartproxy_enabled 580 | crawler = self._mock_crawler(spider, self.settings) 581 | mw = self.mwcls.from_crawler(crawler) 582 | mw.open_spider(spider) 583 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 584 | headers = { 585 | "X-Crawlera-Debug": True, 586 | "X-Crawlera-Foo": "foo", 587 | "X-Crawlera-Profile": "desktop", 588 | "User-Agent": "Scrapy", 589 | "": None, 590 | "Zyte-Bar": "bar", 591 | "Zyte-BrowserHtml": True, 592 | "Zyte-Geolocation": "foo", 593 | } 594 | req = Request("http://example.com", headers=headers, **kwargs) 595 | mw.process_request(req, spider) 596 | httpproxy.process_request(req, spider) 597 | return req 598 | 599 | @patch("scrapy_zyte_smartproxy.middleware.warnings") 600 | @patch("scrapy_zyte_smartproxy.middleware.logger") 601 | def test_zyte_smartproxy_default_headers_conflicting_headers( 602 | self, mock_logger, mock_warnings 603 | ): 604 | spider = self.spider 605 | self.spider.zyte_smartproxy_enabled = True 606 | 607 | self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { 608 | "X-Crawlera-Profile": "desktop", 609 | } 610 | crawler = self._mock_crawler(spider, self.settings) 611 | mw = self.mwcls.from_crawler(crawler) 612 | mw.open_spider(spider) 613 | 614 | req = Request("http://example.com/other", headers={"X-Crawlera-UA": "desktop"}) 615 | assert mw.process_request(req, spider) is None 616 | self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") 617 | self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") 618 | some_requests_warning = ( 619 | "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are " 620 | "conflicting on some of your requests. Please check " 621 | "https://docs.zyte.com/smart-proxy-manager.html#request-headers " 622 | "for more information. You can set LOG_LEVEL=DEBUG to see the " 623 | "urls with problems." 624 | ) 625 | mock_warnings.warn.assert_called_with(some_requests_warning) 626 | other_request_warning = ( 627 | "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are " 628 | "conflicting on request http://example.com/other. " 629 | "X-Crawlera-UA will be ignored. Please check " 630 | "https://docs.zyte.com/smart-proxy-manager.html#request-headers " 631 | "for more information" 632 | ) 633 | mock_logger.debug.assert_called_with( 634 | other_request_warning, extra={"spider": spider} 635 | ) 636 | 637 | # test it ignores case 638 | req = Request("http://example.com/other", headers={"x-crawlera-ua": "desktop"}) 639 | assert mw.process_request(req, spider) is None 640 | self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") 641 | self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") 642 | mock_warnings.warn.assert_called_with(some_requests_warning) 643 | mock_logger.debug.assert_called_with( 644 | other_request_warning, extra={"spider": spider} 645 | ) 646 | 647 | def test_dont_proxy_false_does_nothing(self): 648 | spider = self.spider 649 | spider.zyte_smartproxy_enabled = True 650 | crawler = self._mock_crawler(spider, self.settings) 651 | mw = self.mwcls.from_crawler(crawler) 652 | mw.open_spider(spider) 653 | req = Request("http://example.com/other") 654 | req.meta["dont_proxy"] = False 655 | assert mw.process_request(req, spider) is None 656 | self.assertIsNotNone(req.meta.get("proxy")) 657 | 658 | def test_is_banned(self): 659 | self.spider.zyte_smartproxy_enabled = True 660 | crawler = self._mock_crawler(self.spider, self.settings) 661 | mw = self.mwcls.from_crawler(crawler) 662 | mw.open_spider(self.spider) 663 | req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True) 664 | 665 | res = Response(req.url, status=200) 666 | res = mw.process_response(req, res, self.spider) 667 | self.assertFalse(mw._is_banned(res)) 668 | res = Response(req.url, status=503, headers={"X-Crawlera-Error": "noslaves"}) 669 | res = mw.process_response(req, res, self.spider) 670 | self.assertFalse(mw._is_banned(res)) 671 | res = Response( 672 | req.url, 673 | status=503, 674 | headers={ 675 | "Zyte-Request-Id": "123456789", 676 | "Zyte-Error-Type": "/limits/over-global-limit", 677 | }, 678 | ) 679 | res = mw.process_response(req, res, self.spider) 680 | self.assertFalse(mw._is_banned(res)) 681 | 682 | res = Response(req.url, status=503, headers={"X-Crawlera-Error": "banned"}) 683 | res = mw.process_response(req, res, self.spider) 684 | self.assertTrue(mw._is_banned(res)) 685 | res = Response( 686 | req.url, 687 | status=520, 688 | headers={ 689 | "Zyte-Request-Id": "123456789", 690 | "Zyte-Error-Type": "/download/temporary-error", 691 | }, 692 | ) 693 | res = mw.process_response(req, res, self.spider) 694 | assert mw.crawler.stats.get_value("zyte_smartproxy/response/banned") == 1 695 | self.assertTrue(mw._is_banned(res)) 696 | res = Response( 697 | req.url, 698 | status=521, 699 | headers={ 700 | "Zyte-Request-Id": "123456789", 701 | "Zyte-Error-Type": "/download/internal-error", 702 | }, 703 | ) 704 | res = mw.process_response(req, res, self.spider) 705 | assert mw.crawler.stats.get_value("zyte_smartproxy/response/banned") == 2 706 | self.assertTrue(mw._is_banned(res)) 707 | 708 | @patch("random.uniform") 709 | def test_noslaves_delays(self, random_uniform_patch): 710 | # mock random.uniform to just return the max delay 711 | random_uniform_patch.side_effect = lambda x, y: y 712 | 713 | slot_key = "example.com" 714 | url = "http://example.com" 715 | ban_url = "http://banned.example" 716 | max_delay = 70 717 | backoff_step = 15 718 | default_delay = 0 719 | 720 | self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step 721 | self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay 722 | 723 | self.spider.zyte_smartproxy_enabled = True 724 | crawler = self._mock_crawler(self.spider, self.settings) 725 | mw = self.mwcls.from_crawler(crawler) 726 | mw.open_spider(self.spider) 727 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 728 | 729 | slot = MockedSlot() 730 | crawler.engine.downloader.slots[slot_key] = slot 731 | 732 | noslaves_req = Request(url, meta={"download_slot": slot_key}) 733 | assert mw.process_request(noslaves_req, self.spider) is None 734 | assert httpproxy.process_request(noslaves_req, self.spider) is None 735 | 736 | # delays grow exponentially with any throttling error 737 | noslaves_response = self._mock_zyte_smartproxy_response( 738 | ban_url, 739 | status=503, 740 | headers={"X-Crawlera-Error": "noslaves"}, 741 | ) 742 | mw.process_response(noslaves_req, noslaves_response, self.spider) 743 | self.assertEqual(slot.delay, backoff_step) 744 | 745 | over_use_limit_response = self._mock_zyte_smartproxy_response( 746 | ban_url, 747 | status=429, 748 | headers={"Zyte-Error-Type": "/limits/over-user-limit"}, 749 | ) 750 | mw.process_response(noslaves_req, over_use_limit_response, self.spider) 751 | self.assertEqual(slot.delay, backoff_step * 2**1) 752 | 753 | over_domain_limit_response = self._mock_zyte_smartproxy_response( 754 | ban_url, 755 | status=429, 756 | headers={"Zyte-Error-Type": "/limits/over-domain-limit"}, 757 | ) 758 | mw.process_response(noslaves_req, over_domain_limit_response, self.spider) 759 | self.assertEqual(slot.delay, backoff_step * 2**2) 760 | 761 | over_global_limit_response = self._mock_zyte_smartproxy_response( 762 | ban_url, 763 | status=503, 764 | headers={"Zyte-Error-Type": "/limits/over-global-limit"}, 765 | ) 766 | mw.process_response(noslaves_req, over_global_limit_response, self.spider) 767 | self.assertEqual(slot.delay, max_delay) 768 | 769 | # other responses reset delay 770 | ban_req = Request(url, meta={"download_slot": slot_key}) 771 | assert mw.process_request(ban_req, self.spider) is None 772 | assert httpproxy.process_request(ban_req, self.spider) is None 773 | ban_headers = {"X-Crawlera-Error": "banned"} 774 | ban_res = self._mock_zyte_smartproxy_response( 775 | ban_url, 776 | status=self.bancode, 777 | headers=ban_headers, 778 | ) 779 | mw.process_response(ban_req, ban_res, self.spider) 780 | self.assertEqual(slot.delay, default_delay) 781 | 782 | mw.process_response(noslaves_req, noslaves_response, self.spider) 783 | self.assertEqual(slot.delay, backoff_step) 784 | 785 | good_req = Request(url, meta={"download_slot": slot_key}) 786 | assert mw.process_request(good_req, self.spider) is None 787 | assert httpproxy.process_request(good_req, self.spider) is None 788 | good_res = self._mock_zyte_smartproxy_response( 789 | url, 790 | status=200, 791 | ) 792 | mw.process_response(good_req, good_res, self.spider) 793 | self.assertEqual(slot.delay, default_delay) 794 | 795 | @patch("random.uniform") 796 | def test_auth_error_retries(self, random_uniform_patch): 797 | # mock random.uniform to just return the max delay 798 | random_uniform_patch.side_effect = lambda x, y: y 799 | 800 | slot_key = "example.com" 801 | url = "http://example.com" 802 | ban_url = "http://auth.error" 803 | max_delay = 70 804 | backoff_step = 15 805 | 806 | self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step 807 | self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay 808 | 809 | self.spider.zyte_smartproxy_enabled = True 810 | crawler = self._mock_crawler(self.spider, self.settings) 811 | mw = self.mwcls.from_crawler(crawler) 812 | mw.open_spider(self.spider) 813 | mw.max_auth_retry_times = 4 814 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 815 | 816 | slot = MockedSlot() 817 | crawler.engine.downloader.slots[slot_key] = slot 818 | 819 | auth_error_req = Request(url, meta={"download_slot": slot_key}) 820 | assert mw.process_request(auth_error_req, self.spider) is None 821 | assert httpproxy.process_request(auth_error_req, self.spider) is None 822 | auth_error_headers = {"X-Crawlera-Error": "bad_proxy_auth"} 823 | auth_error_response = self._mock_zyte_smartproxy_response( 824 | ban_url, status=self.auth_error_code, headers=auth_error_headers 825 | ) 826 | 827 | # delays grow exponentially, retry times increase accordingly 828 | req = mw.process_response(auth_error_req, auth_error_response, self.spider) 829 | self.assertEqual(slot.delay, backoff_step) 830 | retry_times = req.meta["zyte_smartproxy_auth_retry_times"] 831 | self.assertEqual(retry_times, 1) 832 | 833 | auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times 834 | req = mw.process_response(auth_error_req, auth_error_response, self.spider) 835 | self.assertEqual(slot.delay, backoff_step * 2**1) 836 | retry_times = req.meta["zyte_smartproxy_auth_retry_times"] 837 | self.assertEqual(retry_times, 2) 838 | 839 | auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times 840 | req = mw.process_response(auth_error_req, auth_error_response, self.spider) 841 | self.assertEqual(slot.delay, backoff_step * 2**2) 842 | retry_times = req.meta["zyte_smartproxy_auth_retry_times"] 843 | self.assertEqual(retry_times, 3) 844 | 845 | auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times 846 | req = mw.process_response(auth_error_req, auth_error_response, self.spider) 847 | self.assertEqual(slot.delay, max_delay) 848 | retry_times = req.meta["zyte_smartproxy_auth_retry_times"] 849 | self.assertEqual(retry_times, 4) 850 | 851 | # Should return a response when after max number of retries 852 | auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times 853 | res = mw.process_response(auth_error_req, auth_error_response, self.spider) 854 | self.assertIsInstance(res, Response) 855 | 856 | # A 407 response not coming directly from Zyte Smart Proxy Manager is 857 | # not retried 858 | non_zyte_smartproxy_407_response = self._mock_zyte_smartproxy_response( 859 | ban_url, 860 | status=self.auth_error_code, 861 | ) 862 | res = mw.process_response( 863 | auth_error_req, non_zyte_smartproxy_407_response, self.spider 864 | ) 865 | self.assertIsInstance(res, Response) 866 | 867 | @patch("scrapy_zyte_smartproxy.middleware.logger") 868 | def test_open_spider_logging(self, mock_logger): 869 | spider = self.spider 870 | self.spider.zyte_smartproxy_enabled = True 871 | crawler = self._mock_crawler(spider, self.settings) 872 | mw = self.mwcls.from_crawler(crawler) 873 | mw.open_spider(spider) 874 | expected_calls = [ 875 | call( 876 | "Using Zyte proxy service %s with an API key ending in %s" 877 | % (self.mwcls.url, "apikey"), 878 | extra={"spider": spider}, 879 | ), 880 | call( 881 | "ZyteSmartProxyMiddleware: disabling download delays in " 882 | "Scrapy to optimize delays introduced by Zyte proxy services. " 883 | "To avoid this behaviour you can use the " 884 | "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " 885 | "that this may slow down the crawl significantly", 886 | extra={"spider": spider}, 887 | ), 888 | ] 889 | assert mock_logger.info.call_args_list == expected_calls 890 | 891 | def test_process_response_enables_zyte_smartproxy(self): 892 | url = "https://scrapy.org" 893 | 894 | self.spider.zyte_smartproxy_enabled = False 895 | self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] 896 | crawler = self._mock_crawler(self.spider, self.settings) 897 | mw = self.mwcls.from_crawler(crawler) 898 | mw.open_spider(self.spider) 899 | 900 | # A good code response should not enable it 901 | req = Request(url) 902 | res = Response(url, status=200) 903 | mw.process_request(req, self.spider) 904 | out = mw.process_response(req, res, self.spider) 905 | self.assertIsInstance(out, Response) 906 | self.assertEqual(mw.enabled_for_domain, {}) 907 | self.assertEqual(mw.enabled, False) 908 | self.assertEqual(mw.crawler.stats.get_stats(), {}) 909 | 910 | # A bad code response should enable it 911 | res = Response(url, status=403) 912 | mw.process_request(req, self.spider) 913 | out = mw.process_response(req, res, self.spider) 914 | self.assertIsInstance(out, Request) 915 | self.assertEqual(mw.enabled, False) 916 | self.assertEqual(mw.enabled_for_domain["scrapy.org"], True) 917 | self.assertEqual( 918 | mw.crawler.stats.get_stats(), 919 | { 920 | "zyte_smartproxy/retries/should_have_been_enabled": 1, 921 | }, 922 | ) 923 | 924 | # Another regular response with bad code should be done on Zyte Smart 925 | # Proxy Manager and not be retried 926 | res = Response(url, status=403) 927 | mw.process_request(req, self.spider) 928 | out = mw.process_response(req, res, self.spider) 929 | self.assertIsInstance(out, Response) 930 | self.assertEqual(mw.enabled, False) 931 | self.assertEqual(mw.enabled_for_domain["scrapy.org"], True) 932 | self.assertEqual(mw.crawler.stats.get_value("zyte_smartproxy/request"), 1) 933 | 934 | # A Zyte Smart Proxy Manager response with bad code should not be 935 | # retried as well 936 | mw.process_request(req, self.spider) 937 | res = self._mock_zyte_smartproxy_response(url, status=403) 938 | out = mw.process_response(req, res, self.spider) 939 | self.assertIsInstance(out, Response) 940 | self.assertEqual(mw.enabled, False) 941 | self.assertEqual(mw.enabled_for_domain["scrapy.org"], True) 942 | self.assertEqual(mw.crawler.stats.get_value("zyte_smartproxy/request"), 2) 943 | 944 | def test_process_response_from_file_scheme(self): 945 | url = "file:///tmp/foobar.txt" 946 | 947 | self.spider.zyte_smartproxy_enabled = False 948 | self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] 949 | crawler = self._mock_crawler(self.spider, self.settings) 950 | mw = self.mwcls.from_crawler(crawler) 951 | mw.enabled_for_domain = {} 952 | mw.open_spider(self.spider) 953 | 954 | # A good code response should not enable it 955 | req = Request(url) 956 | res = Response(url, status=200) 957 | mw.process_request(req, self.spider) 958 | out = mw.process_response(req, res, self.spider) 959 | self.assertIsInstance(out, Response) 960 | self.assertEqual(mw.enabled_for_domain, {}) 961 | self.assertEqual(mw.enabled, False) 962 | self.assertEqual(mw.crawler.stats.get_stats(), {}) 963 | self.assertEqual(out.status, 200) 964 | 965 | @patch("scrapy_zyte_smartproxy.middleware.logger") 966 | def test_apikey_warning_zyte_smartproxy_disabled(self, mock_logger): 967 | self.spider.zyte_smartproxy_enabled = False 968 | settings = {} 969 | crawler = self._mock_crawler(self.spider, settings) 970 | mw = self.mwcls.from_crawler(crawler) 971 | mw.open_spider(self.spider) 972 | self.assertFalse(mw.enabled) 973 | mock_logger.warning.assert_not_called() 974 | 975 | @patch("scrapy_zyte_smartproxy.middleware.logger") 976 | def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): 977 | self.spider.zyte_smartproxy_enabled = True 978 | settings = {} 979 | crawler = self._mock_crawler(self.spider, settings) 980 | mw = self.mwcls.from_crawler(crawler) 981 | mw.open_spider(self.spider) 982 | self.assertTrue(mw.enabled) 983 | mock_logger.warning.assert_called_with( 984 | "Zyte proxy services cannot be used without an API key", 985 | extra={"spider": self.spider}, 986 | ) 987 | 988 | @patch("scrapy_zyte_smartproxy.middleware.logger") 989 | def test_no_apikey_warning_force_enable(self, mock_logger): 990 | self.spider.zyte_smartproxy_enabled = False 991 | settings = {"ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403]} 992 | crawler = self._mock_crawler(self.spider, settings) 993 | mw = self.mwcls.from_crawler(crawler) 994 | mw.open_spider(self.spider) 995 | self.assertFalse(mw.enabled) 996 | mock_logger.warning.assert_called_with( 997 | "Zyte proxy services cannot be used without an API key", 998 | extra={"spider": self.spider}, 999 | ) 1000 | 1001 | @patch("scrapy_zyte_smartproxy.middleware.logger") 1002 | def test_apikey_warning_force_enable(self, mock_logger): 1003 | self.spider.zyte_smartproxy_enabled = False 1004 | settings = { 1005 | "ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403], 1006 | "ZYTE_SMARTPROXY_APIKEY": "apikey", 1007 | } 1008 | crawler = self._mock_crawler(self.spider, settings) 1009 | mw = self.mwcls.from_crawler(crawler) 1010 | mw.open_spider(self.spider) 1011 | self.assertFalse(mw.enabled) 1012 | mock_logger.warning.assert_not_called() 1013 | 1014 | def test_is_enabled_warnings(self): 1015 | self._assert_disabled(self.spider, self.settings) 1016 | self.settings["HUBPROXY_ENABLED"] = True 1017 | with pytest.warns(ScrapyDeprecationWarning) as record: 1018 | self._assert_enabled(self.spider, self.settings) 1019 | assert len(record) == 1 1020 | assert "HUBPROXY_ENABLED setting is deprecated" in str(record[0].message) 1021 | 1022 | del self.settings["HUBPROXY_ENABLED"] 1023 | self.spider.use_hubproxy = False 1024 | with pytest.warns(ScrapyDeprecationWarning) as record: 1025 | self._assert_disabled(self.spider, self.settings) 1026 | assert len(record) == 1 1027 | assert "use_hubproxy attribute is deprecated" in str(record[0].message) 1028 | 1029 | def test_settings_warnings(self): 1030 | self.spider.hubproxy_maxbans = 10 1031 | crawler = self._mock_crawler(self.spider, self.settings) 1032 | mw = self.mwcls.from_crawler(crawler) 1033 | with pytest.warns(ScrapyDeprecationWarning) as record: 1034 | mw.open_spider(self.spider) 1035 | assert len(record) == 1 1036 | assert "hubproxy_maxbans attribute is deprecated" in str(record[0].message) 1037 | del self.spider.hubproxy_maxbans 1038 | 1039 | self.settings["HUBPROXY_BACKOFF_MAX"] = 10 1040 | crawler = self._mock_crawler(self.spider, self.settings) 1041 | mw = self.mwcls.from_crawler(crawler) 1042 | with pytest.warns(ScrapyDeprecationWarning) as record: 1043 | mw.open_spider(self.spider) 1044 | assert len(record) == 1 1045 | assert "HUBPROXY_BACKOFF_MAX setting is deprecated" in str( 1046 | record[0].message 1047 | ) 1048 | 1049 | def test_no_slot(self): 1050 | url = "http://example.com" 1051 | ban_url = "http://banned.example" 1052 | 1053 | self.spider.zyte_smartproxy_enabled = True 1054 | crawler = self._mock_crawler(self.spider, self.settings) 1055 | mw = self.mwcls.from_crawler(crawler) 1056 | mw.open_spider(self.spider) 1057 | 1058 | # there are no slot named 'example.com' 1059 | noslaves_req = Request(url, meta={"download_slot": "example.com"}) 1060 | assert mw.process_request(noslaves_req, self.spider) is None 1061 | 1062 | headers = {"X-Crawlera-Error": "noslaves"} 1063 | noslaves_res = self._mock_zyte_smartproxy_response( 1064 | ban_url, 1065 | status=self.bancode, 1066 | headers=headers, 1067 | ) 1068 | # checking that response was processed 1069 | response = mw.process_response(noslaves_req, noslaves_res, self.spider) 1070 | assert response.status == 503 1071 | 1072 | def test_settings_dict(self): 1073 | self.spider.zyte_smartproxy_enabled = True 1074 | self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { 1075 | "X-Crawlera-Profile": "desktop", 1076 | } 1077 | crawler = self._mock_crawler(self.spider, self.settings) 1078 | mw = self.mwcls.from_crawler(crawler) 1079 | # we don't have a dict settings yet, have to mess with protected 1080 | # property 1081 | mw._settings.append(("default_headers", dict)) 1082 | mw.open_spider(self.spider) 1083 | req = Request("http://example.com/other") 1084 | mw.process_request(req, self.spider) 1085 | assert mw.process_request(req, self.spider) is None 1086 | self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") 1087 | 1088 | def test_client_header(self): 1089 | self.spider.zyte_smartproxy_enabled = True 1090 | crawler = self._mock_crawler(self.spider, self.settings) 1091 | mw = self.mwcls.from_crawler(crawler) 1092 | mw.open_spider(self.spider) 1093 | req1 = Request("http://example.com") 1094 | self.assertEqual(mw.process_request(req1, self.spider), None) 1095 | client = "scrapy-zyte-smartproxy/{}".format(__version__).encode() 1096 | self.assertEqual(req1.headers.get("X-Crawlera-Client"), client) 1097 | self.assertEqual(req1.headers.get("Zyte-Client"), None) 1098 | 1099 | req2 = Request( 1100 | "http://example.com", 1101 | meta={ 1102 | "proxy": "http://apikey:@api.zyte.com:8011", 1103 | }, 1104 | ) 1105 | self.assertEqual(mw.process_request(req2, self.spider), None) 1106 | self.assertEqual(req2.headers.get("X-Crawlera-Client"), None) 1107 | self.assertEqual(req2.headers.get("Zyte-Client"), client) 1108 | 1109 | def test_scrapy_httpproxy_integration(self): 1110 | self.spider.zyte_smartproxy_enabled = True 1111 | crawler = self._mock_crawler(self.spider, self.settings) 1112 | smartproxy = self.mwcls.from_crawler(crawler) 1113 | smartproxy.open_spider(self.spider) 1114 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1115 | request = Request("https://example.com") 1116 | auth_header = basic_auth_header("apikey", "") 1117 | 1118 | # 1st pass 1119 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1120 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1121 | self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") 1122 | self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) 1123 | 1124 | # 2nd pass (e.g. retry or redirect) 1125 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1126 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1127 | self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") 1128 | self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) 1129 | 1130 | def test_subclass_non_basic_header(self): 1131 | 1132 | class Subclass(self.mwcls): 1133 | def get_proxyauth(self, spider): 1134 | return b"Non-Basic foo" 1135 | 1136 | self.spider.zyte_smartproxy_enabled = True 1137 | crawler = self._mock_crawler(self.spider, self.settings) 1138 | smartproxy = Subclass.from_crawler(crawler) 1139 | with pytest.raises(ValueError): 1140 | smartproxy.open_spider(self.spider) 1141 | 1142 | def test_subclass_basic_header_non_base64(self): 1143 | 1144 | class Subclass(self.mwcls): 1145 | def get_proxyauth(self, spider): 1146 | return b"Basic foo" 1147 | 1148 | self.spider.zyte_smartproxy_enabled = True 1149 | crawler = self._mock_crawler(self.spider, self.settings) 1150 | smartproxy = Subclass.from_crawler(crawler) 1151 | with pytest.raises((TypeError, binascii.Error)): 1152 | smartproxy.open_spider(self.spider) 1153 | 1154 | def test_subclass_basic_header_nonurlsafe_base64(self): 1155 | 1156 | class Subclass(self.mwcls): 1157 | def get_proxyauth(self, spider): 1158 | return b"Basic YWF+Og==" 1159 | 1160 | self.spider.zyte_smartproxy_enabled = True 1161 | crawler = self._mock_crawler(self.spider, self.settings) 1162 | smartproxy = Subclass.from_crawler(crawler) 1163 | smartproxy.open_spider(self.spider) 1164 | self.assertEqual(smartproxy._auth_url, "http://aa~:@proxy.zyte.com:8011") 1165 | 1166 | def test_subclass_basic_header_urlsafe_base64(self): 1167 | 1168 | class Subclass(self.mwcls): 1169 | def get_proxyauth(self, spider): 1170 | return b"Basic YWF-Og==" 1171 | 1172 | self.spider.zyte_smartproxy_enabled = True 1173 | crawler = self._mock_crawler(self.spider, self.settings) 1174 | smartproxy = Subclass.from_crawler(crawler) 1175 | smartproxy.open_spider(self.spider) 1176 | self.assertEqual(smartproxy._auth_url, "http://aa~:@proxy.zyte.com:8011") 1177 | 1178 | def test_response_headers(self): 1179 | self.spider.zyte_smartproxy_enabled = True 1180 | spider = self.spider 1181 | crawler = self._mock_crawler(spider, self.settings) 1182 | mw = self.mwcls.from_crawler(crawler) 1183 | mw.open_spider(spider) 1184 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1185 | 1186 | req = Request("http://example.com") 1187 | assert mw.process_request(req, spider) is None 1188 | assert httpproxy.process_request(req, spider) is None 1189 | 1190 | count = 0 1191 | res = Response(req.url) 1192 | assert mw.process_response(req, res, spider) is res 1193 | self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), None) 1194 | 1195 | for k, v in RESPONSE_IDENTIFYING_HEADERS: 1196 | count += 1 1197 | res = Response(req.url, headers={k: v}) 1198 | assert mw.process_response(req, res, spider) is res 1199 | self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), count) 1200 | 1201 | def test_meta_copy(self): 1202 | """Warn when users copy the proxy key from one response to the next.""" 1203 | self.spider.zyte_smartproxy_enabled = True 1204 | crawler = self._mock_crawler(self.spider, self.settings) 1205 | smartproxy = self.mwcls.from_crawler(crawler) 1206 | smartproxy.open_spider(self.spider) 1207 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1208 | auth_header = basic_auth_header("apikey", "") 1209 | 1210 | request1 = Request("https://example.com/a") 1211 | self.assertEqual(smartproxy.process_request(request1, self.spider), None) 1212 | self.assertEqual(httpproxy.process_request(request1, self.spider), None) 1213 | self.assertEqual(request1.meta["proxy"], "http://proxy.zyte.com:8011") 1214 | self.assertEqual(request1.headers[b"Proxy-Authorization"], auth_header) 1215 | 1216 | request2 = Request("https://example.com/b", meta=dict(request1.meta)) 1217 | with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: 1218 | self.assertEqual(smartproxy.process_request(request2, self.spider), None) 1219 | self.assertEqual(httpproxy.process_request(request2, self.spider), None) 1220 | self.assertEqual(request2.meta["proxy"], "http://proxy.zyte.com:8011") 1221 | self.assertEqual(request2.headers[b"Proxy-Authorization"], auth_header) 1222 | expected_calls = [ 1223 | call( 1224 | "The value of the 'proxy' meta key of request {request2} " 1225 | "has no API key. You seem to have copied the value of " 1226 | "the 'proxy' request meta key from a response or from a " 1227 | "different request. Copying request meta keys set by " 1228 | "middlewares from one request to another is a bad " 1229 | "practice that can cause issues.".format(request2=request2) 1230 | ), 1231 | ] 1232 | self.assertEqual(logger.warning.call_args_list, expected_calls) 1233 | 1234 | def test_manual_proxy_same(self): 1235 | """Defining the 'proxy' request meta key with the right URL has the 1236 | same effect as not defining it.""" 1237 | self.spider.zyte_smartproxy_enabled = True 1238 | crawler = self._mock_crawler(self.spider, self.settings) 1239 | smartproxy = self.mwcls.from_crawler(crawler) 1240 | smartproxy.open_spider(self.spider) 1241 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1242 | auth_header = basic_auth_header("apikey", "") 1243 | 1244 | meta = {"proxy": "http://apikey:@proxy.zyte.com:8011"} 1245 | request = Request("https://example.com", meta=meta) 1246 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1247 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1248 | self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") 1249 | self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) 1250 | 1251 | def test_manual_proxy_without_api_key(self): 1252 | """Defining the 'proxy' request meta key with the right URL but missing 1253 | the API key triggers a warning, and causes the API key to be added.""" 1254 | self.spider.zyte_smartproxy_enabled = True 1255 | crawler = self._mock_crawler(self.spider, self.settings) 1256 | smartproxy = self.mwcls.from_crawler(crawler) 1257 | smartproxy.open_spider(self.spider) 1258 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1259 | auth_header = basic_auth_header("apikey", "") 1260 | 1261 | meta = {"proxy": "http://proxy.zyte.com:8011"} 1262 | request = Request("https://example.com", meta=meta) 1263 | with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: 1264 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1265 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1266 | self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") 1267 | self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) 1268 | expected_calls = [ 1269 | call( 1270 | "The value of the 'proxy' meta key of request {request} " 1271 | "has no API key. You seem to have copied the value of " 1272 | "the 'proxy' request meta key from a response or from a " 1273 | "different request. Copying request meta keys set by " 1274 | "middlewares from one request to another is a bad " 1275 | "practice that can cause issues.".format(request=request) 1276 | ), 1277 | ] 1278 | self.assertEqual(logger.warning.call_args_list, expected_calls) 1279 | 1280 | def test_manual_proxy_different(self): 1281 | """Setting a custom 'proxy' request meta with an unrelated proxy URL 1282 | prevents the middleware from making changes.""" 1283 | self.spider.zyte_smartproxy_enabled = True 1284 | crawler = self._mock_crawler(self.spider, self.settings) 1285 | smartproxy = self.mwcls.from_crawler(crawler) 1286 | smartproxy.open_spider(self.spider) 1287 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1288 | 1289 | meta = {"proxy": "http://proxy.example.com:8011"} 1290 | request = Request("https://example.com", meta=meta) 1291 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1292 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1293 | self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") 1294 | self.assertNotIn(b"Proxy-Authorization", request.headers) 1295 | 1296 | def test_manual_proxy_different_auth(self): 1297 | """Setting a custom 'proxy' request meta with a matching proxy URL 1298 | but a different key prevents the middleware from making changes.""" 1299 | self.spider.zyte_smartproxy_enabled = True 1300 | crawler = self._mock_crawler(self.spider, self.settings) 1301 | smartproxy = self.mwcls.from_crawler(crawler) 1302 | smartproxy.open_spider(self.spider) 1303 | httpproxy = HttpProxyMiddleware.from_crawler(crawler) 1304 | auth_header = basic_auth_header("altkey", "") 1305 | 1306 | meta = {"proxy": "http://altkey:@proxy.example.com:8011"} 1307 | request = Request("https://example.com", meta=meta) 1308 | self.assertEqual(smartproxy.process_request(request, self.spider), None) 1309 | self.assertEqual(httpproxy.process_request(request, self.spider), None) 1310 | self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") 1311 | self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) 1312 | 1313 | 1314 | def _merge_dicts(d1, d2): 1315 | d1.update(d2) 1316 | return d1 1317 | 1318 | 1319 | @pytest.mark.parametrize( 1320 | ("settings", "input_headers", "output_headers", "warnings"), 1321 | # Baseline 1322 | tuple( 1323 | ( 1324 | settings, 1325 | {b"Foo": b"Bar"}, 1326 | {b"Foo": b"Bar"}, 1327 | [], 1328 | ) 1329 | for settings in ( 1330 | {"ZYTE_SMARTPROXY_ENABLED": False}, # Plugin disabled 1331 | {}, # SPM 1332 | {"ZYTE_SMARTPROXY_URL": "http://api.zyte.com:8011"}, # Zyte API 1333 | ) 1334 | ) 1335 | # Plugin disabled 1336 | # 1337 | # When the plugin is disabled, by default all headers prefixed with 1338 | # X-Crawlera- or Zyte-, regardless of whether or not they are 1339 | # recognized, are dropped. 1340 | + tuple( 1341 | ( 1342 | {"ZYTE_SMARTPROXY_ENABLED": False}, 1343 | {header: value}, 1344 | {}, 1345 | [ 1346 | "Dropping header {header!r} ({value!r})".format( 1347 | header=header, value=value 1348 | ) 1349 | ], 1350 | ) 1351 | for header in ( 1352 | b"X-Crawlera-Foo", 1353 | b"X-Crawlera-Client", 1354 | b"Zyte-Foo", 1355 | b"Zyte-Client", 1356 | ) 1357 | for value in (b"Bar",) 1358 | ) 1359 | # SPM → ZAPI 1360 | # 1361 | # Backward-compatible headers are kept as is, to let Zyte API do the 1362 | # best translation possible, which is specially important in cases 1363 | # where translation may not be 1:1 (X-Crawlera-Cookies, 1364 | # X-Crawlera-Session). 1365 | + tuple( 1366 | ( 1367 | {"ZYTE_SMARTPROXY_URL": "http://api.zyte.com:8011"}, 1368 | {header: value}, 1369 | {header: value}, 1370 | ["Keeping deprecated header {header!r}".format(header=header)], 1371 | ) 1372 | for header, value in ( 1373 | (b"X-Crawlera-Cookies", b"enable"), 1374 | (b"X-Crawlera-Jobid", b"00000/0/0"), 1375 | (b"X-Crawlera-Profile", b"desktop"), 1376 | (b"X-Crawlera-Profile-Pass", b"User-Agent"), 1377 | (b"X-Crawlera-Region", b"US"), 1378 | (b"X-Crawlera-Session", b"create"), 1379 | ) 1380 | ) 1381 | # Other headers, known or made up, are dropped with a warning. 1382 | + tuple( 1383 | ( 1384 | {"ZYTE_SMARTPROXY_URL": "http://api.zyte.com:8011"}, 1385 | {header: value}, 1386 | {}, 1387 | [ 1388 | "Dropping header {header!r} ({value!r})".format( 1389 | header=header, value=value 1390 | ) 1391 | ], 1392 | ) 1393 | for header, value in ( 1394 | (b"X-Crawlera-Timeout", b"40000"), 1395 | (b"X-Crawlera-Foo", b"Bar"), 1396 | ) 1397 | ) 1398 | # ZAPI → SPM 1399 | # 1400 | # We support some ZAPI → SPM translations, just because it was trivial 1401 | # to implement them originally. But there are no plans to extend them 1402 | # with more translations. There is no good reason for someone to send 1403 | # Zyte API proxy mode headers to SPM. 1404 | + tuple( 1405 | ( 1406 | {}, 1407 | {zyte_header: value}, 1408 | {spm_header: value}, 1409 | [ 1410 | ( 1411 | "Translating header {zyte_header!r} ({value!r}) " 1412 | "to {spm_header!r}" 1413 | ).format( 1414 | zyte_header=zyte_header.lower(), 1415 | value=value, 1416 | spm_header=spm_header.lower(), 1417 | ) 1418 | ], 1419 | ) 1420 | for zyte_header, spm_header, value in ( 1421 | (b"Zyte-Device", b"X-Crawlera-Profile", b"desktop"), 1422 | (b"Zyte-Geolocation", b"X-Crawlera-Region", b"US"), 1423 | (b"Zyte-Jobid", b"X-Crawlera-Jobid", b"00000/0/0"), 1424 | (b"Zyte-Override-Headers", b"X-Crawlera-Profile-Pass", b"User-Agent"), 1425 | ) 1426 | ) 1427 | # Other headers, known or made up, are dropped with a warning. 1428 | + tuple( 1429 | ( 1430 | {}, 1431 | {header: value}, 1432 | {}, 1433 | [ 1434 | "Dropping header {header!r} ({value!r})".format( 1435 | header=header, value=value 1436 | ) 1437 | ], 1438 | ) 1439 | for header, value in ( 1440 | (b"Zyte-Cookie-Management", b"enable"), 1441 | (b"Zyte-Foo", b"Bar"), 1442 | ) 1443 | ) 1444 | # ZYTE_SMARTPROXY_KEEP_HEADERS 1445 | + tuple( 1446 | ( 1447 | _merge_dicts({"ZYTE_SMARTPROXY_KEEP_HEADERS": True}, settings), 1448 | {header: value}, 1449 | {header: value}, 1450 | [], 1451 | ) 1452 | for header in ( 1453 | b"X-Crawlera-Foo", 1454 | b"X-Crawlera-Device", 1455 | b"Zyte-Foo", 1456 | b"Zyte-Device", 1457 | ) 1458 | for value in (b"mobile",) 1459 | for settings in ( 1460 | {"ZYTE_SMARTPROXY_ENABLED": False}, 1461 | {"ZYTE_SMARTPROXY_URL": "http://api.zyte.com:8011"}, 1462 | {}, 1463 | ) 1464 | ) 1465 | # ZYTE_SMARTPROXY_DEFAULT_HEADERS 1466 | + ( 1467 | ( 1468 | {"ZYTE_SMARTPROXY_DEFAULT_HEADERS": {"X-Crawlera-Profile": "desktop"}}, 1469 | {}, 1470 | {b"X-Crawlera-Profile": b"desktop"}, 1471 | [], 1472 | ), 1473 | ( 1474 | { 1475 | "ZYTE_SMARTPROXY_DEFAULT_HEADERS": {"X-Crawlera-Profile": "desktop"}, 1476 | "ZYTE_SMARTPROXY_URL": "http://apikey:@api.zyte.com:8011", 1477 | }, 1478 | {}, 1479 | {b"X-Crawlera-Profile": b"desktop"}, # Not translated to Zyte-Device 1480 | ["Keeping deprecated header"], 1481 | ), 1482 | ( 1483 | { 1484 | "ZYTE_SMARTPROXY_DEFAULT_HEADERS": { 1485 | "X-Crawlera-Profile": None, # ignored 1486 | "X-Crawlera-Cookies": "disable", 1487 | }, 1488 | }, 1489 | {}, 1490 | {b"X-Crawlera-Cookies": b"disable"}, 1491 | [], 1492 | ), 1493 | ( 1494 | { 1495 | "ZYTE_SMARTPROXY_DEFAULT_HEADERS": { 1496 | "X-Crawlera-Profile": None, # ignored 1497 | "X-Crawlera-Cookies": "disable", 1498 | }, 1499 | "ZYTE_SMARTPROXY_URL": "http://apikey:@api.zyte.com:8011", 1500 | }, 1501 | {}, 1502 | { 1503 | b"X-Crawlera-Cookies": b"disable" 1504 | }, # Not translated to Zyte-Cookie-Management 1505 | ["Keeping deprecated header"], 1506 | ), 1507 | ), 1508 | ) 1509 | def test_request_headers(settings, input_headers, output_headers, warnings, caplog): 1510 | settings = _merge_dicts( 1511 | { 1512 | "ZYTE_SMARTPROXY_APIKEY": "apikey", 1513 | "ZYTE_SMARTPROXY_ENABLED": True, 1514 | }, 1515 | settings, 1516 | ) 1517 | crawler = get_crawler(settings_dict=settings) 1518 | mw = ZyteSmartProxyMiddleware.from_crawler(crawler) 1519 | spider = Spider("foo") 1520 | mw.open_spider(spider) 1521 | 1522 | request = Request(url="https://example.com", headers=input_headers) 1523 | caplog.clear() 1524 | with caplog.at_level("WARNING"): 1525 | assert mw.process_request(request, spider) is None 1526 | actual_headers = { 1527 | k: b"".join(vs) 1528 | for k, vs in request.headers.items() 1529 | if k not in {b"X-Crawlera-Client", b"Zyte-Client"} 1530 | } 1531 | assert actual_headers == output_headers 1532 | 1533 | if warnings: 1534 | for warning in warnings: 1535 | assert warning in caplog.text 1536 | else: 1537 | assert not caplog.records 1538 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox.ini 2 | [tox] 3 | envlist = pre-commit,mypy,min,py27,py39,py310,py311,py312,py313,docs 4 | 5 | [testenv] 6 | deps = 7 | -rrequirements.txt 8 | -rtests/requirements.txt 9 | commands = 10 | py.test --doctest-modules --cov=scrapy_zyte_smartproxy --cov-report term-missing {posargs:scrapy_zyte_smartproxy tests} 11 | 12 | [testenv:pre-commit] 13 | deps = pre-commit 14 | commands = pre-commit run --all-files --show-diff-on-failure 15 | skip_install = true 16 | 17 | [testenv:mypy] 18 | basepython = python3.10 19 | deps = 20 | mypy[python2]<0.980 21 | pytest<4.7 22 | twisted<=20.3.0 23 | types-six<1.16.12 24 | Scrapy<2 25 | w3lib<2 26 | commands = 27 | mypy --py2 {posargs:scrapy_zyte_smartproxy tests} 28 | 29 | [testenv:min] 30 | basepython = python2.7 31 | deps = 32 | Scrapy==1.4.0 33 | six==1.10.0 34 | # https://github.com/scrapy/scrapy/blob/1.4.0/setup.py#L45 35 | Twisted==13.1.0 36 | w3lib==1.17.0 37 | -rtests/requirements.txt 38 | 39 | [testenv:security] 40 | deps = 41 | bandit 42 | commands = 43 | bandit -r {posargs:scrapy_zyte_smartproxy setup.py} 44 | 45 | [docs] 46 | changedir = docs 47 | deps = 48 | -rdocs/requirements.txt 49 | 50 | [testenv:docs] 51 | changedir = {[docs]changedir} 52 | deps = {[docs]deps} 53 | commands = 54 | sphinx-build -nW -b html . {envtmpdir}/html 55 | 56 | [testenv:twinecheck] 57 | basepython = python3 58 | deps = 59 | twine==6.1.0 60 | build==1.2.2.post1 61 | commands = 62 | python -m build --sdist 63 | twine check dist/* 64 | --------------------------------------------------------------------------------