├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── codecov.yml ├── docs ├── Makefile ├── _ext │ └── __init__.py ├── changelog.rst ├── conf.py ├── contributing.rst ├── index.rst ├── intro │ ├── basic.rst │ └── install.rst ├── ref │ ├── api.rst │ └── cli.rst ├── requirements.txt └── use │ ├── api.rst │ ├── cli.rst │ └── key.rst ├── pyproject.toml ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── mockserver.py ├── test_async.py ├── test_main.py ├── test_retry.py ├── test_sync.py └── test_utils.py ├── tox.ini └── zyte_api ├── __init__.py ├── __main__.py ├── __version__.py ├── _async.py ├── _errors.py ├── _retry.py ├── _sync.py ├── _utils.py ├── aio ├── __init__.py ├── client.py ├── errors.py └── retry.py ├── apikey.py ├── constants.py ├── errors.py ├── stats.py └── utils.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - '[0-9]+.[0-9]+.[0-9]+' 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: pypi 11 | url: https://pypi.org/p/zyte-api 12 | permissions: 13 | id-token: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.13 19 | - run: | 20 | python -m pip install --upgrade build 21 | python -m build 22 | - name: Publish to PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | with: 25 | password: ${{ secrets.PYPI_TOKEN }} 26 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: tox 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | test: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install tox 31 | - name: tox 32 | run: | 33 | tox -e py 34 | - name: coverage 35 | if: ${{ success() }} 36 | uses: codecov/codecov-action@v4.0.1 37 | with: 38 | token: ${{ secrets.CODECOV_TOKEN }} 39 | 40 | check: 41 | runs-on: ubuntu-latest 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | python-version: ['3.12'] # Keep in sync with .readthedocs.yml 46 | tox-job: ["mypy", "docs"] 47 | 48 | steps: 49 | - uses: actions/checkout@v4 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | python -m pip install --upgrade pip 57 | python -m pip install tox 58 | - name: tox 59 | run: | 60 | tox -e ${{ matrix.tox-job }} 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.pyc 3 | *.pyo 4 | /build/ 5 | /dist/ 6 | *.egg-info 7 | 8 | # Mac OS 9 | *.DS_Store 10 | 11 | # IDE 12 | /.idea/ 13 | 14 | .mypy_cache/ 15 | .cache/ 16 | .tox/ 17 | .pytest_cache/ 18 | .coverage 19 | .ipynb_checkpoints/ 20 | htmlcov/ 21 | notebooks/ 22 | coverage.xml 23 | _generated 24 | 25 | docs/_build/ 26 | docs/_autosummary/ 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.9.6 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | - id: ruff-format 8 | - repo: https://github.com/adamchainz/blacken-docs 9 | rev: 1.19.0 10 | hooks: 11 | - id: blacken-docs 12 | additional_dependencies: 13 | - black==25.1.0 14 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | # For available versions, see: 9 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 10 | python: "3.12" # Keep in sync with .github/workflows/test.yml 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | - path: . 15 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 0.7.1 (2025-06-05) 5 | ------------------ 6 | 7 | * Restored and deprecated the ``temporary_download_error_stop()`` and 8 | ``temporary_download_error_wait()`` methods of :class:`~.RetryFactory` for 9 | backwards compatibility. 10 | 11 | 0.7.0 (2025-02-17) 12 | ------------------ 13 | 14 | * Dropped support for Python 3.8, added support for Python 3.12 and 3.13. 15 | 16 | * Renamed some methods of :class:`~.RetryFactory` for consistency, since they 17 | now handle both temporary and permanent download errors: 18 | 19 | * ``temporary_download_error_stop()`` → 20 | :meth:`~.RetryFactory.download_error_stop` 21 | 22 | * ``temporary_download_error_wait()`` → 23 | :meth:`~.RetryFactory.download_error_wait` 24 | 25 | * Made the :ref:`default retry policy ` behave like the 26 | :ref:`aggressive retry policy `, but with half the 27 | retry attempts: 28 | 29 | * :ref:`Permanent download errors ` now also 30 | count towards the retry limit of :ref:`temporary download errors 31 | `. 32 | 33 | * Permanent download errors are now retried once. 34 | 35 | * Error responses with an HTTP status code in the 500-599 range (503, 520 and 36 | 521 excluded) are now retried once. 37 | 38 | * Fixed the session example of the :ref:`async API `. 39 | 40 | 0.6.0 (2024-05-29) 41 | ------------------ 42 | 43 | * Improved how the :ref:`default retry policy ` handles 44 | :ref:`temporary download errors `. 45 | Before, 3 HTTP 429 responses followed by a single HTTP 520 response would 46 | have prevented a retry. Now, unrelated responses and errors do not count 47 | towards the HTTP 520 retry limit. 48 | 49 | * Improved how the :ref:`default retry policy ` handles 50 | network errors. Before, after 15 minutes of unsuccessful responses (e.g. HTTP 51 | 429), any network error would prevent a retry. Now, network errors must happen 52 | 15 minutes in a row, without different errors in between, to stop retries. 53 | 54 | * Implemented an optional :ref:`aggressive retry policy 55 | `, which retries more errors more often, and could 56 | be useful for long crawls or websites with a low success rate. 57 | 58 | * Improved the exception that is raised when passing an invalid retrying policy 59 | object to a :ref:`Python client `. 60 | 61 | 0.5.2 (2024-05-10) 62 | ------------------ 63 | 64 | * :class:`~zyte_api.RequestError` now has a :data:`~zyte_api.RequestError.query` 65 | attribute with the Zyte API request parameters that caused the error. 66 | 67 | 0.5.1 (2024-04-16) 68 | ------------------ 69 | 70 | * :class:`~zyte_api.ZyteAPI` and :class:`~zyte_api.AsyncZyteAPI` sessions no 71 | longer need to be used as context managers, and can instead be closed with a 72 | ``close()`` method. 73 | 74 | 0.5.0 (2024-04-05) 75 | ------------------ 76 | 77 | * Removed Python 3.7 support. 78 | 79 | * Added :class:`~zyte_api.ZyteAPI` and :class:`~zyte_api.AsyncZyteAPI` to 80 | provide both sync and async Python interfaces with a cleaner API. 81 | 82 | * Deprecated ``zyte_api.aio``: 83 | 84 | * Replace ``zyte_api.aio.client.AsyncClient`` with the new 85 | :class:`~zyte_api.AsyncZyteAPI` class. 86 | 87 | * Replace ``zyte_api.aio.client.create_session`` with the new 88 | :meth:`AsyncZyteAPI.session ` method. 89 | 90 | * Import ``zyte_api.aio.errors.RequestError``, 91 | ``zyte_api.aio.retry.RetryFactory`` and 92 | ``zyte_api.aio.retry.zyte_api_retrying`` directly from ``zyte_api`` now. 93 | 94 | * When using the command-line interface, you can now use ``--store-errors`` to 95 | have error responses be stored alongside successful responses. 96 | 97 | * Improved the documentation. 98 | 99 | 0.4.8 (2023-11-02) 100 | ------------------ 101 | 102 | * Include the Zyte API request ID value in a new ``.request_id`` attribute 103 | in ``zyte_api.aio.errors.RequestError``. 104 | 105 | 0.4.7 (2023-09-26) 106 | ------------------ 107 | 108 | * ``AsyncClient`` now lets you set a custom user agent to send to Zyte API. 109 | 110 | 0.4.6 (2023-09-26) 111 | ------------------ 112 | 113 | * Increased the client timeout to match the server’s. 114 | * Mentioned the ``api_key`` parameter of ``AsyncClient`` in the docs example. 115 | 116 | 0.4.5 (2023-01-03) 117 | ------------------ 118 | 119 | * w3lib >= 2.1.1 is required in install_requires, to ensure that URLs 120 | are escaped properly. 121 | * unnecessary ``requests`` library is removed from install_requires 122 | * fixed tox 4 support 123 | 124 | 0.4.4 (2022-12-01) 125 | ------------------ 126 | 127 | * Fixed an issue with submitting URLs which contain unescaped symbols 128 | * New "retrying" argument for AsyncClient.__init__, which allows to set 129 | custom retrying policy for the client 130 | * ``--dont-retry-errors`` argument in the CLI tool 131 | 132 | 0.4.3 (2022-11-10) 133 | ------------------ 134 | 135 | * Connections are no longer reused between requests. 136 | This reduces the amount of ``ServerDisconnectedError`` exceptions. 137 | 138 | 0.4.2 (2022-10-28) 139 | ------------------ 140 | * Bump minimum ``aiohttp`` version to 3.8.0, as earlier versions don't support 141 | brotli decompression of responses 142 | * Declared Python 3.11 support 143 | 144 | 0.4.1 (2022-10-16) 145 | ------------------ 146 | 147 | * Network errors, like server timeouts or disconnections, are now retried for 148 | up to 15 minutes, instead of 5 minutes. 149 | 150 | 0.4.0 (2022-09-20) 151 | ------------------ 152 | 153 | * Require to install ``Brotli`` as a dependency. This changes the requests to 154 | have ``Accept-Encoding: br`` and automatically decompress brotli responses. 155 | 156 | 0.3.0 (2022-07-29) 157 | ------------------ 158 | 159 | Internal AggStats class is cleaned up: 160 | 161 | * ``AggStats.n_extracted_queries`` attribute is removed, as it was a duplicate 162 | of ``AggStats.n_results`` 163 | * ``AggStats.n_results`` is renamed to ``AggStats.n_success`` 164 | * ``AggStats.n_input_queries`` is removed as redundant and misleading; 165 | AggStats got a new ``AggStats.n_processed`` property instead. 166 | 167 | This change is backwards incompatible if you used stats directly. 168 | 169 | 0.2.1 (2022-07-29) 170 | ------------------ 171 | 172 | * ``aiohttp.client_exceptions.ClientConnectorError`` is now treated as a 173 | network error and retried accordingly. 174 | * Removed the unused ``zyte_api.sync`` module. 175 | 176 | 0.2.0 (2022-07-14) 177 | ------------------ 178 | 179 | * Temporary download errors are now retried 3 times by default. 180 | They were not retried in previous releases. 181 | 182 | 0.1.4 (2022-05-21) 183 | ------------------ 184 | This release contains usability improvements to the command-line script: 185 | 186 | * Instead of ``python -m zyte_api`` you can now run it as ``zyte-api``; 187 | * the type of the input file (``--intype`` argument) is guessed now, 188 | based on file extension and content; .jl, .jsonl and .txt 189 | files are supported. 190 | 191 | 0.1.3 (2022-02-03) 192 | ------------------ 193 | 194 | * Minor documenation fix 195 | * Remove support for Python 3.6 196 | * Added support for Python 3.10 197 | 198 | 0.1.2 (2021-11-10) 199 | ------------------ 200 | 201 | * Default timeouts changed 202 | 203 | 204 | 0.1.1 (2021-11-01) 205 | ------------------ 206 | 207 | * CHANGES.rst updated properly 208 | 209 | 210 | 0.1.0 (2021-11-01) 211 | ------------------ 212 | 213 | * Initial release. 214 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Zyte Group Ltd 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Zyte nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGES.rst 2 | include LICENSE 3 | include README.rst 4 | 5 | recursive-include tests * 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | python-zyte-api 3 | =============== 4 | 5 | .. image:: https://img.shields.io/pypi/v/zyte-api.svg 6 | :target: https://pypi.python.org/pypi/zyte-api 7 | :alt: PyPI Version 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/zyte-api.svg 10 | :target: https://pypi.python.org/pypi/zyte-api 11 | :alt: Supported Python Versions 12 | 13 | .. image:: https://github.com/zytedata/python-zyte-api/actions/workflows/test.yml/badge.svg 14 | :target: https://github.com/zytedata/python-zyte-api/actions/workflows/test.yml 15 | :alt: Build Status 16 | 17 | .. image:: https://codecov.io/github/zytedata/zyte-api/coverage.svg?branch=master 18 | :target: https://codecov.io/gh/zytedata/zyte-api 19 | :alt: Coverage report 20 | 21 | .. description-start 22 | 23 | Command-line client and Python client library for `Zyte API`_. 24 | 25 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 26 | 27 | .. description-end 28 | 29 | Installation 30 | ============ 31 | 32 | .. install-start 33 | 34 | .. code-block:: shell 35 | 36 | pip install zyte-api 37 | 38 | .. note:: Python 3.9+ is required. 39 | 40 | .. install-end 41 | 42 | Basic usage 43 | =========== 44 | 45 | .. basic-start 46 | 47 | Set your API key 48 | ---------------- 49 | 50 | .. key-get-start 51 | 52 | After you `sign up for a Zyte API account 53 | `_, copy `your API key 54 | `_. 55 | 56 | .. key-get-end 57 | 58 | 59 | Use the command-line client 60 | --------------------------- 61 | 62 | Then you can use the zyte-api command-line client to send Zyte API requests. 63 | First create a text file with a list of URLs: 64 | 65 | .. code-block:: none 66 | 67 | https://books.toscrape.com 68 | https://quotes.toscrape.com 69 | 70 | And then call ``zyte-api`` from your shell: 71 | 72 | .. code-block:: shell 73 | 74 | zyte-api url-list.txt --api-key YOUR_API_KEY --output results.jsonl 75 | 76 | 77 | Use the Python sync API 78 | ----------------------- 79 | 80 | For very basic Python scripts, use the sync API: 81 | 82 | .. code-block:: python 83 | 84 | from zyte_api import ZyteAPI 85 | 86 | client = ZyteAPI(api_key="YOUR_API_KEY") 87 | response = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) 88 | 89 | 90 | Use the Python async API 91 | ------------------------ 92 | 93 | For asyncio code, use the async API: 94 | 95 | .. code-block:: python 96 | 97 | import asyncio 98 | 99 | from zyte_api import AsyncZyteAPI 100 | 101 | 102 | async def main(): 103 | client = AsyncZyteAPI(api_key="YOUR_API_KEY") 104 | response = await client.get( 105 | {"url": "https://toscrape.com", "httpResponseBody": True} 106 | ) 107 | 108 | 109 | asyncio.run(main()) 110 | 111 | .. basic-end 112 | 113 | Read the `documentation `_ for more 114 | information. 115 | 116 | * Documentation: https://python-zyte-api.readthedocs.io 117 | * Source code: https://github.com/zytedata/python-zyte-api 118 | * Issue tracker: https://github.com/zytedata/python-zyte-api/issues 119 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "header, diff, tree" 3 | 4 | coverage: 5 | status: 6 | project: false 7 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from docutils import nodes 4 | from docutils.parsers.rst.roles import set_classes 5 | 6 | 7 | def http_api_reference_role( 8 | name, rawtext, text, lineno, inliner, options={}, content=[] 9 | ): 10 | match = re.search( 11 | r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text 12 | ) 13 | if match: 14 | display_text = match[1] 15 | reference = match[2] 16 | else: 17 | display_text = None 18 | reference = text 19 | if reference.startswith("request:"): 20 | request_or_response = "request" 21 | elif reference.startswith("response:"): 22 | request_or_response = "response/200" 23 | else: 24 | raise ValueError( 25 | f":http: directive reference must start with request: or " 26 | f"response:, got {reference} from {text!r}." 27 | ) 28 | 29 | field = reference.split(":", maxsplit=1)[1] 30 | if not display_text: 31 | display_text = field 32 | refuri = ( 33 | f"https://docs.zyte.com/zyte-api/usage/reference.html" 34 | f"#operation/extract/{request_or_response}/{field}" 35 | ) 36 | set_classes(options) 37 | node = nodes.reference(rawtext, display_text, refuri=refuri, **options) 38 | return [node], [] 39 | 40 | 41 | def setup(app): 42 | # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/2bfb2bef2e43293a62f47781914331bc4fa08f06/docs/_ext/__init__.py#L42 43 | app.add_role("http", http_api_reference_role) 44 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGES.rst 2 | 3 | .. toctree:: 4 | :hidden: 5 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file does only contain a selection of the most common options. For a 4 | # full list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import sys 14 | from pathlib import Path 15 | 16 | # -- Project information ----------------------------------------------------- 17 | 18 | project = "python-zyte-api" 19 | copyright = "2021, Zyte Group Ltd" 20 | author = "Zyte Group Ltd" 21 | 22 | # The short X.Y version 23 | version = "" 24 | # The full version, including alpha/beta/rc tags 25 | release = "0.7.1" 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext 38 | extensions = [ 39 | "_ext", 40 | "sphinx.ext.autodoc", 41 | "sphinx.ext.intersphinx", 42 | "sphinx.ext.ifconfig", 43 | "sphinx.ext.viewcode", 44 | "sphinx.ext.githubpages", 45 | "sphinxarg.ext", 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | # templates_path = ["_templates"] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = {".rst": "restructuredtext"} 56 | 57 | # The master toctree document. 58 | master_doc = "index" 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = "en" 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = None 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = "sphinx_rtd_theme" 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | # html_theme_options = {} 88 | 89 | # Add any paths that contain custom static files (such as style sheets) here, 90 | # relative to this directory. They are copied after the builtin static files, 91 | # so a file named "default.css" will overwrite the builtin "default.css". 92 | # html_static_path = ['_static'] 93 | 94 | # Custom sidebar templates, must be a dictionary that maps document names 95 | # to template names. 96 | # 97 | # The default sidebars (for documents that don't match any pattern) are 98 | # defined by theme itself. Builtin themes are using these templates by 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 100 | # 'searchbox.html']``. 101 | # 102 | # html_sidebars = {} 103 | 104 | 105 | # -- Options for HTMLHelp output --------------------------------------------- 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = "python-zyte-apidoc" 109 | 110 | 111 | # -- Options for LaTeX output ------------------------------------------------ 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | # The font size ('10pt', '11pt' or '12pt'). 118 | # 119 | # 'pointsize': '10pt', 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | # Latex figure (float) alignment 124 | # 125 | # 'figure_align': 'htbp', 126 | } 127 | 128 | # Grouping the document tree into LaTeX files. List of tuples 129 | # (source start file, target name, title, 130 | # author, documentclass [howto, manual, or own class]). 131 | latex_documents = [ 132 | ( 133 | master_doc, 134 | "python-zyte-api.tex", 135 | "python-zyte-api Documentation", 136 | "Zyte Group Ltd", 137 | "manual", 138 | ), 139 | ] 140 | 141 | 142 | # -- Options for manual page output ------------------------------------------ 143 | 144 | # One entry per manual page. List of tuples 145 | # (source start file, name, description, authors, manual section). 146 | man_pages = [ 147 | (master_doc, "python-zyte-api", "python-zyte-api Documentation", [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ---------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | ( 158 | master_doc, 159 | "python-zyte-api", 160 | "python-zyte-api Documentation", 161 | author, 162 | "python-zyte-api", 163 | "One line description of project.", 164 | "Miscellaneous", 165 | ), 166 | ] 167 | 168 | 169 | # -- Options for Epub output ------------------------------------------------- 170 | 171 | # Bibliographic Dublin Core info. 172 | epub_title = project 173 | 174 | # The unique identifier of the text. This can be a ISBN number 175 | # or the project homepage. 176 | # 177 | # epub_identifier = '' 178 | 179 | # A unique identification for the text. 180 | # 181 | # epub_uid = '' 182 | 183 | # A list of files that should not be packed into the epub file. 184 | epub_exclude_files = ["search.html"] 185 | 186 | 187 | # -- Extension configuration ------------------------------------------------- 188 | 189 | # -- Options for intersphinx extension --------------------------------------- 190 | intersphinx_mapping = { 191 | "python": ( 192 | "https://docs.python.org/3", 193 | None, 194 | ), 195 | "aiohttp": ( 196 | "https://docs.aiohttp.org/en/stable/", 197 | None, 198 | ), 199 | "tenacity": ( 200 | "https://tenacity.readthedocs.io/en/latest/", 201 | None, 202 | ), 203 | "zyte": ( 204 | "https://docs.zyte.com", 205 | None, 206 | ), 207 | } 208 | 209 | autodoc_default_options = { 210 | # 'special-members': '__init__,__call__', 211 | # 'undoc-members': True, 212 | "exclude-members": "__weakref__" 213 | } 214 | 215 | add_module_names = False 216 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | python-zyte-api is an open-source project. Your contribution is very welcome! 6 | 7 | Issue Tracker 8 | ============= 9 | 10 | If you have a bug report, a new feature proposal or simply would like to make 11 | a question, please check our issue tracker on Github: https://github.com/zytedata/python-zyte-api/issues 12 | 13 | Source code 14 | =========== 15 | 16 | Our source code is hosted on Github: https://github.com/zytedata/python-zyte-api 17 | 18 | Before opening a pull request, it might be worth checking current and previous 19 | issues. Some code changes might also require some discussion before being 20 | accepted so it might be worth opening a new issue before implementing huge or 21 | breaking changes. 22 | 23 | Testing 24 | ======= 25 | 26 | We use tox_ to run tests with different Python versions:: 27 | 28 | tox 29 | 30 | The command above also runs type checks; we use mypy. 31 | 32 | .. toctree:: 33 | :hidden: 34 | 35 | .. _tox: https://tox.readthedocs.io 36 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | python-zyte-api 3 | =============== 4 | 5 | .. include:: ../README.rst 6 | :start-after: description-start 7 | :end-before: description-end 8 | 9 | .. toctree:: 10 | :caption: Getting started 11 | :maxdepth: 1 12 | 13 | intro/install 14 | intro/basic 15 | 16 | .. toctree:: 17 | :caption: Usage 18 | :maxdepth: 1 19 | 20 | use/key 21 | use/cli 22 | use/api 23 | 24 | .. toctree:: 25 | :caption: Reference 26 | :maxdepth: 1 27 | 28 | ref/cli 29 | ref/api 30 | 31 | .. toctree:: 32 | :caption: All the rest 33 | :maxdepth: 1 34 | 35 | contributing 36 | changelog 37 | -------------------------------------------------------------------------------- /docs/intro/basic.rst: -------------------------------------------------------------------------------- 1 | .. _basic: 2 | 3 | =========== 4 | Basic usage 5 | =========== 6 | 7 | .. include:: /../README.rst 8 | :start-after: basic-start 9 | :end-before: basic-end 10 | -------------------------------------------------------------------------------- /docs/intro/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | .. include:: /../README.rst 8 | :start-after: install-start 9 | :end-before: install-end 10 | -------------------------------------------------------------------------------- /docs/ref/api.rst: -------------------------------------------------------------------------------- 1 | .. _api-ref: 2 | 3 | ============= 4 | API reference 5 | ============= 6 | 7 | .. module:: zyte_api 8 | 9 | Sync API 10 | ======== 11 | 12 | .. autoclass:: ZyteAPI 13 | :members: 14 | 15 | 16 | Async API 17 | ========= 18 | 19 | .. autoclass:: AsyncZyteAPI 20 | :members: 21 | 22 | 23 | Retries 24 | ======= 25 | 26 | .. autodata:: zyte_api_retrying 27 | :no-value: 28 | 29 | .. autodata:: aggressive_retrying 30 | :no-value: 31 | 32 | .. autoclass:: RetryFactory 33 | 34 | .. autoclass:: AggressiveRetryFactory 35 | 36 | 37 | Errors 38 | ====== 39 | 40 | .. autoexception:: RequestError 41 | :members: 42 | 43 | .. autoclass:: ParsedError 44 | :members: 45 | -------------------------------------------------------------------------------- /docs/ref/cli.rst: -------------------------------------------------------------------------------- 1 | .. _cli-ref: 2 | 3 | ============= 4 | CLI reference 5 | ============= 6 | 7 | zyte-api 8 | ======== 9 | 10 | .. argparse:: 11 | :ref: zyte_api.__main__._get_argument_parser 12 | :prog: zyte-api 13 | :nodefault: 14 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp >= 3.6.0 2 | Sphinx >= 4.2.0 3 | sphinx-argparse 4 | sphinx-rtd-theme >= 0.4 5 | tenacity 6 | -------------------------------------------------------------------------------- /docs/use/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | .. currentmodule:: zyte_api 4 | 5 | ===================== 6 | Python client library 7 | ===================== 8 | 9 | Once you have :ref:`installed python-zyte-api ` and :ref:`configured 10 | your API key `, you can use one of its APIs from Python code: 11 | 12 | - The :ref:`sync API ` can be used to build simple, proof-of-concept or 13 | debugging Python scripts. 14 | 15 | - The :ref:`async API ` can be used from :ref:`coroutines 16 | `, and is meant for production usage, as well as for asyncio 17 | environments like `Jupyter notebooks`_. 18 | 19 | .. _Jupyter notebooks: https://jupyter.org/ 20 | 21 | .. _sync: 22 | 23 | Sync API 24 | ======== 25 | 26 | Create a :class:`ZyteAPI` object, and use its 27 | :meth:`~ZyteAPI.get` method to perform a single request: 28 | 29 | .. code-block:: python 30 | 31 | from zyte_api import ZyteAPI 32 | 33 | client = ZyteAPI() 34 | result = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) 35 | 36 | To perform multiple requests, use a :meth:`~ZyteAPI.session` for 37 | better performance, and use :meth:`~ZyteAPI.iter` to send multiple 38 | requests in parallel: 39 | 40 | .. code-block:: python 41 | 42 | from zyte_api import ZyteAPI, RequestError 43 | 44 | client = ZyteAPI() 45 | with client.session() as session: 46 | queries = [ 47 | {"url": "https://toscrape.com", "httpResponseBody": True}, 48 | {"url": "https://books.toscrape.com", "httpResponseBody": True}, 49 | ] 50 | for result_or_exception in session.iter(queries): 51 | if isinstance(result_or_exception, dict): 52 | ... 53 | elif isinstance(result_or_exception, RequestError): 54 | ... 55 | else: 56 | assert isinstance(result_or_exception, Exception) 57 | ... 58 | 59 | .. tip:: :meth:`~ZyteAPI.iter` yields results as they come, not 60 | necessarily in their original order. Use :http:`request:echoData` to track 61 | the source request. 62 | 63 | .. _asyncio_api: 64 | 65 | Async API 66 | ========= 67 | 68 | Create an :class:`AsyncZyteAPI` object, and use its 69 | :meth:`~AsyncZyteAPI.get` method to perform a single request: 70 | 71 | .. code-block:: python 72 | 73 | import asyncio 74 | 75 | from zyte_api import AsyncZyteAPI 76 | 77 | 78 | async def main(): 79 | client = AsyncZyteAPI() 80 | result = await client.get({"url": "https://toscrape.com", "httpResponseBody": True}) 81 | 82 | 83 | asyncio.run(main()) 84 | 85 | To perform multiple requests, use a :meth:`~AsyncZyteAPI.session` for 86 | better performance, and use :meth:`~AsyncZyteAPI.iter` to send 87 | multiple requests in parallel: 88 | 89 | .. code-block:: python 90 | 91 | import asyncio 92 | 93 | from zyte_api import AsyncZyteAPI, RequestError 94 | 95 | 96 | async def main(): 97 | client = AsyncZyteAPI() 98 | async with client.session() as session: 99 | queries = [ 100 | {"url": "https://toscrape.com", "httpResponseBody": True}, 101 | {"url": "https://books.toscrape.com", "httpResponseBody": True}, 102 | ] 103 | for future in session.iter(queries): 104 | try: 105 | result = await future 106 | except RequestError as e: 107 | ... 108 | except Exception as e: 109 | ... 110 | 111 | 112 | asyncio.run(main()) 113 | 114 | .. tip:: :meth:`~AsyncZyteAPI.iter` yields results as they come, not 115 | necessarily in their original order. Use :http:`request:echoData` to track 116 | the source request. 117 | 118 | 119 | .. _api-optimize: 120 | 121 | Optimization 122 | ============ 123 | 124 | :class:`ZyteAPI` and :class:`AsyncZyteAPI` use 15 125 | concurrent connections by default. 126 | 127 | To change that, use the ``n_conn`` parameter when creating your client object: 128 | 129 | .. code-block:: python 130 | 131 | client = ZyteAPI(n_conn=30) 132 | 133 | The number of concurrent connections if enforced across all method calls, 134 | including different sessions of the same client. 135 | 136 | For guidelines on how to choose the optimal value for you, and other 137 | optimization tips, see :ref:`zapi-optimize`. 138 | 139 | 140 | Errors and retries 141 | ================== 142 | 143 | Methods of :class:`ZyteAPI` and :class:`AsyncZyteAPI` automatically handle 144 | retries for :ref:`rate-limiting ` and :ref:`unsuccessful 145 | ` responses, as well as network errors. 146 | 147 | .. _retry-policy: 148 | .. _default-retry-policy: 149 | 150 | The default retry policy, :data:`~zyte_api.zyte_api_retrying`, does the 151 | following for each request: 152 | 153 | - Retries :ref:`rate-limiting responses ` forever. 154 | 155 | - Retries :ref:`temporary download errors ` 156 | up to 3 times. :ref:`Permanent download errors 157 | ` also count towards this retry limit. 158 | 159 | - Retries permanent download errors once. 160 | 161 | - Retries network errors until they have happened for 15 minutes straight. 162 | 163 | - Retries error responses with an HTTP status code in the 500-599 range (503, 164 | 520 and 521 excluded) once. 165 | 166 | All retries are done with an exponential backoff algorithm. 167 | 168 | .. _aggressive-retry-policy: 169 | 170 | If some :ref:`unsuccessful responses ` exceed 171 | maximum retries with the default retry policy, try using 172 | :data:`~zyte_api.aggressive_retrying` instead, which doubles attempts for 173 | all retry scenarios. 174 | 175 | Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` 176 | and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom 177 | retry policies, and you can always build your own 178 | :class:`~tenacity.AsyncRetrying` object from scratch. 179 | 180 | To use :data:`~zyte_api.aggressive_retrying` or a custom retry policy, pass an 181 | instance of your :class:`~tenacity.AsyncRetrying` subclass when creating your 182 | client object: 183 | 184 | .. code-block:: python 185 | 186 | from zyte_api import ZyteAPI, aggressive_retrying 187 | 188 | client = ZyteAPI(retrying=aggressive_retrying) 189 | 190 | When retries are exceeded for a given request, an exception is raised. Except 191 | for the :meth:`~ZyteAPI.iter` method of the :ref:`sync API `, which 192 | yields exceptions instead of raising them, to prevent exceptions from 193 | interrupting the entire iteration. 194 | 195 | The type of exception depends on the issue that caused the final request 196 | attempt to fail. Unsuccessful responses trigger a :exc:`RequestError` and 197 | network errors trigger :ref:`aiohttp exceptions `. 198 | Other exceptions could be raised; for example, from a custom retry policy. 199 | 200 | 201 | .. seealso:: :ref:`api-ref` 202 | -------------------------------------------------------------------------------- /docs/use/cli.rst: -------------------------------------------------------------------------------- 1 | .. _command_line: 2 | 3 | =================== 4 | Command-line client 5 | =================== 6 | 7 | Once you have :ref:`installed python-zyte-api ` and :ref:`configured 8 | your API key `, you can use the ``zyte-api`` command-line client. 9 | 10 | To use ``zyte-api``, pass an :ref:`input file ` as the first 11 | parameter and specify an :ref:`output file ` with ``--output``. 12 | For example: 13 | 14 | .. code-block:: shell 15 | 16 | zyte-api urls.txt --output result.jsonl 17 | 18 | .. _input-file: 19 | 20 | Input file 21 | ========== 22 | 23 | The input file can be either of the following: 24 | 25 | - A plain-text file with a list of target URLs, one per line. For example: 26 | 27 | .. code-block:: none 28 | 29 | https://books.toscrape.com 30 | https://quotes.toscrape.com 31 | 32 | For each URL, a Zyte API request will be sent with 33 | :http:`request:browserHtml` set to ``True``. 34 | 35 | - A `JSON Lines `_ file with a object of :ref:`Zyte 36 | API request parameters ` per line. For example: 37 | 38 | .. code-block:: json 39 | 40 | {"url": "https://a.example", "browserHtml": true, "geolocation": "GB"} 41 | {"url": "https://b.example", "httpResponseBody": true} 42 | {"url": "https://books.toscrape.com", "productNavigation": true} 43 | 44 | 45 | .. _output-file: 46 | 47 | Output file 48 | =========== 49 | 50 | You can specify the path to an output file with the ``--output``/``-o`` switch. 51 | If not specified, the output is printed on the standard output. 52 | 53 | .. warning:: The output path is overwritten. 54 | 55 | The output file is in `JSON Lines`_ format. Each line contains a JSON object 56 | with a response from Zyte API. 57 | 58 | By default, ``zyte-api`` uses multiple concurrent connections for 59 | :ref:`performance reasons ` and, as a result, the order of 60 | responses will probably not match the order of the source requests from the 61 | :ref:`input file `. If you need to match the output results to the 62 | input requests, the best way is to use :http:`request:echoData`. By default, 63 | ``zyte-api`` fills :http:`request:echoData` with the input URL. 64 | 65 | 66 | .. _cli-optimize: 67 | 68 | Optimization 69 | ============ 70 | 71 | By default, ``zyte-api`` uses 20 concurrent connections for requests. Use the 72 | ``--n-conn`` switch to change that: 73 | 74 | .. code-block:: shell 75 | 76 | zyte-api --n-conn 40 … 77 | 78 | The ``--shuffle`` option can be useful if you target multiple websites and your 79 | :ref:`input file ` is sorted by website, to randomize the request 80 | order and hence distribute the load somewhat evenly: 81 | 82 | .. code-block:: shell 83 | 84 | zyte-api urls.txt --shuffle … 85 | 86 | For guidelines on how to choose the optimal ``--n-conn`` value for you, and 87 | other optimization tips, see :ref:`zapi-optimize`. 88 | 89 | 90 | Errors and retries 91 | ================== 92 | 93 | ``zyte-api`` automatically handles retries for :ref:`rate-limiting 94 | ` and :ref:`unsuccessful 95 | ` responses, as well as network errors, 96 | following the :ref:`default retry policy `. 97 | 98 | Use ``--dont-retry-errors`` to disable the retrying of error responses, and 99 | retrying only :ref:`rate-limiting responses `: 100 | 101 | .. code-block:: shell 102 | 103 | zyte-api --dont-retry-errors … 104 | 105 | By default, errors are only logged in the standard error output (``stderr``). 106 | If you want to include error responses in the output file, use 107 | ``--store-errors``: 108 | 109 | .. code-block:: shell 110 | 111 | zyte-api --store-errors … 112 | 113 | 114 | .. seealso:: :ref:`cli-ref` 115 | -------------------------------------------------------------------------------- /docs/use/key.rst: -------------------------------------------------------------------------------- 1 | .. _api-key: 2 | 3 | ======= 4 | API key 5 | ======= 6 | 7 | .. include:: /../README.rst 8 | :start-after: key-get-start 9 | :end-before: key-get-end 10 | 11 | It is recommended to configure your API key through an environment variable, so 12 | that it can be picked by both the :ref:`command-line client ` and 13 | the :ref:`Python client library `: 14 | 15 | - On Windows: 16 | 17 | .. code-block:: shell 18 | 19 | > set ZYTE_API_KEY=YOUR_API_KEY 20 | 21 | - On macOS and Linux: 22 | 23 | .. code-block:: shell 24 | 25 | $ export ZYTE_API_KEY=YOUR_API_KEY 26 | 27 | Alternatively, you may pass your API key to the clients directly: 28 | 29 | - To pass your API key directly to the command-line client, use the 30 | ``--api-key`` switch: 31 | 32 | .. code-block:: shell 33 | 34 | zyte-api --api-key YOUR_API_KEY … 35 | 36 | - To pass your API key directly to the Python client classes, use the 37 | ``api_key`` parameter when creating a client object: 38 | 39 | .. code-block:: python 40 | 41 | from zyte_api import ZyteAPI 42 | 43 | client = ZyteAPI(api_key="YOUR_API_KEY") 44 | 45 | .. code-block:: python 46 | 47 | from zyte_api import AsyncZyteAPI 48 | 49 | client = AsyncZyteAPI(api_key="YOUR_API_KEY") 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.bumpversion] 2 | current_version = "0.7.1" 3 | commit = true 4 | tag = true 5 | tag_name = "{new_version}" 6 | 7 | [[tool.bumpversion.files]] 8 | filename = 'CHANGES.rst' 9 | search = "\\(unreleased\\)$" 10 | replace = "({now:%Y-%m-%d})" 11 | regex = true 12 | 13 | [[tool.bumpversion.files]] 14 | filename = "docs/conf.py" 15 | 16 | [[tool.bumpversion.files]] 17 | filename = "setup.py" 18 | 19 | [[tool.bumpversion.files]] 20 | filename = "zyte_api/__version__.py" 21 | 22 | [tool.coverage.run] 23 | branch = true 24 | 25 | [tool.coverage.report] 26 | exclude_also = [ 27 | "if TYPE_CHECKING:", 28 | ] 29 | 30 | [tool.pytest.ini_options] 31 | filterwarnings = [ 32 | "ignore:The zyte_api\\.aio module is deprecated:DeprecationWarning" 33 | ] 34 | 35 | [tool.ruff.lint] 36 | extend-select = [ 37 | # flake8-bugbear 38 | "B", 39 | # flake8-comprehensions 40 | "C4", 41 | # pydocstyle 42 | "D", 43 | # flake8-future-annotations 44 | "FA", 45 | # flynt 46 | "FLY", 47 | # refurb 48 | "FURB", 49 | # isort 50 | "I", 51 | # flake8-implicit-str-concat 52 | "ISC", 53 | # flake8-logging 54 | "LOG", 55 | # Perflint 56 | "PERF", 57 | # pygrep-hooks 58 | "PGH", 59 | # flake8-pie 60 | "PIE", 61 | # pylint 62 | "PL", 63 | # flake8-pytest-style 64 | "PT", 65 | # flake8-use-pathlib 66 | "PTH", 67 | # flake8-pyi 68 | "PYI", 69 | # flake8-quotes 70 | "Q", 71 | # flake8-return 72 | "RET", 73 | # flake8-raise 74 | "RSE", 75 | # Ruff-specific rules 76 | "RUF", 77 | # flake8-bandit 78 | "S", 79 | # flake8-simplify 80 | "SIM", 81 | # flake8-slots 82 | "SLOT", 83 | # flake8-debugger 84 | "T10", 85 | # flake8-type-checking 86 | "TC", 87 | # pyupgrade 88 | "UP", 89 | # pycodestyle warnings 90 | "W", 91 | # flake8-2020 92 | "YTT", 93 | ] 94 | ignore = [ 95 | # Missing docstring in public module 96 | "D100", 97 | # Missing docstring in public class 98 | "D101", 99 | # Missing docstring in public method 100 | "D102", 101 | # Missing docstring in public function 102 | "D103", 103 | # Missing docstring in public package 104 | "D104", 105 | # Missing docstring in magic method 106 | "D105", 107 | # Missing docstring in __init__ 108 | "D107", 109 | # One-line docstring should fit on one line with quotes 110 | "D200", 111 | # No blank lines allowed after function docstring 112 | "D202", 113 | # 1 blank line required between summary line and description 114 | "D205", 115 | # Multi-line docstring closing quotes should be on a separate line 116 | "D209", 117 | # First line should end with a period 118 | "D400", 119 | # First line should be in imperative mood; try rephrasing 120 | "D401", 121 | # First line should not be the function's "signature" 122 | "D402", 123 | # `try`-`except` within a loop incurs performance overhead 124 | "PERF203", 125 | # Too many return statements 126 | "PLR0911", 127 | # Too many branches 128 | "PLR0912", 129 | # Too many arguments in function definition 130 | "PLR0913", 131 | # Too many statements 132 | "PLR0915", 133 | # Magic value used in comparison 134 | "PLR2004", 135 | # String contains ambiguous {}. 136 | "RUF001", 137 | # Docstring contains ambiguous {}. 138 | "RUF002", 139 | # Comment contains ambiguous {}. 140 | "RUF003", 141 | # Mutable class attributes should be annotated with `typing.ClassVar` 142 | "RUF012", 143 | # Use of `assert` detected 144 | "S101", 145 | ] 146 | 147 | [tool.ruff.lint.per-file-ignores] 148 | "zyte_api/__init__.py" = ["F401"] 149 | "zyte_api/aio/errors.py" = ["F401"] 150 | "zyte_api/aio/retry.py" = ["F401"] 151 | "tests/*" = ["S"] 152 | "docs/**" = ["B006"] 153 | # Skip PEP 604 suggestions for files with attr classes 154 | "zyte_api/errors.py" = ["UP007"] 155 | "zyte_api/stats.py" = ["UP007"] 156 | 157 | [tool.ruff.lint.flake8-type-checking] 158 | runtime-evaluated-decorators = ["attr.s"] 159 | 160 | [tool.ruff.lint.pydocstyle] 161 | convention = "pep257" 162 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="zyte-api", 7 | version="0.7.1", 8 | description="Python interface to Zyte API", 9 | long_description=Path("README.rst").read_text(encoding="utf-8"), 10 | long_description_content_type="text/x-rst", 11 | author="Zyte Group Ltd", 12 | author_email="opensource@zyte.com", 13 | url="https://github.com/zytedata/python-zyte-api", 14 | packages=find_packages(exclude=["tests", "examples"]), 15 | entry_points={ 16 | "console_scripts": ["zyte-api=zyte_api.__main__:_main"], 17 | }, 18 | install_requires=[ 19 | "aiohttp >= 3.8.0", 20 | "attrs", 21 | "brotli", 22 | "runstats", 23 | "tenacity", 24 | "tqdm", 25 | "w3lib >= 2.1.1", 26 | ], 27 | classifiers=[ 28 | "Development Status :: 3 - Alpha", 29 | "Intended Audience :: Developers", 30 | "License :: OSI Approved :: BSD License", 31 | "Natural Language :: English", 32 | "Operating System :: OS Independent", 33 | "Programming Language :: Python :: 3", 34 | "Programming Language :: Python :: 3.9", 35 | "Programming Language :: Python :: 3.10", 36 | "Programming Language :: Python :: 3.11", 37 | "Programming Language :: Python :: 3.12", 38 | "Programming Language :: Python :: 3.13", 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/python-zyte-api/0cb282388f413fb6f5d3b3f4ecbb2461cfa2ead3/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session") 5 | def mockserver(): 6 | from .mockserver import MockServer 7 | 8 | with MockServer() as server: 9 | yield server 10 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import socket 4 | import sys 5 | import time 6 | from base64 import b64encode 7 | from importlib import import_module 8 | from subprocess import PIPE, Popen 9 | from typing import Any 10 | from urllib.parse import urlparse 11 | 12 | from twisted.internet import reactor 13 | from twisted.internet.task import deferLater 14 | from twisted.web.resource import Resource 15 | from twisted.web.server import NOT_DONE_YET, Site 16 | 17 | 18 | # https://github.com/scrapy/scrapy/blob/02b97f98e74a994ad3e4d74e7ed55207e508a576/tests/mockserver.py#L27C1-L33C19 19 | def getarg(request, name, default=None, type=None): 20 | if name in request.args: 21 | value = request.args[name][0] 22 | if type is not None: 23 | value = type(value) 24 | return value 25 | return default 26 | 27 | 28 | def get_ephemeral_port(): 29 | s = socket.socket() 30 | s.bind(("", 0)) 31 | return s.getsockname()[1] 32 | 33 | 34 | class DropResource(Resource): 35 | isLeaf = True 36 | 37 | def deferRequest(self, request, delay, f, *a, **kw): 38 | def _cancelrequest(_): 39 | # silence CancelledError 40 | d.addErrback(lambda _: None) 41 | d.cancel() 42 | 43 | d = deferLater(reactor, delay, f, *a, **kw) 44 | request.notifyFinish().addErrback(_cancelrequest) 45 | return d 46 | 47 | def render_POST(self, request): 48 | request.setHeader(b"Content-Length", b"1024") 49 | self.deferRequest(request, 0, self._delayedRender, request) 50 | return NOT_DONE_YET 51 | 52 | def _delayedRender(self, request): 53 | abort = getarg(request, b"abort", 0, type=int) 54 | request.write(b"this connection will be dropped\n") 55 | tr = request.channel.transport 56 | try: 57 | if abort and hasattr(tr, "abortConnection"): 58 | tr.abortConnection() 59 | else: 60 | tr.loseConnection() 61 | finally: 62 | request.finish() 63 | 64 | 65 | class DefaultResource(Resource): 66 | request_count = 0 67 | 68 | def getChild(self, path, request): 69 | return self 70 | 71 | def render_POST(self, request): 72 | request_data = json.loads(request.content.read()) 73 | 74 | request.responseHeaders.setRawHeaders( 75 | b"Content-Type", 76 | [b"application/json"], 77 | ) 78 | request.responseHeaders.setRawHeaders( 79 | b"request-id", 80 | [b"abcd1234"], 81 | ) 82 | 83 | url = request_data["url"] 84 | domain = urlparse(url).netloc 85 | if domain == "e429.example": 86 | request.setResponseCode(429) 87 | response_data = {"status": 429, "type": "/limits/over-user-limit"} 88 | return json.dumps(response_data).encode() 89 | if domain == "e500.example": 90 | request.setResponseCode(500) 91 | return "" 92 | if domain == "e520.example": 93 | request.setResponseCode(520) 94 | response_data = {"status": 520, "type": "/download/temporary-error"} 95 | return json.dumps(response_data).encode() 96 | if domain == "e521.example": 97 | request.setResponseCode(521) 98 | response_data = {"status": 521, "type": "/download/internal-error"} 99 | return json.dumps(response_data).encode() 100 | if domain == "exception.example": 101 | request.setResponseCode(401) 102 | response_data = { 103 | "status": 401, 104 | "type": "/auth/key-not-found", 105 | "title": "Authentication Key Not Found", 106 | "detail": "The authentication key is not valid or can't be matched.", 107 | } 108 | return json.dumps(response_data).encode() 109 | if domain == "empty-body-exception.example": 110 | request.setResponseCode(500) 111 | return b"" 112 | if domain == "nonjson.example": 113 | request.setResponseCode(200) 114 | return b"foo" 115 | if domain == "nonjson-exception.example": 116 | request.setResponseCode(500) 117 | return b"foo" 118 | if domain == "array-exception.example": 119 | request.setResponseCode(500) 120 | return b'["foo"]' 121 | 122 | response_data: dict[str, Any] = { 123 | "url": url, 124 | } 125 | 126 | html = "Hello

World!

" 127 | if "httpResponseBody" in request_data: 128 | body = b64encode(html.encode()).decode() 129 | response_data["httpResponseBody"] = body 130 | else: 131 | assert "browserHtml" in request_data 132 | response_data["browserHtml"] = html 133 | 134 | return json.dumps(response_data).encode() 135 | 136 | 137 | class MockServer: 138 | def __init__(self, resource=None, port=None): 139 | resource = resource or DefaultResource 140 | self.resource = f"{resource.__module__}.{resource.__name__}" 141 | self.proc = None 142 | self.host = socket.gethostbyname(socket.gethostname()) 143 | self.port = port or get_ephemeral_port() 144 | self.root_url = f"http://{self.host}:{self.port}" 145 | 146 | def __enter__(self): 147 | self.proc = Popen( 148 | [ 149 | sys.executable, 150 | "-u", 151 | "-m", 152 | "tests.mockserver", 153 | self.resource, 154 | "--port", 155 | str(self.port), 156 | ], 157 | stdout=PIPE, 158 | ) 159 | assert self.proc.stdout is not None 160 | self.proc.stdout.readline() 161 | return self 162 | 163 | def __exit__(self, exc_type, exc_value, traceback): 164 | assert self.proc is not None 165 | self.proc.kill() 166 | self.proc.wait() 167 | time.sleep(0.2) 168 | 169 | def urljoin(self, path): 170 | return self.root_url + path 171 | 172 | 173 | def main(): 174 | parser = argparse.ArgumentParser() 175 | parser.add_argument("resource") 176 | parser.add_argument("--port", type=int) 177 | args = parser.parse_args() 178 | module_name, name = args.resource.rsplit(".", 1) 179 | sys.path.append(".") 180 | resource = getattr(import_module(module_name), name)() 181 | # Typing issue: https://github.com/twisted/twisted/issues/9909 182 | http_port = reactor.listenTCP(args.port, Site(resource)) # type: ignore[attr-defined] 183 | 184 | def print_listening(): 185 | host = http_port.getHost() 186 | print(f"Mock server {resource} running at http://{host.host}:{host.port}") 187 | 188 | # Typing issue: https://github.com/twisted/twisted/issues/9909 189 | reactor.callWhenRunning(print_listening) # type: ignore[attr-defined] 190 | reactor.run() # type: ignore[attr-defined] 191 | 192 | 193 | if __name__ == "__main__": 194 | main() 195 | -------------------------------------------------------------------------------- /tests/test_async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from unittest.mock import AsyncMock 3 | 4 | import pytest 5 | 6 | from zyte_api import AggressiveRetryFactory, AsyncZyteAPI, RequestError 7 | from zyte_api.aio.client import AsyncClient 8 | from zyte_api.apikey import NoApiKey 9 | from zyte_api.errors import ParsedError 10 | from zyte_api.utils import USER_AGENT 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "client_cls", 15 | [ 16 | AsyncZyteAPI, 17 | AsyncClient, 18 | ], 19 | ) 20 | @pytest.mark.parametrize( 21 | ("user_agent", "expected"), 22 | [ 23 | ( 24 | None, 25 | USER_AGENT, 26 | ), 27 | ( 28 | f"scrapy-zyte-api/0.11.1 {USER_AGENT}", 29 | f"scrapy-zyte-api/0.11.1 {USER_AGENT}", 30 | ), 31 | ], 32 | ) 33 | def test_user_agent(client_cls, user_agent, expected): 34 | client = client_cls(api_key="123", api_url="http:\\test", user_agent=user_agent) 35 | assert client.user_agent == expected 36 | 37 | 38 | @pytest.mark.parametrize( 39 | "client_cls", 40 | [ 41 | AsyncZyteAPI, 42 | AsyncClient, 43 | ], 44 | ) 45 | def test_api_key(client_cls): 46 | client_cls(api_key="a") 47 | with pytest.raises(NoApiKey): 48 | client_cls() 49 | 50 | 51 | @pytest.mark.parametrize( 52 | ("client_cls", "get_method"), 53 | [ 54 | (AsyncZyteAPI, "get"), 55 | (AsyncClient, "request_raw"), 56 | ], 57 | ) 58 | @pytest.mark.asyncio 59 | async def test_get(client_cls, get_method, mockserver): 60 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 61 | expected_result = { 62 | "url": "https://a.example", 63 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 64 | } 65 | actual_result = await getattr(client, get_method)( 66 | {"url": "https://a.example", "httpResponseBody": True} 67 | ) 68 | assert actual_result == expected_result 69 | 70 | 71 | @pytest.mark.parametrize( 72 | ("client_cls", "get_method"), 73 | [ 74 | (AsyncZyteAPI, "get"), 75 | (AsyncClient, "request_raw"), 76 | ], 77 | ) 78 | @pytest.mark.asyncio 79 | async def test_get_request_error(client_cls, get_method, mockserver): 80 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 81 | with pytest.raises(RequestError) as request_error_info: 82 | await getattr(client, get_method)( 83 | {"url": "https://exception.example", "browserHtml": True}, 84 | ) 85 | parsed_error = request_error_info.value.parsed 86 | assert isinstance(parsed_error, ParsedError) 87 | assert parsed_error.data == { 88 | "detail": "The authentication key is not valid or can't be matched.", 89 | "status": 401, 90 | "title": "Authentication Key Not Found", 91 | "type": "/auth/key-not-found", 92 | } 93 | 94 | 95 | @pytest.mark.parametrize( 96 | ("client_cls", "get_method"), 97 | [ 98 | (AsyncZyteAPI, "get"), 99 | (AsyncClient, "request_raw"), 100 | ], 101 | ) 102 | @pytest.mark.asyncio 103 | async def test_get_request_error_empty_body(client_cls, get_method, mockserver): 104 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 105 | with pytest.raises(RequestError) as request_error_info: 106 | await getattr(client, get_method)( 107 | {"url": "https://empty-body-exception.example", "browserHtml": True}, 108 | ) 109 | parsed_error = request_error_info.value.parsed 110 | assert isinstance(parsed_error, ParsedError) 111 | assert parsed_error.data is None 112 | 113 | 114 | @pytest.mark.parametrize( 115 | ("client_cls", "get_method"), 116 | [ 117 | (AsyncZyteAPI, "get"), 118 | (AsyncClient, "request_raw"), 119 | ], 120 | ) 121 | @pytest.mark.asyncio 122 | async def test_get_request_error_non_json(client_cls, get_method, mockserver): 123 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 124 | with pytest.raises(RequestError) as request_error_info: 125 | await getattr(client, get_method)( 126 | {"url": "https://nonjson-exception.example", "browserHtml": True}, 127 | ) 128 | parsed_error = request_error_info.value.parsed 129 | assert isinstance(parsed_error, ParsedError) 130 | assert parsed_error.data is None 131 | 132 | 133 | @pytest.mark.parametrize( 134 | ("client_cls", "get_method"), 135 | [ 136 | (AsyncZyteAPI, "get"), 137 | (AsyncClient, "request_raw"), 138 | ], 139 | ) 140 | @pytest.mark.asyncio 141 | async def test_get_request_error_unexpected_json(client_cls, get_method, mockserver): 142 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 143 | with pytest.raises(RequestError) as request_error_info: 144 | await getattr(client, get_method)( 145 | {"url": "https://array-exception.example", "browserHtml": True}, 146 | ) 147 | parsed_error = request_error_info.value.parsed 148 | assert isinstance(parsed_error, ParsedError) 149 | assert parsed_error.data is None 150 | 151 | 152 | @pytest.mark.parametrize( 153 | ("client_cls", "iter_method"), 154 | [ 155 | (AsyncZyteAPI, "iter"), 156 | (AsyncClient, "request_parallel_as_completed"), 157 | ], 158 | ) 159 | @pytest.mark.asyncio 160 | async def test_iter(client_cls, iter_method, mockserver): 161 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 162 | queries = [ 163 | {"url": "https://a.example", "httpResponseBody": True}, 164 | {"url": "https://exception.example", "httpResponseBody": True}, 165 | {"url": "https://b.example", "httpResponseBody": True}, 166 | ] 167 | expected_results = [ 168 | { 169 | "url": "https://a.example", 170 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 171 | }, 172 | Exception, 173 | { 174 | "url": "https://b.example", 175 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 176 | }, 177 | ] 178 | actual_results = [] 179 | for future in getattr(client, iter_method)(queries): 180 | try: 181 | actual_result = await future 182 | except Exception as exception: 183 | actual_result = exception 184 | actual_results.append(actual_result) 185 | assert len(actual_results) == len(expected_results) 186 | for actual_result in actual_results: 187 | if isinstance(actual_result, Exception): 188 | assert Exception in expected_results 189 | else: 190 | assert actual_result in expected_results 191 | 192 | 193 | @pytest.mark.parametrize( 194 | ("client_cls", "get_method", "iter_method"), 195 | [ 196 | (AsyncZyteAPI, "get", "iter"), 197 | (AsyncClient, "request_raw", "request_parallel_as_completed"), 198 | ], 199 | ) 200 | @pytest.mark.asyncio 201 | async def test_semaphore(client_cls, get_method, iter_method, mockserver): 202 | client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) 203 | client._semaphore = AsyncMock(wraps=client._semaphore) 204 | queries = [ 205 | {"url": "https://a.example", "httpResponseBody": True}, 206 | {"url": "https://b.example", "httpResponseBody": True}, 207 | {"url": "https://c.example", "httpResponseBody": True}, 208 | ] 209 | futures = [ 210 | getattr(client, get_method)(queries[0]), 211 | next(iter(getattr(client, iter_method)(queries[1:2]))), 212 | getattr(client, get_method)(queries[2]), 213 | ] 214 | for future in asyncio.as_completed(futures): 215 | await future 216 | assert client._semaphore.__aenter__.call_count == len(queries) 217 | assert client._semaphore.__aexit__.call_count == len(queries) 218 | 219 | 220 | @pytest.mark.asyncio 221 | async def test_session_context_manager(mockserver): 222 | client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 223 | queries = [ 224 | {"url": "https://a.example", "httpResponseBody": True}, 225 | {"url": "https://exception.example", "httpResponseBody": True}, 226 | {"url": "https://b.example", "httpResponseBody": True}, 227 | ] 228 | expected_results = [ 229 | { 230 | "url": "https://a.example", 231 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 232 | }, 233 | Exception, 234 | { 235 | "url": "https://b.example", 236 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 237 | }, 238 | ] 239 | actual_results = [] 240 | async with client.session() as session: 241 | assert session._session.connector.limit == client.n_conn 242 | actual_results.append(await session.get(queries[0])) 243 | for future in session.iter(queries[1:]): 244 | try: 245 | result = await future 246 | except Exception as e: 247 | result = e 248 | actual_results.append(result) 249 | aiohttp_session = session._session 250 | assert not aiohttp_session.closed 251 | assert aiohttp_session.closed 252 | 253 | with pytest.raises(RuntimeError): 254 | await session.get(queries[0]) 255 | 256 | future = next(iter(session.iter(queries[1:]))) 257 | with pytest.raises(RuntimeError): 258 | await future 259 | 260 | assert len(actual_results) == len(expected_results) 261 | for actual_result in actual_results: 262 | if isinstance(actual_result, Exception): 263 | assert Exception in expected_results 264 | else: 265 | assert actual_result in expected_results 266 | 267 | 268 | @pytest.mark.asyncio 269 | async def test_session_no_context_manager(mockserver): 270 | client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 271 | queries = [ 272 | {"url": "https://a.example", "httpResponseBody": True}, 273 | {"url": "https://exception.example", "httpResponseBody": True}, 274 | {"url": "https://b.example", "httpResponseBody": True}, 275 | ] 276 | expected_results = [ 277 | { 278 | "url": "https://a.example", 279 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 280 | }, 281 | Exception, 282 | { 283 | "url": "https://b.example", 284 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 285 | }, 286 | ] 287 | actual_results = [] 288 | session = client.session() 289 | assert session._session.connector.limit == client.n_conn 290 | actual_results.append(await session.get(queries[0])) 291 | for future in session.iter(queries[1:]): 292 | try: 293 | result = await future 294 | except Exception as e: 295 | result = e 296 | actual_results.append(result) 297 | aiohttp_session = session._session 298 | assert not aiohttp_session.closed 299 | await session.close() 300 | assert aiohttp_session.closed 301 | 302 | with pytest.raises(RuntimeError): 303 | await session.get(queries[0]) 304 | 305 | future = next(iter(session.iter(queries[1:]))) 306 | with pytest.raises(RuntimeError): 307 | await future 308 | 309 | assert len(actual_results) == len(expected_results) 310 | for actual_result in actual_results: 311 | if isinstance(actual_result, Exception): 312 | assert Exception in expected_results 313 | else: 314 | assert actual_result in expected_results 315 | 316 | 317 | def test_retrying_class(): 318 | """A descriptive exception is raised when creating a client with an 319 | AsyncRetrying subclass or similar instead of an instance of it.""" 320 | with pytest.raises(ValueError, match="must be an instance of AsyncRetrying"): 321 | AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) 322 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | from json import JSONDecodeError 4 | from pathlib import Path 5 | from tempfile import NamedTemporaryFile 6 | from unittest.mock import AsyncMock, Mock, patch 7 | 8 | import pytest 9 | 10 | from zyte_api.__main__ import run 11 | from zyte_api.aio.errors import RequestError 12 | 13 | 14 | class MockRequestError(Exception): 15 | @property 16 | def parsed(self): 17 | return Mock( 18 | response_body=Mock(decode=Mock(return_value=forbidden_domain_response())) 19 | ) 20 | 21 | 22 | def get_json_content(file_object): 23 | if not file_object: 24 | return None 25 | 26 | file_path = file_object.name 27 | try: 28 | with Path(file_path).open() as file: 29 | return json.load(file) 30 | except JSONDecodeError: 31 | pass 32 | 33 | 34 | def forbidden_domain_response(): 35 | return { 36 | "type": "/download/temporary-error", 37 | "title": "Temporary Downloading Error", 38 | "status": 520, 39 | "detail": "There is a downloading problem which might be temporary. Retry in N seconds from 'Retry-After' header or open a support ticket from https://support.zyte.com/support/tickets/new if it fails consistently.", 40 | } 41 | 42 | 43 | async def fake_exception(value=True): 44 | # Simulating an error condition 45 | if value: 46 | raise MockRequestError 47 | 48 | create_session_mock = AsyncMock() 49 | return await create_session_mock.coroutine() 50 | 51 | 52 | @pytest.mark.parametrize( 53 | ("queries", "expected_response", "store_errors", "exception"), 54 | ( 55 | [ 56 | # test if it stores the error(s) also by adding flag 57 | ( 58 | [ 59 | { 60 | "url": "https://forbidden.example", 61 | "browserHtml": True, 62 | "echoData": "https://forbidden.example", 63 | } 64 | ], 65 | forbidden_domain_response(), 66 | True, 67 | fake_exception, 68 | ), 69 | # test with store_errors=False 70 | ( 71 | [ 72 | { 73 | "url": "https://forbidden.example", 74 | "browserHtml": True, 75 | "echoData": "https://forbidden.example", 76 | } 77 | ], 78 | None, # expected response should be None 79 | False, 80 | fake_exception, 81 | ), 82 | ] 83 | ), 84 | ) 85 | @pytest.mark.asyncio 86 | async def test_run(queries, expected_response, store_errors, exception): 87 | tmp_path = Path("temporary_file.jsonl") 88 | temporary_file = tmp_path.open("w") 89 | n_conn = 5 90 | api_url = "https://example.com" 91 | api_key = "fake_key" 92 | retry_errors = True 93 | 94 | # Create a mock for AsyncZyteAPI 95 | async_client_mock = Mock() 96 | 97 | # Create a mock for the iter method 98 | request_parallel_mock = Mock() 99 | async_client_mock.return_value.iter = request_parallel_mock 100 | 101 | # Patch the AsyncZyteAPI class in __main__ with the mock 102 | with ( 103 | patch("zyte_api.__main__.AsyncZyteAPI", async_client_mock), 104 | patch("zyte_api.__main__.create_session") as create_session_mock, 105 | ): 106 | # Mock create_session to return an AsyncMock 107 | create_session_mock.return_value = AsyncMock() 108 | 109 | # Set up the AsyncZyteAPI instance to return the mocked iterator 110 | async_client_mock.return_value.iter.return_value = [ 111 | exception(), 112 | ] 113 | 114 | # Call the run function with the mocked AsyncZyteAPI 115 | await run( 116 | queries=queries, 117 | out=temporary_file, 118 | n_conn=n_conn, 119 | api_url=api_url, 120 | api_key=api_key, 121 | retry_errors=retry_errors, 122 | store_errors=store_errors, 123 | ) 124 | 125 | assert get_json_content(temporary_file) == expected_response 126 | tmp_path.unlink() 127 | 128 | 129 | @pytest.mark.asyncio 130 | async def test_run_stop_on_errors_false(mockserver): 131 | queries = [{"url": "https://exception.example", "httpResponseBody": True}] 132 | with ( 133 | NamedTemporaryFile("w") as output_file, 134 | pytest.warns( 135 | DeprecationWarning, match=r"^The stop_on_errors parameter is deprecated\.$" 136 | ), 137 | ): 138 | await run( 139 | queries=queries, 140 | out=output_file, 141 | n_conn=1, 142 | api_url=mockserver.urljoin("/"), 143 | api_key="a", 144 | stop_on_errors=False, 145 | ) 146 | 147 | 148 | @pytest.mark.asyncio 149 | async def test_run_stop_on_errors_true(mockserver): 150 | query = {"url": "https://exception.example", "httpResponseBody": True} 151 | queries = [query] 152 | with ( 153 | NamedTemporaryFile("w") as output_file, 154 | pytest.warns( 155 | DeprecationWarning, match=r"^The stop_on_errors parameter is deprecated\.$" 156 | ), 157 | pytest.raises(RequestError) as exc_info, 158 | ): 159 | await run( 160 | queries=queries, 161 | out=output_file, 162 | n_conn=1, 163 | api_url=mockserver.urljoin("/"), 164 | api_key="a", 165 | stop_on_errors=True, 166 | ) 167 | assert exc_info.value.query == query 168 | 169 | 170 | def _run(*, input, mockserver, cli_params=None): 171 | cli_params = cli_params or () 172 | with NamedTemporaryFile("w") as url_list: 173 | url_list.write(input) 174 | url_list.flush() 175 | # Note: Using “python -m zyte_api” instead of “zyte-api” enables 176 | # coverage tracking to work. 177 | return subprocess.run( 178 | [ 179 | "python", 180 | "-m", 181 | "zyte_api", 182 | "--api-key", 183 | "a", 184 | "--api-url", 185 | mockserver.urljoin("/"), 186 | url_list.name, 187 | *cli_params, 188 | ], 189 | capture_output=True, 190 | check=False, 191 | ) 192 | 193 | 194 | def test_empty_input(mockserver): 195 | result = _run(input="", mockserver=mockserver) 196 | assert result.returncode 197 | assert result.stdout == b"" 198 | assert result.stderr == b"No input queries found. Is the input file empty?\n" 199 | 200 | 201 | def test_intype_txt_implicit(mockserver): 202 | result = _run(input="https://a.example", mockserver=mockserver) 203 | assert not result.returncode 204 | assert ( 205 | result.stdout 206 | == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' 207 | ) 208 | 209 | 210 | def test_intype_txt_explicit(mockserver): 211 | result = _run( 212 | input="https://a.example", mockserver=mockserver, cli_params=["--intype", "txt"] 213 | ) 214 | assert not result.returncode 215 | assert ( 216 | result.stdout 217 | == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' 218 | ) 219 | 220 | 221 | def test_intype_jsonl_implicit(mockserver): 222 | result = _run( 223 | input='{"url": "https://a.example", "browserHtml": true}', mockserver=mockserver 224 | ) 225 | assert not result.returncode 226 | assert ( 227 | result.stdout 228 | == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' 229 | ) 230 | 231 | 232 | def test_intype_jsonl_explicit(mockserver): 233 | result = _run( 234 | input='{"url": "https://a.example", "browserHtml": true}', 235 | mockserver=mockserver, 236 | cli_params=["--intype", "jl"], 237 | ) 238 | assert not result.returncode 239 | assert ( 240 | result.stdout 241 | == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' 242 | ) 243 | 244 | 245 | @pytest.mark.flaky(reruns=16) 246 | def test_limit_and_shuffle(mockserver): 247 | result = _run( 248 | input="https://a.example\nhttps://b.example", 249 | mockserver=mockserver, 250 | cli_params=["--limit", "1", "--shuffle"], 251 | ) 252 | assert not result.returncode 253 | assert ( 254 | result.stdout 255 | == b'{"url": "https://b.example", "browserHtml": "Hello

World!

"}\n' 256 | ) 257 | 258 | 259 | def test_run_non_json_response(mockserver): 260 | result = _run( 261 | input="https://nonjson.example", 262 | mockserver=mockserver, 263 | ) 264 | assert not result.returncode 265 | assert result.stdout == b"" 266 | assert b"json.decoder.JSONDecodeError" in result.stderr 267 | -------------------------------------------------------------------------------- /tests/test_retry.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from copy import copy 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | from aiohttp.client_exceptions import ServerConnectionError 7 | from tenacity import AsyncRetrying, RetryCallState 8 | 9 | from zyte_api import ( 10 | AggressiveRetryFactory, 11 | AsyncZyteAPI, 12 | RequestError, 13 | RetryFactory, 14 | aggressive_retrying, 15 | zyte_api_retrying, 16 | ) 17 | 18 | from .mockserver import DropResource, MockServer 19 | 20 | 21 | def test_deprecated_imports(): 22 | from zyte_api import RetryFactory, zyte_api_retrying 23 | from zyte_api.aio.retry import RetryFactory as DeprecatedRetryFactory 24 | from zyte_api.aio.retry import zyte_api_retrying as deprecated_zyte_api_retrying 25 | 26 | assert RetryFactory is DeprecatedRetryFactory 27 | assert zyte_api_retrying is deprecated_zyte_api_retrying 28 | 29 | 30 | UNSET = object() 31 | 32 | 33 | class OutlierException(RuntimeError): 34 | pass 35 | 36 | 37 | @pytest.mark.parametrize( 38 | ("value", "exception"), 39 | [ 40 | (UNSET, OutlierException), 41 | (True, OutlierException), 42 | (False, RequestError), 43 | ], 44 | ) 45 | @pytest.mark.asyncio 46 | async def test_get_handle_retries(value, exception, mockserver): 47 | kwargs = {} 48 | if value is not UNSET: 49 | kwargs["handle_retries"] = value 50 | 51 | def broken_stop(_): 52 | raise OutlierException 53 | 54 | retrying = AsyncRetrying(stop=broken_stop) 55 | client = AsyncZyteAPI( 56 | api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying 57 | ) 58 | with pytest.raises(exception): 59 | await client.get( 60 | {"url": "https://exception.example", "browserHtml": True}, 61 | **kwargs, 62 | ) 63 | 64 | 65 | @pytest.mark.parametrize( 66 | ("retry_factory", "status", "waiter"), 67 | [ 68 | (RetryFactory, 429, "throttling"), 69 | (RetryFactory, 520, "download_error"), 70 | (AggressiveRetryFactory, 429, "throttling"), 71 | (AggressiveRetryFactory, 500, "undocumented_error"), 72 | (AggressiveRetryFactory, 520, "download_error"), 73 | ], 74 | ) 75 | @pytest.mark.asyncio 76 | async def test_retry_wait(retry_factory, status, waiter, mockserver): 77 | def broken_wait(self, retry_state): 78 | raise OutlierException 79 | 80 | class CustomRetryFactory(retry_factory): 81 | pass 82 | 83 | setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) 84 | retrying = CustomRetryFactory().build() 85 | client = AsyncZyteAPI( 86 | api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying 87 | ) 88 | with pytest.raises(OutlierException): 89 | await client.get( 90 | {"url": f"https://e{status}.example", "browserHtml": True}, 91 | ) 92 | 93 | 94 | @pytest.mark.parametrize( 95 | "retry_factory", 96 | [ 97 | RetryFactory, 98 | AggressiveRetryFactory, 99 | ], 100 | ) 101 | @pytest.mark.asyncio 102 | async def test_retry_wait_network_error(retry_factory): 103 | waiter = "network_error" 104 | 105 | def broken_wait(self, retry_state): 106 | raise OutlierException 107 | 108 | class CustomRetryFactory(retry_factory): 109 | pass 110 | 111 | setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) 112 | 113 | retrying = CustomRetryFactory().build() 114 | with MockServer(resource=DropResource) as mockserver: 115 | client = AsyncZyteAPI( 116 | api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying 117 | ) 118 | with pytest.raises(OutlierException): 119 | await client.get( 120 | {"url": "https://example.com", "browserHtml": True}, 121 | ) 122 | 123 | 124 | def mock_request_error(*, status=200): 125 | return RequestError( 126 | history=None, 127 | request_info=None, 128 | response_content=None, 129 | status=status, 130 | query={}, 131 | ) 132 | 133 | 134 | # Number of times to test request errors that must be retried forever. 135 | FOREVER_TIMES = 100 136 | 137 | 138 | class fast_forward: 139 | def __init__(self, time): 140 | self.time = time 141 | 142 | 143 | class scale: 144 | def __init__(self, factor): 145 | self.factor = factor 146 | 147 | def __call__(self, number, add=0): 148 | return int(number * self.factor) + add 149 | 150 | 151 | @pytest.mark.parametrize( 152 | ("retrying", "outcomes", "exhausted"), 153 | [ 154 | # Shared behaviors of all retry policies 155 | *( 156 | (retrying, outcomes, exhausted) 157 | for retrying in (zyte_api_retrying, aggressive_retrying) 158 | for outcomes, exhausted in ( 159 | # Rate limiting is retried forever. 160 | ( 161 | (mock_request_error(status=429),) * FOREVER_TIMES, 162 | False, 163 | ), 164 | ( 165 | (mock_request_error(status=503),) * FOREVER_TIMES, 166 | False, 167 | ), 168 | # Network errors are retried until there have only been network 169 | # errors (of any kind) for 15 minutes straight or more. 170 | ( 171 | ( 172 | ServerConnectionError(), 173 | fast_forward(15 * 60 - 1), 174 | ServerConnectionError(), 175 | ), 176 | False, 177 | ), 178 | ( 179 | ( 180 | ServerConnectionError(), 181 | fast_forward(15 * 60), 182 | ServerConnectionError(), 183 | ), 184 | True, 185 | ), 186 | ( 187 | ( 188 | mock_request_error(status=429), 189 | fast_forward(15 * 60 - 1), 190 | ServerConnectionError(), 191 | ), 192 | False, 193 | ), 194 | ( 195 | ( 196 | mock_request_error(status=429), 197 | fast_forward(15 * 60), 198 | ServerConnectionError(), 199 | ), 200 | False, 201 | ), 202 | ( 203 | ( 204 | ServerConnectionError(), 205 | fast_forward(7 * 60), 206 | mock_request_error(status=429), 207 | fast_forward(8 * 60 - 1), 208 | ServerConnectionError(), 209 | ), 210 | False, 211 | ), 212 | ( 213 | ( 214 | ServerConnectionError(), 215 | fast_forward(7 * 60), 216 | mock_request_error(status=429), 217 | fast_forward(8 * 60), 218 | ServerConnectionError(), 219 | ), 220 | False, 221 | ), 222 | ( 223 | ( 224 | ServerConnectionError(), 225 | fast_forward(7 * 60), 226 | mock_request_error(status=429), 227 | fast_forward(8 * 60), 228 | ServerConnectionError(), 229 | fast_forward(15 * 60 - 1), 230 | ServerConnectionError(), 231 | ), 232 | False, 233 | ), 234 | ( 235 | ( 236 | ServerConnectionError(), 237 | fast_forward(7 * 60), 238 | mock_request_error(status=429), 239 | fast_forward(8 * 60), 240 | ServerConnectionError(), 241 | fast_forward(15 * 60), 242 | ServerConnectionError(), 243 | ), 244 | True, 245 | ), 246 | ) 247 | ), 248 | # Scaled behaviors, where the default retry policy uses half as many 249 | # attempts as the aggressive retry policy. 250 | *( 251 | (retrying, outcomes, exhausted) 252 | for retrying, scaled in ( 253 | (zyte_api_retrying, scale(0.5)), 254 | (aggressive_retrying, scale(1)), 255 | ) 256 | for outcomes, exhausted in ( 257 | # Temporary download errors are retried until they have 258 | # happened 8*factor times in total. Permanent download errors 259 | # also count towards that limit. 260 | ( 261 | (mock_request_error(status=520),) * scaled(8, -1), 262 | False, 263 | ), 264 | ( 265 | (mock_request_error(status=520),) * scaled(8), 266 | True, 267 | ), 268 | ( 269 | ( 270 | *(mock_request_error(status=429),) * scaled(8, -2), 271 | mock_request_error(status=520), 272 | ), 273 | False, 274 | ), 275 | ( 276 | ( 277 | *(mock_request_error(status=429),) * scaled(8, -1), 278 | mock_request_error(status=520), 279 | ), 280 | False, 281 | ), 282 | ( 283 | ( 284 | *( 285 | mock_request_error(status=429), 286 | mock_request_error(status=520), 287 | ) 288 | * scaled(8, -1), 289 | ), 290 | False, 291 | ), 292 | ( 293 | ( 294 | *( 295 | mock_request_error(status=429), 296 | mock_request_error(status=520), 297 | ) 298 | * scaled(8), 299 | ), 300 | True, 301 | ), 302 | ( 303 | ( 304 | *(mock_request_error(status=520),) * scaled(8, -3), 305 | *(mock_request_error(status=521),) * 1, 306 | *(mock_request_error(status=520),) * 1, 307 | ), 308 | False, 309 | ), 310 | ( 311 | ( 312 | *(mock_request_error(status=520),) * scaled(8, -2), 313 | *(mock_request_error(status=521),) * 1, 314 | *(mock_request_error(status=520),) * 1, 315 | ), 316 | True, 317 | ), 318 | ( 319 | ( 320 | *(mock_request_error(status=520),) * scaled(8, -2), 321 | *(mock_request_error(status=521),) * 1, 322 | ), 323 | False, 324 | ), 325 | ( 326 | ( 327 | *(mock_request_error(status=520),) * scaled(8, -1), 328 | *(mock_request_error(status=521),) * 1, 329 | ), 330 | True, 331 | ), 332 | # Permanent download errors are retried until they have 333 | # happened 4*factor times in total. 334 | ( 335 | (*(mock_request_error(status=521),) * scaled(4, -1),), 336 | False, 337 | ), 338 | ( 339 | (*(mock_request_error(status=521),) * scaled(4),), 340 | True, 341 | ), 342 | # Undocumented 5xx errors are retried until they have happened 343 | # 4*factor times. 344 | *( 345 | scenario 346 | for status in ( 347 | 500, 348 | 502, 349 | 504, 350 | ) 351 | for scenario in ( 352 | ( 353 | (*(mock_request_error(status=status),) * scaled(4, -1),), 354 | False, 355 | ), 356 | ( 357 | (*(mock_request_error(status=status),) * scaled(4),), 358 | True, 359 | ), 360 | ( 361 | ( 362 | *(mock_request_error(status=status),) * scaled(4, -2), 363 | mock_request_error(status=429), 364 | mock_request_error(status=503), 365 | ServerConnectionError(), 366 | mock_request_error(status=status), 367 | ), 368 | False, 369 | ), 370 | ( 371 | ( 372 | *(mock_request_error(status=status),) * scaled(4, -1), 373 | mock_request_error(status=429), 374 | mock_request_error(status=503), 375 | ServerConnectionError(), 376 | mock_request_error(status=status), 377 | ), 378 | True, 379 | ), 380 | ( 381 | ( 382 | mock_request_error(status=555), 383 | *(mock_request_error(status=status),) * scaled(4, -2), 384 | ), 385 | False, 386 | ), 387 | ( 388 | ( 389 | mock_request_error(status=555), 390 | *(mock_request_error(status=status),) * scaled(4, -1), 391 | ), 392 | True, 393 | ), 394 | ) 395 | ), 396 | ) 397 | ), 398 | ], 399 | ) 400 | @pytest.mark.asyncio 401 | @patch("time.monotonic") 402 | async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): 403 | monotonic_mock.return_value = 0 404 | last_outcome = outcomes[-1] 405 | outcomes = deque(outcomes) 406 | 407 | def wait(retry_state): 408 | return 0.0 409 | 410 | retrying = copy(retrying) 411 | retrying.wait = wait 412 | 413 | async def run(): 414 | while True: 415 | try: 416 | outcome = outcomes.popleft() 417 | except IndexError: 418 | return 419 | else: 420 | if isinstance(outcome, fast_forward): 421 | monotonic_mock.return_value += outcome.time 422 | continue 423 | raise outcome 424 | 425 | run = retrying.wraps(run) 426 | try: 427 | await run() 428 | except Exception as outcome: 429 | assert exhausted, outcome # noqa: PT017 430 | assert outcome is last_outcome # noqa: PT017 431 | else: 432 | assert not exhausted 433 | 434 | 435 | @pytest.mark.asyncio 436 | async def test_deprecated_temporary_download_error(): 437 | class CustomRetryFactory(RetryFactory): 438 | def wait(self, retry_state: RetryCallState) -> float: 439 | self.temporary_download_error_wait(retry_state=retry_state) 440 | return 0.0 441 | 442 | def stop(self, retry_state: RetryCallState) -> bool: 443 | self.temporary_download_error_stop(retry_state) 444 | return super().stop(retry_state) 445 | 446 | retrying = CustomRetryFactory().build() 447 | 448 | outcomes = deque((mock_request_error(status=520), None)) 449 | 450 | async def run(): 451 | outcome = outcomes.popleft() 452 | if isinstance(outcome, Exception): 453 | raise outcome 454 | return outcome 455 | 456 | run = retrying.wraps(run) 457 | with ( 458 | pytest.warns(DeprecationWarning, match="temporary_download_error_stop"), 459 | pytest.warns(DeprecationWarning, match="temporary_download_error_wait"), 460 | ): 461 | await run() 462 | assert not outcomes 463 | -------------------------------------------------------------------------------- /tests/test_sync.py: -------------------------------------------------------------------------------- 1 | from types import GeneratorType 2 | from unittest.mock import AsyncMock 3 | 4 | import pytest 5 | 6 | from zyte_api import ZyteAPI 7 | from zyte_api.apikey import NoApiKey 8 | 9 | 10 | def test_api_key(): 11 | ZyteAPI(api_key="a") 12 | with pytest.raises(NoApiKey): 13 | ZyteAPI() 14 | 15 | 16 | def test_get(mockserver): 17 | client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 18 | expected_result = { 19 | "url": "https://a.example", 20 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 21 | } 22 | actual_result = client.get({"url": "https://a.example", "httpResponseBody": True}) 23 | assert actual_result == expected_result 24 | 25 | 26 | def test_iter(mockserver): 27 | client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 28 | queries = [ 29 | {"url": "https://a.example", "httpResponseBody": True}, 30 | {"url": "https://exception.example", "httpResponseBody": True}, 31 | {"url": "https://b.example", "httpResponseBody": True}, 32 | ] 33 | expected_results = [ 34 | { 35 | "url": "https://a.example", 36 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 37 | }, 38 | Exception, 39 | { 40 | "url": "https://b.example", 41 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 42 | }, 43 | ] 44 | actual_results = client.iter(queries) 45 | assert isinstance(actual_results, GeneratorType) 46 | actual_results_list = list(actual_results) 47 | assert len(actual_results_list) == len(expected_results) 48 | for actual_result in actual_results_list: 49 | if isinstance(actual_result, Exception): 50 | assert Exception in expected_results 51 | else: 52 | assert actual_result in expected_results 53 | 54 | 55 | def test_semaphore(mockserver): 56 | client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 57 | client._async_client._semaphore = AsyncMock(wraps=client._async_client._semaphore) 58 | queries = [ 59 | {"url": "https://a.example", "httpResponseBody": True}, 60 | {"url": "https://b.example", "httpResponseBody": True}, 61 | {"url": "https://c.example", "httpResponseBody": True}, 62 | ] 63 | client.get(queries[0]) 64 | next(iter(client.iter(queries[1:2]))) 65 | client.get(queries[2]) 66 | assert client._async_client._semaphore.__aenter__.call_count == len(queries) 67 | assert client._async_client._semaphore.__aexit__.call_count == len(queries) 68 | 69 | 70 | def test_session_context_manager(mockserver): 71 | client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 72 | queries = [ 73 | {"url": "https://a.example", "httpResponseBody": True}, 74 | {"url": "https://exception.example", "httpResponseBody": True}, 75 | {"url": "https://b.example", "httpResponseBody": True}, 76 | ] 77 | expected_results = [ 78 | { 79 | "url": "https://a.example", 80 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 81 | }, 82 | Exception, 83 | { 84 | "url": "https://b.example", 85 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 86 | }, 87 | ] 88 | actual_results = [] 89 | with client.session() as session: 90 | assert session._session.connector.limit == client._async_client.n_conn 91 | actual_results.append(session.get(queries[0])) 92 | actual_results.extend(session.iter(queries[1:])) 93 | aiohttp_session = session._session 94 | assert not aiohttp_session.closed 95 | assert aiohttp_session.closed 96 | 97 | with pytest.raises(RuntimeError): 98 | session.get(queries[0]) 99 | 100 | assert isinstance(next(iter(session.iter(queries[1:]))), RuntimeError) 101 | 102 | assert len(actual_results) == len(expected_results) 103 | for actual_result in actual_results: 104 | if isinstance(actual_result, Exception): 105 | assert Exception in expected_results 106 | else: 107 | assert actual_result in expected_results 108 | 109 | 110 | def test_session_no_context_manager(mockserver): 111 | client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) 112 | queries = [ 113 | {"url": "https://a.example", "httpResponseBody": True}, 114 | {"url": "https://exception.example", "httpResponseBody": True}, 115 | {"url": "https://b.example", "httpResponseBody": True}, 116 | ] 117 | expected_results = [ 118 | { 119 | "url": "https://a.example", 120 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 121 | }, 122 | Exception, 123 | { 124 | "url": "https://b.example", 125 | "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", 126 | }, 127 | ] 128 | actual_results = [] 129 | session = client.session() 130 | assert session._session.connector.limit == client._async_client.n_conn 131 | actual_results.append(session.get(queries[0])) 132 | actual_results.extend(session.iter(queries[1:])) 133 | aiohttp_session = session._session 134 | assert not aiohttp_session.closed 135 | session.close() 136 | assert aiohttp_session.closed 137 | 138 | with pytest.raises(RuntimeError): 139 | session.get(queries[0]) 140 | 141 | assert isinstance(next(iter(session.iter(queries[1:]))), RuntimeError) 142 | 143 | assert len(actual_results) == len(expected_results) 144 | for actual_result in actual_results: 145 | if isinstance(actual_result, Exception): 146 | assert Exception in expected_results 147 | else: 148 | assert actual_result in expected_results 149 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from aiohttp import TCPConnector 3 | 4 | from zyte_api._utils import create_session 5 | from zyte_api.utils import _guess_intype, _process_query 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_create_session_custom_connector(): 10 | # Declare a connector with a random parameter to avoid it matching the 11 | # default one. 12 | custom_connector = TCPConnector(limit=1850) 13 | session = create_session(connector=custom_connector) 14 | assert session.connector == custom_connector 15 | 16 | 17 | @pytest.mark.parametrize( 18 | ("file_name", "first_line", "expected"), 19 | [ 20 | ( 21 | "", 22 | "https://toscrape.com", 23 | "txt", 24 | ), 25 | ( 26 | "", 27 | '{"url": "https://toscrape.com"}', 28 | "jl", 29 | ), 30 | ( 31 | "", 32 | ' {"url": "https://toscrape.com"}', 33 | "jl", 34 | ), 35 | ( 36 | "urls.txt", 37 | "https://toscrape.com", 38 | "txt", 39 | ), 40 | ( 41 | "urls.txt", 42 | '{"url": "https://toscrape.com"}', 43 | "txt", 44 | ), 45 | ( 46 | "urls.jl", 47 | "https://toscrape.com", 48 | "jl", 49 | ), 50 | ( 51 | "urls.jl", 52 | '{"url": "https://toscrape.com"}', 53 | "jl", 54 | ), 55 | ( 56 | "urls.jsonl", 57 | "https://toscrape.com", 58 | "jl", 59 | ), 60 | ( 61 | "urls.jsonl", 62 | '{"url": "https://toscrape.com"}', 63 | "jl", 64 | ), 65 | ], 66 | ) 67 | def test_guess_intype(file_name, first_line, expected): 68 | assert _guess_intype(file_name, [first_line]) == expected 69 | 70 | 71 | @pytest.mark.parametrize( 72 | ("input", "output"), 73 | [ 74 | # Unsafe URLs in the url field are modified, while left untouched on 75 | # other fields. 76 | ( 77 | { 78 | "a": {"b", "c"}, 79 | "d": "https://example.com/ a", 80 | "url": "https://example.com/ a", 81 | }, 82 | { 83 | "a": {"b", "c"}, 84 | "d": "https://example.com/ a", 85 | "url": "https://example.com/%20a", 86 | }, 87 | ), 88 | # Safe URLs are returned unmodified. 89 | ( 90 | {"url": "https://example.com"}, 91 | {"url": "https://example.com"}, 92 | ), 93 | # URL fragments are kept. 94 | ( 95 | {"url": "https://example.com#a"}, 96 | {"url": "https://example.com#a"}, 97 | ), 98 | # If no URL is passed, nothing is done. 99 | ( 100 | {"a": "b"}, 101 | {"a": "b"}, 102 | ), 103 | # NOTE: We use w3lib.url.safe_url_string for escaping. Tests covering 104 | # the URL escaping logic exist upstream. 105 | ], 106 | ) 107 | def test_process_query(input, output): 108 | assert _process_query(input) == output 109 | 110 | 111 | def test_process_query_bytes(): 112 | with pytest.raises(ValueError, match="Expected a str URL parameter"): 113 | _process_query({"url": b"https://example.com"}) 114 | 115 | 116 | @pytest.mark.asyncio # https://github.com/aio-libs/aiohttp/pull/1468 117 | async def test_deprecated_create_session(): 118 | from zyte_api.aio.client import create_session as _create_session 119 | 120 | with pytest.warns( 121 | DeprecationWarning, 122 | match=r"^zyte_api\.aio\.client\.create_session is deprecated", 123 | ): 124 | _create_session() 125 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39,py310,py311,py312,py313,mypy,docs,twine 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-asyncio 8 | pytest-cov 9 | pytest-rerunfailures 10 | pytest-twisted 11 | responses 12 | twisted 13 | 14 | commands = 15 | py.test \ 16 | --cov-report=term-missing --cov-report=html --cov-report=xml --cov=zyte_api \ 17 | --doctest-modules \ 18 | {posargs:zyte_api tests} 19 | 20 | [testenv:mypy] 21 | deps = 22 | mypy==1.12.0 23 | pytest==8.3.3 24 | Twisted==24.7.0 25 | types-tqdm==4.66.0.20240417 26 | 27 | commands = mypy --ignore-missing-imports \ 28 | zyte_api \ 29 | tests 30 | 31 | [testenv:docs] 32 | changedir = docs 33 | deps = 34 | -rdocs/requirements.txt 35 | basepython = python3 36 | commands = 37 | sphinx-build -W -b html . {envtmpdir}/html 38 | 39 | [testenv:pre-commit] 40 | deps = pre-commit 41 | commands = pre-commit run --all-files --show-diff-on-failure 42 | 43 | [testenv:twine] 44 | deps = 45 | twine==5.1.1 46 | build==1.2.2 47 | commands = 48 | python -m build --sdist 49 | twine check dist/* 50 | -------------------------------------------------------------------------------- /zyte_api/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python client libraries and command line utilities for Zyte API 3 | """ 4 | 5 | from ._async import AsyncZyteAPI 6 | from ._errors import RequestError 7 | from ._retry import ( 8 | AggressiveRetryFactory, 9 | RetryFactory, 10 | stop_after_uninterrupted_delay, 11 | stop_on_count, 12 | stop_on_download_error, 13 | ) 14 | from ._retry import aggressive_retrying as _aggressive_retrying 15 | from ._retry import zyte_api_retrying as _zyte_api_retrying 16 | from ._sync import ZyteAPI 17 | from .errors import ParsedError 18 | 19 | # We re-define the variables here for Sphinx to pick the documentation. 20 | 21 | #: :ref:`Default retry policy `. 22 | zyte_api_retrying = _zyte_api_retrying 23 | 24 | #: :ref:`Aggresive retry policy `. 25 | aggressive_retrying = _aggressive_retrying 26 | -------------------------------------------------------------------------------- /zyte_api/__main__.py: -------------------------------------------------------------------------------- 1 | """Basic command-line interface for Zyte API.""" 2 | 3 | import argparse 4 | import asyncio 5 | import json 6 | import logging 7 | import random 8 | import sys 9 | from warnings import warn 10 | 11 | import tqdm 12 | from tenacity import retry_if_exception 13 | 14 | from zyte_api._async import AsyncZyteAPI 15 | from zyte_api._retry import RetryFactory, _is_throttling_error 16 | from zyte_api._utils import create_session 17 | from zyte_api.constants import API_URL 18 | from zyte_api.utils import _guess_intype 19 | 20 | 21 | class DontRetryErrorsFactory(RetryFactory): 22 | retry_condition = retry_if_exception(_is_throttling_error) 23 | 24 | 25 | logger = logging.getLogger("zyte_api") 26 | 27 | _UNSET = object() 28 | 29 | 30 | async def run( 31 | queries, 32 | out, 33 | *, 34 | n_conn, 35 | stop_on_errors=_UNSET, 36 | api_url, 37 | api_key=None, 38 | retry_errors=True, 39 | store_errors=None, 40 | ): 41 | if stop_on_errors is not _UNSET: 42 | warn( 43 | "The stop_on_errors parameter is deprecated.", 44 | DeprecationWarning, 45 | stacklevel=2, 46 | ) 47 | else: 48 | stop_on_errors = False 49 | 50 | def write_output(content): 51 | json.dump(content, out, ensure_ascii=False) 52 | out.write("\n") 53 | out.flush() 54 | pbar.update() 55 | 56 | retrying = None if retry_errors else DontRetryErrorsFactory().build() 57 | client = AsyncZyteAPI( 58 | n_conn=n_conn, api_key=api_key, api_url=api_url, retrying=retrying 59 | ) 60 | async with create_session(connection_pool_size=n_conn) as session: 61 | result_iter = client.iter( 62 | queries=queries, 63 | session=session, 64 | ) 65 | pbar = tqdm.tqdm( 66 | smoothing=0, leave=True, total=len(queries), miniters=1, unit="url" 67 | ) 68 | pbar.set_postfix_str(str(client.agg_stats)) 69 | try: 70 | for fut in result_iter: 71 | try: 72 | result = await fut 73 | except Exception as e: 74 | if store_errors: 75 | write_output(e.parsed.response_body.decode()) 76 | 77 | if stop_on_errors: 78 | raise 79 | 80 | logger.error(str(e)) 81 | else: 82 | write_output(result) 83 | finally: 84 | pbar.set_postfix_str(str(client.agg_stats)) 85 | finally: 86 | pbar.close() 87 | logger.info(client.agg_stats.summary()) 88 | logger.info(f"\nAPI error types:\n{client.agg_stats.api_error_types.most_common()}") 89 | logger.info(f"\nStatus codes:\n{client.agg_stats.status_codes.most_common()}") 90 | logger.info(f"\nException types:\n{client.agg_stats.exception_types.most_common()}") 91 | 92 | 93 | def read_input(input_fp, intype): 94 | assert intype in {"txt", "jl", _UNSET} 95 | lines = input_fp.readlines() 96 | if not lines: 97 | return [] 98 | if intype is _UNSET: 99 | intype = _guess_intype(input_fp.name, lines) 100 | if intype == "txt": 101 | urls = [u.strip() for u in lines if u.strip()] 102 | records = [{"url": url, "browserHtml": True} for url in urls] 103 | else: 104 | records = [json.loads(line.strip()) for line in lines if line.strip()] 105 | # Automatically replicating the url in echoData to being able to 106 | # to match URLs with content in the responses 107 | for record in records: 108 | record.setdefault("echoData", record.get("url")) 109 | return records 110 | 111 | 112 | def _get_argument_parser(program_name="zyte-api"): 113 | p = argparse.ArgumentParser( 114 | prog=program_name, 115 | description="Send Zyte API requests.", 116 | ) 117 | p.add_argument( 118 | "INPUT", 119 | type=argparse.FileType("r", encoding="utf8"), 120 | help=( 121 | "Path to an input file (see 'Command-line client > Input file' in " 122 | "the docs for details)." 123 | ), 124 | ) 125 | p.add_argument( 126 | "--intype", 127 | default=_UNSET, 128 | choices=["txt", "jl"], 129 | help=( 130 | "Type of the input file, either 'txt' (plain text) or 'jl' (JSON " 131 | "Lines).\n" 132 | "\n" 133 | "If not specified, the input type is guessed based on the input " 134 | "file extension ('.jl', '.jsonl', or '.txt'), or in its content, " 135 | "with 'txt' as fallback." 136 | ), 137 | ) 138 | p.add_argument("--limit", type=int, help="Maximum number of requests to send.") 139 | p.add_argument( 140 | "--output", 141 | "-o", 142 | default=sys.stdout, 143 | type=argparse.FileType("w", encoding="utf8"), 144 | help=( 145 | "Path for the output file. Results are written into the output " 146 | "file in JSON Lines format.\n" 147 | "\n" 148 | "If not specified, results are printed to the standard output." 149 | ), 150 | ) 151 | p.add_argument( 152 | "--n-conn", 153 | type=int, 154 | default=20, 155 | help=("Number of concurrent connections to use (default: %(default)s)."), 156 | ) 157 | p.add_argument( 158 | "--api-key", 159 | help="Zyte API key.", 160 | ) 161 | p.add_argument( 162 | "--api-url", help="Zyte API endpoint (default: %(default)s).", default=API_URL 163 | ) 164 | p.add_argument( 165 | "--loglevel", 166 | "-L", 167 | default="INFO", 168 | choices=["DEBUG", "INFO", "WARNING", "ERROR"], 169 | help="Log level (default: %(default)s).", 170 | ) 171 | p.add_argument( 172 | "--shuffle", 173 | help="Shuffle request order.", 174 | action="store_true", 175 | ) 176 | p.add_argument( 177 | "--dont-retry-errors", 178 | help="Do not retry unsuccessful responses and network errors, only rate-limiting responses.", 179 | action="store_true", 180 | ) 181 | p.add_argument( 182 | "--store-errors", 183 | help=( 184 | "Store error responses in the output file.\n" 185 | "\n" 186 | "If omitted, only successful responses are stored." 187 | ), 188 | action="store_true", 189 | ) 190 | return p 191 | 192 | 193 | def _main(program_name="zyte-api"): 194 | """Process urls from input file through Zyte API""" 195 | p = _get_argument_parser(program_name=program_name) 196 | args = p.parse_args() 197 | logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) 198 | 199 | queries = read_input(args.INPUT, args.intype) 200 | if not queries: 201 | print("No input queries found. Is the input file empty?", file=sys.stderr) 202 | sys.exit(-1) 203 | 204 | if args.shuffle: 205 | random.shuffle(queries) 206 | if args.limit: 207 | queries = queries[: args.limit] 208 | 209 | logger.info( 210 | f"Loaded {len(queries)} urls from {args.INPUT.name}; shuffled: {args.shuffle}" 211 | ) 212 | logger.info(f"Running Zyte API (connections: {args.n_conn})") 213 | 214 | loop = asyncio.get_event_loop() 215 | coro = run( 216 | queries, 217 | out=args.output, 218 | n_conn=args.n_conn, 219 | api_url=args.api_url, 220 | api_key=args.api_key, 221 | retry_errors=not args.dont_retry_errors, 222 | store_errors=args.store_errors, 223 | ) 224 | loop.run_until_complete(coro) 225 | loop.close() 226 | 227 | 228 | if __name__ == "__main__": 229 | _main(program_name="python -m zyte_api") 230 | -------------------------------------------------------------------------------- /zyte_api/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.7.1" 2 | -------------------------------------------------------------------------------- /zyte_api/_async.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import time 5 | from asyncio import Future 6 | from functools import partial 7 | from typing import TYPE_CHECKING, Any 8 | 9 | import aiohttp 10 | from tenacity import AsyncRetrying 11 | 12 | from ._errors import RequestError 13 | from ._retry import zyte_api_retrying 14 | from ._utils import _AIO_API_TIMEOUT, create_session 15 | from .apikey import get_apikey 16 | from .constants import API_URL 17 | from .stats import AggStats, ResponseStats 18 | from .utils import USER_AGENT, _process_query 19 | 20 | if TYPE_CHECKING: 21 | from collections.abc import Iterator 22 | 23 | _ResponseFuture = Future[dict[str, Any]] 24 | 25 | 26 | def _post_func(session): 27 | """Return a function to send a POST request""" 28 | if session is None: 29 | return partial(aiohttp.request, method="POST", timeout=_AIO_API_TIMEOUT) 30 | return session.post 31 | 32 | 33 | class _AsyncSession: 34 | def __init__(self, client, **session_kwargs): 35 | self._client = client 36 | self._session = create_session(client.n_conn, **session_kwargs) 37 | 38 | async def __aenter__(self): 39 | return self 40 | 41 | async def __aexit__(self, *exc_info): 42 | await self._session.close() 43 | 44 | async def close(self): 45 | await self._session.close() 46 | 47 | async def get( 48 | self, 49 | query: dict, 50 | *, 51 | endpoint: str = "extract", 52 | handle_retries=True, 53 | retrying: AsyncRetrying | None = None, 54 | ): 55 | return await self._client.get( 56 | query=query, 57 | endpoint=endpoint, 58 | handle_retries=handle_retries, 59 | retrying=retrying, 60 | session=self._session, 61 | ) 62 | 63 | def iter( 64 | self, 65 | queries: list[dict], 66 | *, 67 | endpoint: str = "extract", 68 | handle_retries=True, 69 | retrying: AsyncRetrying | None = None, 70 | ) -> Iterator[Future]: 71 | return self._client.iter( 72 | queries=queries, 73 | endpoint=endpoint, 74 | session=self._session, 75 | handle_retries=handle_retries, 76 | retrying=retrying, 77 | ) 78 | 79 | 80 | class AsyncZyteAPI: 81 | """:ref:`Asynchronous Zyte API client `. 82 | 83 | Parameters work the same as for :class:`ZyteAPI`. 84 | """ 85 | 86 | def __init__( 87 | self, 88 | *, 89 | api_key=None, 90 | api_url=API_URL, 91 | n_conn=15, 92 | retrying: AsyncRetrying | None = None, 93 | user_agent: str | None = None, 94 | ): 95 | if retrying is not None and not isinstance(retrying, AsyncRetrying): 96 | raise ValueError( 97 | "The retrying parameter, if defined, must be an instance of " 98 | "AsyncRetrying." 99 | ) 100 | self.api_key = get_apikey(api_key) 101 | self.api_url = api_url 102 | self.n_conn = n_conn 103 | self.agg_stats = AggStats() 104 | self.retrying = retrying or zyte_api_retrying 105 | self.user_agent = user_agent or USER_AGENT 106 | self._semaphore = asyncio.Semaphore(n_conn) 107 | 108 | async def get( 109 | self, 110 | query: dict, 111 | *, 112 | endpoint: str = "extract", 113 | session=None, 114 | handle_retries=True, 115 | retrying: AsyncRetrying | None = None, 116 | ) -> _ResponseFuture: 117 | """Asynchronous equivalent to :meth:`ZyteAPI.get`.""" 118 | retrying = retrying or self.retrying 119 | post = _post_func(session) 120 | auth = aiohttp.BasicAuth(self.api_key) 121 | headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} 122 | 123 | response_stats = [] 124 | start_global = time.perf_counter() 125 | 126 | async def request(): 127 | stats = ResponseStats.create(start_global) 128 | self.agg_stats.n_attempts += 1 129 | 130 | safe_query = _process_query(query) 131 | post_kwargs = { 132 | "url": self.api_url + endpoint, 133 | "json": safe_query, 134 | "auth": auth, 135 | "headers": headers, 136 | } 137 | 138 | try: 139 | async with self._semaphore, post(**post_kwargs) as resp: 140 | stats.record_connected(resp.status, self.agg_stats) 141 | if resp.status >= 400: 142 | content = await resp.read() 143 | resp.release() 144 | stats.record_read() 145 | stats.record_request_error(content, self.agg_stats) 146 | 147 | raise RequestError( 148 | request_info=resp.request_info, 149 | history=resp.history, 150 | status=resp.status, 151 | message=resp.reason, 152 | headers=resp.headers, 153 | response_content=content, 154 | query=safe_query, 155 | ) 156 | 157 | response = await resp.json() 158 | stats.record_read(self.agg_stats) 159 | return response 160 | except Exception as e: 161 | if not isinstance(e, RequestError): 162 | self.agg_stats.n_errors += 1 163 | stats.record_exception(e, agg_stats=self.agg_stats) 164 | raise 165 | finally: 166 | response_stats.append(stats) 167 | 168 | if handle_retries: 169 | request = retrying.wraps(request) 170 | 171 | try: 172 | # Try to make a request 173 | result = await request() 174 | self.agg_stats.n_success += 1 175 | except Exception: 176 | self.agg_stats.n_fatal_errors += 1 177 | raise 178 | 179 | return result 180 | 181 | def iter( 182 | self, 183 | queries: list[dict], 184 | *, 185 | endpoint: str = "extract", 186 | session: aiohttp.ClientSession | None = None, 187 | handle_retries=True, 188 | retrying: AsyncRetrying | None = None, 189 | ) -> Iterator[_ResponseFuture]: 190 | """Asynchronous equivalent to :meth:`ZyteAPI.iter`. 191 | 192 | .. note:: Yielded futures, when awaited, do raise their exceptions, 193 | instead of only returning them. 194 | """ 195 | 196 | def _request(query): 197 | return self.get( 198 | query, 199 | endpoint=endpoint, 200 | session=session, 201 | handle_retries=handle_retries, 202 | retrying=retrying, 203 | ) 204 | 205 | return asyncio.as_completed([_request(query) for query in queries]) 206 | 207 | def session(self, **kwargs): 208 | """Asynchronous equivalent to :meth:`ZyteAPI.session`. 209 | 210 | You do not need to use :meth:`~AsyncZyteAPI.session` as an async 211 | context manager as long as you await ``close()`` on the object it 212 | returns when you are done: 213 | 214 | .. code-block:: python 215 | 216 | session = client.session() 217 | try: 218 | ... 219 | finally: 220 | await session.close() 221 | """ 222 | return _AsyncSession(client=self, **kwargs) 223 | -------------------------------------------------------------------------------- /zyte_api/_errors.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | from typing import Any 5 | 6 | from aiohttp import ClientResponseError 7 | 8 | from zyte_api.errors import ParsedError 9 | 10 | logger = logging.getLogger("zyte_api") 11 | 12 | 13 | class RequestError(ClientResponseError): 14 | """Exception raised upon receiving a :ref:`rate-limiting 15 | ` or :ref:`unsuccessful 16 | ` response from Zyte API.""" 17 | 18 | def __init__(self, *args, **kwargs): 19 | #: Query sent to Zyte API. 20 | #: 21 | #: May be slightly different from the input query due to 22 | #: pre-processing logic on the client side. 23 | self.query: dict[str, Any] = kwargs.pop("query") 24 | 25 | #: Request ID. 26 | self.request_id: str | None = kwargs.get("headers", {}).get("request-id") 27 | 28 | #: Response body. 29 | self.response_content: bytes | None = kwargs.pop("response_content") 30 | 31 | super().__init__(*args, **kwargs) 32 | 33 | @property 34 | def parsed(self): 35 | """Response as a :class:`ParsedError` object.""" 36 | return ParsedError.from_body(self.response_content) 37 | 38 | def __str__(self): 39 | return ( 40 | f"RequestError: {self.status}, message={self.message}, " 41 | f"headers={self.headers}, body={self.response_content}, " 42 | f"request_id={self.request_id}" 43 | ) 44 | -------------------------------------------------------------------------------- /zyte_api/_retry.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import logging 5 | from collections import Counter 6 | from datetime import timedelta 7 | from itertools import count 8 | from typing import Callable, Union 9 | from warnings import warn 10 | 11 | from aiohttp import client_exceptions 12 | from tenacity import ( 13 | AsyncRetrying, 14 | RetryCallState, 15 | after_log, 16 | before_log, 17 | before_sleep_log, 18 | retry_base, 19 | retry_if_exception, 20 | wait_chain, 21 | wait_fixed, 22 | wait_random, 23 | wait_random_exponential, 24 | ) 25 | from tenacity.stop import stop_base, stop_never 26 | 27 | from ._errors import RequestError 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | _IDS = count() 32 | 33 | 34 | _NETWORK_ERRORS = ( 35 | asyncio.TimeoutError, # could happen while reading the response body 36 | client_exceptions.ClientResponseError, 37 | client_exceptions.ClientOSError, 38 | client_exceptions.ServerConnectionError, 39 | client_exceptions.ServerDisconnectedError, 40 | client_exceptions.ServerTimeoutError, 41 | client_exceptions.ClientPayloadError, 42 | client_exceptions.ClientConnectorSSLError, 43 | client_exceptions.ClientConnectorError, 44 | ) 45 | 46 | 47 | def _is_network_error(exc: BaseException) -> bool: 48 | if isinstance(exc, RequestError): 49 | # RequestError is ClientResponseError, which is in the 50 | # _NETWORK_ERRORS list, but it should be handled 51 | # separately. 52 | return False 53 | return isinstance(exc, _NETWORK_ERRORS) 54 | 55 | 56 | def _is_throttling_error(exc: BaseException) -> bool: 57 | return isinstance(exc, RequestError) and exc.status in (429, 503) 58 | 59 | 60 | class stop_on_count(stop_base): 61 | """Keep a call count with the specified counter name, and stop after the 62 | specified number os calls. 63 | 64 | Unlike stop_after_attempt, this callable does not take into account 65 | attempts for which a different stop callable was used. 66 | """ 67 | 68 | def __init__(self, max_count: int) -> None: 69 | self._max_count = max_count 70 | self._counter_id = next(_IDS) 71 | 72 | def __call__(self, retry_state: RetryCallState) -> bool: 73 | if not hasattr(retry_state, "counter"): 74 | retry_state.counter = Counter() # type: ignore[attr-defined] 75 | retry_state.counter[self._counter_id] += 1 # type: ignore[attr-defined] 76 | return retry_state.counter[self._counter_id] >= self._max_count # type: ignore[attr-defined] 77 | 78 | 79 | time_unit_type = Union[int, float, timedelta] 80 | 81 | 82 | def to_seconds(time_unit: time_unit_type) -> float: 83 | return float( 84 | time_unit.total_seconds() if isinstance(time_unit, timedelta) else time_unit 85 | ) 86 | 87 | 88 | class stop_after_uninterrupted_delay(stop_base): 89 | """Stop when this stop callable has been called for the specified time 90 | uninterrupted, i.e. without calls to different stop callables. 91 | 92 | Unlike stop_after_delay, this callable resets its timer after any attempt 93 | for which a different stop callable was used. 94 | """ 95 | 96 | def __init__(self, max_delay: time_unit_type) -> None: 97 | self._max_delay = to_seconds(max_delay) 98 | self._timer_id = next(_IDS) 99 | 100 | def __call__(self, retry_state: RetryCallState) -> bool: 101 | if not hasattr(retry_state, "uninterrupted_start_times"): 102 | retry_state.uninterrupted_start_times = {} # type: ignore[attr-defined] 103 | if self._timer_id not in retry_state.uninterrupted_start_times: # type: ignore[attr-defined] 104 | # First time. 105 | retry_state.uninterrupted_start_times[self._timer_id] = [ # type: ignore[attr-defined] 106 | retry_state.attempt_number, 107 | retry_state.outcome_timestamp, 108 | ] 109 | return False 110 | attempt_number, start_time = retry_state.uninterrupted_start_times[ # type: ignore[attr-defined] 111 | self._timer_id 112 | ] 113 | if retry_state.attempt_number - attempt_number > 1: 114 | # There was a different stop reason since the last attempt, 115 | # resetting the timer. 116 | retry_state.uninterrupted_start_times[self._timer_id] = [ # type: ignore[attr-defined] 117 | retry_state.attempt_number, 118 | retry_state.outcome_timestamp, 119 | ] 120 | return False 121 | if retry_state.outcome_timestamp - start_time < self._max_delay: 122 | # Within time, do not stop, only increase the attempt count. 123 | retry_state.uninterrupted_start_times[self._timer_id][0] += 1 # type: ignore[attr-defined] 124 | return False 125 | return True 126 | 127 | 128 | class stop_on_download_error(stop_base): 129 | """Stop after the specified max numbers of total or permanent download 130 | errors.""" 131 | 132 | def __init__(self, max_total: int, max_permanent: int) -> None: 133 | self._max_total = max_total 134 | self._max_permanent = max_permanent 135 | 136 | def __call__(self, retry_state: RetryCallState) -> bool: 137 | if not hasattr(retry_state, "counter"): 138 | retry_state.counter = Counter() # type: ignore[attr-defined] 139 | assert retry_state.outcome, "Unexpected empty outcome" 140 | exc = retry_state.outcome.exception() 141 | assert exc, "Unexpected empty exception" 142 | if exc.status == 521: # type: ignore[attr-defined] 143 | retry_state.counter["permanent_download_error"] += 1 # type: ignore[attr-defined] 144 | if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore[attr-defined] 145 | return True 146 | retry_state.counter["download_error"] += 1 # type: ignore[attr-defined] 147 | return retry_state.counter["download_error"] >= self._max_total # type: ignore[attr-defined] 148 | 149 | 150 | def _download_error(exc: BaseException) -> bool: 151 | return isinstance(exc, RequestError) and exc.status in {520, 521} 152 | 153 | 154 | def _undocumented_error(exc: BaseException) -> bool: 155 | return ( 156 | isinstance(exc, RequestError) 157 | and exc.status >= 500 158 | and exc.status not in {503, 520, 521} 159 | ) 160 | 161 | 162 | def _deprecated(message: str, callable: Callable) -> Callable: 163 | def wrapper(factory, retry_state: RetryCallState): 164 | warn(message, DeprecationWarning, stacklevel=3) 165 | return callable(retry_state=retry_state) 166 | 167 | return wrapper 168 | 169 | 170 | class RetryFactory: 171 | """Factory class that builds the :class:`tenacity.AsyncRetrying` object 172 | that defines the :ref:`default retry policy `. 173 | 174 | To create a custom retry policy, you can subclass this factory class, 175 | modify it as needed, and then call :meth:`build` on your subclass to get 176 | the corresponding :class:`tenacity.AsyncRetrying` object. 177 | 178 | For example, to double the number of attempts for download errors and the 179 | time network errors are retried: 180 | 181 | .. code-block:: python 182 | 183 | from zyte_api import ( 184 | RetryFactory, 185 | stop_after_uninterrupted_delay, 186 | stop_on_download_error, 187 | ) 188 | 189 | 190 | class CustomRetryFactory(RetryFactory): 191 | network_error_stop = stop_after_uninterrupted_delay(30 * 60) 192 | download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) 193 | 194 | 195 | CUSTOM_RETRY_POLICY = CustomRetryFactory().build() 196 | """ 197 | 198 | retry_condition: retry_base = ( 199 | retry_if_exception(_is_throttling_error) 200 | | retry_if_exception(_is_network_error) 201 | | retry_if_exception(_download_error) 202 | | retry_if_exception(_undocumented_error) 203 | ) 204 | # throttling 205 | throttling_wait = wait_chain( 206 | # always wait 20-40s first 207 | wait_fixed(20) + wait_random(0, 20), 208 | # wait 20-40s again 209 | wait_fixed(20) + wait_random(0, 20), 210 | # wait from 30 to 630s, with full jitter and exponentially 211 | # increasing max wait time 212 | wait_fixed(30) + wait_random_exponential(multiplier=1, max=600), 213 | ) 214 | 215 | # connection errors, other client and server failures 216 | network_error_stop = stop_after_uninterrupted_delay(15 * 60) 217 | network_error_wait = ( 218 | # wait from 3s to ~1m 219 | wait_random(3, 7) + wait_random_exponential(multiplier=1, max=55) 220 | ) 221 | 222 | download_error_stop = stop_on_download_error(max_total=4, max_permanent=2) 223 | download_error_wait = network_error_wait 224 | 225 | temporary_download_error_stop = _deprecated( 226 | ( 227 | "The zyte_api.RetryFactory.temporary_download_error_stop() method " 228 | "is deprecated and will be removed in a future version. Use " 229 | "download_error_stop() instead." 230 | ), 231 | download_error_stop, 232 | ) 233 | temporary_download_error_wait = _deprecated( 234 | ( 235 | "The zyte_api.RetryFactory.temporary_download_error_wait() method " 236 | "is deprecated and will be removed in a future version. Use " 237 | "download_error_wait() instead." 238 | ), 239 | download_error_wait, 240 | ) 241 | 242 | throttling_stop = stop_never 243 | 244 | undocumented_error_stop = stop_on_count(2) 245 | undocumented_error_wait = network_error_wait 246 | 247 | def wait(self, retry_state: RetryCallState) -> float: 248 | assert retry_state.outcome, "Unexpected empty outcome" 249 | exc = retry_state.outcome.exception() 250 | assert exc, "Unexpected empty exception" 251 | if _is_throttling_error(exc): 252 | return self.throttling_wait(retry_state=retry_state) 253 | if _is_network_error(exc): 254 | return self.network_error_wait(retry_state=retry_state) 255 | if _undocumented_error(exc): 256 | return self.undocumented_error_wait(retry_state=retry_state) 257 | assert _download_error(exc) # See retry_condition 258 | return self.download_error_wait(retry_state=retry_state) 259 | 260 | def stop(self, retry_state: RetryCallState) -> bool: 261 | assert retry_state.outcome, "Unexpected empty outcome" 262 | exc = retry_state.outcome.exception() 263 | assert exc, "Unexpected empty exception" 264 | if _is_throttling_error(exc): 265 | return self.throttling_stop(retry_state) 266 | if _is_network_error(exc): 267 | return self.network_error_stop(retry_state) 268 | if _undocumented_error(exc): 269 | return self.undocumented_error_stop(retry_state) 270 | assert _download_error(exc) # See retry_condition 271 | return self.download_error_stop(retry_state) 272 | 273 | def reraise(self) -> bool: 274 | return True 275 | 276 | def build(self) -> AsyncRetrying: 277 | return AsyncRetrying( 278 | wait=self.wait, 279 | retry=self.retry_condition, 280 | stop=self.stop, 281 | reraise=self.reraise(), 282 | before=before_log(logger, logging.DEBUG), 283 | after=after_log(logger, logging.DEBUG), 284 | before_sleep=before_sleep_log(logger, logging.DEBUG), 285 | ) 286 | 287 | 288 | zyte_api_retrying: AsyncRetrying = RetryFactory().build() 289 | 290 | 291 | class AggressiveRetryFactory(RetryFactory): 292 | """Factory class that builds the :class:`tenacity.AsyncRetrying` object 293 | that defines the :ref:`aggressive retry policy `. 294 | 295 | To create a custom retry policy, you can subclass this factory class, 296 | modify it as needed, and then call :meth:`build` on your subclass to get 297 | the corresponding :class:`tenacity.AsyncRetrying` object. 298 | 299 | For example, to double the maximum number of attempts for all error 300 | responses and double the time network errors are retried: 301 | 302 | .. code-block:: python 303 | 304 | from zyte_api import ( 305 | AggressiveRetryFactory, 306 | stop_after_uninterrupted_delay, 307 | stop_on_count, 308 | stop_on_download_error, 309 | ) 310 | 311 | 312 | class CustomRetryFactory(AggressiveRetryFactory): 313 | download_error_stop = stop_on_download_error(max_total=16, max_permanent=8) 314 | network_error_stop = stop_after_uninterrupted_delay(30 * 60) 315 | undocumented_error_stop = stop_on_count(8) 316 | 317 | 318 | CUSTOM_RETRY_POLICY = CustomRetryFactory().build() 319 | """ 320 | 321 | download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) 322 | undocumented_error_stop = stop_on_count(4) 323 | 324 | 325 | aggressive_retrying = AggressiveRetryFactory().build() 326 | -------------------------------------------------------------------------------- /zyte_api/_sync.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from typing import TYPE_CHECKING 5 | 6 | from ._async import AsyncZyteAPI 7 | from .constants import API_URL 8 | 9 | if TYPE_CHECKING: 10 | from collections.abc import Generator 11 | 12 | from aiohttp import ClientSession 13 | from tenacity import AsyncRetrying 14 | 15 | 16 | def _get_loop(): 17 | try: 18 | return asyncio.get_event_loop() 19 | except RuntimeError: # pragma: no cover (tests always have a running loop) 20 | loop = asyncio.new_event_loop() 21 | asyncio.set_event_loop(loop) 22 | return loop 23 | 24 | 25 | class _Session: 26 | def __init__(self, client, **session_kwargs): 27 | self._client = client 28 | 29 | # https://github.com/aio-libs/aiohttp/pull/1468 30 | async def create_session(): 31 | return client._async_client.session(**session_kwargs)._session 32 | 33 | loop = _get_loop() 34 | self._session = loop.run_until_complete(create_session()) 35 | 36 | def __enter__(self): 37 | return self 38 | 39 | def __exit__(self, *exc_info): 40 | loop = _get_loop() 41 | loop.run_until_complete(self._session.close()) 42 | 43 | def close(self): 44 | loop = _get_loop() 45 | loop.run_until_complete(self._session.close()) 46 | 47 | def get( 48 | self, 49 | query: dict, 50 | *, 51 | endpoint: str = "extract", 52 | handle_retries=True, 53 | retrying: AsyncRetrying | None = None, 54 | ): 55 | return self._client.get( 56 | query=query, 57 | endpoint=endpoint, 58 | handle_retries=handle_retries, 59 | retrying=retrying, 60 | session=self._session, 61 | ) 62 | 63 | def iter( 64 | self, 65 | queries: list[dict], 66 | *, 67 | endpoint: str = "extract", 68 | handle_retries=True, 69 | retrying: AsyncRetrying | None = None, 70 | ) -> Generator[dict | Exception, None, None]: 71 | return self._client.iter( 72 | queries=queries, 73 | endpoint=endpoint, 74 | session=self._session, 75 | handle_retries=handle_retries, 76 | retrying=retrying, 77 | ) 78 | 79 | 80 | class ZyteAPI: 81 | """:ref:`Synchronous Zyte API client `. 82 | 83 | *api_key* is your Zyte API key. If not specified, it is read from the 84 | ``ZYTE_API_KEY`` environment variable. See :ref:`api-key`. 85 | 86 | *api_url* is the Zyte API base URL. 87 | 88 | *n_conn* is the maximum number of concurrent requests to use. See 89 | :ref:`api-optimize`. 90 | 91 | *retrying* is the retry policy for requests. Defaults to 92 | :data:`~zyte_api.zyte_api_retrying`. 93 | 94 | *user_agent* is the user agent string reported to Zyte API. Defaults to 95 | ``python-zyte-api/``. 96 | 97 | .. tip:: To change the ``User-Agent`` header sent to a target website, use 98 | :http:`request:customHttpRequestHeaders` instead. 99 | """ 100 | 101 | def __init__( 102 | self, 103 | *, 104 | api_key=None, 105 | api_url=API_URL, 106 | n_conn=15, 107 | retrying: AsyncRetrying | None = None, 108 | user_agent: str | None = None, 109 | ): 110 | self._async_client = AsyncZyteAPI( 111 | api_key=api_key, 112 | api_url=api_url, 113 | n_conn=n_conn, 114 | retrying=retrying, 115 | user_agent=user_agent, 116 | ) 117 | 118 | def get( 119 | self, 120 | query: dict, 121 | *, 122 | endpoint: str = "extract", 123 | session: ClientSession | None = None, 124 | handle_retries: bool = True, 125 | retrying: AsyncRetrying | None = None, 126 | ) -> dict: 127 | """Send *query* to Zyte API and return the result. 128 | 129 | *endpoint* is the Zyte API endpoint path relative to the client object 130 | *api_url*. 131 | 132 | *session* is the network session to use. Consider using 133 | :meth:`session` instead of this parameter. 134 | 135 | *handle_retries* determines whether or not a :ref:`retry policy 136 | ` should be used. 137 | 138 | *retrying* is the :ref:`retry policy ` to use, provided 139 | *handle_retries* is ``True``. If not specified, the :ref:`default retry 140 | policy ` is used. 141 | """ 142 | loop = _get_loop() 143 | future = self._async_client.get( 144 | query=query, 145 | endpoint=endpoint, 146 | session=session, 147 | handle_retries=handle_retries, 148 | retrying=retrying, 149 | ) 150 | return loop.run_until_complete(future) 151 | 152 | def iter( 153 | self, 154 | queries: list[dict], 155 | *, 156 | endpoint: str = "extract", 157 | session: ClientSession | None = None, 158 | handle_retries: bool = True, 159 | retrying: AsyncRetrying | None = None, 160 | ) -> Generator[dict | Exception, None, None]: 161 | """Send multiple *queries* to Zyte API in parallel and iterate over 162 | their results as they come. 163 | 164 | The number of *queries* can exceed the *n_conn* parameter set on the 165 | client object. Extra queries will be queued, there will be only up to 166 | *n_conn* requests being processed in parallel at a time. 167 | 168 | Results may come an a different order from the original list of 169 | *queries*. You can use :http:`request:echoData` to attach metadata to 170 | queries, and later use that metadata to restore their original order. 171 | 172 | When exceptions occur, they are yielded, not raised. 173 | 174 | The remaining parameters work the same as in :meth:`get`. 175 | """ 176 | loop = _get_loop() 177 | for future in self._async_client.iter( 178 | queries=queries, 179 | endpoint=endpoint, 180 | session=session, 181 | handle_retries=handle_retries, 182 | retrying=retrying, 183 | ): 184 | try: 185 | yield loop.run_until_complete(future) 186 | except Exception as exception: 187 | yield exception 188 | 189 | def session(self, **kwargs): 190 | """:ref:`Context manager ` to create a session. 191 | 192 | A session is an object that has the same API as the client object, 193 | except: 194 | 195 | - :meth:`get` and :meth:`iter` do not have a *session* parameter, 196 | the session creates an :class:`aiohttp.ClientSession` object and 197 | passes it to :meth:`get` and :meth:`iter` automatically. 198 | 199 | - It does not have a :meth:`session` method. 200 | 201 | Using the same :class:`aiohttp.ClientSession` object for all Zyte API 202 | requests improves performance by keeping a pool of reusable connections 203 | to Zyte API. 204 | 205 | The :class:`aiohttp.ClientSession` object is created with sane defaults 206 | for Zyte API, but you can use *kwargs* to pass additional parameters to 207 | :class:`aiohttp.ClientSession` and even override those sane defaults. 208 | 209 | You do not need to use :meth:`session` as a context manager as long as 210 | you call ``close()`` on the object it returns when you are done: 211 | 212 | .. code-block:: python 213 | 214 | session = client.session() 215 | try: 216 | ... 217 | finally: 218 | session.close() 219 | """ 220 | return _Session(client=self, **kwargs) 221 | -------------------------------------------------------------------------------- /zyte_api/_utils.py: -------------------------------------------------------------------------------- 1 | from warnings import warn 2 | 3 | import aiohttp 4 | from aiohttp import TCPConnector 5 | 6 | from .constants import API_TIMEOUT 7 | 8 | # 120 seconds is probably too long, but we are concerned about the case with 9 | # many concurrent requests and some processing logic running in the same reactor, 10 | # thus, saturating the CPU. This will make timeouts more likely. 11 | _AIO_API_TIMEOUT = aiohttp.ClientTimeout(total=API_TIMEOUT + 120) 12 | 13 | 14 | def deprecated_create_session( 15 | connection_pool_size=100, **kwargs 16 | ) -> aiohttp.ClientSession: 17 | warn( 18 | ( 19 | "zyte_api.aio.client.create_session is deprecated, use " 20 | "ZyteAPI.session or AsyncZyteAPI.session instead." 21 | ), 22 | DeprecationWarning, 23 | stacklevel=2, 24 | ) 25 | return create_session(connection_pool_size=connection_pool_size, **kwargs) 26 | 27 | 28 | def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: 29 | """Create a session with parameters suited for Zyte API""" 30 | kwargs.setdefault("timeout", _AIO_API_TIMEOUT) 31 | if "connector" not in kwargs: 32 | kwargs["connector"] = TCPConnector(limit=connection_pool_size, force_close=True) 33 | return aiohttp.ClientSession(**kwargs) 34 | -------------------------------------------------------------------------------- /zyte_api/aio/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Asyncio client for Zyte API 3 | """ 4 | 5 | from warnings import warn 6 | 7 | warn( 8 | ( 9 | "The zyte_api.aio module is deprecated. Replace " 10 | "zyte_api.aio.client.AsyncClient with zyte_api.AsyncZyteAPI (note " 11 | "that method names are different), zyte_api.aio.client.create_session " 12 | "with zyte_api.AsyncZyteAPI.session, zyte_api.aio.errors.RequestError " 13 | "with zyte_api.RequestError, zyte_api.aio.retry.RetryFactory with " 14 | "zyte_api.RetryFactory, and zyte_api.aio.retry.zyte_api_retrying with " 15 | "zyte_api.zyte_api_retrying." 16 | ), 17 | DeprecationWarning, 18 | stacklevel=2, 19 | ) 20 | -------------------------------------------------------------------------------- /zyte_api/aio/client.py: -------------------------------------------------------------------------------- 1 | from .._async import AsyncZyteAPI 2 | from .._utils import deprecated_create_session as create_session # noqa: F401 3 | 4 | 5 | class AsyncClient(AsyncZyteAPI): 6 | request_raw = AsyncZyteAPI.get 7 | request_parallel_as_completed = AsyncZyteAPI.iter 8 | -------------------------------------------------------------------------------- /zyte_api/aio/errors.py: -------------------------------------------------------------------------------- 1 | from .._errors import RequestError 2 | -------------------------------------------------------------------------------- /zyte_api/aio/retry.py: -------------------------------------------------------------------------------- 1 | from .._retry import RetryFactory, zyte_api_retrying 2 | -------------------------------------------------------------------------------- /zyte_api/apikey.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from .constants import ENV_VARIABLE 6 | 7 | 8 | class NoApiKey(Exception): 9 | pass 10 | 11 | 12 | def get_apikey(key: str | None = None) -> str: 13 | """Return API key, probably loading it from an environment variable""" 14 | if key is not None: 15 | return key 16 | try: 17 | return os.environ[ENV_VARIABLE] 18 | except KeyError: 19 | raise NoApiKey( 20 | f"API key not found. Please set {ENV_VARIABLE} environment variable." 21 | ) from None 22 | -------------------------------------------------------------------------------- /zyte_api/constants.py: -------------------------------------------------------------------------------- 1 | # Name of the environment variable with the API key 2 | ENV_VARIABLE = "ZYTE_API_KEY" 3 | 4 | # API URL 5 | API_URL = "https://api.zyte.com/v1/" 6 | 7 | # Default timeout that server uses. Client timeouts should be larger than that. 8 | API_TIMEOUT = 200 9 | -------------------------------------------------------------------------------- /zyte_api/errors.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import Optional 5 | 6 | import attr 7 | 8 | 9 | @attr.s(auto_attribs=True) 10 | class ParsedError: 11 | """Parsed error response body from Zyte API.""" 12 | 13 | #: Raw response body from Zyte API. 14 | response_body: bytes 15 | 16 | #: JSON-decoded response body. 17 | #: 18 | #: If ``None``, :data:`parse_error` indicates the reason. 19 | data: Optional[dict] 20 | 21 | #: If :data:`data` is ``None``, this indicates whether the reason is that 22 | #: :data:`response_body` is not valid JSON (``"bad_json"``) or that it is 23 | #: not a JSON object (``"bad_format"``). 24 | parse_error: Optional[str] 25 | 26 | @classmethod 27 | def from_body(cls, response_body: bytes) -> ParsedError: 28 | """Return a :class:`ParsedError` object built out of the specified 29 | error response body.""" 30 | data = None 31 | parse_error = None 32 | 33 | if response_body: 34 | try: 35 | data = json.loads(response_body.decode("utf-8")) 36 | if not isinstance(data, dict): 37 | parse_error = "bad_format" 38 | data = None 39 | except (json.JSONDecodeError, UnicodeDecodeError) as _: 40 | parse_error = "bad_json" 41 | 42 | return cls(response_body=response_body, data=data, parse_error=parse_error) 43 | 44 | @property 45 | def type(self) -> Optional[str]: 46 | """ID of the error type, e.g. ``"/limits/over-user-limit"`` or 47 | ``"/download/temporary-error"``.""" 48 | return (self.data or {}).get("type", None) 49 | -------------------------------------------------------------------------------- /zyte_api/stats.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import time 5 | from collections import Counter 6 | from typing import Optional 7 | 8 | import attr 9 | from runstats import Statistics 10 | 11 | from zyte_api.errors import ParsedError 12 | 13 | 14 | def zero_on_division_error(meth): 15 | @functools.wraps(meth) 16 | def wrapper(*args, **kwargs): 17 | try: 18 | return meth(*args, **kwargs) 19 | except ZeroDivisionError: 20 | return 0 21 | 22 | return wrapper 23 | 24 | 25 | class AggStats: 26 | def __init__(self): 27 | self.time_connect_stats = Statistics() 28 | self.time_total_stats = Statistics() 29 | 30 | self.n_success = 0 # number of successful results returned to the user 31 | self.n_fatal_errors = ( 32 | 0 # number of errors returned to the user, after all retries 33 | ) 34 | 35 | self.n_attempts = ( 36 | 0 # total amount of requests made to Zyte API, including retries 37 | ) 38 | self.n_429 = 0 # number of 429 (throttling) responses 39 | self.n_errors = 0 # number of errors, including errors which were retried 40 | 41 | self.status_codes = Counter() 42 | self.exception_types = Counter() 43 | self.api_error_types = Counter() 44 | 45 | def __str__(self): 46 | return ( 47 | f"conn:{self.time_connect_stats.mean():0.2f}s, " 48 | f"resp:{self.time_total_stats.mean():0.2f}s, " 49 | f"throttle:{self.throttle_ratio():.1%}, " 50 | f"err:{self.n_errors - self.n_fatal_errors}+{self.n_fatal_errors}({self.error_ratio():.1%}) | " 51 | f"success:{self.n_success}/{self.n_processed}({self.success_ratio():.1%})" 52 | ) 53 | 54 | def summary(self): 55 | return ( 56 | "\n" 57 | "Summary\n" 58 | "-------\n" 59 | f"Mean connection time: {self.time_connect_stats.mean():0.2f}\n" 60 | f"Mean response time: {self.time_total_stats.mean():0.2f}\n" 61 | f"Throttle ratio: {self.throttle_ratio():0.1%}\n" 62 | f"Attempts: {self.n_attempts}\n" 63 | f"Errors: {self.error_ratio():0.1%}, fatal: {self.n_fatal_errors}, non fatal: {self.n_errors - self.n_fatal_errors}\n" 64 | f"Successful URLs: {self.n_success} of {self.n_processed}\n" 65 | f"Success ratio: {self.success_ratio():0.1%}\n" 66 | ) 67 | 68 | @zero_on_division_error 69 | def throttle_ratio(self): 70 | return self.n_429 / self.n_attempts 71 | 72 | @zero_on_division_error 73 | def error_ratio(self): 74 | return self.n_errors / self.n_attempts 75 | 76 | @zero_on_division_error 77 | def success_ratio(self): 78 | return self.n_success / self.n_processed 79 | 80 | @property 81 | def n_processed(self): 82 | """Total number of processed URLs""" 83 | return self.n_success + self.n_fatal_errors 84 | 85 | 86 | @attr.s 87 | class ResponseStats: 88 | _start: float = attr.ib(repr=False) 89 | 90 | # Wait time, before this request is sent. Can be large in case of retries. 91 | time_delayed: Optional[float] = attr.ib(default=None) 92 | 93 | # Time between sending a request and having a connection established 94 | time_connect: Optional[float] = attr.ib(default=None) 95 | 96 | # Time to read & decode the response 97 | time_read: Optional[float] = attr.ib(default=None) 98 | 99 | # time to get an exception (usually, a network error) 100 | time_exception: Optional[float] = attr.ib(default=None) 101 | 102 | # Total time to process the response, excluding the wait time caused 103 | # by retries. 104 | time_total: Optional[float] = attr.ib(default=None) 105 | 106 | # HTTP status code 107 | status: Optional[int] = attr.ib(default=None) 108 | 109 | # error (parsed), in case of error response 110 | error: Optional[ParsedError] = attr.ib(default=None) 111 | 112 | # exception raised 113 | exception: Optional[Exception] = attr.ib(default=None) 114 | 115 | @classmethod 116 | def create(cls, start_global): 117 | start = time.perf_counter() 118 | return cls( 119 | start=start, 120 | time_delayed=start - start_global, 121 | ) 122 | 123 | def record_connected(self, status: int, agg_stats: AggStats): 124 | self.status = status 125 | self.time_connect = time.perf_counter() - self._start 126 | agg_stats.time_connect_stats.push(self.time_connect) 127 | agg_stats.status_codes[self.status] += 1 128 | 129 | def record_read(self, agg_stats: AggStats | None = None): 130 | now = time.perf_counter() 131 | self.time_total = now - self._start 132 | self.time_read = self.time_total - (self.time_connect or 0) 133 | if agg_stats: 134 | agg_stats.time_total_stats.push(self.time_total) 135 | 136 | def record_exception(self, exception: Exception, agg_stats: AggStats): 137 | self.time_exception = time.perf_counter() - self._start 138 | self.exception = exception 139 | agg_stats.status_codes[0] += 1 140 | agg_stats.exception_types[exception.__class__] += 1 141 | 142 | def record_request_error(self, error_body: bytes, agg_stats: AggStats): 143 | self.error = ParsedError.from_body(error_body) 144 | 145 | if self.status == 429: # XXX: status must be set already! 146 | agg_stats.n_429 += 1 147 | else: 148 | agg_stats.n_errors += 1 149 | 150 | agg_stats.api_error_types[self.error.type] += 1 151 | -------------------------------------------------------------------------------- /zyte_api/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | 4 | from w3lib.url import safe_url_string 5 | 6 | from .__version__ import __version__ 7 | 8 | USER_AGENT = f"python-zyte-api/{__version__}" 9 | 10 | 11 | def _guess_intype(file_name, lines): 12 | extension = Path(file_name).suffix[1:] 13 | if extension in {"jl", "jsonl"}: 14 | return "jl" 15 | if extension == "txt": 16 | return "txt" 17 | 18 | if re.search(r"^\s*\{", lines[0]): 19 | return "jl" 20 | 21 | return "txt" 22 | 23 | 24 | def _process_query(query): 25 | """Given a query to be sent to Zyte API, return a functionally-equivalent 26 | query that fixes any known issue. 27 | 28 | Specifically, unsafe characters in the query URL are escaped to make sure 29 | they are safe not only for the end server, but also for Zyte API, which 30 | requires URLs compatible with RFC 2396. 31 | 32 | *query* is never modified in place, but the returned object is not 33 | guaranteed to be a copy of *query*: it could be *query* itself if no 34 | changes where needed, or a shallow copy of *query* with some common nested 35 | objects (e.g. shared ``actions`` list). 36 | """ 37 | url = query.get("url", None) 38 | if url is None: 39 | return query 40 | if not isinstance(url, str): 41 | raise ValueError(f"Expected a str URL parameter, got {type(url)}") 42 | safe_url = safe_url_string(url) 43 | if url == safe_url: 44 | return query 45 | return {**query, "url": safe_url} 46 | --------------------------------------------------------------------------------