├── .bumpversion.cfg ├── .codecov.yml ├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGES.rst ├── LICENSE ├── README.rst ├── docs ├── Makefile ├── _ext │ └── __init__.py ├── changes.rst ├── conf.py ├── customization │ ├── index.rst │ ├── pages.rst │ └── spiders.rst ├── features │ └── search.rst ├── index.rst ├── make.bat ├── reference │ ├── api.rst │ ├── reqmeta.rst │ └── settings.rst ├── requirements.txt ├── setup.rst └── templates │ ├── article.rst │ ├── e-commerce.rst │ ├── google-search.rst │ ├── index.rst │ └── job-posting.rst ├── pyproject.toml ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── incremental │ ├── test_collection_fp_manager.py │ ├── test_incremental_manager.py │ └── test_middleware.py ├── mockserver.py ├── pages │ ├── __init__.py │ ├── test_article_navigation_heuristics.py │ └── test_product_navigation_heuristics.py ├── test_addon.py ├── test_article.py ├── test_base.py ├── test_ecommerce.py ├── test_feeds.py ├── test_heuristics.py ├── test_job_posting.py ├── test_middlewares.py ├── test_params.py ├── test_params_location_param.py ├── test_search.py ├── test_serp.py ├── test_utils.py └── utils.py ├── tox.ini ├── utils ├── google-gl-updater │ ├── requirements.in │ ├── requirements.txt │ ├── template.py │ └── update.py └── google-hl-updater │ ├── requirements.in │ ├── requirements.txt │ ├── template.py │ └── update.py └── zyte_spider_templates ├── __init__.py ├── _addon.py ├── _geolocations.py ├── _incremental ├── __init__.py ├── manager.py └── middleware.py ├── _lang_codes.py ├── documentation.py ├── feeds.py ├── heuristics.py ├── middlewares.py ├── page_objects ├── __init__.py └── product_navigation_heuristics.py ├── pages ├── __init__.py ├── article_heuristics.py ├── product_navigation_heuristics.py └── search_request_template.py ├── params.py ├── spiders ├── __init__.py ├── _google_domains.py ├── _google_gl.py ├── _google_hl.py ├── article.py ├── base.py ├── ecommerce.py ├── job_posting.py └── serp.py └── utils.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.12.0 3 | commit = True 4 | tag = True 5 | tag_name = {new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | search = version="{current_version}" 9 | replace = version="{new_version}" 10 | 11 | [bumpversion:file:docs/conf.py] 12 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "header, diff, tree" 3 | 4 | coverage: 5 | status: 6 | project: false 7 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: publish 5 | 6 | on: 7 | push: 8 | tags: 9 | - "[0-9]+.[0-9]+.[0-9]+" 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.13' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build 25 | - name: Build package 26 | run: python -m build 27 | - name: Publish package 28 | if: startsWith(github.ref, 'refs/tags') 29 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: tox 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | 11 | jobs: 12 | test: 13 | 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | include: 19 | - python-version: '3.9' 20 | toxenv: min 21 | - python-version: '3.9' 22 | - python-version: '3.10' 23 | - python-version: '3.11' 24 | - python-version: '3.12' 25 | - python-version: '3.13' 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | python -m pip install tox 37 | - name: tox 38 | run: | 39 | tox -e ${{ matrix.toxenv || 'py' }} 40 | - name: coverage 41 | if: ${{ success() }} 42 | uses: codecov/codecov-action@v4.0.1 43 | with: 44 | token: ${{ secrets.CODECOV_TOKEN }} 45 | 46 | check: 47 | runs-on: ubuntu-latest 48 | strategy: 49 | fail-fast: false 50 | matrix: 51 | python-version: ["3.13"] 52 | tox-job: ["mypy", "linters", "twine", "docs"] 53 | 54 | steps: 55 | - uses: actions/checkout@v4 56 | - name: Set up Python ${{ matrix.python-version }} 57 | uses: actions/setup-python@v5 58 | with: 59 | python-version: ${{ matrix.python-version }} 60 | - name: Install dependencies 61 | run: | 62 | python -m pip install --upgrade pip 63 | python -m pip install tox 64 | - name: tox 65 | run: | 66 | tox -e ${{ matrix.tox-job }} 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .coverage 2 | .mypy_cache/ 3 | .tox/ 4 | dist/ 5 | htmlcov/ 6 | coverage.xml 7 | docs/_build 8 | *.egg-info/ 9 | __pycache__/ 10 | coverage-html/ 11 | build/ 12 | .idea/ 13 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | - repo: https://github.com/psf/black 7 | rev: 23.10.1 8 | hooks: 9 | - id: black 10 | - repo: https://github.com/pycqa/flake8 11 | rev: 6.1.0 12 | hooks: 13 | - id: flake8 14 | - repo: https://github.com/adamchainz/blacken-docs 15 | rev: 1.16.0 16 | hooks: 17 | - id: blacken-docs 18 | additional_dependencies: 19 | - black==23.10.1 20 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "3.12" # Keep in sync with .github/workflows/test.yml 9 | python: 10 | install: 11 | - requirements: docs/requirements.txt 12 | - path: . 13 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 0.12.0 (2025-03-31) 5 | ------------------- 6 | 7 | * :ref:`Search queries ` support is added to the :ref:`job 8 | posting spider template `. 9 | * Fixed support for POST requests in search queries. 10 | * Improved validation in the :ref:`Google search spider template 11 | `. 12 | 13 | 0.11.2 (2024-12-30) 14 | ------------------- 15 | 16 | * Do not log warning about disabled components. 17 | 18 | 0.11.1 (2024-12-26) 19 | ------------------- 20 | 21 | * The :ref:`e-commerce ` and :ref:`job posting ` 22 | spider templates no longer ignore item requests for a different domain. 23 | 24 | 0.11.0 (2024-12-16) 25 | ------------------- 26 | 27 | * New :ref:`Articles spider template
`, built on top of 28 | Zyte API’s :http:`request:article` and :http:`request:articleNavigation`. 29 | 30 | * New :ref:`Job Posting spider template `, built on top of 31 | Zyte API’s :http:`request:jobPosting` and :http:`request:jobPostingNavigation`. 32 | 33 | * :ref:`Search queries ` support is added to the 34 | :ref:`e-commerce spider template `. 35 | This allows to provide a list of search queries to the 36 | spider; the spider finds a search form on the target webpage, and submits all the queries. 37 | 38 | * ProductList extraction support is added to the 39 | :ref:`e-commerce spider template `. This allows spiders to 40 | extract basic product information without going into product detail pages. 41 | 42 | * New features are added to the :ref:`Google Search spider template `: 43 | 44 | * An option to follow the result links and extract data 45 | from the target pages (via the ``extract`` argument) 46 | * Content Languages (lr) parameter 47 | * Content Countries (cr) parameter 48 | * User Country (gl) parameter 49 | * User Language (hl) parameter 50 | * results_per_page parameter 51 | 52 | * Added a Scrapy add-on. This allows to greatly simplify the initial 53 | zyte-spider-templates configuration. 54 | 55 | * Bug fix: incorrectly extracted URLs no longer make spiders drop 56 | other requests. 57 | 58 | * Cleaned up the CI; improved the testing suite; cleaned up the documentation. 59 | 60 | 0.10.0 (2024-11-22) 61 | ------------------- 62 | 63 | * Dropped Python 3.8 support, added Python 3.13 support. 64 | 65 | * Increased the minimum required versions of some dependencies: 66 | 67 | * ``pydantic``: ``2`` → ``2.1`` 68 | 69 | * ``scrapy-poet``: ``0.21.0`` → ``0.24.0`` 70 | 71 | * ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0`` 72 | 73 | * ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0`` 74 | 75 | * ``zyte-common-items``: ``0.22.0`` → ``0.23.0`` 76 | 77 | * Added :ref:`custom attributes ` support to the 78 | :ref:`e-commerce spider template ` through its new 79 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input` 80 | and 81 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method` 82 | parameters. 83 | 84 | * The 85 | :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages` 86 | parameter of the :ref:`Google Search spider template ` can no 87 | longer be 0 or lower. 88 | 89 | * The :ref:`Google Search spider template ` now follows 90 | pagination for the results of each query page by page, instead of sending a 91 | request for every page in parallel. It stops once it reaches a page without 92 | organic results. 93 | 94 | * Improved the description of 95 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy` 96 | values. 97 | 98 | * Fixed type hint issues related to Scrapy. 99 | 100 | 101 | 0.9.0 (2024-09-17) 102 | ------------------ 103 | 104 | * Now requires ``zyte-common-items >= 0.22.0``. 105 | 106 | * New :ref:`Google Search spider template `, built on top of 107 | Zyte API’s :http:`request:serp`. 108 | 109 | * The heuristics of the :ref:`e-commerce spider template ` to 110 | ignore certain URLs when following category links now also handles 111 | subdomains. For example, before https://example.com/blog was ignored, now 112 | https://blog.example.com is also ignored. 113 | 114 | * In the :ref:`spider parameters JSON schema `, the 115 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy` 116 | parameter of the :ref:`e-commerce spider template ` switches 117 | position, from being the last parameter to being between 118 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file` 119 | and 120 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`. 121 | 122 | * Removed the ``valid_page_types`` attribute of 123 | :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`. 124 | 125 | 126 | 0.8.0 (2024-08-21) 127 | ------------------ 128 | 129 | * Added new input parameters: 130 | 131 | * ``urls`` accepts a newline-delimited list of URLs. 132 | 133 | * ``urls_file`` accepts a URL that points to a plain-text file with a 134 | newline-delimited list of URLs. 135 | 136 | Only one of ``url``, ``urls`` and ``urls_file`` should be used at a time. 137 | 138 | * Added new crawling strategies: 139 | 140 | * ``automatic`` - uses heuristics to see if an input URL is a homepage, for 141 | which it uses a modified ``full`` strategy where other links are discovered 142 | only in the homepage. Otherwise, it assumes it's a navigation page and uses 143 | the existing ``navigation`` strategy. 144 | 145 | * ``direct_item`` - input URLs are directly extracted as products. 146 | 147 | * Added new parameters classes: ``LocationParam`` and ``PostalAddress``. Note 148 | that these are available for use when customizing the templates and are not 149 | currently being utilized by any template. 150 | 151 | * Backward incompatible changes: 152 | 153 | * ``automatic`` becomes the new default crawling strategy instead of ``full``. 154 | 155 | * CI test improvements. 156 | 157 | 158 | 0.7.2 (2024-05-07) 159 | ------------------ 160 | 161 | * Implemented :ref:`mixin classes for spider parameters `, to 162 | improve reuse. 163 | 164 | * Improved docs, providing an example about overriding existing parameters when 165 | :ref:`customizing parameters `, and featuring 166 | :class:`~web_poet.AnyResponse` in the :ref:`example about overriding parsing 167 | `. 168 | 169 | 170 | 0.7.1 (2024-02-22) 171 | ------------------ 172 | 173 | * The 174 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy` 175 | parameter of 176 | :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpider` 177 | now defaults to 178 | :attr:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy.full` 179 | instead of 180 | :attr:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy.navigation`. 181 | We also reworded some descriptions of :enum:`~.EcommerceCrawlStrategy` values 182 | for clarification. 183 | 184 | 0.7.0 (2024-02-09) 185 | ------------------ 186 | 187 | * Updated requirement versions: 188 | 189 | * :doc:`scrapy-poet ` >= 0.21.0 190 | * :doc:`scrapy-zyte-api ` >= 0.16.0 191 | 192 | * With the updated dependencies above, this fixes the issue of having 2 separate 193 | Zyte API Requests (*productNavigation* and *httpResponseBody*) for the same URL. Note 194 | that this issue only occurs when requesting product navigation pages. 195 | 196 | * Moved :class:`zyte_spider_templates.spiders.ecommerce.ExtractFrom` into 197 | :class:`zyte_spider_templates.spiders.base.ExtractFrom`. 198 | 199 | 200 | 0.6.1 (2024-02-02) 201 | ------------------ 202 | 203 | * Improved the :attr:`zyte_spider_templates.spiders.base.BaseSpiderParams.url` 204 | description. 205 | 206 | 0.6.0 (2024-01-31) 207 | ------------------ 208 | 209 | * Fixed the ``extract_from`` spider parameter that wasn't working. 210 | 211 | * The *"www."* prefix is now removed when setting the spider's 212 | :attr:`~scrapy.Spider.allowed_domains`. 213 | 214 | * The :attr:`zyte_common_items.ProductNavigation.nextPage` link won't be crawled 215 | if :attr:`zyte_common_items.ProductNavigation.items` is empty. 216 | 217 | * :class:`zyte_common_items.Product` items that are dropped due to low probability 218 | *(below 0.1)* are now logged in stats: ``drop_item/product/low_probability``. 219 | 220 | * :class:`zyte_spider_templates.pages.HeuristicsProductNavigationPage` now 221 | inherits from :class:`zyte_common_items.AutoProductNavigationPage` instead of 222 | :class:`zyte_common_items.BaseProductNavigationPage`. 223 | 224 | * Moved e-commerce code from :class:`zyte_spider_templates.spiders.base.BaseSpider` 225 | to :class:`zyte_spider_templates.spiders.ecommerce.EcommerceSpider`. 226 | 227 | * Documentation improvements. 228 | 229 | 0.5.0 (2023-12-18) 230 | ------------------ 231 | 232 | * The ``zyte_spider_templates.page_objects`` module is now deprecated in favor 233 | of ``zyte_spider_templates.pages``, in line with ``web_poet.pages``. 234 | 235 | 0.4.0 (2023-12-14) 236 | ------------------ 237 | 238 | * Products outside of the target domain can now be crawled using 239 | :class:`zyte_spider_templates.middlewares.AllowOffsiteMiddleware`. 240 | 241 | * Updated the documentation to also set up ``zyte_common_items.ZyteItemAdapter``. 242 | 243 | * The ``max_requests`` spider parameter has now a default value of 100. Previously, 244 | it was ``None`` which was unlimited. 245 | 246 | * Improved the description of the ``max_requests`` spider parameter. 247 | 248 | * Official support for Python 3.12. 249 | 250 | * Misc documentation improvements. 251 | 252 | 0.3.0 (2023-11-03) 253 | ------------------ 254 | 255 | * Added documentation. 256 | 257 | * Added a middleware that logs information about the crawl in JSON format, 258 | :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`. This 259 | replaces the old crawling information that was difficult to parse using 260 | regular expressions. 261 | 262 | 0.2.0 (2023-10-30) 263 | ------------------ 264 | 265 | * Now requires ``zyte-common-items >= 0.12.0``. 266 | 267 | * Added a new crawl strategy, "Pagination Only". 268 | 269 | * Improved the request priority calculation based on the metadata probability 270 | value. 271 | 272 | * CI improvements. 273 | 274 | 275 | 0.1.0 (2023-10-24) 276 | ------------------ 277 | 278 | Initial release. 279 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Zyte Group Ltd 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Zyte nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | zyte-spider-templates 3 | ===================== 4 | 5 | .. image:: https://img.shields.io/pypi/v/zyte-spider-templates.svg 6 | :target: https://pypi.python.org/pypi/zyte-spider-templates 7 | :alt: PyPI Version 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/zyte-spider-templates.svg 10 | :target: https://pypi.python.org/pypi/zyte-spider-templates 11 | :alt: Supported Python Versions 12 | 13 | .. image:: https://github.com/zytedata/zyte-spider-templates/actions/workflows/test.yml/badge.svg 14 | :target: https://github.com/zytedata/zyte-spider-templates/actions/workflows/test.yml 15 | :alt: Automated tests 16 | 17 | .. image:: https://codecov.io/github/zytedata/zyte-spider-templates/coverage.svg?branch=main 18 | :target: https://codecov.io/gh/zytedata/zyte-spider-templates 19 | :alt: Coverage report 20 | 21 | 22 | .. description starts 23 | 24 | Spider templates for automatic crawlers. 25 | 26 | This library contains Scrapy_ spider templates. They can be used out of the box 27 | with the Zyte features such as `Zyte API`_ or modified to be used standalone. 28 | There is a `sample Scrapy project`_ for this library that you can use as a 29 | starting point for your own projects. 30 | 31 | .. _Scrapy: https://docs.scrapy.org/ 32 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 33 | .. _sample Scrapy project: https://github.com/zytedata/zyte-spider-templates-project 34 | 35 | .. description ends 36 | 37 | * Documentation: https://zyte-spider-templates.readthedocs.io/en/latest/ 38 | * License: BSD 3-clause 39 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_ext/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from docutils import nodes 4 | from docutils.parsers.rst.roles import set_classes 5 | 6 | 7 | def http_api_reference_role( 8 | name, rawtext, text, lineno, inliner, options={}, content=[] 9 | ): 10 | match = re.search( 11 | r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text 12 | ) 13 | if match: 14 | display_text = match[1] 15 | reference = match[2] 16 | else: 17 | display_text = None 18 | reference = text 19 | if reference.startswith("request:"): 20 | request_or_response = "request" 21 | elif reference.startswith("response:"): 22 | request_or_response = "response/200" 23 | else: 24 | raise ValueError( 25 | f":http: directive reference must start with request: or " 26 | f"response:, got {reference} from {text!r}." 27 | ) 28 | 29 | field = reference.split(":", maxsplit=1)[1] 30 | if not display_text: 31 | display_text = field 32 | refuri = ( 33 | f"https://docs.zyte.com/zyte-api/usage/reference.html" 34 | f"#operation/extract/{request_or_response}/{field}" 35 | ) 36 | set_classes(options) 37 | node = nodes.reference(rawtext, display_text, refuri=refuri, **options) 38 | return [node], [] 39 | 40 | 41 | def setup(app): 42 | app.add_role("http", http_api_reference_role) 43 | # https://stackoverflow.com/a/13663325 44 | # 45 | # Scrapy’s 46 | # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 47 | app.add_crossref_type( 48 | directivename="setting", 49 | rolename="setting", 50 | indextemplate="pair: %s; setting", 51 | ) 52 | app.add_crossref_type( 53 | directivename="signal", 54 | rolename="signal", 55 | indextemplate="pair: %s; signal", 56 | ) 57 | app.add_crossref_type( 58 | directivename="command", 59 | rolename="command", 60 | indextemplate="pair: %s; command", 61 | ) 62 | app.add_crossref_type( 63 | directivename="reqmeta", 64 | rolename="reqmeta", 65 | indextemplate="pair: %s; reqmeta", 66 | ) 67 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGES.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | project = "zyte-spider-templates" 5 | copyright = "2023, Zyte Group Ltd" 6 | author = "Zyte Group Ltd" 7 | release = "0.12.0" 8 | 9 | sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext 10 | extensions = [ 11 | "_ext", 12 | "enum_tools.autoenum", 13 | "sphinx.ext.autodoc", 14 | "sphinx.ext.intersphinx", 15 | "sphinx.ext.viewcode", 16 | "sphinx_reredirects", 17 | "sphinxcontrib.autodoc_pydantic", 18 | ] 19 | 20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 21 | 22 | html_theme = "sphinx_rtd_theme" 23 | 24 | intersphinx_mapping = { 25 | "form2request": ( 26 | "https://form2request.readthedocs.io/en/latest", 27 | None, 28 | ), 29 | "formasaurus": ( 30 | "https://formasaurus.readthedocs.io/en/latest", 31 | None, 32 | ), 33 | "python": ( 34 | "https://docs.python.org/3", 35 | None, 36 | ), 37 | "scrapy": ( 38 | "https://docs.scrapy.org/en/latest", 39 | None, 40 | ), 41 | "scrapy-poet": ( 42 | "https://scrapy-poet.readthedocs.io/en/stable", 43 | None, 44 | ), 45 | "scrapy-spider-metadata": ( 46 | "https://scrapy-spider-metadata.readthedocs.io/en/latest", 47 | None, 48 | ), 49 | "scrapy-zyte-api": ( 50 | "https://scrapy-zyte-api.readthedocs.io/en/stable", 51 | None, 52 | ), 53 | "web-poet": ( 54 | "https://web-poet.readthedocs.io/en/stable", 55 | None, 56 | ), 57 | "zyte": ( 58 | "https://docs.zyte.com", 59 | None, 60 | ), 61 | "zyte-common-items": ( 62 | "https://zyte-common-items.readthedocs.io/en/latest", 63 | None, 64 | ), 65 | } 66 | 67 | autodoc_pydantic_model_show_config_summary = False 68 | autodoc_pydantic_model_show_field_summary = False 69 | autodoc_pydantic_model_show_json = False 70 | autodoc_pydantic_model_show_validator_members = False 71 | autodoc_pydantic_model_show_validator_summary = False 72 | autodoc_pydantic_field_list_validators = False 73 | autodoc_pydantic_field_show_constraints = False 74 | 75 | # sphinx-reredirects 76 | redirects = { 77 | "customization/page-objects": "pages.html", 78 | } 79 | 80 | # workaround for https://github.com/pydantic/pydantic/discussions/7763 81 | import zyte_spider_templates.spiders.job_posting # noqa: F401, E402 82 | -------------------------------------------------------------------------------- /docs/customization/index.rst: -------------------------------------------------------------------------------- 1 | .. _customization: 2 | 3 | ============= 4 | Customization 5 | ============= 6 | 7 | :ref:`Built-in spider templates ` can be highly customized: 8 | 9 | - :ref:`Subclass spider templates ` to customize metadata, 10 | parameters, and crawling logic. 11 | 12 | - :ref:`Implement page objects ` to override parsing 13 | logic for all or some websites, both for navigation and item detail data. 14 | -------------------------------------------------------------------------------- /docs/customization/pages.rst: -------------------------------------------------------------------------------- 1 | .. _custom-page-objects: 2 | 3 | ======================== 4 | Customizing page objects 5 | ======================== 6 | 7 | All parsing is implemented using :ref:`web-poet page objects ` 8 | that use `Zyte API automatic extraction`_ to extract :ref:`standard items 9 | `: for navigation, for item details, and even for :ref:`search 10 | request generation `. 11 | 12 | .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html 13 | 14 | You can implement your own page object classes to override how extraction works 15 | for any given combination of URL and item type. 16 | 17 | .. tip:: Make sure the import path of your page objects module is in the 18 | :ref:`SCRAPY_POET_DISCOVER ` setting, otherwise your 19 | page objects might be ignored. 20 | 21 | .. _configured scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project 22 | 23 | .. _override-parsing: 24 | 25 | Overriding parsing 26 | ================== 27 | 28 | To change or fix how a given field is extracted, overriding the value from 29 | `Zyte API automatic extraction`_, create a page object class, configured to run 30 | on some given URLs (:func:`web_poet.handle_urls`), that defines the logic to 31 | extract that field. For example: 32 | 33 | .. code-block:: python 34 | :caption: pages/books_toscrape_com.py 35 | 36 | import attrs 37 | from number_parser import parse_number 38 | from web_poet import AnyResponse, field, handle_urls 39 | from zyte_common_items import AggregateRating, AutoProductPage 40 | 41 | 42 | @handle_urls("books.toscrape.com") 43 | @attrs.define 44 | class BooksToScrapeComProductPage(AutoProductPage): 45 | response: AnyResponse 46 | 47 | @field 48 | async def aggregateRating(self): 49 | element_class = self.response.css(".star-rating::attr(class)").get() 50 | if not element_class: 51 | return None 52 | rating_str = element_class.split(" ")[-1] 53 | rating = parse_number(rating_str) 54 | if not rating: 55 | return None 56 | return AggregateRating(ratingValue=rating, bestRating=5) 57 | 58 | ``AutoProductPage`` and other page objects from `zyte-common-items`_ 59 | prefixed with ``Auto`` define fields for all standard items that return 60 | the value from `Zyte API automatic extraction`_, so that you only need 61 | to define your new field. 62 | 63 | .. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/ 64 | 65 | The page object above is decorated with ``@attrs.define`` so that it can 66 | declare a dependency on :class:`~web_poet.page_inputs.response.AnyResponse` and 67 | use that to implement custom parsing logic. You could alternatively use 68 | :class:`~web_poet.page_inputs.browser.BrowserHtml` if needed. 69 | 70 | 71 | .. _add-field: 72 | 73 | Parsing a new field 74 | =================== 75 | 76 | To extract a new field for one or more websites: 77 | 78 | #. Declare a new item type that extends a :ref:`standard item ` with 79 | your new field. For example: 80 | 81 | .. code-block:: python 82 | :caption: items.py 83 | 84 | from typing import Optional 85 | 86 | import attrs 87 | from zyte_common_items import Product 88 | 89 | 90 | @attrs.define 91 | class CustomProduct(Product): 92 | stock: Optional[int] 93 | 94 | #. Create a page object class, configured to run for your new item type 95 | (:class:`web_poet.pages.Returns`) on some given URLs 96 | (:func:`web_poet.handle_urls`), that defines the logic to extract your new 97 | field. For example: 98 | 99 | .. code-block:: python 100 | :caption: pages/books_toscrape_com.py 101 | 102 | import re 103 | 104 | from web_poet import Returns, field, handle_urls 105 | from zyte_common_items import AutoProductPage 106 | 107 | from ..items import CustomProduct 108 | 109 | 110 | @handle_urls("books.toscrape.com") 111 | class BookPage(AutoProductPage, Returns[CustomProduct]): 112 | @field 113 | async def stock(self): 114 | for entry in await self.additionalProperties: 115 | if entry.name == "availability": 116 | match = re.search(r"\d([.,\s]*\d+)*(?=\s+available\b)", entry.value) 117 | if not match: 118 | return None 119 | stock_str = re.sub(r"[.,\s]", "", match[0]) 120 | return int(stock_str) 121 | return None 122 | 123 | #. Create a spider template subclass that requests your new item type instead 124 | of the standard one. For example: 125 | 126 | .. code-block:: python 127 | :caption: spiders/books_toscrape_com.py 128 | 129 | from scrapy_poet import DummyResponse 130 | from zyte_spider_templates import EcommerceSpider 131 | 132 | from ..items import CustomProduct 133 | 134 | 135 | class BooksToScrapeComSpider(EcommerceSpider): 136 | name = "books_toscrape_com" 137 | metadata = { 138 | **EcommerceSpider.metadata, 139 | "title": "Books to Scrape", 140 | "description": "Spider template for books.toscrape.com", 141 | } 142 | 143 | def parse_product(self, response: DummyResponse, product: CustomProduct): 144 | yield from super().parse_product(response, product) 145 | 146 | .. _fix-search: 147 | 148 | Fixing search support 149 | ===================== 150 | 151 | If the default implementation to build a request out of :ref:`search queries 152 | ` does not work on a given website, you can implement your 153 | own search request page object to fix that. See 154 | :ref:`custom-request-template-page`. 155 | 156 | For example: 157 | 158 | .. code-block:: python 159 | 160 | from web_poet import handle_urls 161 | from zyte_common_items import BaseSearchRequestTemplatePage 162 | 163 | 164 | @handle_urls("example.com") 165 | class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage): 166 | @field 167 | def url(self): 168 | return "https://example.com/search?q={{ query|quote_plus }}" 169 | -------------------------------------------------------------------------------- /docs/customization/spiders.rst: -------------------------------------------------------------------------------- 1 | .. _custom-spiders: 2 | 3 | ============================ 4 | Customizing spider templates 5 | ============================ 6 | 7 | Subclass a :ref:`built-in spider template ` to customize its 8 | :ref:`metadata `, :ref:`parameters `, and 9 | :ref:`crawling logic `. 10 | 11 | .. _custom-metadata: 12 | 13 | Customizing metadata 14 | ==================== 15 | 16 | Spider template metadata is defined using `scrapy-spider-metadata`_, and can be 17 | `redefined or customized in a subclass`_. 18 | 19 | For example, to keep the upstream ``title`` but change the ``description``: 20 | 21 | .. _redefined or customized in a subclass: https://scrapy-spider-metadata.readthedocs.io/en/latest/metadata.html#defining-spider-metadata 22 | 23 | .. code-block:: python 24 | 25 | from zyte_spider_templates import EcommerceSpider 26 | 27 | 28 | class MySpider(EcommerceSpider): 29 | name = "my_spider" 30 | metadata = { 31 | **EcommerceSpider.metadata, 32 | "description": "Custom e-commerce spider template.", 33 | } 34 | 35 | 36 | .. _custom-params: 37 | 38 | Customizing parameters 39 | ====================== 40 | 41 | Spider template parameters are also defined using `scrapy-spider-metadata`_, 42 | and can be `redefined or customized in a subclass as well`_. 43 | 44 | For example, to add a ``min_price`` parameter and filter out products with a 45 | lower price: 46 | 47 | .. _redefined or customized in a subclass as well: https://scrapy-spider-metadata.readthedocs.io/en/latest/params.html 48 | 49 | .. code-block:: python 50 | 51 | from decimal import Decimal 52 | from typing import Iterable 53 | 54 | from scrapy_poet import DummyResponse 55 | from scrapy_spider_metadata import Args 56 | from zyte_common_items import Product 57 | from zyte_spider_templates import EcommerceSpider 58 | from zyte_spider_templates.spiders.ecommerce import EcommerceSpiderParams 59 | 60 | 61 | class MyParams(EcommerceSpiderParams): 62 | min_price: str = "0.00" 63 | 64 | 65 | class MySpider(EcommerceSpider, Args[MyParams]): 66 | name = "my_spider" 67 | 68 | def parse_product( 69 | self, response: DummyResponse, product: Product 70 | ) -> Iterable[Product]: 71 | for product in super().parse_product(response, product): 72 | if Decimal(product.price) >= Decimal(self.args.min_price): 73 | yield product 74 | 75 | 76 | You can also override existing parameters. For example, to hard-code the start 77 | URL: 78 | 79 | .. code-block:: python 80 | 81 | from scrapy_spider_metadata import Args 82 | from zyte_spider_templates import EcommerceSpider 83 | from zyte_spider_templates.spiders.ecommerce import EcommerceSpiderParams 84 | 85 | 86 | class MyParams(EcommerceSpiderParams): 87 | url: str = "https://books.toscrape.com" 88 | 89 | 90 | class MySpider(EcommerceSpider, Args[MyParams]): 91 | name = "my_spider" 92 | 93 | A mixin class exists for every spider parameter (see :ref:`parameter-mixins`), 94 | so you can use any combination of them in any order you like in your custom 95 | classes, while enjoying future improvements to validation, documentation or 96 | UI integration for Scrapy Cloud: 97 | 98 | .. code-block:: python 99 | 100 | from scrapy_spider_metadata import Args 101 | from zyte_spider_templates.params import GeolocationParam, UrlParam 102 | 103 | 104 | class MyParams(GeolocationParam, UrlParam): 105 | pass 106 | 107 | 108 | class MySpider(Args[MyParams]): 109 | name = "my_spider" 110 | 111 | 112 | .. _custom-crawl: 113 | 114 | Customizing the crawling logic 115 | ============================== 116 | 117 | The crawling logic of spider templates can be customized as any other 118 | :ref:`Scrapy spider `. 119 | 120 | For example, you can make a spider that expects a product details URL and does 121 | not follow navigation at all: 122 | 123 | .. code-block:: python 124 | 125 | from typing import Iterable 126 | 127 | from scrapy import Request 128 | from zyte_spider_templates import EcommerceSpider 129 | 130 | 131 | class MySpider(EcommerceSpider): 132 | name = "my_spider" 133 | 134 | def start_requests(self) -> Iterable[Request]: 135 | for request in super().start_requests(): 136 | yield request.replace(callback=self.parse_product) 137 | 138 | All parsing logic is implemented separately in :ref:`page objects 139 | `, making it easier to read the code of :ref:`built-in 140 | spider templates ` to modify them as desired. 141 | 142 | .. _scrapy-spider-metadata: https://scrapy-spider-metadata.readthedocs.io/en/latest 143 | -------------------------------------------------------------------------------- /docs/features/search.rst: -------------------------------------------------------------------------------- 1 | .. _search-queries: 2 | 3 | ============== 4 | Search queries 5 | ============== 6 | 7 | The :ref:`e-commerce spider template ` supports a spider argument, 8 | :data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`, 9 | that allows you to define a different search query per line, and 10 | turns the input URLs into search requests for those queries. 11 | 12 | For example, given the following input URLs: 13 | 14 | .. code-block:: none 15 | 16 | https://a.example 17 | https://b.example 18 | 19 | And the following list of search queries: 20 | 21 | .. code-block:: none 22 | 23 | foo bar 24 | baz 25 | 26 | By default, the spider would send 2 initial requests to those 2 input URLs, 27 | to try and find out how to build a search request for them, and if it succeeds, 28 | it will then send 4 search requests, 1 per combination of input URL and search 29 | query. For example: 30 | 31 | .. code-block:: none 32 | 33 | https://a.example/search?q=foo+bar 34 | https://a.example/search?q=baz 35 | https://b.example/s/foo%20bar 36 | https://b.example/s/baz 37 | 38 | The default implementation uses a combination of HTML metadata, AI-based HTML 39 | form inspection and heuristics to find the most likely way to build a search 40 | request for a given website. 41 | 42 | If this default implementation does not work as expected on a given website, 43 | you can :ref:`write a page object to fix that `. 44 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | =================================== 2 | zyte-spider-templates documentation 3 | =================================== 4 | 5 | .. include:: ../README.rst 6 | :start-after: .. description starts 7 | :end-before: .. description ends 8 | 9 | .. toctree:: 10 | :caption: First steps 11 | :hidden: 12 | 13 | setup 14 | 15 | .. toctree:: 16 | :caption: Templates 17 | :hidden: 18 | 19 | templates/index 20 | E-commerce 21 | Article 22 | Google search 23 | Job posting 24 | 25 | .. toctree:: 26 | :caption: Features 27 | :hidden: 28 | 29 | Search queries 30 | 31 | .. toctree:: 32 | :caption: Customization 33 | :hidden: 34 | 35 | customization/index 36 | customization/spiders 37 | customization/pages 38 | 39 | .. toctree:: 40 | :caption: Reference 41 | :hidden: 42 | 43 | reference/settings 44 | reference/reqmeta 45 | reference/api 46 | 47 | .. toctree:: 48 | :caption: All the rest 49 | :hidden: 50 | 51 | changes 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/reference/api.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | Spiders 6 | ======= 7 | 8 | .. autoclass:: zyte_spider_templates.ArticleSpider 9 | 10 | .. autoclass:: zyte_spider_templates.BaseSpider 11 | 12 | .. autoclass:: zyte_spider_templates.EcommerceSpider 13 | 14 | .. autoclass:: zyte_spider_templates.GoogleSearchSpider 15 | 16 | .. autoclass:: zyte_spider_templates.JobPostingSpider 17 | 18 | 19 | Pages 20 | ===== 21 | 22 | .. autoclass:: zyte_spider_templates.pages.DefaultSearchRequestTemplatePage 23 | 24 | .. autoclass:: zyte_spider_templates.pages.HeuristicsArticleNavigationPage 25 | 26 | .. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage 27 | 28 | 29 | .. _parameter-mixins: 30 | 31 | Parameter mixins 32 | ================ 33 | 34 | .. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam 35 | :exclude-members: model_computed_fields 36 | 37 | .. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam 38 | :exclude-members: model_computed_fields 39 | 40 | .. autoenum:: zyte_spider_templates.params.CustomAttrsMethod 41 | 42 | .. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam 43 | :exclude-members: model_computed_fields 44 | 45 | .. autoenum:: zyte_spider_templates.params.ExtractFrom 46 | 47 | .. autopydantic_model:: zyte_spider_templates.params.GeolocationParam 48 | :exclude-members: model_computed_fields 49 | 50 | .. autoenum:: zyte_spider_templates.params.Geolocation 51 | 52 | .. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam 53 | :exclude-members: model_computed_fields 54 | 55 | .. autopydantic_model:: zyte_spider_templates.params.UrlParam 56 | :exclude-members: model_computed_fields 57 | 58 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam 59 | :exclude-members: model_computed_fields 60 | 61 | .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy 62 | 63 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceExtractParam 64 | :exclude-members: model_computed_fields 65 | 66 | .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceExtract 67 | 68 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam 69 | :exclude-members: model_computed_fields 70 | 71 | .. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType 72 | 73 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam 74 | :exclude-members: model_computed_fields 75 | 76 | .. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleCrawlStrategyParam 77 | :exclude-members: model_computed_fields 78 | 79 | .. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy 80 | 81 | .. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategyParam 82 | :exclude-members: model_computed_fields 83 | 84 | .. autoenum:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategy 85 | 86 | 87 | .. _middlewares: 88 | 89 | Middlewares 90 | =========== 91 | 92 | .. autoclass:: zyte_spider_templates.CrawlingLogsMiddleware 93 | .. autoclass:: zyte_spider_templates.TrackNavigationDepthSpiderMiddleware 94 | .. autoclass:: zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware 95 | .. autoclass:: zyte_spider_templates.OffsiteRequestsPerSeedMiddleware 96 | .. autoclass:: zyte_spider_templates.OnlyFeedsMiddleware 97 | .. autoclass:: zyte_spider_templates.TrackSeedsSpiderMiddleware 98 | .. autoclass:: zyte_spider_templates.IncrementalCrawlMiddleware 99 | -------------------------------------------------------------------------------- /docs/reference/reqmeta.rst: -------------------------------------------------------------------------------- 1 | .. _meta: 2 | 3 | ================= 4 | Request.meta keys 5 | ================= 6 | 7 | Keys that can be defined in :attr:`Request.meta ` for 8 | zyte-spider-templates. 9 | 10 | .. reqmeta:: seed 11 | 12 | seed 13 | ==== 14 | 15 | Default: ``The seed URL (or value) from which the request originated.`` 16 | 17 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` and 18 | :class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`. 19 | 20 | The `seed` meta key is used to track and identify the origin of a request. It 21 | is initially set for each request that originates from the start request and 22 | can be used to manage domain constraints for subsequent requests. This key can 23 | also be set to an arbitrary value by the user to identify the seed source. 24 | 25 | Here's an example: 26 | 27 | .. code-block:: python 28 | 29 | meta = { 30 | "seed": "http://example.com", 31 | } 32 | 33 | .. reqmeta:: is_seed_request 34 | 35 | is_seed_request 36 | =============== 37 | 38 | Default: ``False`` 39 | 40 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`. 41 | 42 | The `is_seed_request` meta key is a boolean flag that identifies whether the 43 | request is a start request (i.e., originating from the initial seed URL). When 44 | set to True, the middleware extracts seed domains from the response. 45 | 46 | Example: 47 | :: 48 | 49 | meta = { 50 | 'is_seed_request': True, 51 | } 52 | 53 | .. reqmeta:: seed_domains 54 | 55 | seed_domains 56 | ============ 57 | 58 | Default: ``Initial URL and redirected URLs`` 59 | 60 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`. 61 | 62 | The `seed_domains` meta key is a list of domains that the middleware uses to 63 | check whether a request belongs to these domains or not. By default, this list 64 | includes the initial URL's domain and domains of any redirected URLs `(if there 65 | was a redirection)`. This list can also be set by the user in the spider to 66 | specify additional domains for which the middleware should allow requests. 67 | 68 | Here's an example: 69 | 70 | .. code-block:: python 71 | 72 | meta = {"seed_domains": ["example.com", "another-example.com"]} 73 | 74 | .. reqmeta:: is_hop 75 | 76 | increase_navigation_depth 77 | ========================= 78 | 79 | Default: ``True`` 80 | 81 | The key is used for :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`. 82 | 83 | The `increase_navigation_depth` meta key is a boolean flag that determines whether the 84 | navigation_depth for a request should be increased. By default, the middleware increases 85 | navigation_depth for all requests. Specific spiders can override this behavior for certain 86 | types of requests, such as pagination or RSS feeds, by explicitly setting the meta key. 87 | 88 | Example: 89 | :: 90 | 91 | meta = { 92 | 'increase_navigation_depth': False, 93 | } 94 | 95 | .. reqmeta:: only_feeds 96 | 97 | only_feeds 98 | ========== 99 | Default: ``False`` 100 | 101 | The key is used for :class:`~zyte_spider_templates.OnlyFeedsMiddleware`. 102 | 103 | The `only_feeds` meta key is a boolean flag that identifies whether the 104 | spider should discover all links on the website or extract links from RSS/Atom feeds only. 105 | 106 | Example: 107 | :: 108 | 109 | meta = { 110 | 'page_params': {'only_feeds': True} 111 | } 112 | 113 | -------------------------------------------------------------------------------- /docs/reference/settings.rst: -------------------------------------------------------------------------------- 1 | .. _settings: 2 | 3 | ======== 4 | Settings 5 | ======== 6 | 7 | .. setting:: NAVIGATION_DEPTH_LIMIT 8 | 9 | NAVIGATION_DEPTH_LIMIT 10 | ====================== 11 | 12 | Default: ``0`` 13 | 14 | The maximum navigation depth to crawl. If ``0``, no limit is imposed. 15 | 16 | We increase *navigation_depth* for requests navigating to a subcategory originating from 17 | its parent category, including a request targeting a category starting at the website home page. 18 | We don't increase *navigation_depth* for requests accessing item details (e.g., an article) or for 19 | additional pages of a visited webpage. For example, if you set ``NAVIGATION_DEPTH_LIMIT`` to ``1``, 20 | only item details and pagination links from your start URLs are followed. 21 | 22 | .. note:: 23 | Currently, only the :ref:`Article spider template
` implements proper 24 | navigation_depth support. Other spider templates treat all follow-up requests as 25 | increasing navigation_depth. 26 | 27 | Setting a navigation_depth limit can prevent a spider from delving too deeply into 28 | subcategories. This is especially useful if you only need data from the 29 | top-level categories or specific subcategories. 30 | 31 | When :ref:`customizing a spider template `, set the 32 | :reqmeta:`increase_navigation_depth` request metadata key to override whether a request is 33 | considered as increasing navigation depth (``True``) or not (``False``): 34 | 35 | .. code-block:: python 36 | 37 | Request("https://example.com", meta={"increase_navigation_depth": False}) 38 | 39 | If you want to limit all link following, including pagination and item details, 40 | consider using the :setting:`DEPTH_LIMIT ` setting instead. 41 | 42 | Implemented by :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`. 43 | 44 | .. setting:: MAX_REQUESTS_PER_SEED 45 | 46 | MAX_REQUESTS_PER_SEED 47 | ===================== 48 | 49 | .. tip:: When using the :ref:`article spider template
`, you may use 50 | the 51 | :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.max_requests_per_seed` 52 | command-line parameter instead of this setting. 53 | 54 | Default: ``0`` 55 | 56 | Limit the number of follow-up requests per initial URL to the specified amount. 57 | Non-positive integers (i.e. 0 and below) imposes no limit and disables this middleware. 58 | 59 | The limit is the total limit for all direct and indirect follow-up requests 60 | of each initial URL. 61 | 62 | Implemented by 63 | :class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`. 64 | 65 | .. setting:: OFFSITE_REQUESTS_PER_SEED_ENABLED 66 | 67 | OFFSITE_REQUESTS_PER_SEED_ENABLED 68 | ================================= 69 | 70 | Default: ``True`` 71 | 72 | Setting this value to ``True`` enables the 73 | :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` while ``False`` 74 | completely disables it. 75 | 76 | The middleware ensures that *most* requests would belong to the domain of the 77 | seed URLs. However, it does allow offsite requests only if they were obtained 78 | from a response that belongs to the domain of the seed URLs. Any other requests 79 | obtained thereafter from a response in a domain outside of the seed URLs will 80 | not be allowed. 81 | 82 | This prevents the spider from completely crawling other domains while ensuring 83 | that aggregator websites *(e.g. a news website with articles from other domains)* 84 | are supported, as it can access pages from other domains. 85 | 86 | Disabling the middleware would not prevent offsite requests from being filtered 87 | and might generally lead in other domains from being crawled completely, unless 88 | ``allowed_domains`` is set in the spider. 89 | 90 | .. note:: 91 | 92 | If a seed URL gets redirected to a different domain, both the domain from 93 | the original request and the domain from the redirected response will be 94 | used as references. 95 | 96 | If the seed URL is `https://books.toscrape.com`, all subsequent requests to 97 | `books.toscrape.com` and its subdomains are allowed, but requests to 98 | `toscrape.com` are not. Conversely, if the seed URL is `https://toscrape.com`, 99 | requests to both `toscrape.com` and `books.toscrape.com` are allowed. 100 | 101 | .. setting:: ONLY_FEEDS_ENABLED 102 | 103 | ONLY_FEEDS_ENABLED 104 | ================== 105 | 106 | .. note:: 107 | 108 | Only works for the :ref:`article spider template
`. 109 | 110 | Default: ``False`` 111 | 112 | Whether to extract links from Atom and RSS news feeds only (``True``) or 113 | to also use extracted links from ``ArticleNavigation.subCategories`` (``False``). 114 | 115 | Implemented by :class:`~zyte_spider_templates.OnlyFeedsMiddleware`. 116 | 117 | .. setting:: INCREMENTAL_CRAWL_BATCH_SIZE 118 | 119 | INCREMENTAL_CRAWL_BATCH_SIZE 120 | ============================ 121 | 122 | Default: ``50`` 123 | 124 | The maximum number of seen URLs to read from or write to the corresponding 125 | :ref:`Zyte Scrapy Cloud collection ` per request during an incremental 126 | crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`). 127 | 128 | This setting determines the batch size for interactions with the Collection. 129 | If the response from a webpage contains more than 50 URLs, they will be split 130 | into smaller batches for processing. Conversely, if fewer than 50 URLs are present, 131 | all URLs will be handled in a single request to the Collection. 132 | 133 | Adjusting this value can optimize the performance of a crawl by balancing the number 134 | of requests sent to the Collection with processing efficiency. 135 | 136 | .. note:: 137 | 138 | Setting it too large (e.g. > 100) will cause issues due to the large query length. 139 | Setting it too small (less than 10) will remove the benefit of using a batch. 140 | 141 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. 142 | 143 | 144 | .. setting:: INCREMENTAL_CRAWL_COLLECTION_NAME 145 | 146 | INCREMENTAL_CRAWL_COLLECTION_NAME 147 | ================================= 148 | 149 | .. note:: 150 | 151 | :ref:`virtual spiders ` are spiders based on :ref:`spider templates `. 152 | The explanation of using INCREMENTAL_CRAWL_COLLECTION_NAME related to both types of spiders. 153 | 154 | .. tip:: When using the :ref:`article spider template
`, you may use 155 | the 156 | :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental_collection_name` 157 | command-line parameter instead of this setting. 158 | 159 | .. note:: 160 | Only ASCII alphanumeric characters and underscores are allowed. 161 | 162 | Default: `_incremental`. 163 | The current spider's name here will be virtual spider's name, if it's a virtual spider; 164 | otherwise, :data:`Spider.name `. 165 | 166 | Name of the :ref:`Zyte Scrapy Cloud collection ` used during 167 | an incremental crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`). 168 | 169 | By default, a collection named after the spider is used, meaning that matching URLs from 170 | previous runs of the same spider are skipped, provided those previous runs had 171 | the :setting:`INCREMENTAL_CRAWL_ENABLED` setting set to ``True`` or the spider 172 | argument `incremental` set to `true`. 173 | 174 | Using a different collection name makes sense, for example, in the following cases: 175 | - Different spiders share a collection. 176 | - The same spider uses different collections (e.g., for development runs vs. production runs). 177 | 178 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. 179 | 180 | 181 | .. setting:: INCREMENTAL_CRAWL_ENABLED 182 | 183 | INCREMENTAL_CRAWL_ENABLED 184 | ========================= 185 | 186 | .. tip:: When using the :ref:`article spider template
`, you may use 187 | the 188 | :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental` 189 | command-line parameter instead of this setting. 190 | 191 | Default: ``False`` 192 | 193 | If set to ``True``, items seen in previous crawls with the same 194 | :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` value are skipped. 195 | 196 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. 197 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | autodoc_pydantic==2.0.1 2 | enum-tools==0.11.0 3 | Sphinx==7.2.6 4 | sphinx-reredirects==0.1.3 5 | sphinx-rtd-theme==1.3.0 6 | sphinx-toolbox==3.5.0 # optional dependency of enum-tools 7 | -------------------------------------------------------------------------------- /docs/setup.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Initial setup 3 | ============= 4 | 5 | Learn how to get :ref:`spider templates ` installed and 6 | configured on an existing Scrapy_ project. 7 | 8 | .. _Scrapy: https://docs.scrapy.org/en/latest/ 9 | 10 | .. tip:: If you do not have a Scrapy project yet, use 11 | `zyte-spider-templates-project`_ as a starting template to get started 12 | quickly. 13 | 14 | .. _zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project 15 | 16 | Requirements 17 | ============ 18 | 19 | - Python 3.9+ 20 | 21 | - Scrapy 2.11+ 22 | 23 | For Zyte API features, including AI-powered parsing, you need a `Zyte API`_ 24 | subscription. 25 | 26 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html 27 | 28 | Installation 29 | ============ 30 | 31 | .. code-block:: shell 32 | 33 | pip install zyte-spider-templates 34 | 35 | 36 | .. _config: 37 | 38 | Configuration 39 | ============= 40 | 41 | In your Scrapy project settings (usually in ``settings.py``): 42 | 43 | #. `Configure scrapy-poet`_. 44 | 45 | .. _Configure scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project 46 | 47 | #. For Zyte API features, including AI-powered parsing, :ref:`configure 48 | scrapy-zyte-api `. 49 | 50 | #. Configure :class:`zyte_common_items.ZyteItemAdapter`: 51 | 52 | .. code-block:: python 53 | :caption: ``settings.py`` 54 | 55 | from itemadapter import ItemAdapter 56 | from zyte_common_items import ZyteItemAdapter 57 | 58 | ItemAdapter.ADAPTER_CLASSES.appendleft(ZyteItemAdapter) 59 | 60 | #. Add the zyte-spider-templates add-on to your :setting:`ADDONS 61 | ` setting: 62 | 63 | .. code-block:: python 64 | :caption: ``settings.py`` 65 | 66 | ADDONS = { 67 | "zyte_spider_templates.Addon": 1000, 68 | } 69 | 70 | For an example of a properly configured ``settings.py`` file, see `the one 71 | in zyte-spider-templates-project`_. 72 | 73 | .. _the one in zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project/blob/main/zyte_spider_templates_project/settings.py 74 | -------------------------------------------------------------------------------- /docs/templates/article.rst: -------------------------------------------------------------------------------- 1 | .. _article: 2 | 3 | ===================================== 4 | Article spider template (``article``) 5 | ===================================== 6 | 7 | Basic use 8 | ========= 9 | 10 | .. code-block:: shell 11 | 12 | scrapy crawl article -a url="https://www.zyte.com/blog/" 13 | 14 | Parameters 15 | ========== 16 | 17 | .. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleSpiderParams 18 | :inherited-members: BaseModel 19 | :exclude-members: model_computed_fields, single_input 20 | 21 | Settings 22 | ======== 23 | 24 | The following :ref:`zyte-spider-templates settings ` may be useful 25 | for the article spider template: 26 | 27 | :setting:`NAVIGATION_DEPTH_LIMIT` 28 | Limit the crawling depth of subcategories. 29 | 30 | :setting:`OFFSITE_REQUESTS_PER_SEED_ENABLED` 31 | Skip follow-up requests if their URL points to a domain different from the 32 | domain of their initial URL. 33 | 34 | :setting:`ONLY_FEEDS_ENABLED` 35 | Extract links only from Atom and RSS news feeds. 36 | -------------------------------------------------------------------------------- /docs/templates/e-commerce.rst: -------------------------------------------------------------------------------- 1 | .. _e-commerce: 2 | 3 | ========================================== 4 | E-commerce spider template (``ecommerce``) 5 | ========================================== 6 | 7 | Basic use 8 | ========= 9 | 10 | .. code-block:: shell 11 | 12 | scrapy crawl ecommerce -a url="https://books.toscrape.com" 13 | 14 | Parameters 15 | ========== 16 | 17 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams 18 | :inherited-members: BaseModel 19 | :exclude-members: model_computed_fields, single_input 20 | 21 | Settings 22 | ======== 23 | 24 | The following :ref:`zyte-spider-templates settings ` may be useful 25 | for the e-commerce spider template: 26 | 27 | :setting:`MAX_REQUESTS_PER_SEED` 28 | Limit the number of follow-up requests per initial URL. 29 | -------------------------------------------------------------------------------- /docs/templates/google-search.rst: -------------------------------------------------------------------------------- 1 | .. _google-search: 2 | 3 | ================================================= 4 | Google search spider template (``google_search``) 5 | ================================================= 6 | 7 | Basic use 8 | ========= 9 | 10 | .. code-block:: shell 11 | 12 | scrapy crawl google_search -a search_queries="foo bar" 13 | 14 | Parameters 15 | ========== 16 | 17 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams 18 | :inherited-members: BaseModel 19 | :exclude-members: model_computed_fields 20 | -------------------------------------------------------------------------------- /docs/templates/index.rst: -------------------------------------------------------------------------------- 1 | .. _spider-templates: 2 | 3 | ================ 4 | Spider templates 5 | ================ 6 | 7 | Built-in `spider templates`_ use `Zyte API automatic extraction`_ to provide 8 | automatic crawling and parsing, i.e. you can run these spiders on any website 9 | of the right type to automatically extract the desired structured data. 10 | 11 | .. _spider templates: https://docs.zyte.com/scrapy-cloud/usage/spiders.html#spider-templates-and-virtual-spiders 12 | .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html 13 | 14 | For example, to extract all products from an e-commerce website, you can run 15 | the :ref:`e-commerce spider ` spider as follows: 16 | 17 | .. code-block:: shell 18 | 19 | scrapy crawl ecommerce -a url="https://books.toscrape.com" 20 | 21 | Spider templates support additional parameters beyond ``url``. See the 22 | documentation of each specific spider for details. 23 | 24 | You can also :ref:`customize spider templates ` to meet your 25 | needs. 26 | 27 | Spider template list 28 | ==================== 29 | 30 | :ref:`E-commerce ` 31 | Get products from an e-commerce website. 32 | 33 | :ref:`Google Search ` 34 | Get Google search results. 35 | 36 | :ref:`Article
` 37 | Get articles from websites. 38 | 39 | :ref:`Job posting ` 40 | Get job postings from job websites. 41 | -------------------------------------------------------------------------------- /docs/templates/job-posting.rst: -------------------------------------------------------------------------------- 1 | .. _job-posting: 2 | 3 | ============================================= 4 | Job posting spider template (``job_posting``) 5 | ============================================= 6 | 7 | Basic use 8 | ========= 9 | 10 | .. code-block:: shell 11 | 12 | scrapy crawl job_posting -a url="https://books.toscrape.com" 13 | 14 | Parameters 15 | ========== 16 | 17 | .. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams 18 | :inherited-members: BaseModel 19 | :exclude-members: model_computed_fields 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | multi_line_output = 3 4 | 5 | [tool.mypy] 6 | check_untyped_defs = true 7 | ignore_missing_imports = true 8 | 9 | [tool.black] 10 | target-version = ["py38", "py39", "py310", "py311", "py312"] 11 | force-exclude = "template.py" 12 | 13 | [tool.pytest.ini_options] 14 | filterwarnings = [ 15 | "ignore:deprecated string literal syntax::jmespath.lexer", 16 | ] 17 | addopts = [ 18 | "--reactor=asyncio", 19 | ] 20 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | pytest 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | # Refers to the max-line length. Let's suppress the error and simply 4 | # let black take care on how it wants to format the lines. 5 | E501, 6 | 7 | # E203 whitespace before ':' 8 | E203, 9 | 10 | # Refers to "line break before binary operator". 11 | # Similar to above, let black take care of the formatting. 12 | W503, 13 | 14 | # Refers to "necessary dict call - rewrite as a literal". 15 | C408, 16 | 17 | # 1 blank line required between summary line and description 18 | D205, 19 | 20 | # First line should end with a period 21 | D400, 22 | 23 | # First line should be in imperative mood 24 | D401, 25 | 26 | # First line should not be the function's "signature" 27 | D402 28 | 29 | exclude = 30 | template.py 31 | 32 | per-file-ignores = 33 | # F401: Ignore "imported but unused" errors in __init__ files, as those 34 | # imports are there to expose submodule functions so they can be imported 35 | # directly from that module 36 | zyte_spider_templates/__init__.py:F401 37 | zyte_spider_templates/page_objects/__init__.py:F401 38 | zyte_spider_templates/page_objects/product_navigation_heuristics.py:F401 39 | zyte_spider_templates/pages/__init__.py:F401 40 | 41 | # E731: Ignore "do not assign a lambda expression, use a def" since 42 | # we're using quick shortcuts for the tests 43 | tests/test_ecommerce.py:E731 44 | tests/test_job_posting.py:E731 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="zyte-spider-templates", 5 | version="0.12.0", 6 | description="Spider templates for automatic crawlers.", 7 | long_description=open("README.rst").read(), 8 | long_description_content_type="text/x-rst", 9 | author="Zyte Group Ltd", 10 | author_email="info@zyte.com", 11 | url="https://github.com/zytedata/zyte-spider-templates", 12 | packages=find_packages(), 13 | include_package_data=True, 14 | install_requires=[ 15 | "duplicate-url-discarder>=0.2.0", 16 | "duplicate-url-discarder-rules>=2024.11.05", 17 | "extruct>=0.18.0", 18 | "feedparser>=6.0.11", 19 | "form2request>=0.2.0", 20 | "formasaurus>=0.10.0", 21 | "jmespath>=0.9.5", 22 | "pydantic>=2.1", 23 | "requests>=2.31.0", 24 | "scrapinghub >= 2.4.0", 25 | "scrapy>=2.11.0", 26 | "scrapy-poet>=0.24.0", 27 | "scrapy-spider-metadata>=0.2.0", 28 | "scrapy-zyte-api[provider]>=0.25.0", 29 | "web-poet>=0.17.1", 30 | "xtractmime>=0.2.1", 31 | "zyte-common-items>=0.26.2", 32 | ], 33 | classifiers=[ 34 | "Development Status :: 3 - Alpha", 35 | "Intended Audience :: Developers", 36 | "License :: OSI Approved :: BSD License", 37 | "Operating System :: OS Independent", 38 | "Programming Language :: Python :: 3", 39 | "Programming Language :: Python :: 3.9", 40 | "Programming Language :: Python :: 3.10", 41 | "Programming Language :: Python :: 3.11", 42 | "Programming Language :: Python :: 3.12", 43 | "Programming Language :: Python :: 3.13", 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Type 2 | 3 | import pytest 4 | from scrapy import Spider 5 | from scrapy.utils.test import TestSpider 6 | 7 | # https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting 8 | pytest.register_assert_rewrite("tests.utils") 9 | 10 | 11 | # scrapy.utils.test.get_crawler alternative that does not freeze settings. 12 | def get_crawler( 13 | *, settings: Optional[Dict[str, Any]] = None, spider_cls: Type[Spider] = TestSpider 14 | ): 15 | from scrapy.crawler import CrawlerRunner 16 | 17 | settings = settings or {} 18 | # Set by default settings that prevent deprecation warnings. 19 | settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7" 20 | runner = CrawlerRunner(settings) 21 | crawler = runner.create_crawler(spider_cls) 22 | return crawler 23 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Any, Optional 4 | 5 | import pytest 6 | import pytest_twisted 7 | from aiohttp.test_utils import TestServer 8 | 9 | if TYPE_CHECKING: 10 | from aiohttp.web import Application 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def mockserver(): 15 | from .mockserver import MockServer 16 | 17 | with MockServer() as server: 18 | yield server 19 | 20 | 21 | # Copied verbatim from pytest-aiohttp. We can't use pytest-asyncio fixtures with 22 | # pytest-twisted, so we need to decorate this one with a pytest-twisted decorator. 23 | # See also https://github.com/pytest-dev/pytest-twisted/issues/188 24 | @pytest_twisted.async_yield_fixture(scope="module") 25 | async def aiohttp_server(): 26 | """Factory to create a TestServer instance, given an app. 27 | 28 | aiohttp_server(app, **kwargs) 29 | """ 30 | servers = [] 31 | 32 | async def go( 33 | app: Application, 34 | *, 35 | host: str = "127.0.0.1", 36 | port: Optional[int] = None, 37 | **kwargs: Any, 38 | ) -> TestServer: 39 | server = TestServer(app, host=host, port=port) 40 | await server.start_server(**kwargs) 41 | servers.append(server) 42 | return server 43 | 44 | yield go 45 | 46 | while servers: 47 | await servers.pop().close() 48 | 49 | 50 | @pytest_twisted.async_fixture(scope="module") 51 | async def zyte_api_server(aiohttp_server) -> TestServer: 52 | from fake_zyte_api.main import make_app 53 | 54 | app = make_app() 55 | return await aiohttp_server(app) 56 | -------------------------------------------------------------------------------- /tests/incremental/test_collection_fp_manager.py: -------------------------------------------------------------------------------- 1 | from asyncio import ensure_future 2 | from unittest.mock import MagicMock, patch 3 | 4 | import pytest 5 | from scrapy.statscollectors import StatsCollector 6 | from scrapy.utils.request import RequestFingerprinter 7 | from twisted.internet.defer import Deferred, inlineCallbacks 8 | 9 | from tests import get_crawler 10 | from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager 11 | from zyte_spider_templates.spiders.article import ArticleSpider 12 | 13 | 14 | @pytest.fixture 15 | def mock_crawler(): 16 | return MagicMock() 17 | 18 | 19 | def crawler_for_incremental(): 20 | url = "https://example.com" 21 | crawler = get_crawler() 22 | crawler.request_fingerprinter = RequestFingerprinter() 23 | crawler.stats = StatsCollector(crawler) 24 | crawler.spider = ArticleSpider.from_crawler(crawler, url=url) 25 | crawler.settings["ZYTE_PROJECT_ID"] = "000000" 26 | return crawler 27 | 28 | 29 | @pytest.mark.parametrize("batch_size", [50, 2]) 30 | @pytest.mark.parametrize( 31 | "fingerprints, keys_in_collection, fingerprints_batch, expected_result", 32 | [ 33 | ([], [], {"fp1", "fp2", "fp3"}, set()), 34 | (["fp1", "fp2", "fp3"], [], set(), set()), 35 | (["fp1", "fp2", "fp3"], ["fp1"], set(), {"fp1"}), 36 | (["fp1", "fp2", "fp3"], ["fp1", "fp2"], set(), {"fp1", "fp2"}), 37 | (["fp1", "fp2", "fp3"], ["fp1", "fp2", "fp3"], set(), {"fp1", "fp2", "fp3"}), 38 | ( 39 | ["fp1", "fp2", "fp3"], 40 | ["fp1", "fp2"], 41 | {("fp3", "url3")}, 42 | {"fp1", "fp2", "fp3"}, 43 | ), 44 | (["fp1", "fp2", "fp3"], [], {("fp3", "url3")}, {"fp3"}), 45 | ], 46 | ) 47 | @patch("scrapinghub.ScrapinghubClient") 48 | @inlineCallbacks 49 | def test_get_existing_fingerprints( 50 | mock_scrapinghub_client, 51 | batch_size, 52 | fingerprints, 53 | keys_in_collection, 54 | fingerprints_batch, 55 | expected_result, 56 | ): 57 | mock_client = MagicMock() 58 | mock_scrapinghub_client.return_value = mock_client 59 | 60 | mock_collection = MagicMock() 61 | mock_collection.count.return_value = 0 62 | mock_client.get_project.return_value.collections.get_store.return_value = ( 63 | mock_collection 64 | ) 65 | 66 | mock_crawler = MagicMock() 67 | mock_crawler.settings.getint.return_value = batch_size 68 | 69 | mock_manager = CollectionsFingerprintsManager(mock_crawler) 70 | mock_manager.get_keys_from_collection = MagicMock(return_value=keys_in_collection) # type: ignore 71 | mock_manager.batch = fingerprints_batch 72 | 73 | r = yield Deferred.fromFuture( 74 | ensure_future(mock_manager.get_existing_fingerprints_async(fingerprints)) 75 | ) 76 | assert r == expected_result 77 | 78 | 79 | @pytest.mark.parametrize( 80 | "fingerprints, expected_keys", 81 | [ 82 | ({"fp1", "fp2", "fp3"}, {"fp1", "fp2", "fp3"}), 83 | ({}, set()), 84 | ], 85 | ) 86 | @patch("scrapinghub.ScrapinghubClient") 87 | def test_get_keys_from_collection(mock_crawler, fingerprints, expected_keys): 88 | mock_collection = MagicMock() 89 | mock_collection.list.return_value = [ 90 | {"_key": key, "value": {}} for key in expected_keys 91 | ] 92 | mock_crawler.settings.getint.return_value = 50 93 | manager = CollectionsFingerprintsManager(mock_crawler) 94 | manager.collection = mock_collection # type: ignore 95 | assert manager.get_keys_from_collection(fingerprints) == expected_keys 96 | 97 | 98 | @pytest.mark.parametrize( 99 | "keys, expected_items_written", 100 | [ 101 | ( 102 | [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], 103 | [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], 104 | ), 105 | ([], []), 106 | ], 107 | ) 108 | @patch("scrapinghub.ScrapinghubClient") 109 | def test_save_to_collection(mock_crawler, keys, expected_items_written): 110 | mock_writer = MagicMock() 111 | mock_writer.write.return_value = expected_items_written 112 | mock_crawler.settings.getint.return_value = 50 113 | manager = CollectionsFingerprintsManager(mock_crawler) 114 | manager.writer = mock_writer # type: ignore 115 | manager.save_to_collection(keys) 116 | mock_writer.write.assert_called_once_with( 117 | [{"_key": key, "value": value} for key, value in keys] 118 | ) 119 | 120 | 121 | @pytest.mark.parametrize( 122 | "fingerprints, expected_batch, batch_size", 123 | [ 124 | ( 125 | [(f"fp{i}", f"url{i}") for i in range(1, 5)], 126 | {("fp4", "url4")}, 127 | 3, 128 | ), # No default min 129 | ([], set(), 20), 130 | ([("fp1", "url1")] * 19, {("fp1", "url1")}, 20), 131 | ( 132 | [(f"fp{i}", f"url{i}") for i in range(1, 103)], 133 | {(f"fp{i}", f"url{i}") for i in range(1, 103)}, 134 | 150, 135 | ), # No default max 136 | ( 137 | [(f"fp{i}", f"url{i}") for i in range(1, 53)], 138 | [("fp51", "url51"), ("fp52", "url52")], 139 | 0, 140 | ), # 50 by default 141 | ], 142 | ) 143 | @patch("scrapinghub.ScrapinghubClient") 144 | def test_save_fingerprints( 145 | mock_scrapinghub_client, fingerprints, expected_batch, batch_size 146 | ): 147 | crawler = crawler_for_incremental() 148 | if batch_size != 0: 149 | crawler.settings.set("INCREMENTAL_CRAWL_BATCH_SIZE", batch_size) 150 | fp_manager = CollectionsFingerprintsManager(crawler) 151 | fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore 152 | fp_manager.add_to_batch(fingerprints) 153 | assert fp_manager.batch == set(sorted(expected_batch, key=lambda x: int(x[0][2:]))) 154 | 155 | if len(fingerprints) >= fp_manager.batch_size: 156 | fp_manager.save_batch.assert_called_once() 157 | else: 158 | fp_manager.save_batch.assert_not_called() 159 | 160 | 161 | @pytest.mark.parametrize( 162 | "fingerprints_batch, expected_batch_size", 163 | [ 164 | ([], 0), 165 | ([("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], 0), 166 | ], 167 | ) 168 | @patch("scrapinghub.ScrapinghubClient") 169 | def test_save_batch(mock_crawler, fingerprints_batch, expected_batch_size): 170 | crawler = crawler_for_incremental() 171 | fp_manager = CollectionsFingerprintsManager(crawler) 172 | fp_manager.batch = set(fingerprints_batch) 173 | fp_manager.save_batch() 174 | assert len(fp_manager.batch) == expected_batch_size 175 | 176 | 177 | @pytest.mark.parametrize( 178 | "project_id, collection_name, expected_collection", 179 | [ 180 | ("project1", "collection1", MagicMock()), 181 | ("project2", "collection2", MagicMock()), 182 | ], 183 | ) 184 | @patch("scrapinghub.ScrapinghubClient") 185 | def test_init_collection( 186 | mock_scrapinghub_client, 187 | mock_crawler, 188 | project_id, 189 | collection_name, 190 | expected_collection, 191 | ): 192 | mock_scrapinghub_instance = MagicMock() 193 | mock_get_project = MagicMock() 194 | mock_get_project.collections.get_store.return_value = expected_collection 195 | mock_scrapinghub_instance.get_project.return_value = mock_get_project 196 | mock_scrapinghub_client.return_value = mock_scrapinghub_instance 197 | mock_crawler.settings.getint.return_value = 50 198 | manager = CollectionsFingerprintsManager(mock_crawler) 199 | manager.init_collection(project_id, collection_name) 200 | assert manager.collection == expected_collection 201 | 202 | 203 | @patch("scrapinghub.ScrapinghubClient") 204 | def test_spider_closed(mock_scrapinghub_client): 205 | crawler = crawler_for_incremental() 206 | fp_manager = CollectionsFingerprintsManager(crawler) 207 | fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore 208 | fp_manager.spider_closed() 209 | fp_manager.save_batch.assert_called_once() 210 | -------------------------------------------------------------------------------- /tests/incremental/test_middleware.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | from pytest_twisted import ensureDeferred 5 | from scrapy.exceptions import CloseSpider, NotConfigured 6 | from scrapy.http import Request, Response 7 | from scrapy.settings import Settings 8 | from scrapy.statscollectors import StatsCollector 9 | from scrapy.utils.request import RequestFingerprinter 10 | 11 | from tests import get_crawler 12 | from zyte_spider_templates import IncrementalCrawlMiddleware 13 | from zyte_spider_templates._incremental.manager import IncrementalCrawlingManager 14 | from zyte_spider_templates.spiders.article import ArticleSpider 15 | 16 | 17 | def crawler_for_incremental(): 18 | url = "https://example.com" 19 | crawler = get_crawler() 20 | crawler.request_fingerprinter = RequestFingerprinter() 21 | crawler.stats = StatsCollector(crawler) 22 | crawler.spider = ArticleSpider.from_crawler(crawler, url=url) 23 | crawler.settings["ZYTE_PROJECT_ID"] = "000000" 24 | return crawler 25 | 26 | 27 | def test_middleware_init_not_configured(): 28 | crawler = crawler_for_incremental() 29 | crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": False}) 30 | 31 | with pytest.raises(NotConfigured): 32 | IncrementalCrawlMiddleware(crawler) 33 | 34 | 35 | @patch("scrapinghub.ScrapinghubClient") 36 | def test_middleware_init_configured(mock_scrapinghub_client): 37 | crawler = crawler_for_incremental() 38 | crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) 39 | 40 | middleware = IncrementalCrawlMiddleware(crawler) 41 | assert isinstance(middleware.inc_manager, IncrementalCrawlingManager) 42 | 43 | 44 | @patch("scrapinghub.ScrapinghubClient") 45 | def test_prepare_manager_with_collection_fp_success(mock_scrapinghub_client): 46 | crawler = crawler_for_incremental() 47 | crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) 48 | 49 | manager = IncrementalCrawlMiddleware.prepare_incremental_manager(crawler) 50 | assert isinstance(manager, IncrementalCrawlingManager) 51 | 52 | 53 | def test_prepare_manager_with_collection_fp_failure(caplog): 54 | crawler = crawler_for_incremental() 55 | crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) 56 | 57 | caplog.clear() 58 | with pytest.raises(CloseSpider): 59 | IncrementalCrawlMiddleware.prepare_incremental_manager(crawler) 60 | 61 | 62 | @patch("scrapinghub.ScrapinghubClient") 63 | @ensureDeferred 64 | async def test_middleware_process_spider_output(mock_scrapinghub_client): 65 | crawler = crawler_for_incremental() 66 | crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) 67 | 68 | middleware = IncrementalCrawlMiddleware(crawler) 69 | request = Request(url=crawler.spider.url) 70 | response = Response(url=crawler.spider.url, request=request) 71 | input_result = [ 72 | Request(url="https://example.com/1"), 73 | Request(url="https://example.com/2"), 74 | Request(url="https://example.com/3"), 75 | ] 76 | 77 | async def async_generator(): 78 | for item in input_result: 79 | yield item 80 | 81 | processed_result_list = [] 82 | 83 | async for processed_item in middleware.process_spider_output( 84 | response, async_generator(), crawler.spider 85 | ): 86 | processed_result_list.append(processed_item) 87 | 88 | for res_ex, res_proc in zip(input_result, processed_result_list): 89 | assert res_ex == res_proc 90 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import socket 4 | import sys 5 | import time 6 | from importlib import import_module 7 | from subprocess import PIPE, Popen 8 | from typing import Any, Dict 9 | 10 | from scrapy_zyte_api.responses import _API_RESPONSE 11 | from twisted.internet import reactor 12 | from twisted.web.resource import Resource 13 | from twisted.web.server import Site 14 | 15 | 16 | def get_ephemeral_port(): 17 | s = socket.socket() 18 | s.bind(("", 0)) 19 | return s.getsockname()[1] 20 | 21 | 22 | class DefaultResource(Resource): 23 | """Mock server to fake Zyte API responses. 24 | 25 | To use, include the mockserver fixture in the signature of your test, and 26 | point the ZYTE_API_URL setting to the mock server. See 27 | ``tests/test_ecommerce.py::test_crawl_strategies`` for an example. 28 | 29 | This mock server is designed to fake the following: 30 | 31 | - An e-commerce website with the following pages: 32 | 33 | ``` 34 | https://example.com/ 35 | https://example.com/page/2 36 | https://example.com/category/1 37 | https://example.com/category/1/page/2 38 | https://example.com/non-navigation 39 | ``` 40 | 41 | When browserHtml is requested (for any URL, listed above or not), it is 42 | a minimal HTML with an anchor tag pointing to 43 | https://example.com/non-navigation. 44 | 45 | When productNavigation is requested, nextPage and subCategories are filled 46 | accordingly. productNavigation.items always has 2 product URLs, which are 47 | the result of appending ``/product/`` to the request URL. 48 | https://example.com/non-navigation is not reachable through 49 | productNavigation. 50 | 51 | When product or productList is requested, an item with the current URL is 52 | always returned. 53 | 54 | All output also includes unsupported links (mailto:…). 55 | 56 | - Job-posting websites with the following endpoints: 57 | 58 | - https://jobs.example (jobPostingNavigation pointing to the 2 items 59 | below). 60 | 61 | - https://jobs.offsite.example/jobs/1 (jobPosting) 62 | 63 | - https://jobs.offsite.example/jobs/2 (jobPosting) 64 | """ 65 | 66 | def getChild(self, path, request): 67 | return self 68 | 69 | def render_POST(self, request): 70 | request_data = json.loads(request.content.read()) 71 | request.responseHeaders.setRawHeaders( 72 | b"Content-Type", 73 | [b"application/json"], 74 | ) 75 | request.responseHeaders.setRawHeaders( 76 | b"request-id", 77 | [b"abcd1234"], 78 | ) 79 | 80 | response_data: _API_RESPONSE = {} 81 | 82 | response_data["url"] = request_data["url"] 83 | 84 | if request_data["url"] == "https://jobs.example": 85 | assert request_data["jobPostingNavigation"] is True 86 | response_data["jobPostingNavigation"] = { 87 | "url": request_data["url"], 88 | "items": [ 89 | {"url": "https://jobs.offsite.example/jobs/1"}, 90 | {"url": "https://jobs.offsite.example/jobs/2"}, 91 | ], 92 | } 93 | return json.dumps(response_data).encode() 94 | 95 | if request_data["url"].startswith("https://jobs.offsite.example/"): 96 | assert request_data["jobPosting"] is True 97 | response_data["jobPosting"] = { 98 | "url": request_data["url"], 99 | } 100 | return json.dumps(response_data).encode() 101 | 102 | non_navigation_url = "https://example.com/non-navigation" 103 | html = f"""""" 104 | if request_data.get("browserHtml", False) is True: 105 | response_data["browserHtml"] = html 106 | 107 | if request_data.get("product", False) is True: 108 | response_data["product"] = { 109 | "url": request_data["url"], 110 | } 111 | 112 | if request_data.get("productList", False) is True: 113 | response_data["productList"] = { 114 | "url": request_data["url"], 115 | } 116 | 117 | if request_data.get("productNavigation", False) is True: 118 | kwargs: Dict[str, Any] = {} 119 | if ( 120 | "/page/" not in request_data["url"] 121 | and "/non-navigation" not in request_data["url"] 122 | ): 123 | kwargs["nextPage"] = { 124 | "url": f"{request_data['url'].rstrip('/')}/page/2" 125 | } 126 | if "/category/" not in request_data["url"]: 127 | kwargs["subCategories"] = [ 128 | {"url": "mailto:jane@example.com"}, 129 | {"url": f"{request_data['url'].rstrip('/')}/category/1"}, 130 | ] 131 | else: 132 | kwargs["nextPage"] = {"url": "mailto:jane@example.com"} 133 | response_data["productNavigation"] = { 134 | "url": request_data["url"], 135 | "items": [ 136 | {"url": "mailto:jane@example.com"}, 137 | {"url": f"{request_data['url'].rstrip('/')}/product/1"}, 138 | {"url": f"{request_data['url'].rstrip('/')}/product/2"}, 139 | ], 140 | **kwargs, 141 | } 142 | 143 | return json.dumps(response_data).encode() 144 | 145 | 146 | class MockServer: 147 | def __init__(self, resource=None, port=None): 148 | resource = resource or DefaultResource 149 | self.resource = "{}.{}".format(resource.__module__, resource.__name__) 150 | self.proc = None 151 | self.host = socket.gethostbyname(socket.gethostname()) 152 | self.port = port or get_ephemeral_port() 153 | self.root_url = "http://%s:%d" % (self.host, self.port) 154 | 155 | def __enter__(self): 156 | self.proc = Popen( 157 | [ 158 | sys.executable, 159 | "-u", 160 | "-m", 161 | "tests.mockserver", 162 | self.resource, 163 | "--port", 164 | str(self.port), 165 | ], 166 | stdout=PIPE, 167 | ) 168 | assert self.proc.stdout is not None 169 | self.proc.stdout.readline() 170 | return self 171 | 172 | def __exit__(self, exc_type, exc_value, traceback): 173 | assert self.proc is not None 174 | self.proc.kill() 175 | self.proc.wait() 176 | time.sleep(0.2) 177 | 178 | def urljoin(self, path): 179 | return self.root_url + path 180 | 181 | 182 | def main(): 183 | parser = argparse.ArgumentParser() 184 | parser.add_argument("resource") 185 | parser.add_argument("--port", type=int) 186 | args = parser.parse_args() 187 | module_name, name = args.resource.rsplit(".", 1) 188 | sys.path.append(".") 189 | resource = getattr(import_module(module_name), name)() 190 | # Typing issue: https://github.com/twisted/twisted/issues/9909 191 | http_port = reactor.listenTCP(args.port, Site(resource)) # type: ignore[attr-defined] 192 | 193 | def print_listening(): 194 | host = http_port.getHost() 195 | print( 196 | "Mock server {} running at http://{}:{}".format( 197 | resource, host.host, host.port 198 | ) 199 | ) 200 | 201 | # Typing issue: https://github.com/twisted/twisted/issues/9909 202 | reactor.callWhenRunning(print_listening) # type: ignore[attr-defined] 203 | reactor.run() # type: ignore[attr-defined] 204 | 205 | 206 | if __name__ == "__main__": 207 | main() 208 | -------------------------------------------------------------------------------- /tests/pages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/tests/pages/__init__.py -------------------------------------------------------------------------------- /tests/pages/test_article_navigation_heuristics.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from pytest_twisted import ensureDeferred 4 | from web_poet import ( 5 | AnyResponse, 6 | HttpResponse, 7 | HttpResponseHeaders, 8 | PageParams, 9 | RequestUrl, 10 | Stats, 11 | ) 12 | from zyte_common_items import ProbabilityMetadata, ProbabilityRequest 13 | 14 | from zyte_spider_templates.pages.article_heuristics import ( 15 | HeuristicsArticleNavigationPage, 16 | ) 17 | 18 | 19 | @ensureDeferred 20 | async def test_article_page(): 21 | body = b""" 22 | 23 | 24 |
25 |

Categories

26 |
27 | UX 28 | CSS 29 |
30 |

31 |
32 |

Articles

33 |
34 | Modern CSS 35 | How to run UX 36 |
37 | 38 | Next Page 39 | 40 |

41 | 46 | 47 | 48 | """ 49 | response = AnyResponse(HttpResponse("https://example.com", body)) 50 | 51 | rss_content = b""" 52 | 53 | 54 | Sample RSS Feed 55 | http://example.com/feed/rss.xml 56 | This is a sample RSS feed 57 | 58 | Item 1 59 | http://example.com/item1 60 | Description of Item 1 61 | 62 | 63 | Item 2 64 | http://example.com/item2 65 | Description of Item 2 66 | 67 | 68 | 69 | """ 70 | rss_response = AnyResponse( 71 | HttpResponse( 72 | "https://example.com/feed/rss.xml", 73 | rss_content, 74 | headers=HttpResponseHeaders({"Content-Type": "text/xml"}), 75 | ) 76 | ) 77 | 78 | urls_subcategories = [ 79 | {"url": "https://example.com/category/UX", "name": "UX"}, 80 | {"url": "https://example.com/category/CSS", "name": "CSS"}, 81 | {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"}, 82 | {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"}, 83 | {"url": "https://example.com/page-2", "name": "Next Page"}, 84 | {"url": "https://another-example.com", "name": "Link to other domain"}, 85 | ] 86 | requests_subcategories = [ 87 | ProbabilityRequest( 88 | url=subcat["url"], 89 | name=f"[heuristics][articleNavigation][subCategories] {subcat['name']}", 90 | headers=None, 91 | metadata=ProbabilityMetadata(probability=0.5), 92 | ) 93 | for subcat in urls_subcategories 94 | ] 95 | 96 | urls_feed = [ 97 | {"url": "https://example.com/feed/rss.xml"}, 98 | ] 99 | requests_feed = [ 100 | ProbabilityRequest( 101 | url=feed["url"], 102 | name="[heuristics][articleNavigation][feed] ", 103 | headers=None, 104 | metadata=ProbabilityMetadata(probability=1.0), 105 | ) 106 | for feed in urls_feed 107 | ] 108 | 109 | feed_items = ["http://example.com/item1", "http://example.com/item2"] 110 | 111 | urls_items = [ 112 | {"url": "https://example.com/category/UX", "name": "UX"}, 113 | {"url": "https://example.com/category/CSS", "name": "CSS"}, 114 | {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"}, 115 | {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"}, 116 | {"url": "https://example.com/page-2", "name": "Next Page"}, 117 | {"url": "https://another-example.com", "name": "Link to other domain"}, 118 | ] 119 | requests_items = [ 120 | ProbabilityRequest( 121 | url=item["url"], 122 | name=f"[heuristics][articleNavigation][article] {item['name']}", 123 | headers=None, 124 | metadata=ProbabilityMetadata(probability=0.5), 125 | ) 126 | for item in urls_items 127 | ] 128 | 129 | request_url = RequestUrl(response.url) 130 | rss_url = RequestUrl(rss_response.url) 131 | 132 | # final_navigation_page = True 133 | page_params = PageParams({"skip_subcategories": True}) 134 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 135 | item = await page.to_item() 136 | 137 | assert page.skip_subcategories() 138 | assert item.subCategories[0].url == "https://example.com/feed/rss.xml" 139 | assert [item.url for item in item.items] == [item["url"] for item in urls_items] 140 | 141 | # final_navigation_page = False 142 | page_params = PageParams({"skip_subcategories": False}) 143 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 144 | item = await page.to_item() 145 | 146 | assert not page.skip_subcategories() 147 | assert item.subCategories == requests_feed + requests_subcategories 148 | assert item.items == requests_items 149 | 150 | # no final_navigation_page (False by default) 151 | page_params = PageParams() 152 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 153 | item = await page.to_item() 154 | 155 | assert not page.skip_subcategories() 156 | assert item.subCategories == requests_feed + requests_subcategories 157 | assert item.items == requests_items 158 | 159 | # only_feeds = True, request to page 160 | page_params = PageParams({"only_feeds": True}) 161 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 162 | item = await page.to_item() 163 | 164 | assert page.is_only_feeds() 165 | assert item.subCategories[0].url == str(rss_url) 166 | assert [item.url for item in item.items] == [] 167 | 168 | # only_feeds = True, request to feed 169 | page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) 170 | with patch.object( 171 | HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True 172 | ): 173 | item = await page.to_item() 174 | assert page.is_only_feeds() 175 | assert item.subCategories == [] 176 | assert [item.url for item in item.items] == feed_items 177 | 178 | # only_feeds = False, request to page 179 | page_params = PageParams({"only_feeds": False}) 180 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 181 | item = await page.to_item() 182 | 183 | assert not page.is_only_feeds() 184 | assert item.subCategories == requests_feed + requests_subcategories 185 | assert item.items == requests_items 186 | 187 | # only_feeds = False, request to feed 188 | page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) 189 | with patch.object( 190 | HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True 191 | ): 192 | item = await page.to_item() 193 | assert not page.is_only_feeds() 194 | assert item.subCategories == [] 195 | assert [item.url for item in item.items] == feed_items 196 | 197 | # no only_feeds (False by default) 198 | page_params = PageParams() 199 | page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) 200 | item = await page.to_item() 201 | 202 | assert not page.is_only_feeds() 203 | assert item.subCategories == requests_feed + requests_subcategories 204 | assert item.items == requests_items 205 | 206 | # no only_feeds (False by default), request to feed 207 | page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) 208 | with patch.object( 209 | HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True 210 | ): 211 | item = await page.to_item() 212 | assert not page.is_only_feeds() 213 | assert item.subCategories == [] 214 | assert [item.url for item in item.items] == feed_items 215 | -------------------------------------------------------------------------------- /tests/pages/test_product_navigation_heuristics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pytest_twisted import ensureDeferred 3 | from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl 4 | from zyte_common_items import ProbabilityRequest, ProductNavigation 5 | 6 | from zyte_spider_templates.pages.product_navigation_heuristics import ( 7 | HeuristicsProductNavigationPage, 8 | ) 9 | 10 | 11 | @ensureDeferred 12 | async def test_unknown_product_page(): 13 | body = b""" 14 | 15 | 16 |
17 |

Subcategories

18 |
19 | Sentinels 20 | Duelists 21 |
22 |

23 |
24 |

Items

25 |
26 | Reyna 27 | Jett 28 |
29 | 30 | Next Page 31 | 32 |

33 | category?? 34 | 38 | 39 | 40 | """ 41 | response = AnyResponse(HttpResponse("https://example.com", body)) 42 | navigation = ProductNavigation.from_dict( 43 | { 44 | "url": "https://example.com", 45 | "subCategories": [ 46 | {"url": "https://example.com/categ/sentinels", "name": "Sentinels"}, 47 | {"url": "https://example.com/categ/duelists", "name": "Duelists"}, 48 | ], 49 | "items": [ 50 | {"url": "https://example.com/p?id=reyna", "name": "Reyna"}, 51 | {"url": "https://example.com/p?id=jett", "name": "Jett"}, 52 | ], 53 | "nextPage": { 54 | "url": "https://example.com/page-2", 55 | "name": "Next Page", 56 | }, 57 | "metadata": {"dateDownloaded": "2024-01-09T14:37:58Z"}, 58 | } 59 | ) 60 | all_valid_urls = [ 61 | "https://example.com/categ/sentinels", 62 | "https://example.com/categ/duelists", 63 | "https://example.com/p?id=reyna", 64 | "https://example.com/p?id=jett", 65 | "https://example.com/page-2", 66 | ] 67 | urls_subcategories = [ 68 | ProbabilityRequest.from_dict( 69 | {"url": "https://example.com/categ/sentinels", "name": "Sentinels"} 70 | ), 71 | ProbabilityRequest.from_dict( 72 | {"url": "https://example.com/categ/duelists", "name": "Duelists"} 73 | ), 74 | ] 75 | 76 | # Heuristics turned OFF 77 | request_url = RequestUrl(response.url) 78 | page_params = PageParams({"allow_domains": "example.com"}) 79 | page = HeuristicsProductNavigationPage( 80 | request_url, navigation, response, page_params 81 | ) 82 | item = await page.to_item() 83 | 84 | assert item.subCategories == urls_subcategories 85 | assert page._urls_for_category() == all_valid_urls 86 | 87 | # Heuristics turned ON 88 | page_params = PageParams({"full_domain": "example.com"}) 89 | page = HeuristicsProductNavigationPage( 90 | request_url, navigation, response, page_params 91 | ) 92 | item = await page.to_item() 93 | 94 | assert item.subCategories == urls_subcategories + [ 95 | ProbabilityRequest.from_dict( 96 | { 97 | "url": "https://example.com/categ/probably", 98 | "name": "[heuristics] category??", 99 | "metadata": {"probability": 0.1}, 100 | } 101 | ) 102 | ] 103 | assert page._urls_for_category() == all_valid_urls 104 | 105 | 106 | @ensureDeferred 107 | async def test_crawl_nofollow_links(): 108 | page_params = PageParams({"full_domain": "example.com"}) 109 | body = b""" 110 | 111 | 112 |
113 | Outside link 114 | Can follow 115 | Dont follow 116 |
117 | 118 | 119 | """ 120 | url = "https://example.com" 121 | response = AnyResponse(HttpResponse(url, body)) 122 | request_url = RequestUrl(response.url) 123 | navigation = ProductNavigation(url=url) 124 | 125 | page = HeuristicsProductNavigationPage( 126 | request_url, navigation, response, page_params 127 | ) 128 | assert [req.url for req in page.subCategories] == ["https://example.com/can-follow"] 129 | 130 | 131 | @pytest.mark.deprication_warning 132 | def test_deprecated_page_objects(): 133 | with pytest.warns(DeprecationWarning, match="page_objects"): 134 | from zyte_spider_templates.page_objects import ( # noqa: F401 135 | HeuristicsProductNavigationPage, 136 | ) 137 | 138 | # We cannot test the warning again because duplicate warnings are ignored, 139 | # but we still want to ensure that we can import the class. 140 | from zyte_spider_templates.page_objects.product_navigation_heuristics import ( # noqa: F401, F811 141 | HeuristicsProductNavigationPage, 142 | ) 143 | -------------------------------------------------------------------------------- /tests/test_addon.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import scrapy 3 | from duplicate_url_discarder_rules import RULE_PATHS 4 | from packaging import version 5 | from scrapy.utils.test import get_crawler 6 | 7 | from zyte_spider_templates import ( 8 | AllowOffsiteMiddleware, 9 | CrawlingLogsMiddleware, 10 | IncrementalCrawlMiddleware, 11 | MaxRequestsPerSeedDownloaderMiddleware, 12 | OffsiteRequestsPerSeedMiddleware, 13 | OnlyFeedsMiddleware, 14 | TrackNavigationDepthSpiderMiddleware, 15 | TrackSeedsSpiderMiddleware, 16 | ) 17 | 18 | _crawler = get_crawler() 19 | BASELINE_SETTINGS = _crawler.settings.copy_to_dict() 20 | 21 | try: 22 | from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware # noqa: F401 23 | except ImportError: 24 | BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = ( 25 | "scrapy.spidermiddlewares.offsite.OffsiteMiddleware" 26 | ) 27 | else: 28 | BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = ( 29 | "scrapy.downloadermiddlewares.offsite.OffsiteMiddleware" 30 | ) 31 | 32 | 33 | # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/tests/test_addon.py#L109 34 | def _test_setting_changes(initial_settings, expected_settings): 35 | settings = { 36 | **initial_settings, 37 | "ADDONS": { 38 | "zyte_spider_templates.Addon": 1000, 39 | }, 40 | } 41 | crawler = get_crawler(settings_dict=settings) 42 | crawler._apply_settings() 43 | actual_settings = crawler.settings.copy_to_dict() 44 | 45 | # Test separately settings that copy_to_dict messes up. 46 | for setting in ( 47 | "DOWNLOADER_MIDDLEWARES", 48 | "SCRAPY_POET_PROVIDERS", 49 | "SPIDER_MIDDLEWARES", 50 | ): 51 | if setting not in crawler.settings: 52 | assert setting not in expected_settings 53 | continue 54 | assert crawler.settings.getdict(setting) == expected_settings.pop(setting) 55 | del actual_settings[setting] 56 | 57 | for key in BASELINE_SETTINGS: 58 | if key in actual_settings and actual_settings[key] == BASELINE_SETTINGS[key]: 59 | del actual_settings[key] 60 | del actual_settings["ADDONS"] 61 | 62 | assert actual_settings == expected_settings 63 | 64 | 65 | @pytest.mark.parametrize( 66 | ("initial_settings", "expected_settings"), 67 | ( 68 | ( 69 | {}, 70 | { 71 | "CLOSESPIDER_TIMEOUT_NO_ITEM": 600, 72 | "DOWNLOADER_MIDDLEWARES": { 73 | MaxRequestsPerSeedDownloaderMiddleware: 100, 74 | BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None, 75 | AllowOffsiteMiddleware: 50, 76 | }, 77 | "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", 78 | "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", 79 | "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue", 80 | "ITEM_PROBABILITY_THRESHOLDS": { 81 | "zyte_common_items.items.Article": 0.1, 82 | "zyte_common_items.items.Product": 0.1, 83 | }, 84 | "DUD_LOAD_RULE_PATHS": RULE_PATHS, 85 | "SCRAPY_POET_DISCOVER": [ 86 | "zyte_spider_templates.pages", 87 | ], 88 | "SPIDER_MIDDLEWARES": { 89 | IncrementalCrawlMiddleware: 45, 90 | OffsiteRequestsPerSeedMiddleware: 49, 91 | OnlyFeedsMiddleware: 108, 92 | TrackNavigationDepthSpiderMiddleware: 110, 93 | TrackSeedsSpiderMiddleware: 550, 94 | CrawlingLogsMiddleware: 1000, 95 | }, 96 | "SPIDER_MODULES": [ 97 | "zyte_spider_templates.spiders", 98 | ], 99 | }, 100 | ), 101 | ), 102 | ) 103 | @pytest.mark.skipif( 104 | version.parse(scrapy.__version__) < version.parse("2.11.2"), 105 | reason="Test applicable only for Scrapy versions >= 2.11.2", 106 | ) 107 | def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_settings): 108 | _test_setting_changes(initial_settings, expected_settings) 109 | 110 | 111 | @pytest.mark.parametrize( 112 | ("initial_settings", "expected_settings"), 113 | ( 114 | ( 115 | {}, 116 | { 117 | "CLOSESPIDER_TIMEOUT_NO_ITEM": 600, 118 | "DOWNLOADER_MIDDLEWARES": {MaxRequestsPerSeedDownloaderMiddleware: 100}, 119 | "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", 120 | "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", 121 | "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue", 122 | "ITEM_PROBABILITY_THRESHOLDS": { 123 | "zyte_common_items.items.Article": 0.1, 124 | "zyte_common_items.items.Product": 0.1, 125 | }, 126 | "DUD_LOAD_RULE_PATHS": RULE_PATHS, 127 | "SCRAPY_POET_DISCOVER": [ 128 | "zyte_spider_templates.pages", 129 | ], 130 | "SPIDER_MIDDLEWARES": { 131 | IncrementalCrawlMiddleware: 45, 132 | OffsiteRequestsPerSeedMiddleware: 49, 133 | OnlyFeedsMiddleware: 108, 134 | TrackNavigationDepthSpiderMiddleware: 110, 135 | BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None, 136 | AllowOffsiteMiddleware: 500, 137 | TrackSeedsSpiderMiddleware: 550, 138 | CrawlingLogsMiddleware: 1000, 139 | }, 140 | "SPIDER_MODULES": [ 141 | "zyte_spider_templates.spiders", 142 | ], 143 | }, 144 | ), 145 | ), 146 | ) 147 | @pytest.mark.skipif( 148 | version.parse(scrapy.__version__) >= version.parse("2.11.2"), 149 | reason="Test applicable only for Scrapy versions < 2.11.2", 150 | ) 151 | def test_poet_setting_changes(initial_settings, expected_settings): 152 | _test_setting_changes(initial_settings, expected_settings) 153 | -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from zyte_spider_templates import BaseSpiderParams 4 | 5 | 6 | def test_deprecation(): 7 | with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"): 8 | BaseSpiderParams(url="https://example.com") # type: ignore[call-arg] 9 | -------------------------------------------------------------------------------- /tests/test_feeds.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import pytest 4 | from web_poet import ( 5 | AnyResponse, 6 | BrowserHtml, 7 | BrowserResponse, 8 | HttpResponse, 9 | HttpResponseBody, 10 | ResponseUrl, 11 | ) 12 | 13 | from zyte_spider_templates.feeds import get_feed_urls, parse_feed, unique_urls 14 | 15 | 16 | @pytest.fixture 17 | def sample_urls() -> List[str]: 18 | return [ 19 | "http://example.com", 20 | "http://example.com/", 21 | "https://example.com", 22 | "https://example.com/", 23 | "http://example.com/page", 24 | "http://example.com/page/", 25 | ] 26 | 27 | 28 | def test_unique_urls(sample_urls): 29 | unique_list = unique_urls(sample_urls) 30 | assert len(unique_list) == 4 31 | 32 | 33 | def test_unique_urls_order(sample_urls): 34 | unique_list = unique_urls(sample_urls) 35 | expected_order = [ 36 | "http://example.com", 37 | "https://example.com", 38 | "http://example.com/page", 39 | "http://example.com/page/", 40 | ] 41 | assert unique_list == expected_order 42 | 43 | 44 | @pytest.fixture 45 | def sample_response_feed() -> Union[AnyResponse, HttpResponse, BrowserResponse]: 46 | html_content = """ 47 | 48 | 49 | 50 | 51 | 52 | 53 | RSS Feed 54 | Atom Feed 55 | 56 | 57 | """ 58 | return HttpResponse( 59 | url=ResponseUrl("http://example.com"), 60 | body=HttpResponseBody(html_content.encode(encoding="utf-8")), 61 | ) 62 | 63 | 64 | def test_get_feed_urls(sample_response_feed): 65 | feed_urls = get_feed_urls(sample_response_feed) 66 | assert len(feed_urls) == 3 67 | assert "http://example.com/rss.xml" in feed_urls 68 | assert "http://example.com/atom.xml" in feed_urls 69 | assert "http://example.com/feed/rss.xml" in feed_urls 70 | 71 | 72 | @pytest.fixture 73 | def sample_response_feeds() -> Union[AnyResponse, HttpResponse, BrowserResponse]: 74 | rss_content = """ 75 | 76 | 77 | Sample RSS Feed 78 | http://example.com/feed/rss.xml 79 | This is a sample RSS feed 80 | 81 | Item 1 82 | http://example.com/item1 83 | Description of Item 1 84 | 85 | 86 | Item 2 87 | http://example.com/item2 88 | Description of Item 2 89 | 90 | 91 | Item 3 92 | http://example.com/item2 93 | Description of Item 3 94 | 95 | 96 | 97 | """ 98 | return HttpResponse( 99 | url=ResponseUrl("http://example.com/feed/rss.xml"), 100 | body=HttpResponseBody(rss_content.encode(encoding="utf-8")), 101 | ) 102 | 103 | 104 | @pytest.mark.parametrize("is_browser_response", [False, True]) 105 | def test_parse_feed(sample_response_feeds, is_browser_response): 106 | if is_browser_response: 107 | sample_response_feeds = BrowserResponse( 108 | url=ResponseUrl("http://example.com"), 109 | html=BrowserHtml(str(sample_response_feeds.text)), 110 | ) 111 | feed_urls = parse_feed(sample_response_feeds) 112 | expected_urls = ["http://example.com/item1", "http://example.com/item2"] 113 | assert feed_urls == expected_urls 114 | -------------------------------------------------------------------------------- /tests/test_params.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from pydantic import ValidationError 5 | 6 | from zyte_spider_templates import EcommerceSpider, GoogleSearchSpider 7 | from zyte_spider_templates.params import URL_FIELD_KWARGS 8 | from zyte_spider_templates.spiders.ecommerce import EcommerceCrawlStrategy 9 | 10 | from . import get_crawler 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "valid,url", 15 | [ 16 | (False, ""), 17 | (False, "http://"), 18 | (False, "http:/example.com"), 19 | (False, "ftp://example.com"), 20 | (False, "example.com"), 21 | (False, "//example.com"), 22 | (False, "http://foo:bar@example.com"), 23 | (False, " http://example.com"), 24 | (False, "http://example.com "), 25 | (False, "http://examp le.com"), 26 | (False, "https://example.com:232323"), 27 | (True, "http://example.com"), 28 | (True, "http://bücher.example"), 29 | (True, "http://xn--bcher-kva.example"), 30 | (True, "https://i❤.ws"), 31 | (True, "https://example.com"), 32 | (True, "https://example.com/"), 33 | (True, "https://example.com:2323"), 34 | (True, "https://example.com:2323/"), 35 | (True, "https://example.com:2323/foo"), 36 | (True, "https://example.com/f"), 37 | (True, "https://example.com/foo"), 38 | (True, "https://example.com/foo/"), 39 | (True, "https://example.com/foo/bar"), 40 | (True, "https://example.com/foo/bar/"), 41 | (True, "https://example.com/foo/bar?baz"), 42 | (True, "https://example.com/foo/bar/?baz"), 43 | (True, "https://example.com?foo"), 44 | (True, "https://example.com?foo=bar"), 45 | (True, "https://example.com/?foo=bar&baz"), 46 | (True, "https://example.com/?foo=bar&baz#"), 47 | (True, "https://example.com/?foo=bar&baz#frag"), 48 | (True, "https://example.com#"), 49 | (True, "https://example.com/#"), 50 | (True, "https://example.com/&"), 51 | (True, "https://example.com/&#"), 52 | ], 53 | ) 54 | def test_url_pattern(url, valid): 55 | assert isinstance(URL_FIELD_KWARGS["pattern"], str) 56 | assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid 57 | 58 | 59 | REQUIRED_ARGS = { 60 | EcommerceSpider: {"url": "https://example.com"}, 61 | GoogleSearchSpider: {"search_queries": "foo"}, 62 | } 63 | 64 | 65 | @pytest.mark.parametrize( 66 | ("spider_cls",), ((spider_cls,) for spider_cls in REQUIRED_ARGS) 67 | ) 68 | def test_required_args(spider_cls): 69 | crawler = get_crawler() 70 | 71 | with pytest.raises(ValidationError): 72 | spider_cls.from_crawler(crawler) 73 | 74 | spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls]) 75 | 76 | 77 | @pytest.mark.parametrize( 78 | ("spider_cls", "args", "valid"), 79 | ( 80 | ( 81 | EcommerceSpider, 82 | { 83 | "url": "https://example.com", 84 | "crawl_strategy": EcommerceCrawlStrategy.automatic, 85 | }, 86 | True, 87 | ), 88 | ( 89 | EcommerceSpider, 90 | {"url": "https://example.com", "crawl_strategy": "automatic"}, 91 | True, 92 | ), 93 | ( 94 | EcommerceSpider, 95 | {"url": "https://example.com", "crawl_strategy": "unknown"}, 96 | False, 97 | ), 98 | ( 99 | EcommerceSpider, 100 | { 101 | "url": "https://example.com", 102 | "crawl_strategy": "direct_item", 103 | "search_queries": "", 104 | }, 105 | True, 106 | ), 107 | ( 108 | EcommerceSpider, 109 | { 110 | "url": "https://example.com", 111 | "crawl_strategy": "automatic", 112 | "search_queries": "foo", 113 | }, 114 | True, 115 | ), 116 | ( 117 | EcommerceSpider, 118 | { 119 | "url": "https://example.com", 120 | "crawl_strategy": "direct_item", 121 | "search_queries": "foo", 122 | }, 123 | False, 124 | ), 125 | ( 126 | EcommerceSpider, 127 | { 128 | "url": "https://example.com", 129 | "extract": "product", 130 | "crawl_strategy": "direct_item", 131 | "search_queries": "foo", 132 | }, 133 | False, 134 | ), 135 | ( 136 | EcommerceSpider, 137 | { 138 | "url": "https://example.com", 139 | "extract": "productList", 140 | "crawl_strategy": "direct_item", 141 | "search_queries": "foo", 142 | }, 143 | True, 144 | ), 145 | (GoogleSearchSpider, {"domain": "google.com"}, False), 146 | ( 147 | GoogleSearchSpider, 148 | {"domain": "google.cat", "search_queries": "foo bar"}, 149 | True, 150 | ), 151 | ( 152 | GoogleSearchSpider, 153 | {"domain": "google.cat", "search_queries": "foo bar", "max_pages": 10}, 154 | True, 155 | ), 156 | ( 157 | GoogleSearchSpider, 158 | {"domain": "google.foo", "search_queries": "foo bar"}, 159 | False, 160 | ), 161 | (GoogleSearchSpider, {"search_queries": "foo bar", "max_pages": "all"}, False), 162 | (GoogleSearchSpider, {"search_queries": "foo", "results_per_page": 0}, False), 163 | ), 164 | ) 165 | def test_arg_combinations(spider_cls, args, valid): 166 | crawler = get_crawler() 167 | if valid: 168 | spider_cls.from_crawler(crawler, **args) 169 | else: 170 | with pytest.raises(ValidationError): 171 | spider_cls.from_crawler(crawler, **args) 172 | 173 | 174 | @pytest.mark.parametrize( 175 | ("spider_cls", "param", "arg", "setting", "old", "getter", "new"), 176 | ( 177 | # extract_from 178 | *( 179 | (EcommerceSpider, *scenario) 180 | for scenario in ( 181 | ( 182 | "extract_from", 183 | "browserHtml", 184 | "ZYTE_API_PROVIDER_PARAMS", 185 | None, 186 | "getdict", 187 | { 188 | "productOptions": {"extractFrom": "browserHtml"}, 189 | "productNavigationOptions": {"extractFrom": "browserHtml"}, 190 | "productListOptions": {"extractFrom": "browserHtml"}, 191 | }, 192 | ), 193 | ( 194 | "extract_from", 195 | "httpResponseBody", 196 | "ZYTE_API_PROVIDER_PARAMS", 197 | {"geolocation": "US"}, 198 | "getdict", 199 | { 200 | "productOptions": {"extractFrom": "httpResponseBody"}, 201 | "productNavigationOptions": {"extractFrom": "httpResponseBody"}, 202 | "productListOptions": {"extractFrom": "httpResponseBody"}, 203 | "geolocation": "US", 204 | }, 205 | ), 206 | ( 207 | "extract_from", 208 | None, 209 | "ZYTE_API_PROVIDER_PARAMS", 210 | {"geolocation": "US"}, 211 | "getdict", 212 | {"geolocation": "US"}, 213 | ), 214 | ) 215 | ), 216 | # geolocation 217 | *( 218 | (spider_cls, *scenario) 219 | for spider_cls in (EcommerceSpider, GoogleSearchSpider) 220 | for scenario in ( 221 | ( 222 | "geolocation", 223 | "DE", 224 | "ZYTE_API_AUTOMAP_PARAMS", 225 | None, 226 | "getdict", 227 | {"geolocation": "DE"}, 228 | ), 229 | ( 230 | "geolocation", 231 | "DE", 232 | "ZYTE_API_AUTOMAP_PARAMS", 233 | '{"browserHtml": true}', 234 | "getdict", 235 | {"browserHtml": True, "geolocation": "DE"}, 236 | ), 237 | ( 238 | "geolocation", 239 | "DE", 240 | "ZYTE_API_AUTOMAP_PARAMS", 241 | '{"geolocation": "IE"}', 242 | "getdict", 243 | {"geolocation": "DE"}, 244 | ), 245 | ( 246 | "geolocation", 247 | "DE", 248 | "ZYTE_API_PROVIDER_PARAMS", 249 | None, 250 | "getdict", 251 | {"geolocation": "DE"}, 252 | ), 253 | ( 254 | "geolocation", 255 | "DE", 256 | "ZYTE_API_PROVIDER_PARAMS", 257 | '{"browserHtml": true}', 258 | "getdict", 259 | {"browserHtml": True, "geolocation": "DE"}, 260 | ), 261 | ( 262 | "geolocation", 263 | "DE", 264 | "ZYTE_API_PROVIDER_PARAMS", 265 | '{"geolocation": "IE"}', 266 | "getdict", 267 | {"geolocation": "DE"}, 268 | ), 269 | ) 270 | ), 271 | # max_requests 272 | *( 273 | ( 274 | spider_cls, 275 | "max_requests", 276 | "123", 277 | "ZYTE_API_MAX_REQUESTS", 278 | None, 279 | "getint", 280 | 123, 281 | ) 282 | for spider_cls in (EcommerceSpider, GoogleSearchSpider) 283 | ), 284 | ), 285 | ) 286 | def test_setting_setter_params(spider_cls, param, arg, setting, old, getter, new): 287 | settings = {} 288 | if old is not None: 289 | settings[setting] = old 290 | crawler = get_crawler(settings=settings) 291 | spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls], **{param: arg}) 292 | read = getattr(crawler.settings, getter) 293 | assert read(setting) == new 294 | -------------------------------------------------------------------------------- /tests/test_params_location_param.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pydantic import ValidationError 3 | 4 | from zyte_spider_templates.params import LocationParam 5 | 6 | 7 | def test_valid_location_param(): 8 | valid_address_dict = { 9 | "streetAddress": "123 Main St", 10 | "addressCountry": "US", 11 | "addressRegion": "CA", 12 | "postalCode": "12345", 13 | } 14 | location_param = LocationParam(location=valid_address_dict) # type: ignore[arg-type] 15 | assert location_param.location is not None 16 | assert location_param.location.streetAddress == "123 Main St" 17 | assert location_param.location.addressCountry == "US" 18 | assert location_param.location.addressRegion == "CA" 19 | assert location_param.location.postalCode == "12345" 20 | 21 | 22 | def test_valid_location_param_from_json(): 23 | valid_address_json = '{"streetAddress": "456 Elm St", "addressCountry": "US", "addressRegion": "NY", "postalCode": "54321"}' 24 | location_param = LocationParam(location=valid_address_json) # type: ignore[arg-type] 25 | assert location_param.location is not None 26 | assert location_param.location.streetAddress == "456 Elm St" 27 | assert location_param.location.addressCountry == "US" 28 | assert location_param.location.addressRegion == "NY" 29 | assert location_param.location.postalCode == "54321" 30 | 31 | 32 | def test_none_location_param(): 33 | location_param = LocationParam(location=None) 34 | assert location_param.location is None 35 | 36 | 37 | def test_invalid_json_location_param(): 38 | invalid_address_json = '{"streetAddress": "789 Pine St", "addressCountry": "AnotheraddressCountry", "addressRegion": "FL", "postalCode": "67890"' 39 | with pytest.raises(ValueError, match=r".* is not a valid JSON object"): 40 | LocationParam(location=invalid_address_json) # type: ignore[arg-type] 41 | 42 | 43 | def test_invalid_type_location_param(): 44 | invalid_type_value = 12345 # Invalid type, should raise ValueError 45 | with pytest.raises(ValueError, match=r".* type .* is not a supported type"): 46 | LocationParam(location=invalid_type_value) # type: ignore[arg-type] 47 | 48 | 49 | def test_invalid_validation_location_param(): 50 | invalid_address_json = '{"nonExpectedInputField": "67890"}' 51 | with pytest.raises(ValidationError, match=r"Extra inputs are not permitted .*"): 52 | LocationParam(location=invalid_address_json) # type: ignore[arg-type] 53 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | from scrapy import Request, Spider 7 | 8 | from tests import get_crawler 9 | from zyte_spider_templates.utils import ( 10 | get_domain, 11 | get_domain_fingerprint, 12 | get_project_id, 13 | get_request_fingerprint, 14 | get_spider_name, 15 | load_url_list, 16 | ) 17 | 18 | URL_TO_DOMAIN = ( 19 | ("https://example.com", "example.com"), 20 | ("https://www.example.com", "example.com"), 21 | ("https://www2.example.com", "example.com"), 22 | ("https://prefixwww.example.com", "prefixwww.example.com"), 23 | ("https://wwworld.example.com", "wwworld.example.com"), 24 | ("https://my.wwworld-example.com", "my.wwworld-example.com"), 25 | ("https://wwwow.com", "wwwow.com"), 26 | ("https://wowww.com", "wowww.com"), 27 | ("https://awww.com", "awww.com"), 28 | ) 29 | 30 | 31 | @pytest.mark.parametrize("url,domain", URL_TO_DOMAIN) 32 | def test_get_domain(url, domain): 33 | assert get_domain(url) == domain 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "input_urls,expected", 38 | ( 39 | ( 40 | "https://a.example", 41 | ["https://a.example"], 42 | ), 43 | ( 44 | " https://a.example ", 45 | ["https://a.example"], 46 | ), 47 | ( 48 | "https://a.example\n \nhttps://b.example\nhttps://c.example\n\n", 49 | ["https://a.example", "https://b.example", "https://c.example"], 50 | ), 51 | ( 52 | "ftp://a.example", 53 | ValueError, 54 | ), 55 | ), 56 | ) 57 | def test_load_url_list(input_urls, expected): 58 | if isinstance(expected, list): 59 | assert load_url_list(input_urls) == expected 60 | return 61 | with pytest.raises(expected): 62 | load_url_list(input_urls) 63 | 64 | 65 | @pytest.mark.parametrize( 66 | "url, expected_fingerprint", 67 | [ 68 | # No subdomain 69 | ("https://example.com", "c300"), 70 | # One subdomain 71 | ("https://sub.example.com", "c35d"), 72 | # Multiple subdomains 73 | ("https://sub1.sub2.example.com", "c3c9"), 74 | # No TLD (localhost or internal addresses) 75 | ("http://localhost", "3300"), 76 | # Complex TLD (e.g., .co.uk) and subdomains 77 | ("https://sub.example.co.uk", "c35d"), 78 | ], 79 | ) 80 | def test_get_domain_fingerprint(url, expected_fingerprint): 81 | assert get_domain_fingerprint(url) == expected_fingerprint 82 | 83 | 84 | @pytest.mark.parametrize( 85 | "env_var_value, spider_name, expected_result, expected_log", 86 | [ 87 | ( 88 | "virtual_spider_name", 89 | "regular_spider_name", 90 | "virtual_spider_name", 91 | "Picked virtual spider name virtual_spider_name from the spider's SHUB_VIRTUAL_SPIDER setting.", 92 | ), 93 | ( 94 | None, 95 | "regular_spider_name", 96 | "regular_spider_name", 97 | "Picked spider name regular_spider_name from the spider.", 98 | ), 99 | ], 100 | ) 101 | def test_get_spider_name( 102 | env_var_value, spider_name, expected_result, expected_log, caplog 103 | ): 104 | class TestSpider(Spider): 105 | name = spider_name 106 | 107 | caplog.clear() 108 | crawler = get_crawler() 109 | crawler.spider = TestSpider() 110 | 111 | logger = logging.getLogger("zyte_spider_templates.utils") 112 | logger.setLevel(logging.INFO) 113 | 114 | with patch.dict( 115 | os.environ, 116 | {"SHUB_VIRTUAL_SPIDER": env_var_value} if env_var_value else {}, 117 | clear=True, 118 | ): 119 | result = get_spider_name(crawler) 120 | assert result == expected_result 121 | assert expected_log in caplog.text 122 | 123 | 124 | @pytest.mark.parametrize( 125 | "env_scrapy, env_zyte, settings_zyte, expected_result, expected_log, expect_exception", 126 | [ 127 | # SCRAPY_PROJECT_ID is set 128 | ( 129 | "123456", 130 | None, 131 | None, 132 | "123456", 133 | "Picked project id 123456 from SCRAPY_PROJECT_ID env variable.", 134 | False, 135 | ), 136 | # ZYTE_PROJECT_ID is set in the environment 137 | ( 138 | None, 139 | "654321", 140 | None, 141 | "654321", 142 | "Picked project id 654321 from ZYTE_PROJECT_ID env variable.", 143 | False, 144 | ), 145 | # ZYTE_PROJECT_ID is set in the settings 146 | ( 147 | None, 148 | None, 149 | "126534", 150 | "126534", 151 | "Picked project id 126534 from the spider's ZYTE_PROJECT_ID setting.", 152 | False, 153 | ), 154 | # No project ID found, expect an exception 155 | ( 156 | None, 157 | None, 158 | None, 159 | None, # No result expected 160 | None, # No log expected 161 | True, # Expect an exception 162 | ), 163 | ], 164 | ) 165 | def test_get_project_id( 166 | env_scrapy, 167 | env_zyte, 168 | settings_zyte, 169 | expected_result, 170 | expected_log, 171 | expect_exception, 172 | caplog, 173 | ): 174 | caplog.clear() 175 | 176 | env_vars = {} 177 | if env_scrapy: 178 | env_vars["SCRAPY_PROJECT_ID"] = env_scrapy 179 | if env_zyte: 180 | env_vars["ZYTE_PROJECT_ID"] = env_zyte 181 | 182 | with patch.dict(os.environ, env_vars, clear=True): 183 | crawler = get_crawler() 184 | 185 | if settings_zyte: 186 | crawler.settings.set("ZYTE_PROJECT_ID", settings_zyte) 187 | 188 | with caplog.at_level(logging.INFO, logger="zyte_spider_templates.utils"): 189 | if expect_exception: 190 | with pytest.raises( 191 | ValueError, 192 | match="Zyte project id wasn't found in job data, env, or settings.", 193 | ): 194 | get_project_id(crawler) 195 | else: 196 | assert get_project_id(crawler) == expected_result 197 | assert expected_log in caplog.text 198 | 199 | 200 | def test_get_request_fingerprint(): 201 | url = "https://example.com" 202 | domain_fp = "ffeeddccbbaa" 203 | request_fp = "aabbccddeeff" 204 | 205 | with patch( 206 | "zyte_spider_templates.utils.get_domain_fingerprint", return_value=domain_fp 207 | ): 208 | crawler = get_crawler() 209 | with patch.object(crawler, "request_fingerprinter") as mock_fingerprinter: 210 | mock_fingerprinter.fingerprint.return_value = bytes.fromhex(request_fp) 211 | request = Request(url) 212 | result = get_request_fingerprint(crawler, request) 213 | assert result == domain_fp + request_fp 214 | mock_fingerprinter.fingerprint.assert_called_once_with(request) 215 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import Any 5 | 6 | from aiohttp.test_utils import TestServer 7 | from scrapy import Spider, signals 8 | from scrapy.utils.defer import deferred_to_future 9 | 10 | from . import get_crawler 11 | 12 | 13 | def assertEqualSpiderMetadata(actual, expected): 14 | """Compare 2 JSON schemas of spider metadata. 15 | 16 | The parameter order in the parameter schema is taken into account, given 17 | how it affects the UI, while the order of other object keys may be 18 | different. 19 | 20 | It also generates a better diff in pytest output when enums are involved, 21 | e.g. geolocation values. 22 | """ 23 | assert tuple(actual["param_schema"]["properties"]) == tuple( 24 | expected["param_schema"]["properties"] 25 | ) 26 | actual_json = json.dumps(actual, indent=2, sort_keys=True) 27 | expected_json = json.dumps(expected, indent=2, sort_keys=True) 28 | assert actual_json == expected_json 29 | 30 | 31 | def get_addons() -> dict[str | type, int]: 32 | addons: dict[str | type, int] = { 33 | "scrapy_zyte_api.Addon": 500, 34 | "zyte_spider_templates.Addon": 1000, 35 | } 36 | try: 37 | from scrapy_poet import Addon 38 | except ImportError: 39 | pass 40 | else: 41 | addons[Addon] = 300 42 | return addons 43 | 44 | 45 | def get_zyte_api_settings(zyte_api_server) -> dict[str, Any]: 46 | return { 47 | "ZYTE_API_URL": str(zyte_api_server.make_url("/")), 48 | "ZYTE_API_KEY": "a", 49 | "ADDONS": get_addons(), 50 | } 51 | 52 | 53 | async def crawl_fake_zyte_api( 54 | zyte_api_server: TestServer, 55 | spider_cls: type[Spider], 56 | spider_kwargs: dict[str, Any], 57 | settings: dict[str, Any] | None = None, 58 | ): 59 | settings = {**get_zyte_api_settings(zyte_api_server), **(settings or {})} 60 | crawler = get_crawler(settings=settings, spider_cls=spider_cls) 61 | items = [] 62 | 63 | def track_item(item, response, spider): 64 | items.append(item) 65 | 66 | crawler.signals.connect(track_item, signal=signals.item_scraped) 67 | await deferred_to_future(crawler.crawl(**spider_kwargs)) 68 | return items 69 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = min,py39,py310,py311,py312,py313,mypy,linters,twine 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-cov 8 | pytest-twisted 9 | freezegun 10 | zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@c48564f 11 | fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@1352eec 12 | commands = 13 | py.test \ 14 | --cov-report=html:coverage-html \ 15 | --doctest-modules \ 16 | --cov-report=html \ 17 | --cov-report=xml \ 18 | --cov=zyte_spider_templates \ 19 | -vv \ 20 | -m "not deprication_warning" \ 21 | {posargs:zyte_spider_templates tests} 22 | 23 | [testenv:min] 24 | basepython = python3.9 25 | deps = 26 | {[testenv]deps} 27 | extruct==0.18.0 28 | form2request==0.2.0 29 | formasaurus==0.10.0 30 | jmespath==0.9.5 31 | pydantic==2.1 32 | requests==2.31.0 33 | scrapinghub==2.4.0 34 | scrapy==2.11.0 35 | scrapy-poet==0.24.0 36 | scrapy-spider-metadata==0.2.0 37 | scrapy-zyte-api[provider]==0.25.0 38 | web-poet==0.17.1 39 | xtractmime==0.2.1 40 | zyte-common-items==0.26.2 41 | 42 | [testenv:mypy] 43 | deps = 44 | mypy==1.12.0 45 | enum-tools==0.12.0 46 | freezegun==1.5.1 47 | pytest==8.3.3 48 | types-requests==2.32.0.20240914 49 | commands = mypy zyte_spider_templates tests 50 | 51 | [testenv:linters] 52 | deps = -rrequirements-dev.txt 53 | commands = pre-commit run --all-files --show-diff-on-failure 54 | 55 | [testenv:twine] 56 | deps = 57 | twine==6.1.0 58 | build==1.2.2.post1 59 | commands = 60 | python setup.py sdist 61 | twine check dist/* 62 | 63 | [testenv:docs] 64 | changedir = docs 65 | deps = 66 | -rdocs/requirements.txt 67 | commands = 68 | sphinx-build -W -b html . {envtmpdir}/html 69 | -------------------------------------------------------------------------------- /utils/google-gl-updater/requirements.in: -------------------------------------------------------------------------------- 1 | jinja2 2 | parsel 3 | requests 4 | -------------------------------------------------------------------------------- /utils/google-gl-updater/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | certifi==2024.8.30 8 | # via requests 9 | charset-normalizer==3.4.0 10 | # via requests 11 | cssselect==1.2.0 12 | # via parsel 13 | idna==3.10 14 | # via requests 15 | jinja2==3.1.5 16 | # via -r requirements.in 17 | jmespath==1.0.1 18 | # via parsel 19 | lxml==5.3.0 20 | # via parsel 21 | markupsafe==3.0.2 22 | # via jinja2 23 | packaging==24.2 24 | # via parsel 25 | parsel==1.9.1 26 | # via -r requirements.in 27 | requests==2.32.3 28 | # via -r requirements.in 29 | urllib3==2.2.3 30 | # via requests 31 | w3lib==2.2.1 32 | # via parsel 33 | -------------------------------------------------------------------------------- /utils/google-gl-updater/template.py: -------------------------------------------------------------------------------- 1 | {% raw %}# ../_geolocations.py counterpart for 2 | # https://developers.google.com/custom-search/docs/json_api_reference#countryCodes 3 | # 4 | # Built automatically with ../../utils/google-gl-updater 5 | 6 | from enum import Enum 7 | 8 | GOOGLE_GL_OPTIONS = {{% endraw %}{% for country in countries %} 9 | "{{ country.code }}": "{{ country.name }}",{% endfor %}{% raw %} 10 | } 11 | GOOGLE_GL_OPTIONS_WITH_CODE = { 12 | code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items() 13 | } 14 | 15 | 16 | class GoogleGl(str, Enum):{% endraw %}{% for country in countries %} 17 | {{ country.keyword }}: str = "{{ country.code }}"{% endfor %} 18 | 19 | -------------------------------------------------------------------------------- /utils/google-gl-updater/update.py: -------------------------------------------------------------------------------- 1 | from keyword import iskeyword 2 | from pathlib import Path 3 | 4 | import jinja2 5 | import requests 6 | from parsel import Selector 7 | 8 | countries = [] 9 | 10 | response = requests.get( 11 | "https://developers.google.com/custom-search/docs/json_api_reference" 12 | ) 13 | selector = Selector(text=response.text) 14 | table = selector.xpath('//*[@id="country-codes"]/following-sibling::table[1]') 15 | for tr in table.css("tr"): 16 | name = tr.xpath("td/text()").get() 17 | if not name: # header 18 | continue 19 | code = tr.xpath("td/span/text()").get() 20 | keyword = f"{code}_" if iskeyword(code) else code 21 | countries.append({"code": code, "keyword": keyword, "name": name}) 22 | 23 | template_path = Path(__file__).parent / "template.py" 24 | template_environment = jinja2.Environment() 25 | with template_path.open() as f: 26 | template = template_environment.from_string(f.read()) 27 | output = template.render(countries=countries) 28 | output_path = ( 29 | Path(__file__).parent.parent.parent 30 | / "zyte_spider_templates" 31 | / "spiders" 32 | / "_google_gl.py" 33 | ) 34 | with output_path.open("w") as f: 35 | f.write(output) 36 | -------------------------------------------------------------------------------- /utils/google-hl-updater/requirements.in: -------------------------------------------------------------------------------- 1 | jinja2 2 | parsel 3 | requests 4 | -------------------------------------------------------------------------------- /utils/google-hl-updater/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | certifi==2024.8.30 8 | # via requests 9 | charset-normalizer==3.4.0 10 | # via requests 11 | cssselect==1.2.0 12 | # via parsel 13 | idna==3.10 14 | # via requests 15 | jinja2==3.1.6 16 | # via -r requirements.in 17 | jmespath==1.0.1 18 | # via parsel 19 | lxml==5.3.0 20 | # via parsel 21 | markupsafe==3.0.2 22 | # via jinja2 23 | packaging==24.2 24 | # via parsel 25 | parsel==1.9.1 26 | # via -r requirements.in 27 | requests==2.32.3 28 | # via -r requirements.in 29 | urllib3==2.2.3 30 | # via requests 31 | w3lib==2.2.1 32 | # via parsel 33 | -------------------------------------------------------------------------------- /utils/google-hl-updater/template.py: -------------------------------------------------------------------------------- 1 | {% raw %}# _google_gl.py counterpart for 2 | # https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages 3 | # 4 | # Built automatically with ../../utils/google-hl-updater 5 | 6 | from enum import Enum 7 | 8 | GOOGLE_HL_OPTIONS = {{% endraw %}{% for language in languages %} 9 | "{{ language.code }}": "{{ language.name }}",{% endfor %}{% raw %} 10 | } 11 | GOOGLE_HL_OPTIONS_WITH_CODE = { 12 | code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items() 13 | } 14 | 15 | 16 | class GoogleHl(str, Enum):{% endraw %}{% for language in languages %} 17 | {{ language.keyword }}: str = "{{ language.code }}"{% endfor %} 18 | 19 | -------------------------------------------------------------------------------- /utils/google-hl-updater/update.py: -------------------------------------------------------------------------------- 1 | from keyword import iskeyword 2 | from pathlib import Path 3 | 4 | import jinja2 5 | import requests 6 | from parsel import Selector 7 | 8 | languages = [] 9 | 10 | response = requests.get( 11 | "https://developers.google.com/custom-search/docs/json_api_reference" 12 | ) 13 | selector = Selector(text=response.text) 14 | table = selector.xpath( 15 | '//*[@id="supported-interface-languages"]/following-sibling::table[1]' 16 | ) 17 | for tr in table.css("tr"): 18 | name = tr.xpath("td/text()").get() 19 | if not name: # header 20 | continue 21 | code = tr.xpath("td/span/text()").get() 22 | keyword = f"{code}_" if iskeyword(code) else code 23 | keyword = keyword.replace("-", "_") 24 | languages.append({"code": code, "keyword": keyword, "name": name}) 25 | 26 | template_path = Path(__file__).parent / "template.py" 27 | template_environment = jinja2.Environment() 28 | with template_path.open() as f: 29 | template = template_environment.from_string(f.read()) 30 | output = template.render(languages=languages) 31 | output_path = ( 32 | Path(__file__).parent.parent.parent 33 | / "zyte_spider_templates" 34 | / "spiders" 35 | / "_google_hl.py" 36 | ) 37 | with output_path.open("w") as f: 38 | f.write(output) 39 | -------------------------------------------------------------------------------- /zyte_spider_templates/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | from logging import getLogger 3 | 4 | from ._incremental.middleware import IncrementalCrawlMiddleware 5 | from .middlewares import ( 6 | AllowOffsiteMiddleware, 7 | CrawlingLogsMiddleware, 8 | MaxRequestsPerSeedDownloaderMiddleware, 9 | OffsiteRequestsPerSeedMiddleware, 10 | OnlyFeedsMiddleware, 11 | TrackNavigationDepthSpiderMiddleware, 12 | TrackSeedsSpiderMiddleware, 13 | ) 14 | from .spiders.article import ArticleSpider 15 | from .spiders.base import BaseSpider, BaseSpiderParams 16 | from .spiders.ecommerce import EcommerceSpider 17 | from .spiders.job_posting import JobPostingSpider 18 | from .spiders.serp import GoogleSearchSpider 19 | 20 | from ._addon import Addon # isort: skip 21 | 22 | logger = getLogger(__name__) 23 | package = "zyte-spider-templates" 24 | logger.info(f"Running {package} {version(package)}") 25 | -------------------------------------------------------------------------------- /zyte_spider_templates/_addon.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | from typing import Any, List, Optional, Type 3 | 4 | from duplicate_url_discarder_rules import RULE_PATHS 5 | from scrapy.settings import BaseSettings 6 | from scrapy.utils.misc import load_object 7 | 8 | from zyte_spider_templates import ( 9 | AllowOffsiteMiddleware, 10 | CrawlingLogsMiddleware, 11 | IncrementalCrawlMiddleware, 12 | MaxRequestsPerSeedDownloaderMiddleware, 13 | OffsiteRequestsPerSeedMiddleware, 14 | OnlyFeedsMiddleware, 15 | TrackNavigationDepthSpiderMiddleware, 16 | TrackSeedsSpiderMiddleware, 17 | ) 18 | 19 | logger = getLogger(__name__) 20 | 21 | 22 | def _extend_module_list(settings: BaseSettings, setting: str, item: str) -> None: 23 | spider_modules: List[str] = settings.getlist(setting) 24 | if item not in spider_modules: 25 | spider_modules_priority = settings.getpriority(setting) 26 | settings.set( 27 | setting, 28 | spider_modules + [item], 29 | priority=spider_modules_priority, # type: ignore[arg-type] 30 | ) 31 | 32 | 33 | def _replace_builtin( 34 | settings: BaseSettings, setting: str, builtin_cls: Type, new_cls: Type 35 | ) -> None: 36 | setting_value = settings[setting] 37 | if not setting_value: 38 | logger.warning( 39 | f"Setting {setting!r} is empty. Could not replace the built-in " 40 | f"{builtin_cls} entry with {new_cls}. Add {new_cls} manually to " 41 | f"silence this warning." 42 | ) 43 | return None 44 | 45 | if new_cls in setting_value: 46 | return None 47 | for cls_or_path in setting_value: 48 | if isinstance(cls_or_path, str): 49 | _cls = load_object(cls_or_path) 50 | if _cls == new_cls: 51 | return None 52 | 53 | builtin_entry: Optional[Any] = None 54 | for _setting_value in (setting_value, settings[f"{setting}_BASE"]): 55 | if builtin_cls in _setting_value: 56 | builtin_entry = builtin_cls 57 | pos = _setting_value[builtin_entry] 58 | break 59 | for cls_or_path in _setting_value: 60 | if isinstance(cls_or_path, str): 61 | _cls = load_object(cls_or_path) 62 | if _cls == builtin_cls: 63 | builtin_entry = cls_or_path 64 | pos = _setting_value[builtin_entry] 65 | break 66 | if builtin_entry: 67 | break 68 | 69 | if not builtin_entry: 70 | logger.warning( 71 | f"Settings {setting!r} and {setting + '_BASE'!r} are both " 72 | f"missing built-in entry {builtin_cls}. Cannot replace it with {new_cls}. " 73 | f"Add {new_cls} manually to silence this warning." 74 | ) 75 | return None 76 | 77 | if pos is None: 78 | logger.warning( 79 | f"Built-in entry {builtin_cls} of setting {setting!r} is disabled " 80 | f"(None). Cannot replace it with {new_cls}. Add {new_cls} " 81 | f"manually to silence this warning. If you had replaced " 82 | f"{builtin_cls} with some other entry, you might also need to " 83 | f"disable that other entry for things to work as expected." 84 | ) 85 | return 86 | 87 | settings[setting][builtin_entry] = None 88 | settings[setting][new_cls] = pos 89 | 90 | 91 | # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/scrapy_zyte_api/addon.py#L12 92 | def _setdefault(settings: BaseSettings, setting: str, cls: Type, pos: int) -> None: 93 | setting_value = settings[setting] 94 | if not setting_value: 95 | settings[setting] = {cls: pos} 96 | return None 97 | if cls in setting_value: 98 | return None 99 | for cls_or_path in setting_value: 100 | if isinstance(cls_or_path, str): 101 | _cls = load_object(cls_or_path) 102 | if _cls == cls: 103 | return None 104 | settings[setting][cls] = pos 105 | 106 | 107 | class Addon: 108 | def update_settings(self, settings: BaseSettings) -> None: 109 | for setting, value in ( 110 | ("CLOSESPIDER_TIMEOUT_NO_ITEM", 600), 111 | ("SCHEDULER_DISK_QUEUE", "scrapy.squeues.PickleFifoDiskQueue"), 112 | ("SCHEDULER_MEMORY_QUEUE", "scrapy.squeues.FifoMemoryQueue"), 113 | ("SCHEDULER_PRIORITY_QUEUE", "scrapy.pqueues.DownloaderAwarePriorityQueue"), 114 | ( 115 | "ITEM_PROBABILITY_THRESHOLDS", 116 | { 117 | "zyte_common_items.items.Article": 0.1, 118 | "zyte_common_items.items.Product": 0.1, 119 | }, 120 | ), 121 | ("DUD_LOAD_RULE_PATHS", RULE_PATHS), 122 | ): 123 | settings.set(setting, value, priority="addon") 124 | 125 | _extend_module_list( 126 | settings, "SCRAPY_POET_DISCOVER", "zyte_spider_templates.pages" 127 | ) 128 | _extend_module_list(settings, "SPIDER_MODULES", "zyte_spider_templates.spiders") 129 | 130 | _setdefault( 131 | settings, 132 | "DOWNLOADER_MIDDLEWARES", 133 | MaxRequestsPerSeedDownloaderMiddleware, 134 | 100, 135 | ) 136 | _setdefault(settings, "SPIDER_MIDDLEWARES", IncrementalCrawlMiddleware, 45) 137 | _setdefault( 138 | settings, "SPIDER_MIDDLEWARES", OffsiteRequestsPerSeedMiddleware, 49 139 | ) 140 | _setdefault(settings, "SPIDER_MIDDLEWARES", TrackSeedsSpiderMiddleware, 550) 141 | _setdefault(settings, "SPIDER_MIDDLEWARES", OnlyFeedsMiddleware, 108) 142 | _setdefault( 143 | settings, "SPIDER_MIDDLEWARES", TrackNavigationDepthSpiderMiddleware, 110 144 | ) 145 | _setdefault(settings, "SPIDER_MIDDLEWARES", CrawlingLogsMiddleware, 1000) 146 | 147 | try: 148 | from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware 149 | except ImportError: 150 | from scrapy.spidermiddlewares.offsite import ( # type: ignore[assignment] 151 | OffsiteMiddleware, 152 | ) 153 | 154 | _replace_builtin( 155 | settings, 156 | "SPIDER_MIDDLEWARES", 157 | OffsiteMiddleware, 158 | AllowOffsiteMiddleware, 159 | ) 160 | else: 161 | _replace_builtin( 162 | settings, 163 | "DOWNLOADER_MIDDLEWARES", 164 | OffsiteMiddleware, 165 | AllowOffsiteMiddleware, 166 | ) 167 | -------------------------------------------------------------------------------- /zyte_spider_templates/_incremental/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/zyte_spider_templates/_incremental/__init__.py -------------------------------------------------------------------------------- /zyte_spider_templates/_incremental/manager.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from collections import defaultdict 4 | from concurrent.futures import ThreadPoolExecutor 5 | from typing import Dict, List, Optional, Set, Tuple, Union 6 | 7 | import scrapinghub 8 | from itemadapter import ItemAdapter 9 | from scrapinghub.client.exceptions import Unauthorized 10 | from scrapy import signals 11 | from scrapy.crawler import Crawler 12 | from scrapy.http.request import Request 13 | from zyte_common_items import Item 14 | 15 | from zyte_spider_templates.utils import ( 16 | get_client, 17 | get_project_id, 18 | get_request_fingerprint, 19 | get_spider_name, 20 | ) 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | INCREMENTAL_SUFFIX = "_incremental" 25 | COLLECTION_API_URL = "https://storage.scrapinghub.com/collections" 26 | 27 | THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10) 28 | 29 | 30 | class CollectionsFingerprintsManager: 31 | def __init__(self, crawler: Crawler) -> None: 32 | self.writer = None 33 | self.collection = None 34 | self.crawler = crawler 35 | 36 | self.batch: Set[Tuple[str, str]] = set() 37 | self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50) 38 | 39 | project_id = get_project_id(crawler) 40 | collection_name = self.get_collection_name(crawler) 41 | 42 | self.init_collection(project_id, collection_name) 43 | self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}" 44 | 45 | logger.info( 46 | f"Configuration of CollectionsFingerprintsManager for IncrementalCrawlMiddleware:\n" 47 | f"batch_size: {self.batch_size},\n" 48 | f"project: {project_id},\n" 49 | f"collection_name: {collection_name}" 50 | ) 51 | 52 | crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) 53 | 54 | def get_collection_name(self, crawler): 55 | return ( 56 | crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME") 57 | or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}" 58 | ) 59 | 60 | def init_collection(self, project_id, collection_name) -> None: 61 | client = get_client() 62 | collection = client.get_project(project_id).collections.get_store( 63 | collection_name 64 | ) 65 | try: 66 | # Trying to get a random key to make sure the collection exists. 67 | collection.list(key=["init_key"]) 68 | except scrapinghub.client.exceptions.NotFound as e: 69 | if f"unknown collection {collection_name}" in str(e): 70 | logger.info( 71 | f"The collection: {collection_name} for {project_id=} doesn't exist" 72 | f" and will be created automatically" 73 | ) 74 | # This trick forces the creation of a collection. 75 | collection.set({"_key": "init", "value": "1"}) 76 | collection.delete("init") 77 | else: 78 | logger.error(f"The error {e} for {project_id=}") 79 | raise RuntimeError("incremental_crawling__not_found_exception") 80 | except Unauthorized: 81 | logger.error("The api key (SH_APIKEY or SHUB_JOBAUTH) is not valid.") 82 | raise ValueError("incremental_crawling__api_key_not_vaild") 83 | 84 | self.collection = collection 85 | self.writer = self.collection.create_writer() # type: ignore 86 | 87 | def save_to_collection(self, items_to_save) -> None: 88 | """Saves the current batch of fingerprints to the collection.""" 89 | items = [{"_key": key, "value": value} for key, value in items_to_save] 90 | self.writer.write(items) # type: ignore 91 | self.writer.flush() # type: ignore 92 | 93 | async def get_keys_from_collection_async(self, keys: Set[str]) -> Set[str]: 94 | """Asynchronously fetches a set of keys from the collection using an executor to run in separate threads.""" 95 | return await asyncio.get_event_loop().run_in_executor( 96 | THREAD_POOL_EXECUTOR, lambda: self.get_keys_from_collection(keys) 97 | ) 98 | 99 | async def read_batches(self, fingerprints: List[str], batch_start: int) -> Set[str]: 100 | """Reads a specific batch of fingerprints and fetches corresponding keys asynchronously.""" 101 | return await self.get_keys_from_collection_async( 102 | set(fingerprints[batch_start : batch_start + self.batch_size]) 103 | ) 104 | 105 | def get_keys_from_collection(self, keys: Set[str]) -> Set[str]: 106 | """Synchronously fetches a set of keys from the collection.""" 107 | return {item.get("_key", "") for item in self.collection.list(key=keys)} # type: ignore 108 | 109 | async def get_existing_fingerprints_async( 110 | self, fingerprints: List[str] 111 | ) -> Set[str]: 112 | """Asynchronously checks for duplicate fingerprints in both the collection and the local buffer. 113 | Async interaction with the collection could be replaced by 114 | https://github.com/scrapinghub/python-scrapinghub/issues/169 in the future""" 115 | 116 | fingerprints_size = len(fingerprints) 117 | 118 | if fingerprints_size == 0: 119 | return set() 120 | 121 | duplicated_fingerprints = set() 122 | 123 | tasks = [ 124 | self.read_batches(fingerprints, i) 125 | for i in range(0, fingerprints_size, self.batch_size) 126 | ] 127 | for future in asyncio.as_completed(tasks): 128 | try: 129 | batch_keys = await future 130 | duplicated_fingerprints.update(batch_keys) 131 | except Exception as e: 132 | logging.error(f"Error while processing batch: {e}") 133 | 134 | # Check duplicates in the local buffer 135 | local_duplicates = set(fingerprints) & {fp for fp, _ in self.batch} 136 | duplicated_fingerprints.update(local_duplicates) 137 | 138 | return duplicated_fingerprints 139 | 140 | def add_to_batch(self, fp_url_map: Set[Tuple[str, str]]) -> None: 141 | """ 142 | Add the list of provided fingerprints and corresponding URLs per one item to the batch 143 | """ 144 | for fp_url in fp_url_map: 145 | logger.debug(f"Adding fingerprint and URL ({fp_url}) to batch.") 146 | self.crawler.stats.inc_value( # type: ignore[union-attr] 147 | "incremental_crawling/fingerprint_url_to_batch" 148 | ) 149 | self.batch.add(fp_url) 150 | if len(self.batch) >= self.batch_size: 151 | self.save_batch() 152 | self.crawler.stats.inc_value("incremental_crawling/add_to_batch") # type: ignore[union-attr] 153 | 154 | def save_batch(self) -> None: 155 | if not self.batch: 156 | return 157 | logger.debug( 158 | f"Saving {len(self.batch)} fingerprints to the Collection. " 159 | f"The fingerprints are: {self.batch}." 160 | ) 161 | self.crawler.stats.inc_value("incremental_crawling/batch_saved") # type: ignore[union-attr] 162 | self.save_to_collection(items_to_save=self.batch) 163 | self.batch.clear() 164 | 165 | def spider_closed(self) -> None: 166 | """Save fingerprints and corresponding URLs remaining in the batch, before spider closes.""" 167 | self.save_batch() 168 | 169 | 170 | class IncrementalCrawlingManager: 171 | def __init__(self, crawler: Crawler, fm: CollectionsFingerprintsManager) -> None: 172 | self.crawler = crawler 173 | self.fm = fm 174 | 175 | async def process_incremental_async( 176 | self, request: Request, result: List 177 | ) -> List[Union[Request, Item]]: 178 | """ 179 | Processes the spider's parsing callbacks when IncrementalCrawlMiddleware is enabled. 180 | 181 | The function handles both requests and items returned by the spider. 182 | - If an item is found: 183 | - It saves the `request.url` and `item.url/item.canonicalURL` (if they differ) to the collection. 184 | - If the result is a Request: 185 | - It checks whether the request was processed previously. 186 | - If it was processed, the request is removed from the result. 187 | - If it was not, the request remains in the result. 188 | """ 189 | item: Optional[Item] = None 190 | to_check = defaultdict(list) 191 | fingerprint_to_url_map: Set[Tuple[str, str]] = set() 192 | for i, element in enumerate(result): 193 | if isinstance(element, Request): 194 | # The requests are only checked to see if the links exist in the Collection 195 | fp = get_request_fingerprint(self.crawler, element) 196 | to_check[fp].append(i) 197 | self.crawler.stats.inc_value("incremental_crawling/requests_to_check") # type: ignore[union-attr] 198 | else: 199 | if item: 200 | raise NotImplementedError( 201 | f"Unexpected number of returned items for {request.url}. " 202 | f"None or one was expected." 203 | ) 204 | 205 | item = element 206 | unique_urls = self._get_unique_urls(request.url, item) 207 | for url, url_field in unique_urls.items(): 208 | fp = get_request_fingerprint(self.crawler, request.replace(url=url)) 209 | if url_field != "request_url": 210 | to_check[fp].append(i) 211 | 212 | # Storing the fingerprint-to-URL mapping for the item only. 213 | # This will be used when storing the item in the Collection. 214 | fingerprint_to_url_map.add((fp, url)) 215 | 216 | if url_field == "url": 217 | self.crawler.stats.inc_value( # type: ignore[union-attr] 218 | "incremental_crawling/redirected_urls" 219 | ) 220 | logger.debug( 221 | f"Request URL for the item {request.url} was redirected to {url}." 222 | ) 223 | 224 | # Prepare list of duplications 225 | duplicated_fingerprints = await self.fm.get_existing_fingerprints_async( 226 | list(to_check.keys()) 227 | ) 228 | 229 | if duplicated_fingerprints: 230 | logging.debug( 231 | f"Skipping {len(duplicated_fingerprints)} Request fingerprints that were processed previously." 232 | ) 233 | 234 | n_dups = 0 235 | for dupe_fp in duplicated_fingerprints: 236 | # Marking duplicates for removal as None 237 | for index in to_check[dupe_fp]: 238 | result[index] = None 239 | n_dups += 1 240 | 241 | filtered_result = [x for x in result if x is not None] 242 | 243 | self.crawler.stats.inc_value( # type: ignore[union-attr] 244 | "incremental_crawling/filtered_items_and_requests", n_dups 245 | ) 246 | # Check for any new fingerprints and their corresponding URLs for the item 247 | fingerprint_url_map_new = { 248 | (fp, url) 249 | for fp, url in fingerprint_to_url_map 250 | if fp not in duplicated_fingerprints 251 | } 252 | # Add any new fingerprints and their corresponding URLs to the batch for future saving 253 | if fingerprint_url_map_new: 254 | self.fm.add_to_batch(fingerprint_url_map_new) 255 | return filtered_result 256 | 257 | def _get_unique_urls( 258 | self, request_url: str, item: Optional[Item], discard_request_url: bool = False 259 | ) -> Dict[str, Optional[str]]: 260 | """Retrieves a dictionary of unique URLs associated with an item.""" 261 | 262 | urls: Dict[str, Optional[str]] = {request_url: "request_url"} 263 | if not item: 264 | return urls 265 | 266 | url_fields = ["url", "canonicalUrl"] 267 | 268 | adapter = ItemAdapter(item) 269 | for url_field in url_fields: 270 | if (url := adapter[url_field]) and url not in urls: 271 | urls[url] = url_field 272 | 273 | if discard_request_url: 274 | urls.pop(request_url) 275 | 276 | return urls 277 | -------------------------------------------------------------------------------- /zyte_spider_templates/_incremental/middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import AsyncGenerator, Union 3 | 4 | from scrapinghub.client.exceptions import Unauthorized 5 | from scrapy.crawler import Crawler 6 | from scrapy.exceptions import CloseSpider, NotConfigured 7 | from scrapy.http import Request 8 | from zyte_common_items import Item 9 | 10 | from .manager import CollectionsFingerprintsManager, IncrementalCrawlingManager 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class IncrementalCrawlMiddleware: 16 | """:ref:`Downloader middleware ` to skip 17 | items seen in previous crawls. 18 | 19 | To enable this middleware, set the :setting:`INCREMENTAL_CRAWL_ENABLED` 20 | setting to ``True``. 21 | 22 | This middleware keeps a record of URLs of crawled items in the :ref:`Zyte Scrapy Cloud 23 | collection ` specified in the :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` 24 | setting, and skips items, responses and requests with matching URLs. 25 | 26 | Use :setting:`INCREMENTAL_CRAWL_BATCH_SIZE` to fine-tune interactions with 27 | the collection for performance. 28 | """ 29 | 30 | def __init__(self, crawler: Crawler): 31 | assert crawler.spider 32 | if not crawler.spider.settings.getbool("INCREMENTAL_CRAWL_ENABLED", False): 33 | raise NotConfigured 34 | self.inc_manager: IncrementalCrawlingManager = self.prepare_incremental_manager( 35 | crawler 36 | ) 37 | 38 | @staticmethod 39 | def prepare_incremental_manager(crawler): 40 | try: 41 | collection_fp = CollectionsFingerprintsManager(crawler) 42 | except (AttributeError, Unauthorized, RuntimeError, ValueError) as exc_info: 43 | logger.error( 44 | f"IncrementalCrawlMiddleware is enabled, but something went wrong with Collections.\n" 45 | f"The reason: {exc_info}" 46 | ) 47 | raise CloseSpider("incremental_crawling_middleware_collection_issue") 48 | 49 | return IncrementalCrawlingManager(crawler, collection_fp) 50 | 51 | @classmethod 52 | def from_crawler(cls, crawler: Crawler): 53 | return cls(crawler) 54 | 55 | async def process_spider_output( 56 | self, response, result, spider 57 | ) -> AsyncGenerator[Union[Request, Item], None]: 58 | result_list = [] 59 | async for item_or_request in result: 60 | result_list.append(item_or_request) 61 | 62 | unique_items_or_requests = await self.inc_manager.process_incremental_async( 63 | response.request, result_list 64 | ) 65 | 66 | for item_or_request in unique_items_or_requests: 67 | yield item_or_request 68 | -------------------------------------------------------------------------------- /zyte_spider_templates/_lang_codes.py: -------------------------------------------------------------------------------- 1 | # ISO 639-1 language codes 2 | # Taken from https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes 3 | 4 | LANG_CODES = [ 5 | "ab", 6 | "aa", 7 | "af", 8 | "ak", 9 | "sq", 10 | "am", 11 | "ar", 12 | "an", 13 | "hy", 14 | "as", 15 | "av", 16 | "ae", 17 | "ay", 18 | "az", 19 | "bm", 20 | "ba", 21 | "eu", 22 | "be", 23 | "bn", 24 | "bi", 25 | "bs", 26 | "br", 27 | "bg", 28 | "my", 29 | "ca", 30 | "ch", 31 | "ce", 32 | "ny", 33 | "zh", 34 | "cu", 35 | "cv", 36 | "kw", 37 | "co", 38 | "cr", 39 | "hr", 40 | "cs", 41 | "da", 42 | "dv", 43 | "nl", 44 | "dz", 45 | "en", 46 | "eo", 47 | "et", 48 | "ee", 49 | "fo", 50 | "fj", 51 | "fi", 52 | "fr", 53 | "fy", 54 | "ff", 55 | "gd", 56 | "gl", 57 | "lg", 58 | "ka", 59 | "de", 60 | "el", 61 | "kl", 62 | "gn", 63 | "gu", 64 | "ht", 65 | "ha", 66 | "he", 67 | "hz", 68 | "hi", 69 | "ho", 70 | "hu", 71 | "is", 72 | "io", 73 | "ig", 74 | "id", 75 | "ia", 76 | "ie", 77 | "iu", 78 | "ik", 79 | "ga", 80 | "it", 81 | "ja", 82 | "jv", 83 | "kn", 84 | "kr", 85 | "ks", 86 | "kk", 87 | "km", 88 | "ki", 89 | "rw", 90 | "ky", 91 | "kv", 92 | "kg", 93 | "ko", 94 | "kj", 95 | "ku", 96 | "lo", 97 | "la", 98 | "lv", 99 | "li", 100 | "ln", 101 | "lt", 102 | "lu", 103 | "lb", 104 | "mk", 105 | "mg", 106 | "ms", 107 | "ml", 108 | "mt", 109 | "gv", 110 | "mi", 111 | "mr", 112 | "mh", 113 | "mn", 114 | "na", 115 | "nv", 116 | "nd", 117 | "nr", 118 | "ng", 119 | "ne", 120 | "no", 121 | "nb", 122 | "nn", 123 | "ii", 124 | "oc", 125 | "oj", 126 | "or", 127 | "om", 128 | "os", 129 | "pi", 130 | "ps", 131 | "fa", 132 | "pl", 133 | "pt", 134 | "pa", 135 | "qu", 136 | "ro", 137 | "rm", 138 | "rn", 139 | "ru", 140 | "se", 141 | "sm", 142 | "sg", 143 | "sa", 144 | "sc", 145 | "sr", 146 | "sn", 147 | "sd", 148 | "si", 149 | "sk", 150 | "sl", 151 | "so", 152 | "st", 153 | "es", 154 | "su", 155 | "sw", 156 | "ss", 157 | "sv", 158 | "tl", 159 | "ty", 160 | "tg", 161 | "ta", 162 | "tt", 163 | "te", 164 | "th", 165 | "bo", 166 | "ti", 167 | "to", 168 | "ts", 169 | "tn", 170 | "tr", 171 | "tk", 172 | "tw", 173 | "ug", 174 | "uk", 175 | "ur", 176 | "uz", 177 | "ve", 178 | "vi", 179 | "vo", 180 | "wa", 181 | "cy", 182 | "wo", 183 | "xh", 184 | "yi", 185 | "yo", 186 | "za", 187 | "zu", 188 | ] 189 | -------------------------------------------------------------------------------- /zyte_spider_templates/documentation.py: -------------------------------------------------------------------------------- 1 | try: 2 | from enum_tools.documentation import document_enum 3 | except ImportError: 4 | 5 | def document_enum(func): # type: ignore[misc] 6 | return func 7 | -------------------------------------------------------------------------------- /zyte_spider_templates/feeds.py: -------------------------------------------------------------------------------- 1 | from typing import List, Set, Union 2 | 3 | import feedparser 4 | from scrapy.utils.python import unique 5 | from w3lib.html import strip_html5_whitespace 6 | from w3lib.url import canonicalize_url 7 | from web_poet import AnyResponse, BrowserResponse, HttpResponse, RequestUrl, ResponseUrl 8 | 9 | 10 | def unique_urls(urls: List[str]) -> List[str]: 11 | return unique(urls, key=canonicalize_url) 12 | 13 | 14 | def get_feed_urls( 15 | response: Union[AnyResponse, HttpResponse, BrowserResponse] 16 | ) -> Set[str]: 17 | """Find all RSS or Atom feeds from a page""" 18 | feed_urls = set() 19 | 20 | for link in response.xpath("//link[@type]"): 21 | link_type: str = strip_html5_whitespace(link.attrib["type"]) 22 | link_href: Union[str, RequestUrl, ResponseUrl] = strip_html5_whitespace( 23 | link.attrib.get("href", "") 24 | ) 25 | if link_href: 26 | link_href = response.urljoin(link_href) 27 | rss_url = atom_url = None 28 | if "rss+xml" in link_type: 29 | rss_url = link_href 30 | elif "atom+xml" in link_type: 31 | atom_url = link_href 32 | feed_url = rss_url or atom_url 33 | if feed_url: 34 | feed_urls.add(str(feed_url)) 35 | 36 | for link in response.xpath("//a/@href").getall(): 37 | link_href = strip_html5_whitespace(link) 38 | if link_href.endswith("rss.xml"): 39 | feed_urls.add(str(response.urljoin(link_href))) 40 | 41 | return feed_urls 42 | 43 | 44 | def parse_feed( 45 | response: Union[AnyResponse, HttpResponse, BrowserResponse] 46 | ) -> List[str]: 47 | response_text = ( 48 | str(response.html) if isinstance(response, BrowserResponse) else response.text 49 | ) 50 | 51 | feed = feedparser.parse(response_text) 52 | urls = [ 53 | strip_html5_whitespace(entry.get("link", "")) 54 | for entry in feed.get("entries", []) 55 | ] 56 | return unique_urls([str(response.urljoin(url)) for url in urls if url]) 57 | -------------------------------------------------------------------------------- /zyte_spider_templates/heuristics.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple 3 | from urllib.parse import urlparse, urlsplit 4 | 5 | from scrapy.link import Link 6 | from scrapy.linkextractors import IGNORED_EXTENSIONS 7 | from web_poet import BrowserResponse 8 | 9 | from zyte_spider_templates._geolocations import GEOLOCATION_OPTIONS 10 | from zyte_spider_templates._lang_codes import LANG_CODES as _LANG_CODES 11 | 12 | COUNTRY_CODES = set([k.lower() for k in GEOLOCATION_OPTIONS]) 13 | LANG_CODES = set(_LANG_CODES) 14 | 15 | ATOM_PATTERN = re.compile(r"]*>.*?]*>.*?", re.IGNORECASE | re.DOTALL) 16 | RDF_PATTERN = re.compile(r"]*>\s*]*>", re.IGNORECASE) 17 | RSS_PATTERN = re.compile(r"]*>\s*]*>", re.IGNORECASE) 18 | 19 | 20 | NO_CONTENT_KEYWORDS = ( 21 | "authenticate", 22 | "my-account", 23 | "account", 24 | "my-wishlist", 25 | "search", 26 | "archive", 27 | "privacy-policy", 28 | "cookie-policy", 29 | "terms-conditions", 30 | "tos", 31 | "admin", 32 | "rss.xml", 33 | "subscribe", 34 | "newsletter", 35 | "settings", 36 | "cart", 37 | "articles", 38 | "artykuly", # Polish for articles 39 | "news", 40 | "blog", 41 | "about", 42 | "about-us", 43 | "affiliate", 44 | "press", 45 | "careers", 46 | ) 47 | 48 | SUFFIXES = [".html", ".php", ".cgi", ".asp"] 49 | 50 | NO_CONTENT_RE = ( 51 | r"/sign[_-]?in", 52 | r"/log[_-]?(in|out)", 53 | r"/contact[_-]?(us)?", 54 | r"/(lost|forgot)[_-]password", 55 | r"/terms[_-]of[_-](service|use|conditions)", 56 | ) 57 | 58 | NO_ARTICLES_CONTENT_PATHS = ( 59 | "/archive", 60 | "/about", 61 | "/about-us", 62 | "/account", 63 | "/admin", 64 | "/affiliate", 65 | "/authenticate", 66 | "/best-deals", 67 | "/careers", 68 | "/cart", 69 | "/checkout", 70 | "/contactez-nous", 71 | "/cookie-policy", 72 | "/my-account", 73 | "/my-wishlist", 74 | "/press", 75 | "/pricing", 76 | "/privacy-policy", 77 | "/returns", 78 | "/rss.xml", 79 | "/search", 80 | "/settings", 81 | "/shipping", 82 | "/subscribe", 83 | "/terms-conditions", 84 | "/tos", 85 | ) 86 | 87 | 88 | SEED_URL_RE = re.compile(r"^https?:\/\/[^:\/\s]+(:\d{1,5})?(\/[^\s]*)*(#[^\s]*)?") 89 | 90 | NON_HTML_FILE_EXTENSION_RE = re.compile( 91 | ".*(?:{}$)".format("|".join(re.escape("." + ext) for ext in IGNORED_EXTENSIONS)), 92 | re.IGNORECASE, 93 | ) 94 | 95 | SOCIAL_DOMAINS = ( 96 | "facebook.com", 97 | "youtube.com", 98 | "youtu.be", 99 | "twitter.com", 100 | "t.co", 101 | "instagram.com", 102 | "mail.yahoo.com", 103 | "plus.google.com", 104 | "play.google.com", 105 | "www.google.com", 106 | "itunes.apple.com", 107 | "login.yahoo.com", 108 | "consent.yahoo.com", 109 | "outlook.live.com", 110 | "linkedin.com", 111 | "vk.com", 112 | "www.odnoklassniki.ru", 113 | "api.whatsapp.com", 114 | "telegram.me", 115 | "telegram.org", 116 | # ads 117 | "doubleclick.net", 118 | ) 119 | domains = "|".join(re.escape(domain) for domain in SOCIAL_DOMAINS) 120 | pattern = rf"(?:^(?:[./])(?:{domains})|\b(?:{domains}))$" 121 | SOCIAL_DOMAINS_RE = re.compile(pattern) 122 | 123 | 124 | def might_be_category(url: str) -> bool: 125 | """Returns True if the given url might be a category based on its path.""" 126 | 127 | url = url.lower().rstrip("/") 128 | parsed_url = urlparse(url) 129 | 130 | for suffix in [""] + SUFFIXES: 131 | for path in NO_CONTENT_KEYWORDS: 132 | if parsed_url.path.endswith(f"/{path}{suffix}"): 133 | return False 134 | if parsed_url.netloc.startswith(f"{path}."): 135 | return False 136 | for rule in NO_CONTENT_RE: 137 | if re.search(rule + suffix, url): 138 | return False 139 | 140 | return True 141 | 142 | 143 | INDEX_URL_PATHS = { 144 | "", 145 | "/index", 146 | "/index.html", 147 | "/index.htm", 148 | "/index.php", 149 | "/home", 150 | } 151 | 152 | 153 | def is_homepage(url: str) -> bool: 154 | """Given a URL, returns True if the URL could be a homepage.""" 155 | url_split = urlsplit(url) 156 | url_path = url_split.path.rstrip("/").lower() 157 | 158 | # Finds and removes URL subpaths like "/us/en", "en-us", "en-uk", etc. 159 | if _url_has_locale_pair(url_path): 160 | url_path = url_path[6:] 161 | 162 | # Finds and removes URL subpaths like "/en", "/fr", etc. 163 | match = re.search(r"/(\w{2})(?!\w)", url_path) 164 | if match and (match.group(1) in LANG_CODES or match.group(1) in COUNTRY_CODES): 165 | url_path = url_path[3:] 166 | 167 | if url_path in INDEX_URL_PATHS and not url_split.query: 168 | return True 169 | 170 | return False 171 | 172 | 173 | def _url_has_locale_pair(url_path: str) -> bool: 174 | if match := re.search(r"/(\w{2})[^a-z](\w{2})(?!\w)", url_path): 175 | x, y = match.groups() 176 | if x in LANG_CODES and y in COUNTRY_CODES: 177 | return True 178 | if y in LANG_CODES and x in COUNTRY_CODES: 179 | return True 180 | return False 181 | 182 | 183 | def is_comments_article_feed(url: str) -> bool: 184 | """ 185 | Try to guess if a feed URL is for comments, not for articles. 186 | """ 187 | if "comments/feed" in url or "feed=comments-rss2" in url: 188 | return True 189 | return False 190 | 191 | 192 | def is_non_html_file(url: str) -> bool: 193 | """ 194 | True for urls with extensions that clearly are not HTML. For example, 195 | they are images, or a compressed file, etc. 196 | >>> is_non_html_file("http://example.com/article") 197 | False 198 | >>> is_non_html_file("http://example.com/image.jpg") 199 | True 200 | """ 201 | return bool(NON_HTML_FILE_EXTENSION_RE.match(url)) 202 | 203 | 204 | def is_social_link(url: str) -> bool: 205 | """ 206 | True for urls corresponding to the typical social networks 207 | >>> is_social_link("http://facebook.com") 208 | True 209 | >>> is_social_link("http://www.facebook.com") 210 | True 211 | >>> is_social_link("http://rrr.t.co") 212 | True 213 | >>> is_social_link("http://t.co") 214 | True 215 | >>> is_social_link("http://sport.co") 216 | False 217 | >>> is_social_link("http://sport.com") 218 | False 219 | >>> is_social_link("http://example.com") 220 | False 221 | """ 222 | netloc = urlsplit(url).netloc 223 | 224 | if SOCIAL_DOMAINS_RE.search(netloc): 225 | return True 226 | return False 227 | 228 | 229 | def classify_article_crawling_links(links: List[Link]) -> Tuple[List[Link], List[Link]]: 230 | """In accordance with the rules, it divides the list of links into two new lists with allowed and disallowed links. 231 | Returns a tuple of these new lists.""" 232 | allowed_links = [] 233 | disallowed_links = [] 234 | for link in links: 235 | url = link.url 236 | if ( 237 | is_social_link(url) 238 | or is_non_html_file(url) 239 | or url.endswith(NO_ARTICLES_CONTENT_PATHS) 240 | ): 241 | disallowed_links.append(link) 242 | continue 243 | allowed_links.append(link) 244 | 245 | return allowed_links, disallowed_links 246 | 247 | 248 | def classify_article_feed_links(links: List[Link]) -> Tuple[List[Link], List[Link]]: 249 | """In accordance with the rules, it divides the list of urls into two new lists with allowed and disallowed urls. 250 | Returns a tuple of these new lists.""" 251 | allowed_links = [] 252 | disallowed_links = [] 253 | for link in links: 254 | if is_comments_article_feed(link.url): 255 | disallowed_links.append(link) 256 | continue 257 | allowed_links.append(link) 258 | return allowed_links, disallowed_links 259 | 260 | 261 | def is_feed_content(response: BrowserResponse) -> bool: 262 | # RSS 0.91, 0.92, 2.0 263 | if RSS_PATTERN.search(response.html): 264 | return True 265 | # Atom feed 266 | if ATOM_PATTERN.search(response.html): 267 | return True 268 | # RSS 1.0/RDF 269 | if RDF_PATTERN.search(response.html): 270 | return True 271 | return False 272 | -------------------------------------------------------------------------------- /zyte_spider_templates/page_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from warnings import warn 2 | 3 | from ..pages import HeuristicsProductNavigationPage 4 | 5 | warn( 6 | "The zyte_spider_templates.page_objects module is deprecated, use " 7 | "zyte_spider_templates.pages instead.", 8 | DeprecationWarning, 9 | stacklevel=2, 10 | ) 11 | -------------------------------------------------------------------------------- /zyte_spider_templates/page_objects/product_navigation_heuristics.py: -------------------------------------------------------------------------------- 1 | from ..pages import HeuristicsProductNavigationPage 2 | -------------------------------------------------------------------------------- /zyte_spider_templates/pages/__init__.py: -------------------------------------------------------------------------------- 1 | from .article_heuristics import HeuristicsArticleNavigationPage 2 | from .product_navigation_heuristics import HeuristicsProductNavigationPage 3 | from .search_request_template import DefaultSearchRequestTemplatePage 4 | -------------------------------------------------------------------------------- /zyte_spider_templates/pages/article_heuristics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Iterable, List 4 | 5 | import attrs 6 | import xtractmime 7 | from scrapy.http import TextResponse 8 | from scrapy.link import Link 9 | from scrapy.linkextractors import LinkExtractor 10 | from web_poet import AnyResponse, HttpResponse, PageParams, Stats, field, handle_urls 11 | from web_poet.utils import cached_method 12 | from zyte_common_items import ( 13 | BaseArticleNavigationPage, 14 | ProbabilityMetadata, 15 | ProbabilityRequest, 16 | ) 17 | 18 | from zyte_spider_templates.feeds import get_feed_urls, parse_feed 19 | from zyte_spider_templates.heuristics import ( 20 | classify_article_crawling_links, 21 | classify_article_feed_links, 22 | ) 23 | 24 | from ..heuristics import is_feed_content 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | def is_feed_request(request: ProbabilityRequest) -> bool: 30 | return bool( 31 | request.name 32 | and request.name.startswith("[heuristics][articleNavigation][feed]") 33 | ) 34 | 35 | 36 | @handle_urls("") 37 | @attrs.define 38 | class HeuristicsArticleNavigationPage(BaseArticleNavigationPage): 39 | response: AnyResponse 40 | stats: Stats 41 | page_params: PageParams 42 | _ARTICLE_HEURISTIC = {"name": "article", "dummy probability": 0.5} 43 | _NAVIGATION_HEURISTIC = {"name": "subCategories", "dummy probability": 0.5} 44 | _FEED_HEURISTIC = {"name": "feed", "dummy probability": 1.0} 45 | _FEED_ITEMS_HEURISTIC = {"name": "feed items", "dummy probability": 0.99} 46 | 47 | @field 48 | def url(self) -> str: 49 | return str(self.response.url) 50 | 51 | @field 52 | def subCategories(self) -> Iterable[ProbabilityRequest]: 53 | if self._is_response_feed(): 54 | return 55 | 56 | feeds = self._get_feed_links() 57 | feed_urls = {link.url for link in feeds} 58 | for link in feeds: 59 | yield self._get_request(link, self._FEED_HEURISTIC) 60 | 61 | if self.skip_subcategories() or self.is_only_feeds(): 62 | return 63 | 64 | sub_categories = [ 65 | link 66 | for link in self._get_article_or_navigation_links() 67 | if link.url not in feed_urls 68 | ] 69 | for link in sub_categories: 70 | yield self._get_request(link, self._NAVIGATION_HEURISTIC) 71 | 72 | @field 73 | def items(self) -> Iterable[ProbabilityRequest]: 74 | if self._is_response_feed(): 75 | links = self._get_feed_items_links() 76 | heuristic = self._FEED_ITEMS_HEURISTIC 77 | elif not self.is_only_feeds(): 78 | links = self._get_article_or_navigation_links() 79 | heuristic = self._ARTICLE_HEURISTIC 80 | else: 81 | return 82 | 83 | for link in links: 84 | yield self._get_request(link, heuristic) 85 | 86 | @cached_method 87 | def _get_article_or_navigation_links(self) -> List[Link]: 88 | """Extract links from an HTML web page.""" 89 | response = TextResponse( 90 | url=str(self.response.url), body=self.response.text.encode() 91 | ) 92 | link_extractor = LinkExtractor() 93 | links = link_extractor.extract_links(response) 94 | allowed_links, disallowed_links = classify_article_crawling_links(links) 95 | 96 | _log_and_stats( 97 | self, 98 | "heuristic_navigation_or_article", 99 | links, 100 | allowed_links, 101 | disallowed_links, 102 | ) 103 | return allowed_links 104 | 105 | @cached_method 106 | def _get_feed_items_links(self) -> List[Link]: 107 | """Extract links from an RSS/Atom feed.""" 108 | links = [Link(url) for url in parse_feed(self.response)] 109 | allowed_links, disallowed_links = classify_article_crawling_links(links) 110 | 111 | _log_and_stats( 112 | self, "heuristic_feed_items", links, allowed_links, disallowed_links 113 | ) 114 | return allowed_links 115 | 116 | @cached_method 117 | def _get_feed_links(self) -> List[Link]: 118 | """Extract links to RSS/Atom feeds form an HTML web page.""" 119 | links = [Link(url) for url in get_feed_urls(self.response)] 120 | allowed_links, disallowed_links = classify_article_feed_links(links) 121 | 122 | _log_and_stats(self, "heuristic_feed", links, allowed_links, disallowed_links) 123 | return allowed_links 124 | 125 | @cached_method 126 | def _is_response_feed(self) -> bool: 127 | """Return True if a response is an RSS or Atom feed.""" 128 | 129 | content_type = "" 130 | if isinstance(self.response.response, HttpResponse): 131 | content_type = self.response.response.headers.get("Content-Type", "") 132 | elif is_feed_content(self.response.response): 133 | logger.warning( 134 | "It is likely that the spider is using BrowserHtml to extract the RSS feed. " 135 | "Please note that using HttpResponse is more efficient." 136 | ) 137 | return True 138 | 139 | mime_type = xtractmime.extract_mime( 140 | self.response.text.encode(), 141 | content_types=(content_type.encode(),), 142 | ) 143 | 144 | return xtractmime.mimegroups.is_xml_mime_type( 145 | mime_type 146 | ) or xtractmime.mimegroups.is_json_mime_type(mime_type) 147 | 148 | def _get_request(self, link, heuristic) -> ProbabilityRequest: 149 | return ProbabilityRequest( 150 | url=link.url, 151 | name=f"[heuristics][articleNavigation][{heuristic['name']}] {link.text.strip()}", 152 | metadata=ProbabilityMetadata(probability=heuristic["dummy probability"]), 153 | ) 154 | 155 | def skip_subcategories(self) -> bool: 156 | return self.page_params.get("skip_subcategories", False) 157 | 158 | def is_only_feeds(self) -> bool: 159 | return self.page_params.get("only_feeds", False) 160 | 161 | 162 | def _log_and_stats(self, urls_type, links, allowed_links, disallowed_links): 163 | _logs(self, urls_type, links, allowed_links, disallowed_links) 164 | _stats(self, urls_type, links, allowed_links, disallowed_links) 165 | 166 | 167 | def _stats(page, urls_type, urls, allowed_urls, disallowed_urls): 168 | page.stats.inc(f"article_spider/{urls_type}/visited", 1) 169 | page.stats.inc(f"article_spider/{urls_type}/no_links", 0 if urls else 1) 170 | page.stats.inc(f"article_spider/{urls_type}/with_links", 1 if urls else 0) 171 | page.stats.inc(f"article_spider/{urls_type}/links/total", len(urls)) 172 | page.stats.inc(f"article_spider/{urls_type}/links/allow", len(allowed_urls)) 173 | page.stats.inc(f"article_spider/{urls_type}/links/disallow", len(disallowed_urls)) 174 | 175 | 176 | def _logs(page, urls_type, urls, allowed_urls, disallowed_urls): 177 | page_name = page.item_cls.__name__ 178 | data = { 179 | "page": page_name, 180 | "page url": page.url, 181 | "urls type": urls_type, 182 | "urls found": len(urls), 183 | "allowed urls": len(allowed_urls), 184 | "urls to skip": len(disallowed_urls), 185 | "list of urls to skip": [ 186 | url.url if isinstance(url, Link) else url for url in disallowed_urls 187 | ], 188 | } 189 | logger.debug(f"Article Heuristic Logs:\n{json.dumps(data, indent=2)}") 190 | -------------------------------------------------------------------------------- /zyte_spider_templates/pages/product_navigation_heuristics.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | import attrs 4 | from scrapy.http import TextResponse 5 | from scrapy.linkextractors import LinkExtractor 6 | from web_poet import AnyResponse, PageParams, field, handle_urls 7 | from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest 8 | 9 | from zyte_spider_templates.heuristics import might_be_category 10 | 11 | 12 | @handle_urls("") 13 | @attrs.define 14 | class HeuristicsProductNavigationPage(AutoProductNavigationPage): 15 | response: AnyResponse 16 | page_params: PageParams 17 | 18 | @field 19 | def subCategories(self) -> Optional[List[ProbabilityRequest]]: 20 | if self.page_params.get("full_domain"): 21 | return ( 22 | self.product_navigation.subCategories or [] 23 | ) + self._probably_category_links() 24 | return self.product_navigation.subCategories 25 | 26 | def _urls_for_category(self) -> List[str]: 27 | """Return a list of all URLs in the ProductNavigation item: 28 | - items 29 | - next page 30 | - subcategories 31 | """ 32 | 33 | category_urls = [] 34 | if self.product_navigation.items: 35 | category_urls.extend( 36 | [r.url for r in self.product_navigation.subCategories or []] 37 | ) 38 | category_urls.extend([r.url for r in self.product_navigation.items or []]) 39 | if self.product_navigation.nextPage: 40 | category_urls.append(self.product_navigation.nextPage.url) 41 | return category_urls 42 | 43 | def _probably_category_links(self) -> List[ProbabilityRequest]: 44 | # TODO: This should be tuned later 45 | default_probability = 0.1 46 | 47 | link_extractor = LinkExtractor( 48 | allow_domains=self.page_params.get("full_domain", []) 49 | ) 50 | ignore_urls = set(self._urls_for_category()) 51 | 52 | links = [] 53 | response = TextResponse( 54 | url=str(self.response.url), body=self.response.text.encode() 55 | ) 56 | for link in link_extractor.extract_links(response): 57 | if link.url in ignore_urls: 58 | continue 59 | 60 | # TODO: Convert to a configurable parameter like 'obey_nofollow_links' 61 | # some time after the MVP launch. 62 | if link.nofollow: 63 | continue 64 | 65 | if not might_be_category(link.url): 66 | continue 67 | 68 | name = (link.text or "").strip() 69 | request = ProbabilityRequest.from_dict( 70 | { 71 | "url": link.url, 72 | "name": f"[heuristics] {name}", 73 | "metadata": {"probability": default_probability}, 74 | } 75 | ) 76 | links.append(request) 77 | 78 | return links 79 | -------------------------------------------------------------------------------- /zyte_spider_templates/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/zyte_spider_templates/spiders/__init__.py -------------------------------------------------------------------------------- /zyte_spider_templates/spiders/_google_domains.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | # https://www.google.com/supported_domains 5 | # Sorted alphabetically, except for keeping the main domain first. 6 | class GoogleDomain(str, Enum): 7 | google_com: str = "google.com" 8 | google_ad: str = "google.ad" 9 | google_ae: str = "google.ae" 10 | google_al: str = "google.al" 11 | google_am: str = "google.am" 12 | google_as: str = "google.as" 13 | google_at: str = "google.at" 14 | google_az: str = "google.az" 15 | google_ba: str = "google.ba" 16 | google_be: str = "google.be" 17 | google_bf: str = "google.bf" 18 | google_bg: str = "google.bg" 19 | google_bi: str = "google.bi" 20 | google_bj: str = "google.bj" 21 | google_bs: str = "google.bs" 22 | google_bt: str = "google.bt" 23 | google_by: str = "google.by" 24 | google_ca: str = "google.ca" 25 | google_cat: str = "google.cat" 26 | google_cd: str = "google.cd" 27 | google_cf: str = "google.cf" 28 | google_cg: str = "google.cg" 29 | google_ch: str = "google.ch" 30 | google_ci: str = "google.ci" 31 | google_cl: str = "google.cl" 32 | google_cm: str = "google.cm" 33 | google_cn: str = "google.cn" 34 | google_co_ao: str = "google.co.ao" 35 | google_co_bw: str = "google.co.bw" 36 | google_co_ck: str = "google.co.ck" 37 | google_co_cr: str = "google.co.cr" 38 | google_co_id: str = "google.co.id" 39 | google_co_il: str = "google.co.il" 40 | google_co_in: str = "google.co.in" 41 | google_co_jp: str = "google.co.jp" 42 | google_co_ke: str = "google.co.ke" 43 | google_co_kr: str = "google.co.kr" 44 | google_co_ls: str = "google.co.ls" 45 | google_co_ma: str = "google.co.ma" 46 | google_co_mz: str = "google.co.mz" 47 | google_co_nz: str = "google.co.nz" 48 | google_co_th: str = "google.co.th" 49 | google_co_tz: str = "google.co.tz" 50 | google_co_ug: str = "google.co.ug" 51 | google_co_uk: str = "google.co.uk" 52 | google_co_uz: str = "google.co.uz" 53 | google_co_ve: str = "google.co.ve" 54 | google_co_vi: str = "google.co.vi" 55 | google_co_za: str = "google.co.za" 56 | google_co_zm: str = "google.co.zm" 57 | google_co_zw: str = "google.co.zw" 58 | google_com_af: str = "google.com.af" 59 | google_com_ag: str = "google.com.ag" 60 | google_com_ar: str = "google.com.ar" 61 | google_com_au: str = "google.com.au" 62 | google_com_bd: str = "google.com.bd" 63 | google_com_bh: str = "google.com.bh" 64 | google_com_bn: str = "google.com.bn" 65 | google_com_bo: str = "google.com.bo" 66 | google_com_br: str = "google.com.br" 67 | google_com_bz: str = "google.com.bz" 68 | google_com_co: str = "google.com.co" 69 | google_com_cu: str = "google.com.cu" 70 | google_com_cy: str = "google.com.cy" 71 | google_com_do: str = "google.com.do" 72 | google_com_ec: str = "google.com.ec" 73 | google_com_eg: str = "google.com.eg" 74 | google_com_et: str = "google.com.et" 75 | google_com_fj: str = "google.com.fj" 76 | google_com_gh: str = "google.com.gh" 77 | google_com_gi: str = "google.com.gi" 78 | google_com_gt: str = "google.com.gt" 79 | google_com_hk: str = "google.com.hk" 80 | google_com_jm: str = "google.com.jm" 81 | google_com_kh: str = "google.com.kh" 82 | google_com_kw: str = "google.com.kw" 83 | google_com_lb: str = "google.com.lb" 84 | google_com_ly: str = "google.com.ly" 85 | google_com_mm: str = "google.com.mm" 86 | google_com_mt: str = "google.com.mt" 87 | google_com_mx: str = "google.com.mx" 88 | google_com_my: str = "google.com.my" 89 | google_com_na: str = "google.com.na" 90 | google_com_ng: str = "google.com.ng" 91 | google_com_ni: str = "google.com.ni" 92 | google_com_np: str = "google.com.np" 93 | google_com_om: str = "google.com.om" 94 | google_com_pa: str = "google.com.pa" 95 | google_com_pe: str = "google.com.pe" 96 | google_com_pg: str = "google.com.pg" 97 | google_com_ph: str = "google.com.ph" 98 | google_com_pk: str = "google.com.pk" 99 | google_com_pr: str = "google.com.pr" 100 | google_com_py: str = "google.com.py" 101 | google_com_qa: str = "google.com.qa" 102 | google_com_sa: str = "google.com.sa" 103 | google_com_sb: str = "google.com.sb" 104 | google_com_sg: str = "google.com.sg" 105 | google_com_sl: str = "google.com.sl" 106 | google_com_sv: str = "google.com.sv" 107 | google_com_tj: str = "google.com.tj" 108 | google_com_tr: str = "google.com.tr" 109 | google_com_tw: str = "google.com.tw" 110 | google_com_ua: str = "google.com.ua" 111 | google_com_uy: str = "google.com.uy" 112 | google_com_vc: str = "google.com.vc" 113 | google_com_vn: str = "google.com.vn" 114 | google_cv: str = "google.cv" 115 | google_cz: str = "google.cz" 116 | google_de: str = "google.de" 117 | google_dj: str = "google.dj" 118 | google_dk: str = "google.dk" 119 | google_dm: str = "google.dm" 120 | google_dz: str = "google.dz" 121 | google_ee: str = "google.ee" 122 | google_es: str = "google.es" 123 | google_fi: str = "google.fi" 124 | google_fm: str = "google.fm" 125 | google_fr: str = "google.fr" 126 | google_ga: str = "google.ga" 127 | google_ge: str = "google.ge" 128 | google_gg: str = "google.gg" 129 | google_gl: str = "google.gl" 130 | google_gm: str = "google.gm" 131 | google_gr: str = "google.gr" 132 | google_gy: str = "google.gy" 133 | google_hn: str = "google.hn" 134 | google_hr: str = "google.hr" 135 | google_ht: str = "google.ht" 136 | google_hu: str = "google.hu" 137 | google_ie: str = "google.ie" 138 | google_im: str = "google.im" 139 | google_iq: str = "google.iq" 140 | google_is: str = "google.is" 141 | google_it: str = "google.it" 142 | google_je: str = "google.je" 143 | google_jo: str = "google.jo" 144 | google_kg: str = "google.kg" 145 | google_ki: str = "google.ki" 146 | google_kz: str = "google.kz" 147 | google_la: str = "google.la" 148 | google_li: str = "google.li" 149 | google_lk: str = "google.lk" 150 | google_lt: str = "google.lt" 151 | google_lu: str = "google.lu" 152 | google_lv: str = "google.lv" 153 | google_md: str = "google.md" 154 | google_me: str = "google.me" 155 | google_mg: str = "google.mg" 156 | google_mk: str = "google.mk" 157 | google_ml: str = "google.ml" 158 | google_mn: str = "google.mn" 159 | google_mu: str = "google.mu" 160 | google_mv: str = "google.mv" 161 | google_mw: str = "google.mw" 162 | google_ne: str = "google.ne" 163 | google_nl: str = "google.nl" 164 | google_no: str = "google.no" 165 | google_nr: str = "google.nr" 166 | google_nu: str = "google.nu" 167 | google_pl: str = "google.pl" 168 | google_pn: str = "google.pn" 169 | google_ps: str = "google.ps" 170 | google_pt: str = "google.pt" 171 | google_ro: str = "google.ro" 172 | google_rs: str = "google.rs" 173 | google_ru: str = "google.ru" 174 | google_rw: str = "google.rw" 175 | google_sc: str = "google.sc" 176 | google_se: str = "google.se" 177 | google_sh: str = "google.sh" 178 | google_si: str = "google.si" 179 | google_sk: str = "google.sk" 180 | google_sm: str = "google.sm" 181 | google_sn: str = "google.sn" 182 | google_so: str = "google.so" 183 | google_sr: str = "google.sr" 184 | google_st: str = "google.st" 185 | google_td: str = "google.td" 186 | google_tg: str = "google.tg" 187 | google_tl: str = "google.tl" 188 | google_tm: str = "google.tm" 189 | google_tn: str = "google.tn" 190 | google_to: str = "google.to" 191 | google_tt: str = "google.tt" 192 | google_vu: str = "google.vu" 193 | google_ws: str = "google.ws" 194 | -------------------------------------------------------------------------------- /zyte_spider_templates/spiders/_google_hl.py: -------------------------------------------------------------------------------- 1 | # _google_gl.py counterpart for 2 | # https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages 3 | # 4 | # Built automatically with ../../utils/google-hl-updater 5 | 6 | from enum import Enum 7 | 8 | GOOGLE_HL_OPTIONS = { 9 | "af": "Afrikaans", 10 | "sq": "Albanian", 11 | "sm": "Amharic", 12 | "ar": "Arabic", 13 | "az": "Azerbaijani", 14 | "eu": "Basque", 15 | "be": "Belarusian", 16 | "bn": "Bengali", 17 | "bh": "Bihari", 18 | "bs": "Bosnian", 19 | "bg": "Bulgarian", 20 | "ca": "Catalan", 21 | "zh-CN": "Chinese (Simplified)", 22 | "zh-TW": "Chinese (Traditional)", 23 | "hr": "Croatian", 24 | "cs": "Czech", 25 | "da": "Danish", 26 | "nl": "Dutch", 27 | "en": "English", 28 | "eo": "Esperanto", 29 | "et": "Estonian", 30 | "fo": "Faroese", 31 | "fi": "Finnish", 32 | "fr": "French", 33 | "fy": "Frisian", 34 | "gl": "Galician", 35 | "ka": "Georgian", 36 | "de": "German", 37 | "el": "Greek", 38 | "gu": "Gujarati", 39 | "iw": "Hebrew", 40 | "hi": "Hindi", 41 | "hu": "Hungarian", 42 | "is": "Icelandic", 43 | "id": "Indonesian", 44 | "ia": "Interlingua", 45 | "ga": "Irish", 46 | "it": "Italian", 47 | "ja": "Japanese", 48 | "jw": "Javanese", 49 | "kn": "Kannada", 50 | "ko": "Korean", 51 | "la": "Latin", 52 | "lv": "Latvian", 53 | "lt": "Lithuanian", 54 | "mk": "Macedonian", 55 | "ms": "Malay", 56 | "ml": "Malayam", 57 | "mt": "Maltese", 58 | "mr": "Marathi", 59 | "ne": "Nepali", 60 | "no": "Norwegian", 61 | "nn": "Norwegian (Nynorsk)", 62 | "oc": "Occitan", 63 | "fa": "Persian", 64 | "pl": "Polish", 65 | "pt-BR": "Portuguese (Brazil)", 66 | "pt-PT": "Portuguese (Portugal)", 67 | "pa": "Punjabi", 68 | "ro": "Romanian", 69 | "ru": "Russian", 70 | "gd": "Scots Gaelic", 71 | "sr": "Serbian", 72 | "si": "Sinhalese", 73 | "sk": "Slovak", 74 | "sl": "Slovenian", 75 | "es": "Spanish", 76 | "su": "Sudanese", 77 | "sw": "Swahili", 78 | "sv": "Swedish", 79 | "tl": "Tagalog", 80 | "ta": "Tamil", 81 | "te": "Telugu", 82 | "th": "Thai", 83 | "ti": "Tigrinya", 84 | "tr": "Turkish", 85 | "uk": "Ukrainian", 86 | "ur": "Urdu", 87 | "uz": "Uzbek", 88 | "vi": "Vietnamese", 89 | "cy": "Welsh", 90 | "xh": "Xhosa", 91 | "zu": "Zulu", 92 | } 93 | GOOGLE_HL_OPTIONS_WITH_CODE = { 94 | code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items() 95 | } 96 | 97 | 98 | class GoogleHl(str, Enum): 99 | af: str = "af" 100 | sq: str = "sq" 101 | sm: str = "sm" 102 | ar: str = "ar" 103 | az: str = "az" 104 | eu: str = "eu" 105 | be: str = "be" 106 | bn: str = "bn" 107 | bh: str = "bh" 108 | bs: str = "bs" 109 | bg: str = "bg" 110 | ca: str = "ca" 111 | zh_CN: str = "zh-CN" 112 | zh_TW: str = "zh-TW" 113 | hr: str = "hr" 114 | cs: str = "cs" 115 | da: str = "da" 116 | nl: str = "nl" 117 | en: str = "en" 118 | eo: str = "eo" 119 | et: str = "et" 120 | fo: str = "fo" 121 | fi: str = "fi" 122 | fr: str = "fr" 123 | fy: str = "fy" 124 | gl: str = "gl" 125 | ka: str = "ka" 126 | de: str = "de" 127 | el: str = "el" 128 | gu: str = "gu" 129 | iw: str = "iw" 130 | hi: str = "hi" 131 | hu: str = "hu" 132 | is_: str = "is" 133 | id: str = "id" 134 | ia: str = "ia" 135 | ga: str = "ga" 136 | it: str = "it" 137 | ja: str = "ja" 138 | jw: str = "jw" 139 | kn: str = "kn" 140 | ko: str = "ko" 141 | la: str = "la" 142 | lv: str = "lv" 143 | lt: str = "lt" 144 | mk: str = "mk" 145 | ms: str = "ms" 146 | ml: str = "ml" 147 | mt: str = "mt" 148 | mr: str = "mr" 149 | ne: str = "ne" 150 | no: str = "no" 151 | nn: str = "nn" 152 | oc: str = "oc" 153 | fa: str = "fa" 154 | pl: str = "pl" 155 | pt_BR: str = "pt-BR" 156 | pt_PT: str = "pt-PT" 157 | pa: str = "pa" 158 | ro: str = "ro" 159 | ru: str = "ru" 160 | gd: str = "gd" 161 | sr: str = "sr" 162 | si: str = "si" 163 | sk: str = "sk" 164 | sl: str = "sl" 165 | es: str = "es" 166 | su: str = "su" 167 | sw: str = "sw" 168 | sv: str = "sv" 169 | tl: str = "tl" 170 | ta: str = "ta" 171 | te: str = "te" 172 | th: str = "th" 173 | ti: str = "ti" 174 | tr: str = "tr" 175 | uk: str = "uk" 176 | ur: str = "ur" 177 | uz: str = "uz" 178 | vi: str = "vi" 179 | cy: str = "cy" 180 | xh: str = "xh" 181 | zu: str = "zu" 182 | -------------------------------------------------------------------------------- /zyte_spider_templates/spiders/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from importlib.metadata import version 4 | from typing import TYPE_CHECKING, Annotated, Any, Dict 5 | from warnings import warn 6 | 7 | import scrapy 8 | from pydantic import BaseModel, ConfigDict, model_validator 9 | from scrapy.crawler import Crawler 10 | from scrapy_zyte_api import custom_attrs 11 | from zyte_common_items import CustomAttributes 12 | 13 | from ..params import ( 14 | INPUT_GROUP, 15 | ExtractFromParam, 16 | GeolocationParam, 17 | MaxRequestsParam, 18 | SearchQueriesParam, 19 | UrlParam, 20 | UrlsFileParam, 21 | UrlsParam, 22 | ) 23 | 24 | if TYPE_CHECKING: 25 | # typing.Self requires Python 3.11 26 | from typing_extensions import Self 27 | 28 | 29 | class _LogExceptionsContextManager: 30 | def __init__(self, spider, exc_info): 31 | self._spider = spider 32 | self._exc_info = exc_info 33 | 34 | def __enter__(self): 35 | return 36 | 37 | def __exit__(self, exc_type, exc_value, exc_traceback): 38 | if exc_type is None: 39 | return True 40 | if issubclass(exc_type, self._exc_info): 41 | self._spider.logger.exception(exc_value) 42 | return True 43 | return False 44 | 45 | 46 | # Higher priority than command-line-defined settings (40). 47 | ARG_SETTING_PRIORITY: int = 50 48 | 49 | 50 | class BaseSpiderParams( 51 | ExtractFromParam, 52 | MaxRequestsParam, 53 | GeolocationParam, 54 | SearchQueriesParam, 55 | UrlsFileParam, 56 | UrlsParam, 57 | UrlParam, 58 | BaseModel, 59 | ): 60 | model_config = ConfigDict( 61 | json_schema_extra={ 62 | "groups": [ 63 | INPUT_GROUP, 64 | ], 65 | }, 66 | ) 67 | 68 | @model_validator(mode="after") 69 | def deprecated(self): 70 | warn( 71 | ( 72 | "BaseSpiderParams is deprecated, use pydantic.BaseModel and " 73 | "your desired combination of classes from " 74 | "zyte_spider_templates.params instead." 75 | ), 76 | DeprecationWarning, 77 | ) 78 | return self 79 | 80 | 81 | class BaseSpider(scrapy.Spider): 82 | custom_settings: Dict[str, Any] = { # type: ignore[assignment] 83 | "ZYTE_API_TRANSPARENT_MODE": True, 84 | "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}", 85 | } 86 | 87 | metadata: Dict[str, Any] = { 88 | "template": True, 89 | "title": "Base", 90 | "description": "Base template.", 91 | } 92 | 93 | _NEXT_PAGE_PRIORITY: int = 100 94 | 95 | _custom_attrs_dep = None 96 | _log_request_exception: _LogExceptionsContextManager = None # type: ignore[assignment] 97 | 98 | @classmethod 99 | def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: 100 | spider = super().from_crawler(crawler, *args, **kwargs) 101 | 102 | # all subclasses of this need to also have Args as a subclass 103 | # this may be possible to express in type hints instead 104 | assert hasattr(spider, "args") 105 | 106 | if geolocation := getattr(spider.args, "geolocation", None): 107 | # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected 108 | # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object 109 | # additional requests. 110 | for component in ("AUTOMAP", "PROVIDER"): 111 | default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS") 112 | default_params["geolocation"] = geolocation 113 | spider.settings.set( 114 | f"ZYTE_API_{component}_PARAMS", 115 | default_params, 116 | priority=ARG_SETTING_PRIORITY, 117 | ) 118 | 119 | if spider.args.max_requests: 120 | spider.settings.set( 121 | "ZYTE_API_MAX_REQUESTS", 122 | spider.args.max_requests, 123 | priority=ARG_SETTING_PRIORITY, 124 | ) 125 | 126 | if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None): 127 | custom_attrs_options = { 128 | "method": spider.args.custom_attrs_method, 129 | } 130 | if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): 131 | custom_attrs_options["maxInputTokens"] = max_input_tokens 132 | if max_output_tokens := crawler.settings.getint( 133 | "ZYTE_API_MAX_OUTPUT_TOKENS" 134 | ): 135 | custom_attrs_options["maxOutputTokens"] = max_output_tokens 136 | 137 | spider._custom_attrs_dep = Annotated[ 138 | CustomAttributes, 139 | custom_attrs(custom_attrs_input, custom_attrs_options), 140 | ] 141 | 142 | spider._log_request_exception = _LogExceptionsContextManager(spider, ValueError) 143 | 144 | return spider 145 | -------------------------------------------------------------------------------- /zyte_spider_templates/spiders/job_posting.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | from typing import ( 5 | TYPE_CHECKING, 6 | Any, 7 | Callable, 8 | Dict, 9 | Iterable, 10 | List, 11 | Optional, 12 | Union, 13 | cast, 14 | ) 15 | 16 | import scrapy 17 | from pydantic import BaseModel, ConfigDict, Field 18 | from scrapy.crawler import Crawler 19 | from scrapy_poet import DummyResponse, DynamicDeps 20 | from scrapy_spider_metadata import Args 21 | from web_poet import BrowserResponse 22 | from zyte_common_items import ( 23 | CustomAttributes, 24 | JobPosting, 25 | JobPostingNavigation, 26 | ProbabilityRequest, 27 | SearchRequestTemplate, 28 | ) 29 | 30 | from zyte_spider_templates.spiders.base import ( 31 | ARG_SETTING_PRIORITY, 32 | INPUT_GROUP, 33 | BaseSpider, 34 | ) 35 | 36 | from ..documentation import document_enum 37 | from ..params import ( 38 | CustomAttrsInputParam, 39 | CustomAttrsMethodParam, 40 | ExtractFrom, 41 | ExtractFromParam, 42 | GeolocationParam, 43 | MaxRequestsParam, 44 | SearchQueriesParam, 45 | UrlParam, 46 | UrlsFileParam, 47 | UrlsParam, 48 | parse_input_params, 49 | ) 50 | 51 | if TYPE_CHECKING: 52 | # typing.Self requires Python 3.11 53 | from typing_extensions import Self 54 | 55 | 56 | @document_enum 57 | class JobPostingCrawlStrategy(str, Enum): 58 | navigation: str = "navigation" 59 | """Follow pagination and job posting detail pages.""" 60 | 61 | direct_item: str = "direct_item" 62 | """Treat input URLs as direct links to job posting detail pages, and extract a 63 | job posting from each.""" 64 | 65 | 66 | class JobPostingCrawlStrategyParam(BaseModel): 67 | crawl_strategy: JobPostingCrawlStrategy = Field( 68 | title="Crawl strategy", 69 | description="Determines how input URLs and follow-up URLs are crawled.", 70 | default=JobPostingCrawlStrategy.navigation, 71 | json_schema_extra={ 72 | "enumMeta": { 73 | JobPostingCrawlStrategy.navigation: { 74 | "title": "Navigation", 75 | "description": "Follow pagination and job posting detail pages.", 76 | }, 77 | JobPostingCrawlStrategy.direct_item: { 78 | "title": "Direct URLs to job postings", 79 | "description": ( 80 | "Treat input URLs as direct links to job posting detail pages, and " 81 | "extract a job posting from each." 82 | ), 83 | }, 84 | }, 85 | }, 86 | ) 87 | 88 | 89 | class JobPostingSearchQueriesParam(SearchQueriesParam): 90 | search_queries: List[str] = Field( 91 | title="Search Queries", 92 | description=( 93 | "A list of search queries, one per line, to submit using the " 94 | "search form found on each input URL. Only works for input URLs " 95 | "that support search. May not work on every website." 96 | ), 97 | default_factory=list, 98 | json_schema_extra={ 99 | "default": [], 100 | "widget": "textarea", 101 | }, 102 | ) 103 | 104 | 105 | class JobPostingSpiderParams( 106 | CustomAttrsMethodParam, 107 | CustomAttrsInputParam, 108 | ExtractFromParam, 109 | MaxRequestsParam, 110 | GeolocationParam, 111 | JobPostingCrawlStrategyParam, 112 | JobPostingSearchQueriesParam, 113 | UrlsFileParam, 114 | UrlsParam, 115 | UrlParam, 116 | BaseModel, 117 | ): 118 | model_config = ConfigDict( 119 | json_schema_extra={ 120 | "groups": [ 121 | INPUT_GROUP, 122 | ], 123 | }, 124 | ) 125 | 126 | 127 | class JobPostingSpider(Args[JobPostingSpiderParams], BaseSpider): 128 | """Yield job postings from a job website. 129 | 130 | See :class:`~zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams` 131 | for supported parameters. 132 | 133 | .. seealso:: :ref:`job-posting`. 134 | """ 135 | 136 | name = "job_posting" 137 | 138 | metadata: Dict[str, Any] = { 139 | **BaseSpider.metadata, 140 | "title": "Job posting", 141 | "description": "[Experimental] Template for spiders that extract job posting data from websites.", 142 | } 143 | 144 | @classmethod 145 | def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: 146 | spider = super().from_crawler(crawler, *args, **kwargs) 147 | parse_input_params(spider) 148 | spider._init_extract_from() 149 | return spider 150 | 151 | def _init_extract_from(self): 152 | if self.args.extract_from is not None: 153 | self.settings.set( 154 | "ZYTE_API_PROVIDER_PARAMS", 155 | { 156 | "jobPostingOptions": {"extractFrom": self.args.extract_from}, 157 | "jobPostingNavigationOptions": { 158 | "extractFrom": self.args.extract_from 159 | }, 160 | **self.settings.get("ZYTE_API_PROVIDER_PARAMS", {}), 161 | }, 162 | priority=ARG_SETTING_PRIORITY, 163 | ) 164 | 165 | def get_start_request(self, url): 166 | callback = ( 167 | self.parse_job_posting 168 | if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item 169 | else self.parse_navigation 170 | ) 171 | meta: Dict[str, Any] = { 172 | "crawling_logs": { 173 | "page_type": "jobPosting" 174 | if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item 175 | else "jobPostingNavigation" 176 | }, 177 | } 178 | if ( 179 | self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item 180 | and self._custom_attrs_dep 181 | ): 182 | meta["inject"] = [ 183 | self._custom_attrs_dep, 184 | ] 185 | return scrapy.Request( 186 | url=url, 187 | callback=callback, 188 | meta=meta, 189 | ) 190 | 191 | def start_requests(self) -> Iterable[scrapy.Request]: 192 | if self.args.search_queries: 193 | for url in self.start_urls: 194 | meta: Dict[str, Any] = { 195 | "crawling_logs": {"page_type": "searchRequestTemplate"}, 196 | } 197 | if self.args.extract_from == ExtractFrom.browserHtml: 198 | meta["inject"] = [BrowserResponse] 199 | with self._log_request_exception: 200 | yield scrapy.Request( 201 | url=url, 202 | callback=self.parse_search_request_template, 203 | meta=meta, 204 | ) 205 | else: 206 | for url in self.start_urls: 207 | with self._log_request_exception: 208 | yield self.get_start_request(url) 209 | 210 | def parse_search_request_template( 211 | self, 212 | response: DummyResponse, 213 | search_request_template: SearchRequestTemplate, 214 | dynamic: DynamicDeps, 215 | ) -> Iterable[scrapy.Request]: 216 | probability = search_request_template.get_probability() 217 | if probability is not None and probability <= 0: 218 | return 219 | for query in self.args.search_queries: 220 | meta: Dict[str, Any] = { 221 | "crawling_logs": {"page_type": "jobPostingNavigation"}, 222 | } 223 | with self._log_request_exception: 224 | yield search_request_template.request(query=query).to_scrapy( 225 | callback=self.parse_navigation, 226 | meta=meta, 227 | ) 228 | 229 | def parse_navigation( 230 | self, response: DummyResponse, navigation: JobPostingNavigation 231 | ) -> Iterable[scrapy.Request]: 232 | job_postings = navigation.items or [] 233 | for request in job_postings: 234 | with self._log_request_exception: 235 | yield self.get_parse_job_posting_request(request) 236 | 237 | if navigation.nextPage: 238 | if not job_postings: 239 | self.logger.info( 240 | f"Ignoring nextPage link {navigation.nextPage} since there " 241 | f"are no job posting links found in {navigation.url}" 242 | ) 243 | else: 244 | with self._log_request_exception: 245 | yield self.get_nextpage_request( 246 | cast(ProbabilityRequest, navigation.nextPage) 247 | ) 248 | 249 | def parse_job_posting( 250 | self, response: DummyResponse, job_posting: JobPosting, dynamic: DynamicDeps 251 | ) -> Iterable[ 252 | Union[JobPosting, Dict[str, Union[JobPosting, Optional[CustomAttributes]]]] 253 | ]: 254 | probability = job_posting.get_probability() 255 | 256 | # TODO: convert to a configurable parameter later on after the launch 257 | if probability is None or probability >= 0.1: 258 | if self.args.custom_attrs_input: 259 | yield { 260 | "jobPosting": job_posting, 261 | "customAttributes": dynamic.get(CustomAttributes), 262 | } 263 | else: 264 | yield job_posting 265 | else: 266 | assert self.crawler.stats 267 | self.crawler.stats.inc_value("drop_item/job_posting/low_probability") 268 | self.logger.info( 269 | f"Ignoring item from {response.url} since its probability is " 270 | f"less than threshold of 0.1:\n{job_posting}" 271 | ) 272 | 273 | def get_parse_navigation_request( 274 | self, 275 | request: ProbabilityRequest, 276 | callback: Optional[Callable] = None, 277 | page_params: Optional[Dict[str, Any]] = None, 278 | page_type: str = "jobPostingNavigation", 279 | ) -> scrapy.Request: 280 | callback = callback or self.parse_navigation 281 | 282 | return request.to_scrapy( 283 | callback=callback, 284 | meta={ 285 | "page_params": page_params or {}, 286 | "crawling_logs": { 287 | "name": request.name or "", 288 | "probability": request.get_probability(), 289 | "page_type": page_type, 290 | }, 291 | }, 292 | ) 293 | 294 | def get_nextpage_request( 295 | self, 296 | request: ProbabilityRequest, 297 | callback: Optional[Callable] = None, 298 | page_params: Optional[Dict[str, Any]] = None, 299 | ): 300 | return self.get_parse_navigation_request( 301 | request, callback, page_params, "nextPage" 302 | ) 303 | 304 | def get_parse_job_posting_request( 305 | self, request: ProbabilityRequest, callback: Optional[Callable] = None 306 | ) -> scrapy.Request: 307 | callback = callback or self.parse_job_posting 308 | 309 | probability = request.get_probability() 310 | meta: Dict[str, Any] = { 311 | "crawling_logs": { 312 | "name": request.name, 313 | "probability": probability, 314 | "page_type": "jobPosting", 315 | }, 316 | } 317 | if self._custom_attrs_dep: 318 | meta["inject"] = [ 319 | self._custom_attrs_dep, 320 | ] 321 | 322 | scrapy_request = request.to_scrapy( 323 | callback=callback, 324 | meta=meta, 325 | ) 326 | scrapy_request.meta["allow_offsite"] = True 327 | return scrapy_request 328 | -------------------------------------------------------------------------------- /zyte_spider_templates/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | import os 4 | import re 5 | from typing import List, Optional 6 | 7 | import scrapinghub 8 | import tldextract 9 | from scrapy.crawler import Crawler 10 | from scrapy.http import Request 11 | from scrapy.utils.url import parse_url 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | _URL_PATTERN = r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$" 16 | 17 | 18 | def get_domain(url: str, include_port: bool = True) -> str: 19 | return re.sub( 20 | r"^www\d*\.", 21 | "", 22 | parse_url(url).netloc if include_port else parse_url(url).hostname or "", 23 | ) 24 | 25 | 26 | def load_url_list(urls: str) -> List[str]: 27 | result = [] 28 | bad_urls = [] 29 | for url in urls.split("\n"): 30 | if not (url := url.strip()): 31 | continue 32 | if not re.search(_URL_PATTERN, url): 33 | bad_urls.append(url) 34 | elif not bad_urls: 35 | result.append(url) 36 | if bad_urls: 37 | bad_url_list = "\n".join(bad_urls) 38 | raise ValueError( 39 | f"URL list contained the following invalid URLs:\n{bad_url_list}" 40 | ) 41 | return result 42 | 43 | 44 | def get_domain_fingerprint(url: str) -> str: 45 | """ 46 | Create a consistent 2-byte domain fingerprint by combining partial hashes 47 | of the main domain (without TLD) and the subdomain components. 48 | """ 49 | extracted = tldextract.extract(url) 50 | main_domain = extracted.domain 51 | subdomains = extracted.subdomain 52 | 53 | # Calculate partial hashes for each component 54 | main_domain_hash = hashlib.sha1(main_domain.encode("utf-8")).hexdigest()[:2] 55 | subdomain_hash = ( 56 | hashlib.sha1(subdomains.encode("utf-8")).hexdigest()[:2] if subdomains else "00" 57 | ) 58 | 59 | return main_domain_hash + subdomain_hash 60 | 61 | 62 | def get_request_fingerprint(crawler: Crawler, request: Request) -> str: 63 | """Create a fingerprint by including a domain-specific part.""" 64 | 65 | # Calculate domain fingerprint 66 | domain_fingerprint = get_domain_fingerprint(request.url) 67 | 68 | # Calculate request fingerprint 69 | request_fingerprint = crawler.request_fingerprinter.fingerprint(request).hex() # type: ignore[union-attr] 70 | 71 | # Combine the fingerprints by taking the 2-bytes (4 chars) domain fingerprint 72 | # to create a domain-specific identifier. 73 | # This optimization aids in efficient read/write operations in the Collection. 74 | 75 | return domain_fingerprint + request_fingerprint 76 | 77 | 78 | def get_project_id(crawler: Crawler) -> Optional[str]: 79 | """ 80 | Retrieve the project ID required for IncrementalCrawlMiddleware. 81 | 82 | The function attempts to obtain the project ID in the following order: 83 | 1. For Scrapy Cloud deployments, the project ID is automatically set as SCRAPY_PROJECT_ID 84 | in the environment variables. 85 | 2. Otherwise, it checks the ZYTE_PROJECT_ID environment variable. 86 | 3. If still not found, it checks the spider setting named ZYTE_PROJECT_ID. 87 | 88 | """ 89 | 90 | if project_id := os.environ.get("SCRAPY_PROJECT_ID"): 91 | logger.info( 92 | f"Picked project id {project_id} from SCRAPY_PROJECT_ID env variable." 93 | ) 94 | return project_id 95 | # Try to pick from manually set environmental variable 96 | if project_id := os.environ.get("ZYTE_PROJECT_ID"): 97 | logger.info( 98 | f"Picked project id {project_id} from ZYTE_PROJECT_ID env variable." 99 | ) 100 | return project_id 101 | # Try to pick from settings 102 | if project_id := crawler.settings.get("ZYTE_PROJECT_ID"): 103 | logger.info( 104 | f"Picked project id {project_id} from the spider's ZYTE_PROJECT_ID setting." 105 | ) 106 | return project_id 107 | raise ValueError( 108 | "Zyte project id wasn't found in job data, env, or settings. " 109 | "The env variable SCRAPY_PROJECT_ID or settings property ZYTE_PROJECT_ID was expected." 110 | ) 111 | 112 | 113 | def get_spider_name(crawler: Crawler) -> str: 114 | if spider_name := os.environ.get("SHUB_VIRTUAL_SPIDER"): 115 | logger.info( 116 | f"Picked virtual spider name {spider_name} from the spider's SHUB_VIRTUAL_SPIDER setting." 117 | ) 118 | return spider_name 119 | 120 | logger.info(f"Picked spider name {crawler.spider.name} from the spider.") # type: ignore[union-attr] 121 | return crawler.spider.name # type: ignore[union-attr] 122 | 123 | 124 | def get_client() -> scrapinghub.ScrapinghubClient: 125 | # auth is taken from SH_APIKEY or SHUB_JOBAUTH 126 | return scrapinghub.ScrapinghubClient( 127 | dash_endpoint=os.getenv("SHUB_APIURL"), 128 | endpoint=os.getenv("SHUB_STORAGE"), 129 | ) 130 | --------------------------------------------------------------------------------