├── .bumpversion.cfg
├── .codecov.yml
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGES.rst
├── LICENSE
├── README.rst
├── docs
    ├── Makefile
    ├── _ext
    │   └── __init__.py
    ├── changes.rst
    ├── conf.py
    ├── customization
    │   ├── index.rst
    │   ├── pages.rst
    │   └── spiders.rst
    ├── features
    │   └── search.rst
    ├── index.rst
    ├── make.bat
    ├── reference
    │   ├── api.rst
    │   ├── reqmeta.rst
    │   └── settings.rst
    ├── requirements.txt
    ├── setup.rst
    └── templates
    │   ├── article.rst
    │   ├── e-commerce.rst
    │   ├── google-search.rst
    │   ├── index.rst
    │   └── job-posting.rst
├── pyproject.toml
├── requirements-dev.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── incremental
    │   ├── test_collection_fp_manager.py
    │   ├── test_incremental_manager.py
    │   └── test_middleware.py
    ├── mockserver.py
    ├── pages
    │   ├── __init__.py
    │   ├── test_article_navigation_heuristics.py
    │   └── test_product_navigation_heuristics.py
    ├── test_addon.py
    ├── test_article.py
    ├── test_base.py
    ├── test_ecommerce.py
    ├── test_feeds.py
    ├── test_heuristics.py
    ├── test_job_posting.py
    ├── test_middlewares.py
    ├── test_params.py
    ├── test_params_location_param.py
    ├── test_search.py
    ├── test_serp.py
    ├── test_utils.py
    └── utils.py
├── tox.ini
├── utils
    ├── google-gl-updater
    │   ├── requirements.in
    │   ├── requirements.txt
    │   ├── template.py
    │   └── update.py
    └── google-hl-updater
    │   ├── requirements.in
    │   ├── requirements.txt
    │   ├── template.py
    │   └── update.py
└── zyte_spider_templates
    ├── __init__.py
    ├── _addon.py
    ├── _geolocations.py
    ├── _incremental
        ├── __init__.py
        ├── manager.py
        └── middleware.py
    ├── _lang_codes.py
    ├── documentation.py
    ├── feeds.py
    ├── heuristics.py
    ├── middlewares.py
    ├── page_objects
        ├── __init__.py
        └── product_navigation_heuristics.py
    ├── pages
        ├── __init__.py
        ├── article_heuristics.py
        ├── product_navigation_heuristics.py
        └── search_request_template.py
    ├── params.py
    ├── spiders
        ├── __init__.py
        ├── _google_domains.py
        ├── _google_gl.py
        ├── _google_hl.py
        ├── article.py
        ├── base.py
        ├── ecommerce.py
        ├── job_posting.py
        └── serp.py
    └── utils.py


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.12.0
 3 | commit = True
 4 | tag = True
 5 | tag_name = {new_version}
 6 | 
 7 | [bumpversion:file:setup.py]
 8 | search = version="{current_version}"
 9 | replace = version="{new_version}"
10 | 
11 | [bumpversion:file:docs/conf.py]
12 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
1 | comment:
2 |   layout: "header, diff, tree"
3 | 
4 | coverage:
5 |   status:
6 |     project: false
7 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: publish
 5 | 
 6 | on:
 7 |   push:
 8 |     tags:
 9 |       - "[0-9]+.[0-9]+.[0-9]+"
10 | 
11 | jobs:
12 |   deploy:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: '3.13'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install build
25 |     - name: Build package
26 |       run: python -m build
27 |     - name: Publish package
28 |       if: startsWith(github.ref, 'refs/tags')
29 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
30 |       with:
31 |         user: __token__
32 |         password: ${{ secrets.PYPI_API_TOKEN }}
33 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: tox
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main ]
 9 |   pull_request:
10 | 
11 | jobs:
12 |   test:
13 | 
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         include:
19 |         - python-version: '3.9'
20 |           toxenv: min
21 |         - python-version: '3.9'
22 |         - python-version: '3.10'
23 |         - python-version: '3.11'
24 |         - python-version: '3.12'
25 |         - python-version: '3.13'
26 | 
27 |     steps:
28 |     - uses: actions/checkout@v4
29 |     - name: Set up Python ${{ matrix.python-version }}
30 |       uses: actions/setup-python@v5
31 |       with:
32 |         python-version: ${{ matrix.python-version }}
33 |     - name: Install dependencies
34 |       run: |
35 |         python -m pip install --upgrade pip
36 |         python -m pip install tox
37 |     - name: tox
38 |       run: |
39 |         tox -e ${{ matrix.toxenv || 'py' }}
40 |     - name: coverage
41 |       if: ${{ success() }}
42 |       uses: codecov/codecov-action@v4.0.1
43 |       with:
44 |         token: ${{ secrets.CODECOV_TOKEN }}
45 | 
46 |   check:
47 |     runs-on: ubuntu-latest
48 |     strategy:
49 |       fail-fast: false
50 |       matrix:
51 |         python-version: ["3.13"]
52 |         tox-job: ["mypy", "linters", "twine", "docs"]
53 | 
54 |     steps:
55 |     - uses: actions/checkout@v4
56 |     - name: Set up Python ${{ matrix.python-version }}
57 |       uses: actions/setup-python@v5
58 |       with:
59 |         python-version: ${{ matrix.python-version }}
60 |     - name: Install dependencies
61 |       run: |
62 |         python -m pip install --upgrade pip
63 |         python -m pip install tox
64 |     - name: tox
65 |       run: |
66 |         tox -e ${{ matrix.tox-job }}
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .coverage
 2 | .mypy_cache/
 3 | .tox/
 4 | dist/
 5 | htmlcov/
 6 | coverage.xml
 7 | docs/_build
 8 | *.egg-info/
 9 | __pycache__/
10 | coverage-html/
11 | build/
12 | .idea/
13 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/PyCQA/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |     - id: isort
 6 | -   repo: https://github.com/psf/black
 7 |     rev: 23.10.1
 8 |     hooks:
 9 |     - id: black
10 | -   repo: https://github.com/pycqa/flake8
11 |     rev: 6.1.0
12 |     hooks:
13 |     - id: flake8
14 | - repo: https://github.com/adamchainz/blacken-docs
15 |   rev: 1.16.0
16 |   hooks:
17 |   - id: blacken-docs
18 |     additional_dependencies:
19 |     - black==23.10.1
20 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | formats: all
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 | build:
 6 |   os: ubuntu-22.04
 7 |   tools:
 8 |     python: "3.12"  # Keep in sync with .github/workflows/test.yml
 9 | python:
10 |   install:
11 |     - requirements: docs/requirements.txt
12 |     - path: .
13 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | Changes
  2 | =======
  3 | 
  4 | 0.12.0 (2025-03-31)
  5 | -------------------
  6 | 
  7 | * :ref:`Search queries <search-queries>` support is added to the :ref:`job
  8 |   posting spider template <job-posting>`.
  9 | * Fixed support for POST requests in search queries.
 10 | * Improved validation in the :ref:`Google search spider template
 11 |   <google-search>`.
 12 | 
 13 | 0.11.2 (2024-12-30)
 14 | -------------------
 15 | 
 16 | * Do not log warning about disabled components.
 17 | 
 18 | 0.11.1 (2024-12-26)
 19 | -------------------
 20 | 
 21 | * The :ref:`e-commerce <e-commerce>` and :ref:`job posting <job-posting>`
 22 |   spider templates no longer ignore item requests for a different domain.
 23 | 
 24 | 0.11.0 (2024-12-16)
 25 | -------------------
 26 | 
 27 | * New :ref:`Articles spider template <article>`, built on top of
 28 |   Zyte API’s :http:`request:article` and :http:`request:articleNavigation`.
 29 | 
 30 | * New :ref:`Job Posting spider template <job-posting>`, built on top of
 31 |   Zyte API’s :http:`request:jobPosting` and :http:`request:jobPostingNavigation`.
 32 | 
 33 | * :ref:`Search queries <search-queries>` support is added to the
 34 |   :ref:`e-commerce spider template <e-commerce>`.
 35 |   This allows to provide a list of search queries to the
 36 |   spider; the spider finds a search form on the target webpage, and submits all the queries.
 37 | 
 38 | * ProductList extraction support is added to the
 39 |   :ref:`e-commerce spider template <e-commerce>`. This allows spiders to
 40 |   extract basic product information without going into product detail pages.
 41 | 
 42 | * New features are added to the :ref:`Google Search spider template <google-search>`:
 43 | 
 44 |     * An option to follow the result links and extract data
 45 |       from the target pages (via the ``extract`` argument)
 46 |     * Content Languages (lr) parameter
 47 |     * Content Countries (cr) parameter
 48 |     * User Country (gl) parameter
 49 |     * User Language (hl) parameter
 50 |     * results_per_page parameter
 51 | 
 52 | * Added a Scrapy add-on. This allows to greatly simplify the initial
 53 |   zyte-spider-templates configuration.
 54 | 
 55 | * Bug fix: incorrectly extracted URLs no longer make spiders drop
 56 |   other requests.
 57 | 
 58 | * Cleaned up the CI; improved the testing suite; cleaned up the documentation.
 59 | 
 60 | 0.10.0 (2024-11-22)
 61 | -------------------
 62 | 
 63 | * Dropped Python 3.8 support, added Python 3.13 support.
 64 | 
 65 | * Increased the minimum required versions of some dependencies:
 66 | 
 67 |   * ``pydantic``: ``2`` → ``2.1``
 68 | 
 69 |   * ``scrapy-poet``: ``0.21.0`` → ``0.24.0``
 70 | 
 71 |   * ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0``
 72 | 
 73 |   * ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0``
 74 | 
 75 |   * ``zyte-common-items``: ``0.22.0`` → ``0.23.0``
 76 | 
 77 | * Added :ref:`custom attributes <custom-attributes>` support to the
 78 |   :ref:`e-commerce spider template <e-commerce>` through its new
 79 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input`
 80 |   and
 81 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method`
 82 |   parameters.
 83 | 
 84 | * The
 85 |   :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages`
 86 |   parameter of the :ref:`Google Search spider template <google-search>` can no
 87 |   longer be 0 or lower.
 88 | 
 89 | * The :ref:`Google Search spider template <google-search>` now follows
 90 |   pagination for the results of each query page by page, instead of sending a
 91 |   request for every page in parallel. It stops once it reaches a page without
 92 |   organic results.
 93 | 
 94 | * Improved the description of
 95 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy`
 96 |   values.
 97 | 
 98 | * Fixed type hint issues related to Scrapy.
 99 | 
100 | 
101 | 0.9.0 (2024-09-17)
102 | ------------------
103 | 
104 | * Now requires ``zyte-common-items >= 0.22.0``.
105 | 
106 | * New :ref:`Google Search spider template <google-search>`, built on top of
107 |   Zyte API’s :http:`request:serp`.
108 | 
109 | * The heuristics of the :ref:`e-commerce spider template <e-commerce>` to
110 |   ignore certain URLs when following category links now also handles
111 |   subdomains. For example, before https://example.com/blog was ignored, now
112 |   https://blog.example.com is also ignored.
113 | 
114 | * In the :ref:`spider parameters JSON schema <params-schema>`, the
115 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
116 |   parameter of the :ref:`e-commerce spider template <e-commerce>` switches
117 |   position, from being the last parameter to being between
118 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
119 |   and
120 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.
121 | 
122 | * Removed the ``valid_page_types`` attribute of
123 |   :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`.
124 | 
125 | 
126 | 0.8.0 (2024-08-21)
127 | ------------------
128 | 
129 | * Added new input parameters:
130 | 
131 |   * ``urls`` accepts a newline-delimited list of URLs.
132 | 
133 |   * ``urls_file`` accepts a URL that points to a plain-text file with a
134 |     newline-delimited list of URLs.
135 | 
136 |   Only one of ``url``, ``urls`` and ``urls_file`` should be used at a time.
137 | 
138 | * Added new crawling strategies:
139 | 
140 |   * ``automatic`` - uses heuristics to see if an input URL is a homepage, for
141 |     which it uses a modified ``full`` strategy where other links are discovered
142 |     only in the homepage. Otherwise, it assumes it's a navigation page and uses
143 |     the existing ``navigation`` strategy.
144 | 
145 |   * ``direct_item`` - input URLs are directly extracted as products.
146 | 
147 | * Added new parameters classes: ``LocationParam`` and ``PostalAddress``. Note
148 |   that these are available for use when customizing the templates and are not
149 |   currently being utilized by any template.
150 | 
151 | * Backward incompatible changes:
152 | 
153 |   * ``automatic`` becomes the new default crawling strategy instead of ``full``.
154 | 
155 | * CI test improvements.
156 | 
157 | 
158 | 0.7.2 (2024-05-07)
159 | ------------------
160 | 
161 | * Implemented :ref:`mixin classes for spider parameters <parameter-mixins>`, to
162 |   improve reuse.
163 | 
164 | * Improved docs, providing an example about overriding existing parameters when
165 |   :ref:`customizing parameters <custom-params>`, and featuring
166 |   :class:`~web_poet.AnyResponse` in the :ref:`example about overriding parsing
167 |   <override-parsing>`.
168 | 
169 | 
170 | 0.7.1 (2024-02-22)
171 | ------------------
172 | 
173 | * The
174 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
175 |   parameter of
176 |   :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpider`
177 |   now defaults to
178 |   :attr:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy.full`
179 |   instead of
180 |   :attr:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy.navigation`.
181 |   We also reworded some descriptions of :enum:`~.EcommerceCrawlStrategy` values
182 |   for clarification.
183 | 
184 | 0.7.0 (2024-02-09)
185 | ------------------
186 | 
187 | * Updated requirement versions:
188 | 
189 |   * :doc:`scrapy-poet <scrapy-poet:index>` >= 0.21.0
190 |   * :doc:`scrapy-zyte-api <scrapy-zyte-api:index>` >= 0.16.0
191 | 
192 | * With the updated dependencies above, this fixes the issue of having 2 separate
193 |   Zyte API Requests (*productNavigation* and *httpResponseBody*) for the same URL. Note
194 |   that this issue only occurs when requesting product navigation pages.
195 | 
196 | * Moved :class:`zyte_spider_templates.spiders.ecommerce.ExtractFrom` into
197 |   :class:`zyte_spider_templates.spiders.base.ExtractFrom`.
198 | 
199 | 
200 | 0.6.1 (2024-02-02)
201 | ------------------
202 | 
203 | * Improved the :attr:`zyte_spider_templates.spiders.base.BaseSpiderParams.url`
204 |   description.
205 | 
206 | 0.6.0 (2024-01-31)
207 | ------------------
208 | 
209 | * Fixed the ``extract_from`` spider parameter that wasn't working.
210 | 
211 | * The *"www."* prefix is now removed when setting the spider's
212 |   :attr:`~scrapy.Spider.allowed_domains`.
213 | 
214 | * The :attr:`zyte_common_items.ProductNavigation.nextPage` link won't be crawled
215 |   if :attr:`zyte_common_items.ProductNavigation.items` is empty.
216 | 
217 | * :class:`zyte_common_items.Product` items that are dropped due to low probability
218 |   *(below 0.1)* are now logged in stats: ``drop_item/product/low_probability``.
219 | 
220 | * :class:`zyte_spider_templates.pages.HeuristicsProductNavigationPage` now
221 |   inherits from :class:`zyte_common_items.AutoProductNavigationPage` instead of
222 |   :class:`zyte_common_items.BaseProductNavigationPage`.
223 | 
224 | * Moved e-commerce code from :class:`zyte_spider_templates.spiders.base.BaseSpider`
225 |   to :class:`zyte_spider_templates.spiders.ecommerce.EcommerceSpider`.
226 | 
227 | * Documentation improvements.
228 | 
229 | 0.5.0 (2023-12-18)
230 | ------------------
231 | 
232 | * The ``zyte_spider_templates.page_objects`` module is now deprecated in favor
233 |   of ``zyte_spider_templates.pages``, in line with ``web_poet.pages``.
234 | 
235 | 0.4.0 (2023-12-14)
236 | ------------------
237 | 
238 | * Products outside of the target domain can now be crawled using
239 |   :class:`zyte_spider_templates.middlewares.AllowOffsiteMiddleware`.
240 | 
241 | * Updated the documentation to also set up ``zyte_common_items.ZyteItemAdapter``.
242 | 
243 | * The ``max_requests`` spider parameter has now a default value of 100. Previously,
244 |   it was ``None`` which was unlimited.
245 | 
246 | * Improved the description of the ``max_requests`` spider parameter.
247 | 
248 | * Official support for Python 3.12.
249 | 
250 | * Misc documentation improvements.
251 | 
252 | 0.3.0 (2023-11-03)
253 | ------------------
254 | 
255 | * Added documentation.
256 | 
257 | * Added a middleware that logs information about the crawl in JSON format,
258 |   :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`. This
259 |   replaces the old crawling information that was difficult to parse using
260 |   regular expressions.
261 | 
262 | 0.2.0 (2023-10-30)
263 | ------------------
264 | 
265 | * Now requires ``zyte-common-items >= 0.12.0``.
266 | 
267 | * Added a new crawl strategy, "Pagination Only".
268 | 
269 | * Improved the request priority calculation based on the metadata probability
270 |   value.
271 | 
272 | * CI improvements.
273 | 
274 | 
275 | 0.1.0 (2023-10-24)
276 | ------------------
277 | 
278 | Initial release.
279 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Zyte Group Ltd
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice,
 8 |        this list of conditions and the following disclaimer.
 9 | 
10 |     2. Redistributions in binary form must reproduce the above copyright
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of Zyte nor the names of its contributors may be used
15 |        to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | zyte-spider-templates
 3 | =====================
 4 | 
 5 | .. image:: https://img.shields.io/pypi/v/zyte-spider-templates.svg
 6 |    :target: https://pypi.python.org/pypi/zyte-spider-templates
 7 |    :alt: PyPI Version
 8 | 
 9 | .. image:: https://img.shields.io/pypi/pyversions/zyte-spider-templates.svg
10 |    :target: https://pypi.python.org/pypi/zyte-spider-templates
11 |    :alt: Supported Python Versions
12 | 
13 | .. image:: https://github.com/zytedata/zyte-spider-templates/actions/workflows/test.yml/badge.svg
14 |    :target: https://github.com/zytedata/zyte-spider-templates/actions/workflows/test.yml
15 |    :alt: Automated tests
16 | 
17 | .. image:: https://codecov.io/github/zytedata/zyte-spider-templates/coverage.svg?branch=main
18 |    :target: https://codecov.io/gh/zytedata/zyte-spider-templates
19 |    :alt: Coverage report
20 | 
21 | 
22 | .. description starts
23 | 
24 | Spider templates for automatic crawlers.
25 | 
26 | This library contains Scrapy_ spider templates. They can be used out of the box
27 | with the Zyte features such as `Zyte API`_ or modified to be used standalone.
28 | There is a `sample Scrapy project`_ for this library that you can use as a
29 | starting point for your own projects.
30 | 
31 | .. _Scrapy: https://docs.scrapy.org/
32 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
33 | .. _sample Scrapy project: https://github.com/zytedata/zyte-spider-templates-project
34 | 
35 | .. description ends
36 | 
37 | * Documentation: https://zyte-spider-templates.readthedocs.io/en/latest/
38 | * License: BSD 3-clause
39 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_ext/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from docutils import nodes
 4 | from docutils.parsers.rst.roles import set_classes
 5 | 
 6 | 
 7 | def http_api_reference_role(
 8 |     name, rawtext, text, lineno, inliner, options={}, content=[]
 9 | ):
10 |     match = re.search(
11 |         r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text
12 |     )
13 |     if match:
14 |         display_text = match[1]
15 |         reference = match[2]
16 |     else:
17 |         display_text = None
18 |         reference = text
19 |     if reference.startswith("request:"):
20 |         request_or_response = "request"
21 |     elif reference.startswith("response:"):
22 |         request_or_response = "response/200"
23 |     else:
24 |         raise ValueError(
25 |             f":http: directive reference must start with request: or "
26 |             f"response:, got {reference} from {text!r}."
27 |         )
28 | 
29 |     field = reference.split(":", maxsplit=1)[1]
30 |     if not display_text:
31 |         display_text = field
32 |     refuri = (
33 |         f"https://docs.zyte.com/zyte-api/usage/reference.html"
34 |         f"#operation/extract/{request_or_response}/{field}"
35 |     )
36 |     set_classes(options)
37 |     node = nodes.reference(rawtext, display_text, refuri=refuri, **options)
38 |     return [node], []
39 | 
40 | 
41 | def setup(app):
42 |     app.add_role("http", http_api_reference_role)
43 |     # https://stackoverflow.com/a/13663325
44 |     #
45 |     # Scrapy’s
46 |     # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6
47 |     app.add_crossref_type(
48 |         directivename="setting",
49 |         rolename="setting",
50 |         indextemplate="pair: %s; setting",
51 |     )
52 |     app.add_crossref_type(
53 |         directivename="signal",
54 |         rolename="signal",
55 |         indextemplate="pair: %s; signal",
56 |     )
57 |     app.add_crossref_type(
58 |         directivename="command",
59 |         rolename="command",
60 |         indextemplate="pair: %s; command",
61 |     )
62 |     app.add_crossref_type(
63 |         directivename="reqmeta",
64 |         rolename="reqmeta",
65 |         indextemplate="pair: %s; reqmeta",
66 |     )
67 | 


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CHANGES.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | project = "zyte-spider-templates"
 5 | copyright = "2023, Zyte Group Ltd"
 6 | author = "Zyte Group Ltd"
 7 | release = "0.12.0"
 8 | 
 9 | sys.path.insert(0, str(Path(__file__).parent.absolute()))  # _ext
10 | extensions = [
11 |     "_ext",
12 |     "enum_tools.autoenum",
13 |     "sphinx.ext.autodoc",
14 |     "sphinx.ext.intersphinx",
15 |     "sphinx.ext.viewcode",
16 |     "sphinx_reredirects",
17 |     "sphinxcontrib.autodoc_pydantic",
18 | ]
19 | 
20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
21 | 
22 | html_theme = "sphinx_rtd_theme"
23 | 
24 | intersphinx_mapping = {
25 |     "form2request": (
26 |         "https://form2request.readthedocs.io/en/latest",
27 |         None,
28 |     ),
29 |     "formasaurus": (
30 |         "https://formasaurus.readthedocs.io/en/latest",
31 |         None,
32 |     ),
33 |     "python": (
34 |         "https://docs.python.org/3",
35 |         None,
36 |     ),
37 |     "scrapy": (
38 |         "https://docs.scrapy.org/en/latest",
39 |         None,
40 |     ),
41 |     "scrapy-poet": (
42 |         "https://scrapy-poet.readthedocs.io/en/stable",
43 |         None,
44 |     ),
45 |     "scrapy-spider-metadata": (
46 |         "https://scrapy-spider-metadata.readthedocs.io/en/latest",
47 |         None,
48 |     ),
49 |     "scrapy-zyte-api": (
50 |         "https://scrapy-zyte-api.readthedocs.io/en/stable",
51 |         None,
52 |     ),
53 |     "web-poet": (
54 |         "https://web-poet.readthedocs.io/en/stable",
55 |         None,
56 |     ),
57 |     "zyte": (
58 |         "https://docs.zyte.com",
59 |         None,
60 |     ),
61 |     "zyte-common-items": (
62 |         "https://zyte-common-items.readthedocs.io/en/latest",
63 |         None,
64 |     ),
65 | }
66 | 
67 | autodoc_pydantic_model_show_config_summary = False
68 | autodoc_pydantic_model_show_field_summary = False
69 | autodoc_pydantic_model_show_json = False
70 | autodoc_pydantic_model_show_validator_members = False
71 | autodoc_pydantic_model_show_validator_summary = False
72 | autodoc_pydantic_field_list_validators = False
73 | autodoc_pydantic_field_show_constraints = False
74 | 
75 | # sphinx-reredirects
76 | redirects = {
77 |     "customization/page-objects": "pages.html",
78 | }
79 | 
80 | # workaround for https://github.com/pydantic/pydantic/discussions/7763
81 | import zyte_spider_templates.spiders.job_posting  # noqa: F401, E402
82 | 


--------------------------------------------------------------------------------
/docs/customization/index.rst:
--------------------------------------------------------------------------------
 1 | .. _customization:
 2 | 
 3 | =============
 4 | Customization
 5 | =============
 6 | 
 7 | :ref:`Built-in spider templates <spider-templates>` can be highly customized:
 8 | 
 9 | -   :ref:`Subclass spider templates <custom-spiders>` to customize metadata,
10 |     parameters, and crawling logic.
11 | 
12 | -   :ref:`Implement page objects <custom-page-objects>` to override parsing
13 |     logic for all or some websites, both for navigation and item detail data.
14 | 


--------------------------------------------------------------------------------
/docs/customization/pages.rst:
--------------------------------------------------------------------------------
  1 | .. _custom-page-objects:
  2 | 
  3 | ========================
  4 | Customizing page objects
  5 | ========================
  6 | 
  7 | All parsing is implemented using :ref:`web-poet page objects <page-objects>`
  8 | that use `Zyte API automatic extraction`_ to extract :ref:`standard items
  9 | <item-api>`: for navigation, for item details, and even for :ref:`search
 10 | request generation <search-queries>`.
 11 | 
 12 | .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html
 13 | 
 14 | You can implement your own page object classes to override how extraction works
 15 | for any given combination of URL and item type.
 16 | 
 17 | .. tip:: Make sure the import path of your page objects module is in the
 18 |     :ref:`SCRAPY_POET_DISCOVER <scrapy-poet:settings>` setting, otherwise your
 19 |     page objects might be ignored.
 20 | 
 21 | .. _configured scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project
 22 | 
 23 | .. _override-parsing:
 24 | 
 25 | Overriding parsing
 26 | ==================
 27 | 
 28 | To change or fix how a given field is extracted, overriding the value from
 29 | `Zyte API automatic extraction`_, create a page object class, configured to run
 30 | on some given URLs (:func:`web_poet.handle_urls`), that defines the logic to
 31 | extract that field. For example:
 32 | 
 33 | .. code-block:: python
 34 |     :caption: pages/books_toscrape_com.py
 35 | 
 36 |     import attrs
 37 |     from number_parser import parse_number
 38 |     from web_poet import AnyResponse, field, handle_urls
 39 |     from zyte_common_items import AggregateRating, AutoProductPage
 40 | 
 41 | 
 42 |     @handle_urls("books.toscrape.com")
 43 |     @attrs.define
 44 |     class BooksToScrapeComProductPage(AutoProductPage):
 45 |         response: AnyResponse
 46 | 
 47 |         @field
 48 |         async def aggregateRating(self):
 49 |             element_class = self.response.css(".star-rating::attr(class)").get()
 50 |             if not element_class:
 51 |                 return None
 52 |             rating_str = element_class.split(" ")[-1]
 53 |             rating = parse_number(rating_str)
 54 |             if not rating:
 55 |                 return None
 56 |             return AggregateRating(ratingValue=rating, bestRating=5)
 57 | 
 58 | ``AutoProductPage`` and other page objects from `zyte-common-items`_
 59 | prefixed with ``Auto`` define fields for all standard items that return
 60 | the value from `Zyte API automatic extraction`_, so that you only need
 61 | to define your new field.
 62 | 
 63 | .. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/
 64 | 
 65 | The page object above is decorated with ``@attrs.define`` so that it can
 66 | declare a dependency on :class:`~web_poet.page_inputs.response.AnyResponse` and
 67 | use that to implement custom parsing logic. You could alternatively use
 68 | :class:`~web_poet.page_inputs.browser.BrowserHtml` if needed.
 69 | 
 70 | 
 71 | .. _add-field:
 72 | 
 73 | Parsing a new field
 74 | ===================
 75 | 
 76 | To extract a new field for one or more websites:
 77 | 
 78 | #.  Declare a new item type that extends a :ref:`standard item <item-api>` with
 79 |     your new field. For example:
 80 | 
 81 |     .. code-block:: python
 82 |         :caption: items.py
 83 | 
 84 |         from typing import Optional
 85 | 
 86 |         import attrs
 87 |         from zyte_common_items import Product
 88 | 
 89 | 
 90 |         @attrs.define
 91 |         class CustomProduct(Product):
 92 |             stock: Optional[int]
 93 | 
 94 | #.  Create a page object class, configured to run for your new item type
 95 |     (:class:`web_poet.pages.Returns`) on some given URLs
 96 |     (:func:`web_poet.handle_urls`), that defines the logic to extract your new
 97 |     field. For example:
 98 | 
 99 |     .. code-block:: python
100 |         :caption: pages/books_toscrape_com.py
101 | 
102 |         import re
103 | 
104 |         from web_poet import Returns, field, handle_urls
105 |         from zyte_common_items import AutoProductPage
106 | 
107 |         from ..items import CustomProduct
108 | 
109 | 
110 |         @handle_urls("books.toscrape.com")
111 |         class BookPage(AutoProductPage, Returns[CustomProduct]):
112 |             @field
113 |             async def stock(self):
114 |                 for entry in await self.additionalProperties:
115 |                     if entry.name == "availability":
116 |                         match = re.search(r"\d([.,\s]*\d+)*(?=\s+available\b)", entry.value)
117 |                         if not match:
118 |                             return None
119 |                         stock_str = re.sub(r"[.,\s]", "", match[0])
120 |                         return int(stock_str)
121 |                 return None
122 | 
123 | #.  Create a spider template subclass that requests your new item type instead
124 |     of the standard one. For example:
125 | 
126 |     .. code-block:: python
127 |         :caption: spiders/books_toscrape_com.py
128 | 
129 |         from scrapy_poet import DummyResponse
130 |         from zyte_spider_templates import EcommerceSpider
131 | 
132 |         from ..items import CustomProduct
133 | 
134 | 
135 |         class BooksToScrapeComSpider(EcommerceSpider):
136 |             name = "books_toscrape_com"
137 |             metadata = {
138 |                 **EcommerceSpider.metadata,
139 |                 "title": "Books to Scrape",
140 |                 "description": "Spider template for books.toscrape.com",
141 |             }
142 | 
143 |             def parse_product(self, response: DummyResponse, product: CustomProduct):
144 |                 yield from super().parse_product(response, product)
145 | 
146 | .. _fix-search:
147 | 
148 | Fixing search support
149 | =====================
150 | 
151 | If the default implementation to build a request out of :ref:`search queries
152 | <search-queries>` does not work on a given website, you can implement your
153 | own search request page object to fix that. See
154 | :ref:`custom-request-template-page`.
155 | 
156 | For example:
157 | 
158 | .. code-block:: python
159 | 
160 |     from web_poet import handle_urls
161 |     from zyte_common_items import BaseSearchRequestTemplatePage
162 | 
163 | 
164 |     @handle_urls("example.com")
165 |     class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage):
166 |         @field
167 |         def url(self):
168 |             return "https://example.com/search?q={{ query|quote_plus }}"
169 | 


--------------------------------------------------------------------------------
/docs/customization/spiders.rst:
--------------------------------------------------------------------------------
  1 | .. _custom-spiders:
  2 | 
  3 | ============================
  4 | Customizing spider templates
  5 | ============================
  6 | 
  7 | Subclass a :ref:`built-in spider template <spider-templates>` to customize its
  8 | :ref:`metadata <custom-metadata>`, :ref:`parameters <custom-params>`, and
  9 | :ref:`crawling logic <custom-crawl>`.
 10 | 
 11 | .. _custom-metadata:
 12 | 
 13 | Customizing metadata
 14 | ====================
 15 | 
 16 | Spider template metadata is defined using `scrapy-spider-metadata`_, and can be
 17 | `redefined or customized in a subclass`_.
 18 | 
 19 | For example, to keep the upstream ``title`` but change the ``description``:
 20 | 
 21 | .. _redefined or customized in a subclass: https://scrapy-spider-metadata.readthedocs.io/en/latest/metadata.html#defining-spider-metadata
 22 | 
 23 | .. code-block:: python
 24 | 
 25 |     from zyte_spider_templates import EcommerceSpider
 26 | 
 27 | 
 28 |     class MySpider(EcommerceSpider):
 29 |         name = "my_spider"
 30 |         metadata = {
 31 |             **EcommerceSpider.metadata,
 32 |             "description": "Custom e-commerce spider template.",
 33 |         }
 34 | 
 35 | 
 36 | .. _custom-params:
 37 | 
 38 | Customizing parameters
 39 | ======================
 40 | 
 41 | Spider template parameters are also defined using `scrapy-spider-metadata`_,
 42 | and can be `redefined or customized in a subclass as well`_.
 43 | 
 44 | For example, to add a ``min_price`` parameter and filter out products with a
 45 | lower price:
 46 | 
 47 | .. _redefined or customized in a subclass as well: https://scrapy-spider-metadata.readthedocs.io/en/latest/params.html
 48 | 
 49 | .. code-block:: python
 50 | 
 51 |     from decimal import Decimal
 52 |     from typing import Iterable
 53 | 
 54 |     from scrapy_poet import DummyResponse
 55 |     from scrapy_spider_metadata import Args
 56 |     from zyte_common_items import Product
 57 |     from zyte_spider_templates import EcommerceSpider
 58 |     from zyte_spider_templates.spiders.ecommerce import EcommerceSpiderParams
 59 | 
 60 | 
 61 |     class MyParams(EcommerceSpiderParams):
 62 |         min_price: str = "0.00"
 63 | 
 64 | 
 65 |     class MySpider(EcommerceSpider, Args[MyParams]):
 66 |         name = "my_spider"
 67 | 
 68 |         def parse_product(
 69 |             self, response: DummyResponse, product: Product
 70 |         ) -> Iterable[Product]:
 71 |             for product in super().parse_product(response, product):
 72 |                 if Decimal(product.price) >= Decimal(self.args.min_price):
 73 |                     yield product
 74 | 
 75 | 
 76 | You can also override existing parameters. For example, to hard-code the start
 77 | URL:
 78 | 
 79 | .. code-block:: python
 80 | 
 81 |     from scrapy_spider_metadata import Args
 82 |     from zyte_spider_templates import EcommerceSpider
 83 |     from zyte_spider_templates.spiders.ecommerce import EcommerceSpiderParams
 84 | 
 85 | 
 86 |     class MyParams(EcommerceSpiderParams):
 87 |         url: str = "https://books.toscrape.com"
 88 | 
 89 | 
 90 |     class MySpider(EcommerceSpider, Args[MyParams]):
 91 |         name = "my_spider"
 92 | 
 93 | A mixin class exists for every spider parameter (see :ref:`parameter-mixins`),
 94 | so you can use any combination of them in any order you like in your custom
 95 | classes, while enjoying future improvements to validation, documentation or
 96 | UI integration for Scrapy Cloud:
 97 | 
 98 | .. code-block:: python
 99 | 
100 |     from scrapy_spider_metadata import Args
101 |     from zyte_spider_templates.params import GeolocationParam, UrlParam
102 | 
103 | 
104 |     class MyParams(GeolocationParam, UrlParam):
105 |         pass
106 | 
107 | 
108 |     class MySpider(Args[MyParams]):
109 |         name = "my_spider"
110 | 
111 | 
112 | .. _custom-crawl:
113 | 
114 | Customizing the crawling logic
115 | ==============================
116 | 
117 | The crawling logic of spider templates can be customized as any other
118 | :ref:`Scrapy spider <topics-spiders>`.
119 | 
120 | For example, you can make a spider that expects a product details URL and does
121 | not follow navigation at all:
122 | 
123 | .. code-block:: python
124 | 
125 |     from typing import Iterable
126 | 
127 |     from scrapy import Request
128 |     from zyte_spider_templates import EcommerceSpider
129 | 
130 | 
131 |     class MySpider(EcommerceSpider):
132 |         name = "my_spider"
133 | 
134 |         def start_requests(self) -> Iterable[Request]:
135 |             for request in super().start_requests():
136 |                 yield request.replace(callback=self.parse_product)
137 | 
138 | All parsing logic is implemented separately in :ref:`page objects
139 | <custom-page-objects>`, making it easier to read the code of :ref:`built-in
140 | spider templates <spider-templates>` to modify them as desired.
141 | 
142 | .. _scrapy-spider-metadata: https://scrapy-spider-metadata.readthedocs.io/en/latest
143 | 


--------------------------------------------------------------------------------
/docs/features/search.rst:
--------------------------------------------------------------------------------
 1 | .. _search-queries:
 2 | 
 3 | ==============
 4 | Search queries
 5 | ==============
 6 | 
 7 | The :ref:`e-commerce spider template <e-commerce>` supports a spider argument,
 8 | :data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`,
 9 | that allows you to define a different search query per line, and
10 | turns the input URLs into search requests for those queries.
11 | 
12 | For example, given the following input URLs:
13 | 
14 | .. code-block:: none
15 | 
16 |     https://a.example
17 |     https://b.example
18 | 
19 | And the following list of search queries:
20 | 
21 | .. code-block:: none
22 | 
23 |     foo bar
24 |     baz
25 | 
26 | By default, the spider would send 2 initial requests to those 2 input URLs,
27 | to try and find out how to build a search request for them, and if it succeeds,
28 | it will then send 4 search requests, 1 per combination of input URL and search
29 | query. For example:
30 | 
31 | .. code-block:: none
32 | 
33 |     https://a.example/search?q=foo+bar
34 |     https://a.example/search?q=baz
35 |     https://b.example/s/foo%20bar
36 |     https://b.example/s/baz
37 | 
38 | The default implementation uses a combination of HTML metadata, AI-based HTML
39 | form inspection and heuristics to find the most likely way to build a search
40 | request for a given website.
41 | 
42 | If this default implementation does not work as expected on a given website,
43 | you can :ref:`write a page object to fix that <fix-search>`.
44 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ===================================
 2 | zyte-spider-templates documentation
 3 | ===================================
 4 | 
 5 | .. include:: ../README.rst
 6 |    :start-after: .. description starts
 7 |    :end-before: .. description ends
 8 | 
 9 | .. toctree::
10 |    :caption: First steps
11 |    :hidden:
12 | 
13 |    setup
14 | 
15 | .. toctree::
16 |    :caption: Templates
17 |    :hidden:
18 | 
19 |    templates/index
20 |    E-commerce <templates/e-commerce>
21 |    Article <templates/article>
22 |    Google search <templates/google-search>
23 |    Job posting <templates/job-posting>
24 | 
25 | .. toctree::
26 |    :caption: Features
27 |    :hidden:
28 | 
29 |    Search queries <features/search>
30 | 
31 | .. toctree::
32 |    :caption: Customization
33 |    :hidden:
34 | 
35 |    customization/index
36 |    customization/spiders
37 |    customization/pages
38 | 
39 | .. toctree::
40 |    :caption: Reference
41 |    :hidden:
42 | 
43 |    reference/settings
44 |    reference/reqmeta
45 |    reference/api
46 | 
47 | .. toctree::
48 |    :caption: All the rest
49 |    :hidden:
50 | 
51 |    changes
52 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/reference/api.rst:
--------------------------------------------------------------------------------
 1 | ===
 2 | API
 3 | ===
 4 | 
 5 | Spiders
 6 | =======
 7 | 
 8 | .. autoclass:: zyte_spider_templates.ArticleSpider
 9 | 
10 | .. autoclass:: zyte_spider_templates.BaseSpider
11 | 
12 | .. autoclass:: zyte_spider_templates.EcommerceSpider
13 | 
14 | .. autoclass:: zyte_spider_templates.GoogleSearchSpider
15 | 
16 | .. autoclass:: zyte_spider_templates.JobPostingSpider
17 | 
18 | 
19 | Pages
20 | =====
21 | 
22 | .. autoclass:: zyte_spider_templates.pages.DefaultSearchRequestTemplatePage
23 | 
24 | .. autoclass:: zyte_spider_templates.pages.HeuristicsArticleNavigationPage
25 | 
26 | .. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage
27 | 
28 | 
29 | .. _parameter-mixins:
30 | 
31 | Parameter mixins
32 | ================
33 | 
34 | .. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam
35 |     :exclude-members: model_computed_fields
36 | 
37 | .. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam
38 |     :exclude-members: model_computed_fields
39 | 
40 | .. autoenum:: zyte_spider_templates.params.CustomAttrsMethod
41 | 
42 | .. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam
43 |     :exclude-members: model_computed_fields
44 | 
45 | .. autoenum:: zyte_spider_templates.params.ExtractFrom
46 | 
47 | .. autopydantic_model:: zyte_spider_templates.params.GeolocationParam
48 |     :exclude-members: model_computed_fields
49 | 
50 | .. autoenum:: zyte_spider_templates.params.Geolocation
51 | 
52 | .. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam
53 |     :exclude-members: model_computed_fields
54 | 
55 | .. autopydantic_model:: zyte_spider_templates.params.UrlParam
56 |     :exclude-members: model_computed_fields
57 | 
58 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam
59 |     :exclude-members: model_computed_fields
60 | 
61 | .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
62 | 
63 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceExtractParam
64 |     :exclude-members: model_computed_fields
65 | 
66 | .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceExtract
67 | 
68 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
69 |     :exclude-members: model_computed_fields
70 | 
71 | .. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType
72 | 
73 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
74 |     :exclude-members: model_computed_fields
75 | 
76 | .. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleCrawlStrategyParam
77 |     :exclude-members: model_computed_fields
78 | 
79 | .. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy
80 | 
81 | .. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategyParam
82 |     :exclude-members: model_computed_fields
83 | 
84 | .. autoenum:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategy
85 | 
86 | 
87 | .. _middlewares:
88 | 
89 | Middlewares
90 | ===========
91 | 
92 | .. autoclass:: zyte_spider_templates.CrawlingLogsMiddleware
93 | .. autoclass:: zyte_spider_templates.TrackNavigationDepthSpiderMiddleware
94 | .. autoclass:: zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware
95 | .. autoclass:: zyte_spider_templates.OffsiteRequestsPerSeedMiddleware
96 | .. autoclass:: zyte_spider_templates.OnlyFeedsMiddleware
97 | .. autoclass:: zyte_spider_templates.TrackSeedsSpiderMiddleware
98 | .. autoclass:: zyte_spider_templates.IncrementalCrawlMiddleware
99 | 


--------------------------------------------------------------------------------
/docs/reference/reqmeta.rst:
--------------------------------------------------------------------------------
  1 | .. _meta:
  2 | 
  3 | =================
  4 | Request.meta keys
  5 | =================
  6 | 
  7 | Keys that can be defined in :attr:`Request.meta <scrapy.http.Request.meta>` for
  8 | zyte-spider-templates.
  9 | 
 10 | .. reqmeta:: seed
 11 | 
 12 | seed
 13 | ====
 14 | 
 15 | Default: ``The seed URL (or value) from which the request originated.``
 16 | 
 17 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` and
 18 | :class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`.
 19 | 
 20 | The `seed` meta key is used to track and identify the origin of a request. It
 21 | is initially set for each request that originates from the start request and
 22 | can be used to manage domain constraints for subsequent requests. This key can
 23 | also be set to an arbitrary value by the user to identify the seed source.
 24 | 
 25 | Here's an example:
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     meta = {
 30 |         "seed": "http://example.com",
 31 |     }
 32 | 
 33 | .. reqmeta:: is_seed_request
 34 | 
 35 | is_seed_request
 36 | ===============
 37 | 
 38 | Default: ``False``
 39 | 
 40 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`.
 41 | 
 42 | The `is_seed_request` meta key is a boolean flag that identifies whether the
 43 | request is a start request (i.e., originating from the initial seed URL). When
 44 | set to True, the middleware extracts seed domains from the response.
 45 | 
 46 | Example:
 47 |     ::
 48 | 
 49 |         meta = {
 50 |             'is_seed_request': True,
 51 |         }
 52 | 
 53 | .. reqmeta:: seed_domains
 54 | 
 55 | seed_domains
 56 | ============
 57 | 
 58 | Default: ``Initial URL and redirected URLs``
 59 | 
 60 | The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`.
 61 | 
 62 | The `seed_domains` meta key is a list of domains that the middleware uses to
 63 | check whether a request belongs to these domains or not. By default, this list
 64 | includes the initial URL's domain and domains of any redirected URLs `(if there
 65 | was a redirection)`. This list can also be set by the user in the spider to
 66 | specify additional domains for which the middleware should allow requests.
 67 | 
 68 | Here's an example:
 69 | 
 70 | .. code-block:: python
 71 | 
 72 |     meta = {"seed_domains": ["example.com", "another-example.com"]}
 73 | 
 74 | .. reqmeta:: is_hop
 75 | 
 76 | increase_navigation_depth
 77 | =========================
 78 | 
 79 | Default: ``True``
 80 | 
 81 | The key is used for :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`.
 82 | 
 83 | The `increase_navigation_depth` meta key is a boolean flag that determines whether the
 84 | navigation_depth for a request should be increased. By default, the middleware increases
 85 | navigation_depth for all requests. Specific spiders can override this behavior for certain
 86 | types of requests, such as pagination or RSS feeds, by explicitly setting the meta key.
 87 | 
 88 | Example:
 89 |     ::
 90 | 
 91 |         meta = {
 92 |             'increase_navigation_depth': False,
 93 |         }
 94 | 
 95 | .. reqmeta:: only_feeds
 96 | 
 97 | only_feeds
 98 | ==========
 99 | Default: ``False``
100 | 
101 | The key is used for :class:`~zyte_spider_templates.OnlyFeedsMiddleware`.
102 | 
103 | The `only_feeds` meta key is a boolean flag that identifies whether the
104 | spider should discover all links on the website or extract links from RSS/Atom feeds only.
105 | 
106 | Example:
107 |     ::
108 | 
109 |         meta = {
110 |             'page_params': {'only_feeds': True}
111 |         }
112 | 
113 | 


--------------------------------------------------------------------------------
/docs/reference/settings.rst:
--------------------------------------------------------------------------------
  1 | .. _settings:
  2 | 
  3 | ========
  4 | Settings
  5 | ========
  6 | 
  7 | .. setting:: NAVIGATION_DEPTH_LIMIT
  8 | 
  9 | NAVIGATION_DEPTH_LIMIT
 10 | ======================
 11 | 
 12 | Default: ``0``
 13 | 
 14 | The maximum navigation depth to crawl. If ``0``, no limit is imposed.
 15 | 
 16 | We increase *navigation_depth* for requests navigating to a subcategory originating from
 17 | its parent category, including a request targeting a category starting at the website home page.
 18 | We don't increase *navigation_depth* for requests accessing item details (e.g., an article) or for
 19 | additional pages of a visited webpage. For example, if you set ``NAVIGATION_DEPTH_LIMIT`` to ``1``,
 20 | only item details and pagination links from your start URLs are followed.
 21 | 
 22 | .. note::
 23 |     Currently, only the :ref:`Article spider template <article>` implements proper
 24 |     navigation_depth support. Other spider templates treat all follow-up requests as
 25 |     increasing navigation_depth.
 26 | 
 27 | Setting a navigation_depth limit can prevent a spider from delving too deeply into
 28 | subcategories. This is especially useful if you only need data from the
 29 | top-level categories or specific subcategories.
 30 | 
 31 | When :ref:`customizing a spider template <custom-spiders>`, set the
 32 | :reqmeta:`increase_navigation_depth` request metadata key to override whether a request is
 33 | considered as increasing navigation depth (``True``) or not (``False``):
 34 | 
 35 | .. code-block:: python
 36 | 
 37 |     Request("https://example.com", meta={"increase_navigation_depth": False})
 38 | 
 39 | If you want to limit all link following, including pagination and item details,
 40 | consider using the :setting:`DEPTH_LIMIT <scrapy:DEPTH_LIMIT>` setting instead.
 41 | 
 42 | Implemented by :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`.
 43 | 
 44 | .. setting:: MAX_REQUESTS_PER_SEED
 45 | 
 46 | MAX_REQUESTS_PER_SEED
 47 | =====================
 48 | 
 49 | .. tip:: When using the :ref:`article spider template <article>`, you may use
 50 |     the
 51 |     :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.max_requests_per_seed`
 52 |     command-line parameter instead of this setting.
 53 | 
 54 | Default: ``0``
 55 | 
 56 | Limit the number of follow-up requests per initial URL to the specified amount.
 57 | Non-positive integers (i.e. 0 and below) imposes no limit and disables this middleware.
 58 | 
 59 | The limit is the total limit for all direct and indirect follow-up requests
 60 | of each initial URL.
 61 | 
 62 | Implemented by
 63 | :class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`.
 64 | 
 65 | .. setting:: OFFSITE_REQUESTS_PER_SEED_ENABLED
 66 | 
 67 | OFFSITE_REQUESTS_PER_SEED_ENABLED
 68 | =================================
 69 | 
 70 | Default: ``True``
 71 | 
 72 | Setting this value to ``True`` enables the
 73 | :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` while ``False``
 74 | completely disables it.
 75 | 
 76 | The middleware ensures that *most* requests would belong to the domain of the
 77 | seed URLs. However, it does allow offsite requests only if they were obtained
 78 | from a response that belongs to the domain of the seed URLs. Any other requests
 79 | obtained thereafter from a response in a domain outside of the seed URLs will
 80 | not be allowed.
 81 | 
 82 | This prevents the spider from completely crawling other domains while ensuring
 83 | that aggregator websites *(e.g. a news website with articles from other domains)*
 84 | are supported, as it can access pages from other domains.
 85 | 
 86 | Disabling the middleware would not prevent offsite requests from being filtered
 87 | and might generally lead in other domains from being crawled completely, unless
 88 | ``allowed_domains`` is set in the spider.
 89 | 
 90 | .. note::
 91 | 
 92 |     If a seed URL gets redirected to a different domain, both the domain from
 93 |     the original request and the domain from the redirected response will be
 94 |     used as references.
 95 | 
 96 |     If the seed URL is `https://books.toscrape.com`, all subsequent requests to
 97 |     `books.toscrape.com` and its subdomains are allowed, but requests to
 98 |     `toscrape.com` are not. Conversely, if the seed URL is `https://toscrape.com`,
 99 |     requests to both `toscrape.com` and `books.toscrape.com` are allowed.
100 | 
101 | .. setting:: ONLY_FEEDS_ENABLED
102 | 
103 | ONLY_FEEDS_ENABLED
104 | ==================
105 | 
106 | .. note::
107 | 
108 |     Only works for the :ref:`article spider template <article>`.
109 | 
110 | Default: ``False``
111 | 
112 | Whether to extract links from Atom and RSS news feeds only (``True``) or
113 | to also use extracted links from ``ArticleNavigation.subCategories`` (``False``).
114 | 
115 | Implemented by :class:`~zyte_spider_templates.OnlyFeedsMiddleware`.
116 | 
117 | .. setting:: INCREMENTAL_CRAWL_BATCH_SIZE
118 | 
119 | INCREMENTAL_CRAWL_BATCH_SIZE
120 | ============================
121 | 
122 | Default: ``50``
123 | 
124 | The maximum number of seen URLs to read from or write to the corresponding
125 | :ref:`Zyte Scrapy Cloud collection <api-collections>` per request during an incremental
126 | crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`).
127 | 
128 | This setting determines the batch size for interactions with the Collection.
129 | If the response from a webpage contains more than 50 URLs, they will be split
130 | into smaller batches for processing. Conversely, if fewer than 50 URLs are present,
131 | all URLs will be handled in a single request to the Collection.
132 | 
133 | Adjusting this value can optimize the performance of a crawl by balancing the number
134 | of requests sent to the Collection with processing efficiency.
135 | 
136 | .. note::
137 | 
138 |     Setting it too large (e.g. > 100) will cause issues due to the large query length.
139 |     Setting it too small (less than 10) will remove the benefit of using a batch.
140 | 
141 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`.
142 | 
143 | 
144 | .. setting:: INCREMENTAL_CRAWL_COLLECTION_NAME
145 | 
146 | INCREMENTAL_CRAWL_COLLECTION_NAME
147 | =================================
148 | 
149 | .. note::
150 | 
151 |     :ref:`virtual spiders <virtual-spiders>` are spiders based on :ref:`spider templates <spider-templates>`.
152 |     The explanation of using INCREMENTAL_CRAWL_COLLECTION_NAME related to both types of spiders.
153 | 
154 | .. tip:: When using the :ref:`article spider template <article>`, you may use
155 |     the
156 |     :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental_collection_name`
157 |     command-line parameter instead of this setting.
158 | 
159 | .. note::
160 |     Only ASCII alphanumeric characters and underscores are allowed.
161 | 
162 | Default: `<The current spider's name>_incremental`.
163 | The current spider's name here will be virtual spider's name, if it's a virtual spider;
164 | otherwise, :data:`Spider.name <scrapy.Spider.name>`.
165 | 
166 | Name of the :ref:`Zyte Scrapy Cloud collection <api-collections>` used during
167 | an incremental crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`).
168 | 
169 | By default, a collection named after the spider is used, meaning that matching URLs from
170 | previous runs of the same spider are skipped, provided those previous runs had
171 | the :setting:`INCREMENTAL_CRAWL_ENABLED` setting set to ``True`` or the spider
172 | argument `incremental` set to `true`.
173 | 
174 | Using a different collection name makes sense, for example, in the following cases:
175 | - Different spiders share a collection.
176 | - The same spider uses different collections (e.g., for development runs vs. production runs).
177 | 
178 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`.
179 | 
180 | 
181 | .. setting:: INCREMENTAL_CRAWL_ENABLED
182 | 
183 | INCREMENTAL_CRAWL_ENABLED
184 | =========================
185 | 
186 | .. tip:: When using the :ref:`article spider template <article>`, you may use
187 |     the
188 |     :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental`
189 |     command-line parameter instead of this setting.
190 | 
191 | Default: ``False``
192 | 
193 | If set to ``True``, items seen in previous crawls with the same
194 | :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` value are skipped.
195 | 
196 | Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`.
197 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | autodoc_pydantic==2.0.1
2 | enum-tools==0.11.0
3 | Sphinx==7.2.6
4 | sphinx-reredirects==0.1.3
5 | sphinx-rtd-theme==1.3.0
6 | sphinx-toolbox==3.5.0  # optional dependency of enum-tools
7 | 


--------------------------------------------------------------------------------
/docs/setup.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Initial setup
 3 | =============
 4 | 
 5 | Learn how to get :ref:`spider templates <spider-templates>` installed and
 6 | configured on an existing Scrapy_ project.
 7 | 
 8 | .. _Scrapy: https://docs.scrapy.org/en/latest/
 9 | 
10 | .. tip:: If you do not have a Scrapy project yet, use
11 |     `zyte-spider-templates-project`_ as a starting template to get started
12 |     quickly.
13 | 
14 | .. _zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project
15 | 
16 | Requirements
17 | ============
18 | 
19 | -   Python 3.9+
20 | 
21 | -   Scrapy 2.11+
22 | 
23 | For Zyte API features, including AI-powered parsing, you need a `Zyte API`_
24 | subscription.
25 | 
26 | .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
27 | 
28 | Installation
29 | ============
30 | 
31 | .. code-block:: shell
32 | 
33 |     pip install zyte-spider-templates
34 | 
35 | 
36 | .. _config:
37 | 
38 | Configuration
39 | =============
40 | 
41 | In your Scrapy project settings (usually in ``settings.py``):
42 | 
43 | #.  `Configure scrapy-poet`_.
44 | 
45 |     .. _Configure scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project
46 | 
47 | #.  For Zyte API features, including AI-powered parsing, :ref:`configure
48 |     scrapy-zyte-api <scrapy-zyte-api:setup>`.
49 | 
50 | #.  Configure :class:`zyte_common_items.ZyteItemAdapter`:
51 | 
52 |     .. code-block:: python
53 |         :caption: ``settings.py``
54 | 
55 |         from itemadapter import ItemAdapter
56 |         from zyte_common_items import ZyteItemAdapter
57 | 
58 |         ItemAdapter.ADAPTER_CLASSES.appendleft(ZyteItemAdapter)
59 | 
60 | #.  Add the zyte-spider-templates add-on to your :setting:`ADDONS
61 |     <scrapy:ADDONS>` setting:
62 | 
63 |     .. code-block:: python
64 |         :caption: ``settings.py``
65 | 
66 |         ADDONS = {
67 |             "zyte_spider_templates.Addon": 1000,
68 |         }
69 | 
70 | For an example of a properly configured ``settings.py`` file, see `the one
71 | in zyte-spider-templates-project`_.
72 | 
73 | .. _the one in zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project/blob/main/zyte_spider_templates_project/settings.py
74 | 


--------------------------------------------------------------------------------
/docs/templates/article.rst:
--------------------------------------------------------------------------------
 1 | .. _article:
 2 | 
 3 | =====================================
 4 | Article spider template (``article``)
 5 | =====================================
 6 | 
 7 | Basic use
 8 | =========
 9 | 
10 | .. code-block:: shell
11 | 
12 |     scrapy crawl article -a url="https://www.zyte.com/blog/"
13 | 
14 | Parameters
15 | ==========
16 | 
17 | .. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleSpiderParams
18 |     :inherited-members: BaseModel
19 |     :exclude-members: model_computed_fields, single_input
20 | 
21 | Settings
22 | ========
23 | 
24 | The following :ref:`zyte-spider-templates settings <settings>` may be useful
25 | for the article spider template:
26 | 
27 | :setting:`NAVIGATION_DEPTH_LIMIT`
28 |     Limit the crawling depth of subcategories.
29 | 
30 | :setting:`OFFSITE_REQUESTS_PER_SEED_ENABLED`
31 |     Skip follow-up requests if their URL points to a domain different from the
32 |     domain of their initial URL.
33 | 
34 | :setting:`ONLY_FEEDS_ENABLED`
35 |     Extract links only from Atom and RSS news feeds.
36 | 


--------------------------------------------------------------------------------
/docs/templates/e-commerce.rst:
--------------------------------------------------------------------------------
 1 | .. _e-commerce:
 2 | 
 3 | ==========================================
 4 | E-commerce spider template (``ecommerce``)
 5 | ==========================================
 6 | 
 7 | Basic use
 8 | =========
 9 | 
10 | .. code-block:: shell
11 | 
12 |     scrapy crawl ecommerce -a url="https://books.toscrape.com"
13 | 
14 | Parameters
15 | ==========
16 | 
17 | .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
18 |     :inherited-members: BaseModel
19 |     :exclude-members: model_computed_fields, single_input
20 | 
21 | Settings
22 | ========
23 | 
24 | The following :ref:`zyte-spider-templates settings <settings>` may be useful
25 | for the e-commerce spider template:
26 | 
27 | :setting:`MAX_REQUESTS_PER_SEED`
28 |     Limit the number of follow-up requests per initial URL.
29 | 


--------------------------------------------------------------------------------
/docs/templates/google-search.rst:
--------------------------------------------------------------------------------
 1 | .. _google-search:
 2 | 
 3 | =================================================
 4 | Google search spider template (``google_search``)
 5 | =================================================
 6 | 
 7 | Basic use
 8 | =========
 9 | 
10 | .. code-block:: shell
11 | 
12 |     scrapy crawl google_search -a search_queries="foo bar"
13 | 
14 | Parameters
15 | ==========
16 | 
17 | .. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
18 |     :inherited-members: BaseModel
19 |     :exclude-members: model_computed_fields
20 | 


--------------------------------------------------------------------------------
/docs/templates/index.rst:
--------------------------------------------------------------------------------
 1 | .. _spider-templates:
 2 | 
 3 | ================
 4 | Spider templates
 5 | ================
 6 | 
 7 | Built-in `spider templates`_ use `Zyte API automatic extraction`_ to provide
 8 | automatic crawling and parsing, i.e. you can run these spiders on any website
 9 | of the right type to automatically extract the desired structured data.
10 | 
11 | .. _spider templates: https://docs.zyte.com/scrapy-cloud/usage/spiders.html#spider-templates-and-virtual-spiders
12 | .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html
13 | 
14 | For example, to extract all products from an e-commerce website, you can run
15 | the :ref:`e-commerce spider <e-commerce>` spider as follows:
16 | 
17 | .. code-block:: shell
18 | 
19 |     scrapy crawl ecommerce -a url="https://books.toscrape.com"
20 | 
21 | Spider templates support additional parameters beyond ``url``. See the
22 | documentation of each specific spider for details.
23 | 
24 | You can also :ref:`customize spider templates <customization>` to meet your
25 | needs.
26 | 
27 | Spider template list
28 | ====================
29 | 
30 | :ref:`E-commerce <e-commerce>`
31 |     Get products from an e-commerce website.
32 | 
33 | :ref:`Google Search <google-search>`
34 |     Get Google search results.
35 | 
36 | :ref:`Article <article>`
37 |     Get articles from websites.
38 | 
39 | :ref:`Job posting <job-posting>`
40 |     Get job postings from job websites.
41 | 


--------------------------------------------------------------------------------
/docs/templates/job-posting.rst:
--------------------------------------------------------------------------------
 1 | .. _job-posting:
 2 | 
 3 | =============================================
 4 | Job posting spider template (``job_posting``)
 5 | =============================================
 6 | 
 7 | Basic use
 8 | =========
 9 | 
10 | .. code-block:: shell
11 | 
12 |     scrapy crawl job_posting -a url="https://books.toscrape.com"
13 | 
14 | Parameters
15 | ==========
16 | 
17 | .. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams
18 |     :inherited-members: BaseModel
19 |     :exclude-members: model_computed_fields
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.isort]
 2 | profile = "black"
 3 | multi_line_output = 3
 4 | 
 5 | [tool.mypy]
 6 | check_untyped_defs = true
 7 | ignore_missing_imports = true
 8 | 
 9 | [tool.black]
10 | target-version = ["py38", "py39", "py310", "py311", "py312"]
11 | force-exclude = "template.py"
12 | 
13 | [tool.pytest.ini_options]
14 | filterwarnings = [
15 |     "ignore:deprecated string literal syntax::jmespath.lexer",
16 | ]
17 | addopts = [
18 |     "--reactor=asyncio",
19 | ]
20 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | pytest
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore =
 3 |     # Refers to the max-line length. Let's suppress the error and simply
 4 |     # let black take care on how it wants to format the lines.
 5 |     E501,
 6 | 
 7 |     # E203 whitespace before ':'
 8 |     E203,
 9 | 
10 |     # Refers to "line break before binary operator".
11 |     # Similar to above, let black take care of the formatting.
12 |     W503,
13 | 
14 |     # Refers to "necessary dict call - rewrite as a literal".
15 |     C408,
16 | 
17 |     # 1 blank line required between summary line and description
18 |     D205,  
19 | 
20 |     # First line should end with a period
21 |     D400,  
22 | 
23 |     # First line should be in imperative mood
24 |     D401,  
25 | 
26 |     # First line should not be the function's "signature"
27 |     D402
28 | 
29 | exclude =
30 |     template.py
31 | 
32 | per-file-ignores =
33 |     # F401: Ignore "imported but unused" errors in __init__ files, as those
34 |     # imports are there to expose submodule functions so they can be imported
35 |     # directly from that module
36 |     zyte_spider_templates/__init__.py:F401
37 |     zyte_spider_templates/page_objects/__init__.py:F401
38 |     zyte_spider_templates/page_objects/product_navigation_heuristics.py:F401
39 |     zyte_spider_templates/pages/__init__.py:F401
40 | 
41 |     # E731: Ignore "do not assign a lambda expression, use a def" since
42 |     # we're using quick shortcuts for the tests
43 |     tests/test_ecommerce.py:E731
44 |     tests/test_job_posting.py:E731
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="zyte-spider-templates",
 5 |     version="0.12.0",
 6 |     description="Spider templates for automatic crawlers.",
 7 |     long_description=open("README.rst").read(),
 8 |     long_description_content_type="text/x-rst",
 9 |     author="Zyte Group Ltd",
10 |     author_email="info@zyte.com",
11 |     url="https://github.com/zytedata/zyte-spider-templates",
12 |     packages=find_packages(),
13 |     include_package_data=True,
14 |     install_requires=[
15 |         "duplicate-url-discarder>=0.2.0",
16 |         "duplicate-url-discarder-rules>=2024.11.05",
17 |         "extruct>=0.18.0",
18 |         "feedparser>=6.0.11",
19 |         "form2request>=0.2.0",
20 |         "formasaurus>=0.10.0",
21 |         "jmespath>=0.9.5",
22 |         "pydantic>=2.1",
23 |         "requests>=2.31.0",
24 |         "scrapinghub >= 2.4.0",
25 |         "scrapy>=2.11.0",
26 |         "scrapy-poet>=0.24.0",
27 |         "scrapy-spider-metadata>=0.2.0",
28 |         "scrapy-zyte-api[provider]>=0.25.0",
29 |         "web-poet>=0.17.1",
30 |         "xtractmime>=0.2.1",
31 |         "zyte-common-items>=0.26.2",
32 |     ],
33 |     classifiers=[
34 |         "Development Status :: 3 - Alpha",
35 |         "Intended Audience :: Developers",
36 |         "License :: OSI Approved :: BSD License",
37 |         "Operating System :: OS Independent",
38 |         "Programming Language :: Python :: 3",
39 |         "Programming Language :: Python :: 3.9",
40 |         "Programming Language :: Python :: 3.10",
41 |         "Programming Language :: Python :: 3.11",
42 |         "Programming Language :: Python :: 3.12",
43 |         "Programming Language :: Python :: 3.13",
44 |     ],
45 | )
46 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Type
 2 | 
 3 | import pytest
 4 | from scrapy import Spider
 5 | from scrapy.utils.test import TestSpider
 6 | 
 7 | # https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
 8 | pytest.register_assert_rewrite("tests.utils")
 9 | 
10 | 
11 | # scrapy.utils.test.get_crawler alternative that does not freeze settings.
12 | def get_crawler(
13 |     *, settings: Optional[Dict[str, Any]] = None, spider_cls: Type[Spider] = TestSpider
14 | ):
15 |     from scrapy.crawler import CrawlerRunner
16 | 
17 |     settings = settings or {}
18 |     # Set by default settings that prevent deprecation warnings.
19 |     settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7"
20 |     runner = CrawlerRunner(settings)
21 |     crawler = runner.create_crawler(spider_cls)
22 |     return crawler
23 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING, Any, Optional
 4 | 
 5 | import pytest
 6 | import pytest_twisted
 7 | from aiohttp.test_utils import TestServer
 8 | 
 9 | if TYPE_CHECKING:
10 |     from aiohttp.web import Application
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def mockserver():
15 |     from .mockserver import MockServer
16 | 
17 |     with MockServer() as server:
18 |         yield server
19 | 
20 | 
21 | # Copied verbatim from pytest-aiohttp. We can't use pytest-asyncio fixtures with
22 | # pytest-twisted, so we need to decorate this one with a pytest-twisted decorator.
23 | # See also https://github.com/pytest-dev/pytest-twisted/issues/188
24 | @pytest_twisted.async_yield_fixture(scope="module")
25 | async def aiohttp_server():
26 |     """Factory to create a TestServer instance, given an app.
27 | 
28 |     aiohttp_server(app, **kwargs)
29 |     """
30 |     servers = []
31 | 
32 |     async def go(
33 |         app: Application,
34 |         *,
35 |         host: str = "127.0.0.1",
36 |         port: Optional[int] = None,
37 |         **kwargs: Any,
38 |     ) -> TestServer:
39 |         server = TestServer(app, host=host, port=port)
40 |         await server.start_server(**kwargs)
41 |         servers.append(server)
42 |         return server
43 | 
44 |     yield go
45 | 
46 |     while servers:
47 |         await servers.pop().close()
48 | 
49 | 
50 | @pytest_twisted.async_fixture(scope="module")
51 | async def zyte_api_server(aiohttp_server) -> TestServer:
52 |     from fake_zyte_api.main import make_app
53 | 
54 |     app = make_app()
55 |     return await aiohttp_server(app)
56 | 


--------------------------------------------------------------------------------
/tests/incremental/test_collection_fp_manager.py:
--------------------------------------------------------------------------------
  1 | from asyncio import ensure_future
  2 | from unittest.mock import MagicMock, patch
  3 | 
  4 | import pytest
  5 | from scrapy.statscollectors import StatsCollector
  6 | from scrapy.utils.request import RequestFingerprinter
  7 | from twisted.internet.defer import Deferred, inlineCallbacks
  8 | 
  9 | from tests import get_crawler
 10 | from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager
 11 | from zyte_spider_templates.spiders.article import ArticleSpider
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def mock_crawler():
 16 |     return MagicMock()
 17 | 
 18 | 
 19 | def crawler_for_incremental():
 20 |     url = "https://example.com"
 21 |     crawler = get_crawler()
 22 |     crawler.request_fingerprinter = RequestFingerprinter()
 23 |     crawler.stats = StatsCollector(crawler)
 24 |     crawler.spider = ArticleSpider.from_crawler(crawler, url=url)
 25 |     crawler.settings["ZYTE_PROJECT_ID"] = "000000"
 26 |     return crawler
 27 | 
 28 | 
 29 | @pytest.mark.parametrize("batch_size", [50, 2])
 30 | @pytest.mark.parametrize(
 31 |     "fingerprints, keys_in_collection, fingerprints_batch, expected_result",
 32 |     [
 33 |         ([], [], {"fp1", "fp2", "fp3"}, set()),
 34 |         (["fp1", "fp2", "fp3"], [], set(), set()),
 35 |         (["fp1", "fp2", "fp3"], ["fp1"], set(), {"fp1"}),
 36 |         (["fp1", "fp2", "fp3"], ["fp1", "fp2"], set(), {"fp1", "fp2"}),
 37 |         (["fp1", "fp2", "fp3"], ["fp1", "fp2", "fp3"], set(), {"fp1", "fp2", "fp3"}),
 38 |         (
 39 |             ["fp1", "fp2", "fp3"],
 40 |             ["fp1", "fp2"],
 41 |             {("fp3", "url3")},
 42 |             {"fp1", "fp2", "fp3"},
 43 |         ),
 44 |         (["fp1", "fp2", "fp3"], [], {("fp3", "url3")}, {"fp3"}),
 45 |     ],
 46 | )
 47 | @patch("scrapinghub.ScrapinghubClient")
 48 | @inlineCallbacks
 49 | def test_get_existing_fingerprints(
 50 |     mock_scrapinghub_client,
 51 |     batch_size,
 52 |     fingerprints,
 53 |     keys_in_collection,
 54 |     fingerprints_batch,
 55 |     expected_result,
 56 | ):
 57 |     mock_client = MagicMock()
 58 |     mock_scrapinghub_client.return_value = mock_client
 59 | 
 60 |     mock_collection = MagicMock()
 61 |     mock_collection.count.return_value = 0
 62 |     mock_client.get_project.return_value.collections.get_store.return_value = (
 63 |         mock_collection
 64 |     )
 65 | 
 66 |     mock_crawler = MagicMock()
 67 |     mock_crawler.settings.getint.return_value = batch_size
 68 | 
 69 |     mock_manager = CollectionsFingerprintsManager(mock_crawler)
 70 |     mock_manager.get_keys_from_collection = MagicMock(return_value=keys_in_collection)  # type: ignore
 71 |     mock_manager.batch = fingerprints_batch
 72 | 
 73 |     r = yield Deferred.fromFuture(
 74 |         ensure_future(mock_manager.get_existing_fingerprints_async(fingerprints))
 75 |     )
 76 |     assert r == expected_result
 77 | 
 78 | 
 79 | @pytest.mark.parametrize(
 80 |     "fingerprints, expected_keys",
 81 |     [
 82 |         ({"fp1", "fp2", "fp3"}, {"fp1", "fp2", "fp3"}),
 83 |         ({}, set()),
 84 |     ],
 85 | )
 86 | @patch("scrapinghub.ScrapinghubClient")
 87 | def test_get_keys_from_collection(mock_crawler, fingerprints, expected_keys):
 88 |     mock_collection = MagicMock()
 89 |     mock_collection.list.return_value = [
 90 |         {"_key": key, "value": {}} for key in expected_keys
 91 |     ]
 92 |     mock_crawler.settings.getint.return_value = 50
 93 |     manager = CollectionsFingerprintsManager(mock_crawler)
 94 |     manager.collection = mock_collection  # type: ignore
 95 |     assert manager.get_keys_from_collection(fingerprints) == expected_keys
 96 | 
 97 | 
 98 | @pytest.mark.parametrize(
 99 |     "keys, expected_items_written",
100 |     [
101 |         (
102 |             [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")],
103 |             [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")],
104 |         ),
105 |         ([], []),
106 |     ],
107 | )
108 | @patch("scrapinghub.ScrapinghubClient")
109 | def test_save_to_collection(mock_crawler, keys, expected_items_written):
110 |     mock_writer = MagicMock()
111 |     mock_writer.write.return_value = expected_items_written
112 |     mock_crawler.settings.getint.return_value = 50
113 |     manager = CollectionsFingerprintsManager(mock_crawler)
114 |     manager.writer = mock_writer  # type: ignore
115 |     manager.save_to_collection(keys)
116 |     mock_writer.write.assert_called_once_with(
117 |         [{"_key": key, "value": value} for key, value in keys]
118 |     )
119 | 
120 | 
121 | @pytest.mark.parametrize(
122 |     "fingerprints, expected_batch, batch_size",
123 |     [
124 |         (
125 |             [(f"fp{i}", f"url{i}") for i in range(1, 5)],
126 |             {("fp4", "url4")},
127 |             3,
128 |         ),  # No default min
129 |         ([], set(), 20),
130 |         ([("fp1", "url1")] * 19, {("fp1", "url1")}, 20),
131 |         (
132 |             [(f"fp{i}", f"url{i}") for i in range(1, 103)],
133 |             {(f"fp{i}", f"url{i}") for i in range(1, 103)},
134 |             150,
135 |         ),  # No default max
136 |         (
137 |             [(f"fp{i}", f"url{i}") for i in range(1, 53)],
138 |             [("fp51", "url51"), ("fp52", "url52")],
139 |             0,
140 |         ),  # 50 by default
141 |     ],
142 | )
143 | @patch("scrapinghub.ScrapinghubClient")
144 | def test_save_fingerprints(
145 |     mock_scrapinghub_client, fingerprints, expected_batch, batch_size
146 | ):
147 |     crawler = crawler_for_incremental()
148 |     if batch_size != 0:
149 |         crawler.settings.set("INCREMENTAL_CRAWL_BATCH_SIZE", batch_size)
150 |     fp_manager = CollectionsFingerprintsManager(crawler)
151 |     fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch)  # type: ignore
152 |     fp_manager.add_to_batch(fingerprints)
153 |     assert fp_manager.batch == set(sorted(expected_batch, key=lambda x: int(x[0][2:])))
154 | 
155 |     if len(fingerprints) >= fp_manager.batch_size:
156 |         fp_manager.save_batch.assert_called_once()
157 |     else:
158 |         fp_manager.save_batch.assert_not_called()
159 | 
160 | 
161 | @pytest.mark.parametrize(
162 |     "fingerprints_batch, expected_batch_size",
163 |     [
164 |         ([], 0),
165 |         ([("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], 0),
166 |     ],
167 | )
168 | @patch("scrapinghub.ScrapinghubClient")
169 | def test_save_batch(mock_crawler, fingerprints_batch, expected_batch_size):
170 |     crawler = crawler_for_incremental()
171 |     fp_manager = CollectionsFingerprintsManager(crawler)
172 |     fp_manager.batch = set(fingerprints_batch)
173 |     fp_manager.save_batch()
174 |     assert len(fp_manager.batch) == expected_batch_size
175 | 
176 | 
177 | @pytest.mark.parametrize(
178 |     "project_id, collection_name, expected_collection",
179 |     [
180 |         ("project1", "collection1", MagicMock()),
181 |         ("project2", "collection2", MagicMock()),
182 |     ],
183 | )
184 | @patch("scrapinghub.ScrapinghubClient")
185 | def test_init_collection(
186 |     mock_scrapinghub_client,
187 |     mock_crawler,
188 |     project_id,
189 |     collection_name,
190 |     expected_collection,
191 | ):
192 |     mock_scrapinghub_instance = MagicMock()
193 |     mock_get_project = MagicMock()
194 |     mock_get_project.collections.get_store.return_value = expected_collection
195 |     mock_scrapinghub_instance.get_project.return_value = mock_get_project
196 |     mock_scrapinghub_client.return_value = mock_scrapinghub_instance
197 |     mock_crawler.settings.getint.return_value = 50
198 |     manager = CollectionsFingerprintsManager(mock_crawler)
199 |     manager.init_collection(project_id, collection_name)
200 |     assert manager.collection == expected_collection
201 | 
202 | 
203 | @patch("scrapinghub.ScrapinghubClient")
204 | def test_spider_closed(mock_scrapinghub_client):
205 |     crawler = crawler_for_incremental()
206 |     fp_manager = CollectionsFingerprintsManager(crawler)
207 |     fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch)  # type: ignore
208 |     fp_manager.spider_closed()
209 |     fp_manager.save_batch.assert_called_once()
210 | 


--------------------------------------------------------------------------------
/tests/incremental/test_middleware.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | from pytest_twisted import ensureDeferred
 5 | from scrapy.exceptions import CloseSpider, NotConfigured
 6 | from scrapy.http import Request, Response
 7 | from scrapy.settings import Settings
 8 | from scrapy.statscollectors import StatsCollector
 9 | from scrapy.utils.request import RequestFingerprinter
10 | 
11 | from tests import get_crawler
12 | from zyte_spider_templates import IncrementalCrawlMiddleware
13 | from zyte_spider_templates._incremental.manager import IncrementalCrawlingManager
14 | from zyte_spider_templates.spiders.article import ArticleSpider
15 | 
16 | 
17 | def crawler_for_incremental():
18 |     url = "https://example.com"
19 |     crawler = get_crawler()
20 |     crawler.request_fingerprinter = RequestFingerprinter()
21 |     crawler.stats = StatsCollector(crawler)
22 |     crawler.spider = ArticleSpider.from_crawler(crawler, url=url)
23 |     crawler.settings["ZYTE_PROJECT_ID"] = "000000"
24 |     return crawler
25 | 
26 | 
27 | def test_middleware_init_not_configured():
28 |     crawler = crawler_for_incremental()
29 |     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": False})
30 | 
31 |     with pytest.raises(NotConfigured):
32 |         IncrementalCrawlMiddleware(crawler)
33 | 
34 | 
35 | @patch("scrapinghub.ScrapinghubClient")
36 | def test_middleware_init_configured(mock_scrapinghub_client):
37 |     crawler = crawler_for_incremental()
38 |     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})
39 | 
40 |     middleware = IncrementalCrawlMiddleware(crawler)
41 |     assert isinstance(middleware.inc_manager, IncrementalCrawlingManager)
42 | 
43 | 
44 | @patch("scrapinghub.ScrapinghubClient")
45 | def test_prepare_manager_with_collection_fp_success(mock_scrapinghub_client):
46 |     crawler = crawler_for_incremental()
47 |     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})
48 | 
49 |     manager = IncrementalCrawlMiddleware.prepare_incremental_manager(crawler)
50 |     assert isinstance(manager, IncrementalCrawlingManager)
51 | 
52 | 
53 | def test_prepare_manager_with_collection_fp_failure(caplog):
54 |     crawler = crawler_for_incremental()
55 |     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})
56 | 
57 |     caplog.clear()
58 |     with pytest.raises(CloseSpider):
59 |         IncrementalCrawlMiddleware.prepare_incremental_manager(crawler)
60 | 
61 | 
62 | @patch("scrapinghub.ScrapinghubClient")
63 | @ensureDeferred
64 | async def test_middleware_process_spider_output(mock_scrapinghub_client):
65 |     crawler = crawler_for_incremental()
66 |     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})
67 | 
68 |     middleware = IncrementalCrawlMiddleware(crawler)
69 |     request = Request(url=crawler.spider.url)
70 |     response = Response(url=crawler.spider.url, request=request)
71 |     input_result = [
72 |         Request(url="https://example.com/1"),
73 |         Request(url="https://example.com/2"),
74 |         Request(url="https://example.com/3"),
75 |     ]
76 | 
77 |     async def async_generator():
78 |         for item in input_result:
79 |             yield item
80 | 
81 |     processed_result_list = []
82 | 
83 |     async for processed_item in middleware.process_spider_output(
84 |         response, async_generator(), crawler.spider
85 |     ):
86 |         processed_result_list.append(processed_item)
87 | 
88 |     for res_ex, res_proc in zip(input_result, processed_result_list):
89 |         assert res_ex == res_proc
90 | 


--------------------------------------------------------------------------------
/tests/mockserver.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import socket
  4 | import sys
  5 | import time
  6 | from importlib import import_module
  7 | from subprocess import PIPE, Popen
  8 | from typing import Any, Dict
  9 | 
 10 | from scrapy_zyte_api.responses import _API_RESPONSE
 11 | from twisted.internet import reactor
 12 | from twisted.web.resource import Resource
 13 | from twisted.web.server import Site
 14 | 
 15 | 
 16 | def get_ephemeral_port():
 17 |     s = socket.socket()
 18 |     s.bind(("", 0))
 19 |     return s.getsockname()[1]
 20 | 
 21 | 
 22 | class DefaultResource(Resource):
 23 |     """Mock server to fake Zyte API responses.
 24 | 
 25 |     To use, include the mockserver fixture in the signature of your test, and
 26 |     point the ZYTE_API_URL setting to the mock server. See
 27 |     ``tests/test_ecommerce.py::test_crawl_strategies`` for an example.
 28 | 
 29 |     This mock server is designed to fake the following:
 30 | 
 31 |     -   An e-commerce website with the following pages:
 32 | 
 33 |         ```
 34 |         https://example.com/
 35 |         https://example.com/page/2
 36 |         https://example.com/category/1
 37 |         https://example.com/category/1/page/2
 38 |         https://example.com/non-navigation
 39 |         ```
 40 | 
 41 |         When browserHtml is requested (for any URL, listed above or not), it is
 42 |         a minimal HTML with an anchor tag pointing to
 43 |         https://example.com/non-navigation.
 44 | 
 45 |         When productNavigation is requested, nextPage and subCategories are filled
 46 |         accordingly. productNavigation.items always has 2 product URLs, which are
 47 |         the result of appending ``/product/<n>`` to the request URL.
 48 |         https://example.com/non-navigation is not reachable through
 49 |         productNavigation.
 50 | 
 51 |         When product or productList is requested, an item with the current URL is
 52 |         always returned.
 53 | 
 54 |         All output also includes unsupported links (mailto:…).
 55 | 
 56 |     -   Job-posting websites with the following endpoints:
 57 | 
 58 |         -   https://jobs.example (jobPostingNavigation pointing to the 2 items
 59 |             below).
 60 | 
 61 |         -   https://jobs.offsite.example/jobs/1 (jobPosting)
 62 | 
 63 |         -   https://jobs.offsite.example/jobs/2 (jobPosting)
 64 |     """
 65 | 
 66 |     def getChild(self, path, request):
 67 |         return self
 68 | 
 69 |     def render_POST(self, request):
 70 |         request_data = json.loads(request.content.read())
 71 |         request.responseHeaders.setRawHeaders(
 72 |             b"Content-Type",
 73 |             [b"application/json"],
 74 |         )
 75 |         request.responseHeaders.setRawHeaders(
 76 |             b"request-id",
 77 |             [b"abcd1234"],
 78 |         )
 79 | 
 80 |         response_data: _API_RESPONSE = {}
 81 | 
 82 |         response_data["url"] = request_data["url"]
 83 | 
 84 |         if request_data["url"] == "https://jobs.example":
 85 |             assert request_data["jobPostingNavigation"] is True
 86 |             response_data["jobPostingNavigation"] = {
 87 |                 "url": request_data["url"],
 88 |                 "items": [
 89 |                     {"url": "https://jobs.offsite.example/jobs/1"},
 90 |                     {"url": "https://jobs.offsite.example/jobs/2"},
 91 |                 ],
 92 |             }
 93 |             return json.dumps(response_data).encode()
 94 | 
 95 |         if request_data["url"].startswith("https://jobs.offsite.example/"):
 96 |             assert request_data["jobPosting"] is True
 97 |             response_data["jobPosting"] = {
 98 |                 "url": request_data["url"],
 99 |             }
100 |             return json.dumps(response_data).encode()
101 | 
102 |         non_navigation_url = "https://example.com/non-navigation"
103 |         html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:jane@example.com"></a></body></html>"""
104 |         if request_data.get("browserHtml", False) is True:
105 |             response_data["browserHtml"] = html
106 | 
107 |         if request_data.get("product", False) is True:
108 |             response_data["product"] = {
109 |                 "url": request_data["url"],
110 |             }
111 | 
112 |         if request_data.get("productList", False) is True:
113 |             response_data["productList"] = {
114 |                 "url": request_data["url"],
115 |             }
116 | 
117 |         if request_data.get("productNavigation", False) is True:
118 |             kwargs: Dict[str, Any] = {}
119 |             if (
120 |                 "/page/" not in request_data["url"]
121 |                 and "/non-navigation" not in request_data["url"]
122 |             ):
123 |                 kwargs["nextPage"] = {
124 |                     "url": f"{request_data['url'].rstrip('/')}/page/2"
125 |                 }
126 |                 if "/category/" not in request_data["url"]:
127 |                     kwargs["subCategories"] = [
128 |                         {"url": "mailto:jane@example.com"},
129 |                         {"url": f"{request_data['url'].rstrip('/')}/category/1"},
130 |                     ]
131 |             else:
132 |                 kwargs["nextPage"] = {"url": "mailto:jane@example.com"}
133 |             response_data["productNavigation"] = {
134 |                 "url": request_data["url"],
135 |                 "items": [
136 |                     {"url": "mailto:jane@example.com"},
137 |                     {"url": f"{request_data['url'].rstrip('/')}/product/1"},
138 |                     {"url": f"{request_data['url'].rstrip('/')}/product/2"},
139 |                 ],
140 |                 **kwargs,
141 |             }
142 | 
143 |         return json.dumps(response_data).encode()
144 | 
145 | 
146 | class MockServer:
147 |     def __init__(self, resource=None, port=None):
148 |         resource = resource or DefaultResource
149 |         self.resource = "{}.{}".format(resource.__module__, resource.__name__)
150 |         self.proc = None
151 |         self.host = socket.gethostbyname(socket.gethostname())
152 |         self.port = port or get_ephemeral_port()
153 |         self.root_url = "http://%s:%d" % (self.host, self.port)
154 | 
155 |     def __enter__(self):
156 |         self.proc = Popen(
157 |             [
158 |                 sys.executable,
159 |                 "-u",
160 |                 "-m",
161 |                 "tests.mockserver",
162 |                 self.resource,
163 |                 "--port",
164 |                 str(self.port),
165 |             ],
166 |             stdout=PIPE,
167 |         )
168 |         assert self.proc.stdout is not None
169 |         self.proc.stdout.readline()
170 |         return self
171 | 
172 |     def __exit__(self, exc_type, exc_value, traceback):
173 |         assert self.proc is not None
174 |         self.proc.kill()
175 |         self.proc.wait()
176 |         time.sleep(0.2)
177 | 
178 |     def urljoin(self, path):
179 |         return self.root_url + path
180 | 
181 | 
182 | def main():
183 |     parser = argparse.ArgumentParser()
184 |     parser.add_argument("resource")
185 |     parser.add_argument("--port", type=int)
186 |     args = parser.parse_args()
187 |     module_name, name = args.resource.rsplit(".", 1)
188 |     sys.path.append(".")
189 |     resource = getattr(import_module(module_name), name)()
190 |     # Typing issue: https://github.com/twisted/twisted/issues/9909
191 |     http_port = reactor.listenTCP(args.port, Site(resource))  # type: ignore[attr-defined]
192 | 
193 |     def print_listening():
194 |         host = http_port.getHost()
195 |         print(
196 |             "Mock server {} running at http://{}:{}".format(
197 |                 resource, host.host, host.port
198 |             )
199 |         )
200 | 
201 |     # Typing issue: https://github.com/twisted/twisted/issues/9909
202 |     reactor.callWhenRunning(print_listening)  # type: ignore[attr-defined]
203 |     reactor.run()  # type: ignore[attr-defined]
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     main()
208 | 


--------------------------------------------------------------------------------
/tests/pages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/tests/pages/__init__.py


--------------------------------------------------------------------------------
/tests/pages/test_article_navigation_heuristics.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import patch
  2 | 
  3 | from pytest_twisted import ensureDeferred
  4 | from web_poet import (
  5 |     AnyResponse,
  6 |     HttpResponse,
  7 |     HttpResponseHeaders,
  8 |     PageParams,
  9 |     RequestUrl,
 10 |     Stats,
 11 | )
 12 | from zyte_common_items import ProbabilityMetadata, ProbabilityRequest
 13 | 
 14 | from zyte_spider_templates.pages.article_heuristics import (
 15 |     HeuristicsArticleNavigationPage,
 16 | )
 17 | 
 18 | 
 19 | @ensureDeferred
 20 | async def test_article_page():
 21 |     body = b"""
 22 |         <html>
 23 |         <body>
 24 |             <div>
 25 |                 <h1>Categories<h1>
 26 |                 <div>
 27 |                     <a href="https://example.com/category/UX">UX</a>
 28 |                     <a href="https://example.com/category/CSS">CSS</a>
 29 |                 </div>
 30 |             </div>
 31 |             <div>
 32 |                 <h1>Articles<h2>
 33 |                 <div>
 34 |                     <a href="https://example.com/2024/05/modern-css">Modern CSS</a>
 35 |                     <a href="https://example.com/2024/04/how-run-ux">How to run UX</a>
 36 |                 </div>
 37 |                 <span>
 38 |                     <a href="https://example.com/page-2">Next Page</a>
 39 |                 </span>
 40 |             </div>
 41 |             <footer>
 42 |                 <a href="https://example.com/privacy-policy">Privacy Policy</a>
 43 |                 <a href="https://another-example.com">Link to other domain</a>
 44 |                 <a href="https://example.com/feed/rss.xml">RSS feed</a>
 45 |             </footer>
 46 |         </body>
 47 |         </html>
 48 |     """
 49 |     response = AnyResponse(HttpResponse("https://example.com", body))
 50 | 
 51 |     rss_content = b"""
 52 |     <rss version="2.0">
 53 |     <channel>
 54 |         <title>Sample RSS Feed</title>
 55 |         <link>http://example.com/feed/rss.xml</link>
 56 |         <description>This is a sample RSS feed</description>
 57 |         <item>
 58 |             <title>Item 1</title>
 59 |             <link>http://example.com/item1</link>
 60 |             <description>Description of Item 1</description>
 61 |         </item>
 62 |         <item>
 63 |             <title>Item 2</title>
 64 |             <link>http://example.com/item2</link>
 65 |             <description>Description of Item 2</description>
 66 |         </item>
 67 |     </channel>
 68 |     </rss>
 69 |     """
 70 |     rss_response = AnyResponse(
 71 |         HttpResponse(
 72 |             "https://example.com/feed/rss.xml",
 73 |             rss_content,
 74 |             headers=HttpResponseHeaders({"Content-Type": "text/xml"}),
 75 |         )
 76 |     )
 77 | 
 78 |     urls_subcategories = [
 79 |         {"url": "https://example.com/category/UX", "name": "UX"},
 80 |         {"url": "https://example.com/category/CSS", "name": "CSS"},
 81 |         {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"},
 82 |         {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"},
 83 |         {"url": "https://example.com/page-2", "name": "Next Page"},
 84 |         {"url": "https://another-example.com", "name": "Link to other domain"},
 85 |     ]
 86 |     requests_subcategories = [
 87 |         ProbabilityRequest(
 88 |             url=subcat["url"],
 89 |             name=f"[heuristics][articleNavigation][subCategories] {subcat['name']}",
 90 |             headers=None,
 91 |             metadata=ProbabilityMetadata(probability=0.5),
 92 |         )
 93 |         for subcat in urls_subcategories
 94 |     ]
 95 | 
 96 |     urls_feed = [
 97 |         {"url": "https://example.com/feed/rss.xml"},
 98 |     ]
 99 |     requests_feed = [
100 |         ProbabilityRequest(
101 |             url=feed["url"],
102 |             name="[heuristics][articleNavigation][feed] ",
103 |             headers=None,
104 |             metadata=ProbabilityMetadata(probability=1.0),
105 |         )
106 |         for feed in urls_feed
107 |     ]
108 | 
109 |     feed_items = ["http://example.com/item1", "http://example.com/item2"]
110 | 
111 |     urls_items = [
112 |         {"url": "https://example.com/category/UX", "name": "UX"},
113 |         {"url": "https://example.com/category/CSS", "name": "CSS"},
114 |         {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"},
115 |         {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"},
116 |         {"url": "https://example.com/page-2", "name": "Next Page"},
117 |         {"url": "https://another-example.com", "name": "Link to other domain"},
118 |     ]
119 |     requests_items = [
120 |         ProbabilityRequest(
121 |             url=item["url"],
122 |             name=f"[heuristics][articleNavigation][article] {item['name']}",
123 |             headers=None,
124 |             metadata=ProbabilityMetadata(probability=0.5),
125 |         )
126 |         for item in urls_items
127 |     ]
128 | 
129 |     request_url = RequestUrl(response.url)
130 |     rss_url = RequestUrl(rss_response.url)
131 | 
132 |     # final_navigation_page = True
133 |     page_params = PageParams({"skip_subcategories": True})
134 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
135 |     item = await page.to_item()
136 | 
137 |     assert page.skip_subcategories()
138 |     assert item.subCategories[0].url == "https://example.com/feed/rss.xml"
139 |     assert [item.url for item in item.items] == [item["url"] for item in urls_items]
140 | 
141 |     # final_navigation_page = False
142 |     page_params = PageParams({"skip_subcategories": False})
143 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
144 |     item = await page.to_item()
145 | 
146 |     assert not page.skip_subcategories()
147 |     assert item.subCategories == requests_feed + requests_subcategories
148 |     assert item.items == requests_items
149 | 
150 |     # no final_navigation_page (False by default)
151 |     page_params = PageParams()
152 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
153 |     item = await page.to_item()
154 | 
155 |     assert not page.skip_subcategories()
156 |     assert item.subCategories == requests_feed + requests_subcategories
157 |     assert item.items == requests_items
158 | 
159 |     # only_feeds = True, request to page
160 |     page_params = PageParams({"only_feeds": True})
161 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
162 |     item = await page.to_item()
163 | 
164 |     assert page.is_only_feeds()
165 |     assert item.subCategories[0].url == str(rss_url)
166 |     assert [item.url for item in item.items] == []
167 | 
168 |     # only_feeds = True, request to feed
169 |     page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params)
170 |     with patch.object(
171 |         HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True
172 |     ):
173 |         item = await page.to_item()
174 |     assert page.is_only_feeds()
175 |     assert item.subCategories == []
176 |     assert [item.url for item in item.items] == feed_items
177 | 
178 |     # only_feeds = False, request to page
179 |     page_params = PageParams({"only_feeds": False})
180 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
181 |     item = await page.to_item()
182 | 
183 |     assert not page.is_only_feeds()
184 |     assert item.subCategories == requests_feed + requests_subcategories
185 |     assert item.items == requests_items
186 | 
187 |     # only_feeds = False, request to feed
188 |     page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params)
189 |     with patch.object(
190 |         HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True
191 |     ):
192 |         item = await page.to_item()
193 |     assert not page.is_only_feeds()
194 |     assert item.subCategories == []
195 |     assert [item.url for item in item.items] == feed_items
196 | 
197 |     # no only_feeds (False by default)
198 |     page_params = PageParams()
199 |     page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params)
200 |     item = await page.to_item()
201 | 
202 |     assert not page.is_only_feeds()
203 |     assert item.subCategories == requests_feed + requests_subcategories
204 |     assert item.items == requests_items
205 | 
206 |     # no only_feeds (False by default), request to feed
207 |     page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params)
208 |     with patch.object(
209 |         HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True
210 |     ):
211 |         item = await page.to_item()
212 |     assert not page.is_only_feeds()
213 |     assert item.subCategories == []
214 |     assert [item.url for item in item.items] == feed_items
215 | 


--------------------------------------------------------------------------------
/tests/pages/test_product_navigation_heuristics.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pytest_twisted import ensureDeferred
  3 | from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl
  4 | from zyte_common_items import ProbabilityRequest, ProductNavigation
  5 | 
  6 | from zyte_spider_templates.pages.product_navigation_heuristics import (
  7 |     HeuristicsProductNavigationPage,
  8 | )
  9 | 
 10 | 
 11 | @ensureDeferred
 12 | async def test_unknown_product_page():
 13 |     body = b"""
 14 |         <html>
 15 |         <body>
 16 |             <div>
 17 |                 <h1>Subcategories<h1>
 18 |                 <div>
 19 |                     <a href="https://example.com/categ/sentinels">Sentinels</a>
 20 |                     <a href="https://example.com/categ/duelists">Duelists</a>
 21 |                 </div>
 22 |             </div>
 23 |             <div>
 24 |                 <h1>Items<h1>
 25 |                 <div>
 26 |                     <a href="https://example.com/p?id=reyna">Reyna</a>
 27 |                     <a href="https://example.com/p?id=jett">Jett</a>
 28 |                 </div>
 29 |                 <span>
 30 |                     <a href="https://example.com/page-2">Next Page</a>
 31 |                 </span>
 32 |             </div>
 33 |             <a href="https://example.com/categ/probably">category??</a>
 34 |             <footer>
 35 |                 <a href="https://example.com/privacy-policy">Privacy Policy</a>
 36 |                 <a href="https://another-example.com">Link to other domain</a>
 37 |             </footer>
 38 |         </body>
 39 |         </html>
 40 |     """
 41 |     response = AnyResponse(HttpResponse("https://example.com", body))
 42 |     navigation = ProductNavigation.from_dict(
 43 |         {
 44 |             "url": "https://example.com",
 45 |             "subCategories": [
 46 |                 {"url": "https://example.com/categ/sentinels", "name": "Sentinels"},
 47 |                 {"url": "https://example.com/categ/duelists", "name": "Duelists"},
 48 |             ],
 49 |             "items": [
 50 |                 {"url": "https://example.com/p?id=reyna", "name": "Reyna"},
 51 |                 {"url": "https://example.com/p?id=jett", "name": "Jett"},
 52 |             ],
 53 |             "nextPage": {
 54 |                 "url": "https://example.com/page-2",
 55 |                 "name": "Next Page",
 56 |             },
 57 |             "metadata": {"dateDownloaded": "2024-01-09T14:37:58Z"},
 58 |         }
 59 |     )
 60 |     all_valid_urls = [
 61 |         "https://example.com/categ/sentinels",
 62 |         "https://example.com/categ/duelists",
 63 |         "https://example.com/p?id=reyna",
 64 |         "https://example.com/p?id=jett",
 65 |         "https://example.com/page-2",
 66 |     ]
 67 |     urls_subcategories = [
 68 |         ProbabilityRequest.from_dict(
 69 |             {"url": "https://example.com/categ/sentinels", "name": "Sentinels"}
 70 |         ),
 71 |         ProbabilityRequest.from_dict(
 72 |             {"url": "https://example.com/categ/duelists", "name": "Duelists"}
 73 |         ),
 74 |     ]
 75 | 
 76 |     # Heuristics turned OFF
 77 |     request_url = RequestUrl(response.url)
 78 |     page_params = PageParams({"allow_domains": "example.com"})
 79 |     page = HeuristicsProductNavigationPage(
 80 |         request_url, navigation, response, page_params
 81 |     )
 82 |     item = await page.to_item()
 83 | 
 84 |     assert item.subCategories == urls_subcategories
 85 |     assert page._urls_for_category() == all_valid_urls
 86 | 
 87 |     # Heuristics turned ON
 88 |     page_params = PageParams({"full_domain": "example.com"})
 89 |     page = HeuristicsProductNavigationPage(
 90 |         request_url, navigation, response, page_params
 91 |     )
 92 |     item = await page.to_item()
 93 | 
 94 |     assert item.subCategories == urls_subcategories + [
 95 |         ProbabilityRequest.from_dict(
 96 |             {
 97 |                 "url": "https://example.com/categ/probably",
 98 |                 "name": "[heuristics] category??",
 99 |                 "metadata": {"probability": 0.1},
100 |             }
101 |         )
102 |     ]
103 |     assert page._urls_for_category() == all_valid_urls
104 | 
105 | 
106 | @ensureDeferred
107 | async def test_crawl_nofollow_links():
108 |     page_params = PageParams({"full_domain": "example.com"})
109 |     body = b"""
110 |             <html>
111 |             <body>
112 |                 <div>
113 |                     <a href="https://outside-example.com/can-follow">Outside link</a>
114 |                     <a href="https://example.com/can-follow">Can follow</a>
115 |                     <a href="https://example.com/dont-follow" rel="nofollow">Dont follow</a>
116 |                 </div>
117 |             </body>
118 |             </html>
119 |         """
120 |     url = "https://example.com"
121 |     response = AnyResponse(HttpResponse(url, body))
122 |     request_url = RequestUrl(response.url)
123 |     navigation = ProductNavigation(url=url)
124 | 
125 |     page = HeuristicsProductNavigationPage(
126 |         request_url, navigation, response, page_params
127 |     )
128 |     assert [req.url for req in page.subCategories] == ["https://example.com/can-follow"]
129 | 
130 | 
131 | @pytest.mark.deprication_warning
132 | def test_deprecated_page_objects():
133 |     with pytest.warns(DeprecationWarning, match="page_objects"):
134 |         from zyte_spider_templates.page_objects import (  # noqa: F401
135 |             HeuristicsProductNavigationPage,
136 |         )
137 | 
138 |     # We cannot test the warning again because duplicate warnings are ignored,
139 |     # but we still want to ensure that we can import the class.
140 |     from zyte_spider_templates.page_objects.product_navigation_heuristics import (  # noqa: F401, F811
141 |         HeuristicsProductNavigationPage,
142 |     )
143 | 


--------------------------------------------------------------------------------
/tests/test_addon.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import scrapy
  3 | from duplicate_url_discarder_rules import RULE_PATHS
  4 | from packaging import version
  5 | from scrapy.utils.test import get_crawler
  6 | 
  7 | from zyte_spider_templates import (
  8 |     AllowOffsiteMiddleware,
  9 |     CrawlingLogsMiddleware,
 10 |     IncrementalCrawlMiddleware,
 11 |     MaxRequestsPerSeedDownloaderMiddleware,
 12 |     OffsiteRequestsPerSeedMiddleware,
 13 |     OnlyFeedsMiddleware,
 14 |     TrackNavigationDepthSpiderMiddleware,
 15 |     TrackSeedsSpiderMiddleware,
 16 | )
 17 | 
 18 | _crawler = get_crawler()
 19 | BASELINE_SETTINGS = _crawler.settings.copy_to_dict()
 20 | 
 21 | try:
 22 |     from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware  # noqa: F401
 23 | except ImportError:
 24 |     BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = (
 25 |         "scrapy.spidermiddlewares.offsite.OffsiteMiddleware"
 26 |     )
 27 | else:
 28 |     BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH = (
 29 |         "scrapy.downloadermiddlewares.offsite.OffsiteMiddleware"
 30 |     )
 31 | 
 32 | 
 33 | # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/tests/test_addon.py#L109
 34 | def _test_setting_changes(initial_settings, expected_settings):
 35 |     settings = {
 36 |         **initial_settings,
 37 |         "ADDONS": {
 38 |             "zyte_spider_templates.Addon": 1000,
 39 |         },
 40 |     }
 41 |     crawler = get_crawler(settings_dict=settings)
 42 |     crawler._apply_settings()
 43 |     actual_settings = crawler.settings.copy_to_dict()
 44 | 
 45 |     # Test separately settings that copy_to_dict messes up.
 46 |     for setting in (
 47 |         "DOWNLOADER_MIDDLEWARES",
 48 |         "SCRAPY_POET_PROVIDERS",
 49 |         "SPIDER_MIDDLEWARES",
 50 |     ):
 51 |         if setting not in crawler.settings:
 52 |             assert setting not in expected_settings
 53 |             continue
 54 |         assert crawler.settings.getdict(setting) == expected_settings.pop(setting)
 55 |         del actual_settings[setting]
 56 | 
 57 |     for key in BASELINE_SETTINGS:
 58 |         if key in actual_settings and actual_settings[key] == BASELINE_SETTINGS[key]:
 59 |             del actual_settings[key]
 60 |     del actual_settings["ADDONS"]
 61 | 
 62 |     assert actual_settings == expected_settings
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     ("initial_settings", "expected_settings"),
 67 |     (
 68 |         (
 69 |             {},
 70 |             {
 71 |                 "CLOSESPIDER_TIMEOUT_NO_ITEM": 600,
 72 |                 "DOWNLOADER_MIDDLEWARES": {
 73 |                     MaxRequestsPerSeedDownloaderMiddleware: 100,
 74 |                     BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None,
 75 |                     AllowOffsiteMiddleware: 50,
 76 |                 },
 77 |                 "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue",
 78 |                 "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue",
 79 |                 "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
 80 |                 "ITEM_PROBABILITY_THRESHOLDS": {
 81 |                     "zyte_common_items.items.Article": 0.1,
 82 |                     "zyte_common_items.items.Product": 0.1,
 83 |                 },
 84 |                 "DUD_LOAD_RULE_PATHS": RULE_PATHS,
 85 |                 "SCRAPY_POET_DISCOVER": [
 86 |                     "zyte_spider_templates.pages",
 87 |                 ],
 88 |                 "SPIDER_MIDDLEWARES": {
 89 |                     IncrementalCrawlMiddleware: 45,
 90 |                     OffsiteRequestsPerSeedMiddleware: 49,
 91 |                     OnlyFeedsMiddleware: 108,
 92 |                     TrackNavigationDepthSpiderMiddleware: 110,
 93 |                     TrackSeedsSpiderMiddleware: 550,
 94 |                     CrawlingLogsMiddleware: 1000,
 95 |                 },
 96 |                 "SPIDER_MODULES": [
 97 |                     "zyte_spider_templates.spiders",
 98 |                 ],
 99 |             },
100 |         ),
101 |     ),
102 | )
103 | @pytest.mark.skipif(
104 |     version.parse(scrapy.__version__) < version.parse("2.11.2"),
105 |     reason="Test applicable only for Scrapy versions >= 2.11.2",
106 | )
107 | def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_settings):
108 |     _test_setting_changes(initial_settings, expected_settings)
109 | 
110 | 
111 | @pytest.mark.parametrize(
112 |     ("initial_settings", "expected_settings"),
113 |     (
114 |         (
115 |             {},
116 |             {
117 |                 "CLOSESPIDER_TIMEOUT_NO_ITEM": 600,
118 |                 "DOWNLOADER_MIDDLEWARES": {MaxRequestsPerSeedDownloaderMiddleware: 100},
119 |                 "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue",
120 |                 "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue",
121 |                 "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
122 |                 "ITEM_PROBABILITY_THRESHOLDS": {
123 |                     "zyte_common_items.items.Article": 0.1,
124 |                     "zyte_common_items.items.Product": 0.1,
125 |                 },
126 |                 "DUD_LOAD_RULE_PATHS": RULE_PATHS,
127 |                 "SCRAPY_POET_DISCOVER": [
128 |                     "zyte_spider_templates.pages",
129 |                 ],
130 |                 "SPIDER_MIDDLEWARES": {
131 |                     IncrementalCrawlMiddleware: 45,
132 |                     OffsiteRequestsPerSeedMiddleware: 49,
133 |                     OnlyFeedsMiddleware: 108,
134 |                     TrackNavigationDepthSpiderMiddleware: 110,
135 |                     BUILTIN_OFFSITE_MIDDLEWARE_IMPORT_PATH: None,
136 |                     AllowOffsiteMiddleware: 500,
137 |                     TrackSeedsSpiderMiddleware: 550,
138 |                     CrawlingLogsMiddleware: 1000,
139 |                 },
140 |                 "SPIDER_MODULES": [
141 |                     "zyte_spider_templates.spiders",
142 |                 ],
143 |             },
144 |         ),
145 |     ),
146 | )
147 | @pytest.mark.skipif(
148 |     version.parse(scrapy.__version__) >= version.parse("2.11.2"),
149 |     reason="Test applicable only for Scrapy versions < 2.11.2",
150 | )
151 | def test_poet_setting_changes(initial_settings, expected_settings):
152 |     _test_setting_changes(initial_settings, expected_settings)
153 | 


--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from zyte_spider_templates import BaseSpiderParams
4 | 
5 | 
6 | def test_deprecation():
7 |     with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"):
8 |         BaseSpiderParams(url="https://example.com")  # type: ignore[call-arg]
9 | 


--------------------------------------------------------------------------------
/tests/test_feeds.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union
  2 | 
  3 | import pytest
  4 | from web_poet import (
  5 |     AnyResponse,
  6 |     BrowserHtml,
  7 |     BrowserResponse,
  8 |     HttpResponse,
  9 |     HttpResponseBody,
 10 |     ResponseUrl,
 11 | )
 12 | 
 13 | from zyte_spider_templates.feeds import get_feed_urls, parse_feed, unique_urls
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def sample_urls() -> List[str]:
 18 |     return [
 19 |         "http://example.com",
 20 |         "http://example.com/",
 21 |         "https://example.com",
 22 |         "https://example.com/",
 23 |         "http://example.com/page",
 24 |         "http://example.com/page/",
 25 |     ]
 26 | 
 27 | 
 28 | def test_unique_urls(sample_urls):
 29 |     unique_list = unique_urls(sample_urls)
 30 |     assert len(unique_list) == 4
 31 | 
 32 | 
 33 | def test_unique_urls_order(sample_urls):
 34 |     unique_list = unique_urls(sample_urls)
 35 |     expected_order = [
 36 |         "http://example.com",
 37 |         "https://example.com",
 38 |         "http://example.com/page",
 39 |         "http://example.com/page/",
 40 |     ]
 41 |     assert unique_list == expected_order
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def sample_response_feed() -> Union[AnyResponse, HttpResponse, BrowserResponse]:
 46 |     html_content = """
 47 |     <html>
 48 |     <head>
 49 |         <link rel="alternate" type="application/rss+xml" href="http://example.com/rss.xml">
 50 |         <link rel="alternate" type="application/atom+xml" href="http://example.com/atom.xml">
 51 |     </head>
 52 |     <body>
 53 |         <a href="http://example.com/feed/rss.xml">RSS Feed</a>
 54 |         <a href="http://example.com/feed/atom.xml">Atom Feed</a>
 55 |     </body>
 56 |     </html>
 57 |     """
 58 |     return HttpResponse(
 59 |         url=ResponseUrl("http://example.com"),
 60 |         body=HttpResponseBody(html_content.encode(encoding="utf-8")),
 61 |     )
 62 | 
 63 | 
 64 | def test_get_feed_urls(sample_response_feed):
 65 |     feed_urls = get_feed_urls(sample_response_feed)
 66 |     assert len(feed_urls) == 3
 67 |     assert "http://example.com/rss.xml" in feed_urls
 68 |     assert "http://example.com/atom.xml" in feed_urls
 69 |     assert "http://example.com/feed/rss.xml" in feed_urls
 70 | 
 71 | 
 72 | @pytest.fixture
 73 | def sample_response_feeds() -> Union[AnyResponse, HttpResponse, BrowserResponse]:
 74 |     rss_content = """
 75 |     <rss version="2.0">
 76 |     <channel>
 77 |         <title>Sample RSS Feed</title>
 78 |         <link>http://example.com/feed/rss.xml</link>
 79 |         <description>This is a sample RSS feed</description>
 80 |         <item>
 81 |             <title>Item 1</title>
 82 |             <link>http://example.com/item1</link>
 83 |             <description>Description of Item 1</description>
 84 |         </item>
 85 |         <item>
 86 |             <title>Item 2</title>
 87 |             <link>http://example.com/item2</link>
 88 |             <description>Description of Item 2</description>
 89 |         </item>
 90 |         <item>
 91 |             <title>Item 3</title>
 92 |             <link>http://example.com/item2</link>
 93 |             <description>Description of Item 3</description>
 94 |         </item>
 95 |     </channel>
 96 |     </rss>
 97 |     """
 98 |     return HttpResponse(
 99 |         url=ResponseUrl("http://example.com/feed/rss.xml"),
100 |         body=HttpResponseBody(rss_content.encode(encoding="utf-8")),
101 |     )
102 | 
103 | 
104 | @pytest.mark.parametrize("is_browser_response", [False, True])
105 | def test_parse_feed(sample_response_feeds, is_browser_response):
106 |     if is_browser_response:
107 |         sample_response_feeds = BrowserResponse(
108 |             url=ResponseUrl("http://example.com"),
109 |             html=BrowserHtml(str(sample_response_feeds.text)),
110 |         )
111 |     feed_urls = parse_feed(sample_response_feeds)
112 |     expected_urls = ["http://example.com/item1", "http://example.com/item2"]
113 |     assert feed_urls == expected_urls
114 | 


--------------------------------------------------------------------------------
/tests/test_params.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | from pydantic import ValidationError
  5 | 
  6 | from zyte_spider_templates import EcommerceSpider, GoogleSearchSpider
  7 | from zyte_spider_templates.params import URL_FIELD_KWARGS
  8 | from zyte_spider_templates.spiders.ecommerce import EcommerceCrawlStrategy
  9 | 
 10 | from . import get_crawler
 11 | 
 12 | 
 13 | @pytest.mark.parametrize(
 14 |     "valid,url",
 15 |     [
 16 |         (False, ""),
 17 |         (False, "http://"),
 18 |         (False, "http:/example.com"),
 19 |         (False, "ftp://example.com"),
 20 |         (False, "example.com"),
 21 |         (False, "//example.com"),
 22 |         (False, "http://foo:bar@example.com"),
 23 |         (False, " http://example.com"),
 24 |         (False, "http://example.com "),
 25 |         (False, "http://examp le.com"),
 26 |         (False, "https://example.com:232323"),
 27 |         (True, "http://example.com"),
 28 |         (True, "http://bücher.example"),
 29 |         (True, "http://xn--bcher-kva.example"),
 30 |         (True, "https://i❤.ws"),
 31 |         (True, "https://example.com"),
 32 |         (True, "https://example.com/"),
 33 |         (True, "https://example.com:2323"),
 34 |         (True, "https://example.com:2323/"),
 35 |         (True, "https://example.com:2323/foo"),
 36 |         (True, "https://example.com/f"),
 37 |         (True, "https://example.com/foo"),
 38 |         (True, "https://example.com/foo/"),
 39 |         (True, "https://example.com/foo/bar"),
 40 |         (True, "https://example.com/foo/bar/"),
 41 |         (True, "https://example.com/foo/bar?baz"),
 42 |         (True, "https://example.com/foo/bar/?baz"),
 43 |         (True, "https://example.com?foo"),
 44 |         (True, "https://example.com?foo=bar"),
 45 |         (True, "https://example.com/?foo=bar&baz"),
 46 |         (True, "https://example.com/?foo=bar&baz#"),
 47 |         (True, "https://example.com/?foo=bar&baz#frag"),
 48 |         (True, "https://example.com#"),
 49 |         (True, "https://example.com/#"),
 50 |         (True, "https://example.com/&"),
 51 |         (True, "https://example.com/&#"),
 52 |     ],
 53 | )
 54 | def test_url_pattern(url, valid):
 55 |     assert isinstance(URL_FIELD_KWARGS["pattern"], str)
 56 |     assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid
 57 | 
 58 | 
 59 | REQUIRED_ARGS = {
 60 |     EcommerceSpider: {"url": "https://example.com"},
 61 |     GoogleSearchSpider: {"search_queries": "foo"},
 62 | }
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     ("spider_cls",), ((spider_cls,) for spider_cls in REQUIRED_ARGS)
 67 | )
 68 | def test_required_args(spider_cls):
 69 |     crawler = get_crawler()
 70 | 
 71 |     with pytest.raises(ValidationError):
 72 |         spider_cls.from_crawler(crawler)
 73 | 
 74 |     spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls])
 75 | 
 76 | 
 77 | @pytest.mark.parametrize(
 78 |     ("spider_cls", "args", "valid"),
 79 |     (
 80 |         (
 81 |             EcommerceSpider,
 82 |             {
 83 |                 "url": "https://example.com",
 84 |                 "crawl_strategy": EcommerceCrawlStrategy.automatic,
 85 |             },
 86 |             True,
 87 |         ),
 88 |         (
 89 |             EcommerceSpider,
 90 |             {"url": "https://example.com", "crawl_strategy": "automatic"},
 91 |             True,
 92 |         ),
 93 |         (
 94 |             EcommerceSpider,
 95 |             {"url": "https://example.com", "crawl_strategy": "unknown"},
 96 |             False,
 97 |         ),
 98 |         (
 99 |             EcommerceSpider,
100 |             {
101 |                 "url": "https://example.com",
102 |                 "crawl_strategy": "direct_item",
103 |                 "search_queries": "",
104 |             },
105 |             True,
106 |         ),
107 |         (
108 |             EcommerceSpider,
109 |             {
110 |                 "url": "https://example.com",
111 |                 "crawl_strategy": "automatic",
112 |                 "search_queries": "foo",
113 |             },
114 |             True,
115 |         ),
116 |         (
117 |             EcommerceSpider,
118 |             {
119 |                 "url": "https://example.com",
120 |                 "crawl_strategy": "direct_item",
121 |                 "search_queries": "foo",
122 |             },
123 |             False,
124 |         ),
125 |         (
126 |             EcommerceSpider,
127 |             {
128 |                 "url": "https://example.com",
129 |                 "extract": "product",
130 |                 "crawl_strategy": "direct_item",
131 |                 "search_queries": "foo",
132 |             },
133 |             False,
134 |         ),
135 |         (
136 |             EcommerceSpider,
137 |             {
138 |                 "url": "https://example.com",
139 |                 "extract": "productList",
140 |                 "crawl_strategy": "direct_item",
141 |                 "search_queries": "foo",
142 |             },
143 |             True,
144 |         ),
145 |         (GoogleSearchSpider, {"domain": "google.com"}, False),
146 |         (
147 |             GoogleSearchSpider,
148 |             {"domain": "google.cat", "search_queries": "foo bar"},
149 |             True,
150 |         ),
151 |         (
152 |             GoogleSearchSpider,
153 |             {"domain": "google.cat", "search_queries": "foo bar", "max_pages": 10},
154 |             True,
155 |         ),
156 |         (
157 |             GoogleSearchSpider,
158 |             {"domain": "google.foo", "search_queries": "foo bar"},
159 |             False,
160 |         ),
161 |         (GoogleSearchSpider, {"search_queries": "foo bar", "max_pages": "all"}, False),
162 |         (GoogleSearchSpider, {"search_queries": "foo", "results_per_page": 0}, False),
163 |     ),
164 | )
165 | def test_arg_combinations(spider_cls, args, valid):
166 |     crawler = get_crawler()
167 |     if valid:
168 |         spider_cls.from_crawler(crawler, **args)
169 |     else:
170 |         with pytest.raises(ValidationError):
171 |             spider_cls.from_crawler(crawler, **args)
172 | 
173 | 
174 | @pytest.mark.parametrize(
175 |     ("spider_cls", "param", "arg", "setting", "old", "getter", "new"),
176 |     (
177 |         # extract_from
178 |         *(
179 |             (EcommerceSpider, *scenario)
180 |             for scenario in (
181 |                 (
182 |                     "extract_from",
183 |                     "browserHtml",
184 |                     "ZYTE_API_PROVIDER_PARAMS",
185 |                     None,
186 |                     "getdict",
187 |                     {
188 |                         "productOptions": {"extractFrom": "browserHtml"},
189 |                         "productNavigationOptions": {"extractFrom": "browserHtml"},
190 |                         "productListOptions": {"extractFrom": "browserHtml"},
191 |                     },
192 |                 ),
193 |                 (
194 |                     "extract_from",
195 |                     "httpResponseBody",
196 |                     "ZYTE_API_PROVIDER_PARAMS",
197 |                     {"geolocation": "US"},
198 |                     "getdict",
199 |                     {
200 |                         "productOptions": {"extractFrom": "httpResponseBody"},
201 |                         "productNavigationOptions": {"extractFrom": "httpResponseBody"},
202 |                         "productListOptions": {"extractFrom": "httpResponseBody"},
203 |                         "geolocation": "US",
204 |                     },
205 |                 ),
206 |                 (
207 |                     "extract_from",
208 |                     None,
209 |                     "ZYTE_API_PROVIDER_PARAMS",
210 |                     {"geolocation": "US"},
211 |                     "getdict",
212 |                     {"geolocation": "US"},
213 |                 ),
214 |             )
215 |         ),
216 |         # geolocation
217 |         *(
218 |             (spider_cls, *scenario)
219 |             for spider_cls in (EcommerceSpider, GoogleSearchSpider)
220 |             for scenario in (
221 |                 (
222 |                     "geolocation",
223 |                     "DE",
224 |                     "ZYTE_API_AUTOMAP_PARAMS",
225 |                     None,
226 |                     "getdict",
227 |                     {"geolocation": "DE"},
228 |                 ),
229 |                 (
230 |                     "geolocation",
231 |                     "DE",
232 |                     "ZYTE_API_AUTOMAP_PARAMS",
233 |                     '{"browserHtml": true}',
234 |                     "getdict",
235 |                     {"browserHtml": True, "geolocation": "DE"},
236 |                 ),
237 |                 (
238 |                     "geolocation",
239 |                     "DE",
240 |                     "ZYTE_API_AUTOMAP_PARAMS",
241 |                     '{"geolocation": "IE"}',
242 |                     "getdict",
243 |                     {"geolocation": "DE"},
244 |                 ),
245 |                 (
246 |                     "geolocation",
247 |                     "DE",
248 |                     "ZYTE_API_PROVIDER_PARAMS",
249 |                     None,
250 |                     "getdict",
251 |                     {"geolocation": "DE"},
252 |                 ),
253 |                 (
254 |                     "geolocation",
255 |                     "DE",
256 |                     "ZYTE_API_PROVIDER_PARAMS",
257 |                     '{"browserHtml": true}',
258 |                     "getdict",
259 |                     {"browserHtml": True, "geolocation": "DE"},
260 |                 ),
261 |                 (
262 |                     "geolocation",
263 |                     "DE",
264 |                     "ZYTE_API_PROVIDER_PARAMS",
265 |                     '{"geolocation": "IE"}',
266 |                     "getdict",
267 |                     {"geolocation": "DE"},
268 |                 ),
269 |             )
270 |         ),
271 |         # max_requests
272 |         *(
273 |             (
274 |                 spider_cls,
275 |                 "max_requests",
276 |                 "123",
277 |                 "ZYTE_API_MAX_REQUESTS",
278 |                 None,
279 |                 "getint",
280 |                 123,
281 |             )
282 |             for spider_cls in (EcommerceSpider, GoogleSearchSpider)
283 |         ),
284 |     ),
285 | )
286 | def test_setting_setter_params(spider_cls, param, arg, setting, old, getter, new):
287 |     settings = {}
288 |     if old is not None:
289 |         settings[setting] = old
290 |     crawler = get_crawler(settings=settings)
291 |     spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls], **{param: arg})
292 |     read = getattr(crawler.settings, getter)
293 |     assert read(setting) == new
294 | 


--------------------------------------------------------------------------------
/tests/test_params_location_param.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pydantic import ValidationError
 3 | 
 4 | from zyte_spider_templates.params import LocationParam
 5 | 
 6 | 
 7 | def test_valid_location_param():
 8 |     valid_address_dict = {
 9 |         "streetAddress": "123 Main St",
10 |         "addressCountry": "US",
11 |         "addressRegion": "CA",
12 |         "postalCode": "12345",
13 |     }
14 |     location_param = LocationParam(location=valid_address_dict)  # type: ignore[arg-type]
15 |     assert location_param.location is not None
16 |     assert location_param.location.streetAddress == "123 Main St"
17 |     assert location_param.location.addressCountry == "US"
18 |     assert location_param.location.addressRegion == "CA"
19 |     assert location_param.location.postalCode == "12345"
20 | 
21 | 
22 | def test_valid_location_param_from_json():
23 |     valid_address_json = '{"streetAddress": "456 Elm St", "addressCountry": "US", "addressRegion": "NY", "postalCode": "54321"}'
24 |     location_param = LocationParam(location=valid_address_json)  # type: ignore[arg-type]
25 |     assert location_param.location is not None
26 |     assert location_param.location.streetAddress == "456 Elm St"
27 |     assert location_param.location.addressCountry == "US"
28 |     assert location_param.location.addressRegion == "NY"
29 |     assert location_param.location.postalCode == "54321"
30 | 
31 | 
32 | def test_none_location_param():
33 |     location_param = LocationParam(location=None)
34 |     assert location_param.location is None
35 | 
36 | 
37 | def test_invalid_json_location_param():
38 |     invalid_address_json = '{"streetAddress": "789 Pine St", "addressCountry": "AnotheraddressCountry", "addressRegion": "FL", "postalCode": "67890"'
39 |     with pytest.raises(ValueError, match=r".* is not a valid JSON object"):
40 |         LocationParam(location=invalid_address_json)  # type: ignore[arg-type]
41 | 
42 | 
43 | def test_invalid_type_location_param():
44 |     invalid_type_value = 12345  # Invalid type, should raise ValueError
45 |     with pytest.raises(ValueError, match=r".* type .* is not a supported type"):
46 |         LocationParam(location=invalid_type_value)  # type: ignore[arg-type]
47 | 
48 | 
49 | def test_invalid_validation_location_param():
50 |     invalid_address_json = '{"nonExpectedInputField": "67890"}'
51 |     with pytest.raises(ValidationError, match=r"Extra inputs are not permitted .*"):
52 |         LocationParam(location=invalid_address_json)  # type: ignore[arg-type]
53 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from unittest.mock import patch
  4 | 
  5 | import pytest
  6 | from scrapy import Request, Spider
  7 | 
  8 | from tests import get_crawler
  9 | from zyte_spider_templates.utils import (
 10 |     get_domain,
 11 |     get_domain_fingerprint,
 12 |     get_project_id,
 13 |     get_request_fingerprint,
 14 |     get_spider_name,
 15 |     load_url_list,
 16 | )
 17 | 
 18 | URL_TO_DOMAIN = (
 19 |     ("https://example.com", "example.com"),
 20 |     ("https://www.example.com", "example.com"),
 21 |     ("https://www2.example.com", "example.com"),
 22 |     ("https://prefixwww.example.com", "prefixwww.example.com"),
 23 |     ("https://wwworld.example.com", "wwworld.example.com"),
 24 |     ("https://my.wwworld-example.com", "my.wwworld-example.com"),
 25 |     ("https://wwwow.com", "wwwow.com"),
 26 |     ("https://wowww.com", "wowww.com"),
 27 |     ("https://awww.com", "awww.com"),
 28 | )
 29 | 
 30 | 
 31 | @pytest.mark.parametrize("url,domain", URL_TO_DOMAIN)
 32 | def test_get_domain(url, domain):
 33 |     assert get_domain(url) == domain
 34 | 
 35 | 
 36 | @pytest.mark.parametrize(
 37 |     "input_urls,expected",
 38 |     (
 39 |         (
 40 |             "https://a.example",
 41 |             ["https://a.example"],
 42 |         ),
 43 |         (
 44 |             "   https://a.example    ",
 45 |             ["https://a.example"],
 46 |         ),
 47 |         (
 48 |             "https://a.example\n \nhttps://b.example\nhttps://c.example\n\n",
 49 |             ["https://a.example", "https://b.example", "https://c.example"],
 50 |         ),
 51 |         (
 52 |             "ftp://a.example",
 53 |             ValueError,
 54 |         ),
 55 |     ),
 56 | )
 57 | def test_load_url_list(input_urls, expected):
 58 |     if isinstance(expected, list):
 59 |         assert load_url_list(input_urls) == expected
 60 |         return
 61 |     with pytest.raises(expected):
 62 |         load_url_list(input_urls)
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     "url, expected_fingerprint",
 67 |     [
 68 |         # No subdomain
 69 |         ("https://example.com", "c300"),
 70 |         # One subdomain
 71 |         ("https://sub.example.com", "c35d"),
 72 |         # Multiple subdomains
 73 |         ("https://sub1.sub2.example.com", "c3c9"),
 74 |         # No TLD (localhost or internal addresses)
 75 |         ("http://localhost", "3300"),
 76 |         # Complex TLD (e.g., .co.uk) and subdomains
 77 |         ("https://sub.example.co.uk", "c35d"),
 78 |     ],
 79 | )
 80 | def test_get_domain_fingerprint(url, expected_fingerprint):
 81 |     assert get_domain_fingerprint(url) == expected_fingerprint
 82 | 
 83 | 
 84 | @pytest.mark.parametrize(
 85 |     "env_var_value, spider_name, expected_result, expected_log",
 86 |     [
 87 |         (
 88 |             "virtual_spider_name",
 89 |             "regular_spider_name",
 90 |             "virtual_spider_name",
 91 |             "Picked virtual spider name virtual_spider_name from the spider's SHUB_VIRTUAL_SPIDER setting.",
 92 |         ),
 93 |         (
 94 |             None,
 95 |             "regular_spider_name",
 96 |             "regular_spider_name",
 97 |             "Picked spider name regular_spider_name from the spider.",
 98 |         ),
 99 |     ],
100 | )
101 | def test_get_spider_name(
102 |     env_var_value, spider_name, expected_result, expected_log, caplog
103 | ):
104 |     class TestSpider(Spider):
105 |         name = spider_name
106 | 
107 |     caplog.clear()
108 |     crawler = get_crawler()
109 |     crawler.spider = TestSpider()
110 | 
111 |     logger = logging.getLogger("zyte_spider_templates.utils")
112 |     logger.setLevel(logging.INFO)
113 | 
114 |     with patch.dict(
115 |         os.environ,
116 |         {"SHUB_VIRTUAL_SPIDER": env_var_value} if env_var_value else {},
117 |         clear=True,
118 |     ):
119 |         result = get_spider_name(crawler)
120 |         assert result == expected_result
121 |         assert expected_log in caplog.text
122 | 
123 | 
124 | @pytest.mark.parametrize(
125 |     "env_scrapy, env_zyte, settings_zyte, expected_result, expected_log, expect_exception",
126 |     [
127 |         # SCRAPY_PROJECT_ID is set
128 |         (
129 |             "123456",
130 |             None,
131 |             None,
132 |             "123456",
133 |             "Picked project id 123456 from SCRAPY_PROJECT_ID env variable.",
134 |             False,
135 |         ),
136 |         # ZYTE_PROJECT_ID is set in the environment
137 |         (
138 |             None,
139 |             "654321",
140 |             None,
141 |             "654321",
142 |             "Picked project id 654321 from ZYTE_PROJECT_ID env variable.",
143 |             False,
144 |         ),
145 |         # ZYTE_PROJECT_ID is set in the settings
146 |         (
147 |             None,
148 |             None,
149 |             "126534",
150 |             "126534",
151 |             "Picked project id 126534 from the spider's ZYTE_PROJECT_ID setting.",
152 |             False,
153 |         ),
154 |         # No project ID found, expect an exception
155 |         (
156 |             None,
157 |             None,
158 |             None,
159 |             None,  # No result expected
160 |             None,  # No log expected
161 |             True,  # Expect an exception
162 |         ),
163 |     ],
164 | )
165 | def test_get_project_id(
166 |     env_scrapy,
167 |     env_zyte,
168 |     settings_zyte,
169 |     expected_result,
170 |     expected_log,
171 |     expect_exception,
172 |     caplog,
173 | ):
174 |     caplog.clear()
175 | 
176 |     env_vars = {}
177 |     if env_scrapy:
178 |         env_vars["SCRAPY_PROJECT_ID"] = env_scrapy
179 |     if env_zyte:
180 |         env_vars["ZYTE_PROJECT_ID"] = env_zyte
181 | 
182 |     with patch.dict(os.environ, env_vars, clear=True):
183 |         crawler = get_crawler()
184 | 
185 |         if settings_zyte:
186 |             crawler.settings.set("ZYTE_PROJECT_ID", settings_zyte)
187 | 
188 |         with caplog.at_level(logging.INFO, logger="zyte_spider_templates.utils"):
189 |             if expect_exception:
190 |                 with pytest.raises(
191 |                     ValueError,
192 |                     match="Zyte project id wasn't found in job data, env, or settings.",
193 |                 ):
194 |                     get_project_id(crawler)
195 |             else:
196 |                 assert get_project_id(crawler) == expected_result
197 |                 assert expected_log in caplog.text
198 | 
199 | 
200 | def test_get_request_fingerprint():
201 |     url = "https://example.com"
202 |     domain_fp = "ffeeddccbbaa"
203 |     request_fp = "aabbccddeeff"
204 | 
205 |     with patch(
206 |         "zyte_spider_templates.utils.get_domain_fingerprint", return_value=domain_fp
207 |     ):
208 |         crawler = get_crawler()
209 |         with patch.object(crawler, "request_fingerprinter") as mock_fingerprinter:
210 |             mock_fingerprinter.fingerprint.return_value = bytes.fromhex(request_fp)
211 |             request = Request(url)
212 |             result = get_request_fingerprint(crawler, request)
213 |             assert result == domain_fp + request_fp
214 |             mock_fingerprinter.fingerprint.assert_called_once_with(request)
215 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from typing import Any
 5 | 
 6 | from aiohttp.test_utils import TestServer
 7 | from scrapy import Spider, signals
 8 | from scrapy.utils.defer import deferred_to_future
 9 | 
10 | from . import get_crawler
11 | 
12 | 
13 | def assertEqualSpiderMetadata(actual, expected):
14 |     """Compare 2 JSON schemas of spider metadata.
15 | 
16 |     The parameter order in the parameter schema is taken into account, given
17 |     how it affects the UI, while the order of other object keys may be
18 |     different.
19 | 
20 |     It also generates a better diff in pytest output when enums are involved,
21 |     e.g. geolocation values.
22 |     """
23 |     assert tuple(actual["param_schema"]["properties"]) == tuple(
24 |         expected["param_schema"]["properties"]
25 |     )
26 |     actual_json = json.dumps(actual, indent=2, sort_keys=True)
27 |     expected_json = json.dumps(expected, indent=2, sort_keys=True)
28 |     assert actual_json == expected_json
29 | 
30 | 
31 | def get_addons() -> dict[str | type, int]:
32 |     addons: dict[str | type, int] = {
33 |         "scrapy_zyte_api.Addon": 500,
34 |         "zyte_spider_templates.Addon": 1000,
35 |     }
36 |     try:
37 |         from scrapy_poet import Addon
38 |     except ImportError:
39 |         pass
40 |     else:
41 |         addons[Addon] = 300
42 |     return addons
43 | 
44 | 
45 | def get_zyte_api_settings(zyte_api_server) -> dict[str, Any]:
46 |     return {
47 |         "ZYTE_API_URL": str(zyte_api_server.make_url("/")),
48 |         "ZYTE_API_KEY": "a",
49 |         "ADDONS": get_addons(),
50 |     }
51 | 
52 | 
53 | async def crawl_fake_zyte_api(
54 |     zyte_api_server: TestServer,
55 |     spider_cls: type[Spider],
56 |     spider_kwargs: dict[str, Any],
57 |     settings: dict[str, Any] | None = None,
58 | ):
59 |     settings = {**get_zyte_api_settings(zyte_api_server), **(settings or {})}
60 |     crawler = get_crawler(settings=settings, spider_cls=spider_cls)
61 |     items = []
62 | 
63 |     def track_item(item, response, spider):
64 |         items.append(item)
65 | 
66 |     crawler.signals.connect(track_item, signal=signals.item_scraped)
67 |     await deferred_to_future(crawler.crawl(**spider_kwargs))
68 |     return items
69 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = min,py39,py310,py311,py312,py313,mypy,linters,twine
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     pytest-cov
 8 |     pytest-twisted
 9 |     freezegun
10 |     zyte-test-websites @ git+https://github.com/zytedata/zyte-test-websites@c48564f
11 |     fake-zyte-api @ git+https://github.com/zytedata/fake-zyte-api@1352eec
12 | commands =
13 |     py.test \
14 |     --cov-report=html:coverage-html \
15 |     --doctest-modules \
16 |     --cov-report=html \
17 |     --cov-report=xml \
18 |     --cov=zyte_spider_templates \
19 |     -vv \
20 |     -m "not deprication_warning" \
21 |     {posargs:zyte_spider_templates tests}
22 | 
23 | [testenv:min]
24 | basepython = python3.9
25 | deps =
26 |     {[testenv]deps}
27 |     extruct==0.18.0
28 |     form2request==0.2.0
29 |     formasaurus==0.10.0
30 |     jmespath==0.9.5
31 |     pydantic==2.1
32 |     requests==2.31.0
33 |     scrapinghub==2.4.0
34 |     scrapy==2.11.0
35 |     scrapy-poet==0.24.0
36 |     scrapy-spider-metadata==0.2.0
37 |     scrapy-zyte-api[provider]==0.25.0
38 |     web-poet==0.17.1
39 |     xtractmime==0.2.1
40 |     zyte-common-items==0.26.2
41 | 
42 | [testenv:mypy]
43 | deps =
44 |     mypy==1.12.0
45 |     enum-tools==0.12.0
46 |     freezegun==1.5.1
47 |     pytest==8.3.3
48 |     types-requests==2.32.0.20240914
49 | commands = mypy zyte_spider_templates tests
50 | 
51 | [testenv:linters]
52 | deps = -rrequirements-dev.txt
53 | commands = pre-commit run --all-files --show-diff-on-failure
54 | 
55 | [testenv:twine]
56 | deps =
57 |     twine==6.1.0
58 |     build==1.2.2.post1
59 | commands =
60 |     python setup.py sdist
61 |     twine check dist/*
62 | 
63 | [testenv:docs]
64 | changedir = docs
65 | deps =
66 |     -rdocs/requirements.txt
67 | commands =
68 |     sphinx-build -W -b html . {envtmpdir}/html
69 | 


--------------------------------------------------------------------------------
/utils/google-gl-updater/requirements.in:
--------------------------------------------------------------------------------
1 | jinja2
2 | parsel
3 | requests
4 | 


--------------------------------------------------------------------------------
/utils/google-gl-updater/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.12
 3 | # by the following command:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | certifi==2024.8.30
 8 |     # via requests
 9 | charset-normalizer==3.4.0
10 |     # via requests
11 | cssselect==1.2.0
12 |     # via parsel
13 | idna==3.10
14 |     # via requests
15 | jinja2==3.1.5
16 |     # via -r requirements.in
17 | jmespath==1.0.1
18 |     # via parsel
19 | lxml==5.3.0
20 |     # via parsel
21 | markupsafe==3.0.2
22 |     # via jinja2
23 | packaging==24.2
24 |     # via parsel
25 | parsel==1.9.1
26 |     # via -r requirements.in
27 | requests==2.32.3
28 |     # via -r requirements.in
29 | urllib3==2.2.3
30 |     # via requests
31 | w3lib==2.2.1
32 |     # via parsel
33 | 


--------------------------------------------------------------------------------
/utils/google-gl-updater/template.py:
--------------------------------------------------------------------------------
 1 | {% raw %}# ../_geolocations.py counterpart for
 2 | # https://developers.google.com/custom-search/docs/json_api_reference#countryCodes
 3 | #
 4 | # Built automatically with ../../utils/google-gl-updater
 5 | 
 6 | from enum import Enum
 7 | 
 8 | GOOGLE_GL_OPTIONS = {{% endraw %}{% for country in countries %}
 9 |     "{{ country.code }}": "{{ country.name }}",{% endfor %}{% raw %}
10 | }
11 | GOOGLE_GL_OPTIONS_WITH_CODE = {
12 |     code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items()
13 | }
14 | 
15 | 
16 | class GoogleGl(str, Enum):{% endraw %}{% for country in countries %}
17 |     {{ country.keyword }}: str = "{{ country.code }}"{% endfor %}
18 | 
19 | 


--------------------------------------------------------------------------------
/utils/google-gl-updater/update.py:
--------------------------------------------------------------------------------
 1 | from keyword import iskeyword
 2 | from pathlib import Path
 3 | 
 4 | import jinja2
 5 | import requests
 6 | from parsel import Selector
 7 | 
 8 | countries = []
 9 | 
10 | response = requests.get(
11 |     "https://developers.google.com/custom-search/docs/json_api_reference"
12 | )
13 | selector = Selector(text=response.text)
14 | table = selector.xpath('//*[@id="country-codes"]/following-sibling::table[1]')
15 | for tr in table.css("tr"):
16 |     name = tr.xpath("td/text()").get()
17 |     if not name:  # header
18 |         continue
19 |     code = tr.xpath("td/span/text()").get()
20 |     keyword = f"{code}_" if iskeyword(code) else code
21 |     countries.append({"code": code, "keyword": keyword, "name": name})
22 | 
23 | template_path = Path(__file__).parent / "template.py"
24 | template_environment = jinja2.Environment()
25 | with template_path.open() as f:
26 |     template = template_environment.from_string(f.read())
27 | output = template.render(countries=countries)
28 | output_path = (
29 |     Path(__file__).parent.parent.parent
30 |     / "zyte_spider_templates"
31 |     / "spiders"
32 |     / "_google_gl.py"
33 | )
34 | with output_path.open("w") as f:
35 |     f.write(output)
36 | 


--------------------------------------------------------------------------------
/utils/google-hl-updater/requirements.in:
--------------------------------------------------------------------------------
1 | jinja2
2 | parsel
3 | requests
4 | 


--------------------------------------------------------------------------------
/utils/google-hl-updater/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.12
 3 | # by the following command:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | certifi==2024.8.30
 8 |     # via requests
 9 | charset-normalizer==3.4.0
10 |     # via requests
11 | cssselect==1.2.0
12 |     # via parsel
13 | idna==3.10
14 |     # via requests
15 | jinja2==3.1.6
16 |     # via -r requirements.in
17 | jmespath==1.0.1
18 |     # via parsel
19 | lxml==5.3.0
20 |     # via parsel
21 | markupsafe==3.0.2
22 |     # via jinja2
23 | packaging==24.2
24 |     # via parsel
25 | parsel==1.9.1
26 |     # via -r requirements.in
27 | requests==2.32.3
28 |     # via -r requirements.in
29 | urllib3==2.2.3
30 |     # via requests
31 | w3lib==2.2.1
32 |     # via parsel
33 | 


--------------------------------------------------------------------------------
/utils/google-hl-updater/template.py:
--------------------------------------------------------------------------------
 1 | {% raw %}# _google_gl.py counterpart for
 2 | # https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
 3 | #
 4 | # Built automatically with ../../utils/google-hl-updater
 5 | 
 6 | from enum import Enum
 7 | 
 8 | GOOGLE_HL_OPTIONS = {{% endraw %}{% for language in languages %}
 9 |     "{{ language.code }}": "{{ language.name }}",{% endfor %}{% raw %}
10 | }
11 | GOOGLE_HL_OPTIONS_WITH_CODE = {
12 |     code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
13 | }
14 | 
15 | 
16 | class GoogleHl(str, Enum):{% endraw %}{% for language in languages %}
17 |     {{ language.keyword }}: str = "{{ language.code }}"{% endfor %}
18 | 
19 | 


--------------------------------------------------------------------------------
/utils/google-hl-updater/update.py:
--------------------------------------------------------------------------------
 1 | from keyword import iskeyword
 2 | from pathlib import Path
 3 | 
 4 | import jinja2
 5 | import requests
 6 | from parsel import Selector
 7 | 
 8 | languages = []
 9 | 
10 | response = requests.get(
11 |     "https://developers.google.com/custom-search/docs/json_api_reference"
12 | )
13 | selector = Selector(text=response.text)
14 | table = selector.xpath(
15 |     '//*[@id="supported-interface-languages"]/following-sibling::table[1]'
16 | )
17 | for tr in table.css("tr"):
18 |     name = tr.xpath("td/text()").get()
19 |     if not name:  # header
20 |         continue
21 |     code = tr.xpath("td/span/text()").get()
22 |     keyword = f"{code}_" if iskeyword(code) else code
23 |     keyword = keyword.replace("-", "_")
24 |     languages.append({"code": code, "keyword": keyword, "name": name})
25 | 
26 | template_path = Path(__file__).parent / "template.py"
27 | template_environment = jinja2.Environment()
28 | with template_path.open() as f:
29 |     template = template_environment.from_string(f.read())
30 | output = template.render(languages=languages)
31 | output_path = (
32 |     Path(__file__).parent.parent.parent
33 |     / "zyte_spider_templates"
34 |     / "spiders"
35 |     / "_google_hl.py"
36 | )
37 | with output_path.open("w") as f:
38 |     f.write(output)
39 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version
 2 | from logging import getLogger
 3 | 
 4 | from ._incremental.middleware import IncrementalCrawlMiddleware
 5 | from .middlewares import (
 6 |     AllowOffsiteMiddleware,
 7 |     CrawlingLogsMiddleware,
 8 |     MaxRequestsPerSeedDownloaderMiddleware,
 9 |     OffsiteRequestsPerSeedMiddleware,
10 |     OnlyFeedsMiddleware,
11 |     TrackNavigationDepthSpiderMiddleware,
12 |     TrackSeedsSpiderMiddleware,
13 | )
14 | from .spiders.article import ArticleSpider
15 | from .spiders.base import BaseSpider, BaseSpiderParams
16 | from .spiders.ecommerce import EcommerceSpider
17 | from .spiders.job_posting import JobPostingSpider
18 | from .spiders.serp import GoogleSearchSpider
19 | 
20 | from ._addon import Addon  # isort: skip
21 | 
22 | logger = getLogger(__name__)
23 | package = "zyte-spider-templates"
24 | logger.info(f"Running {package} {version(package)}")
25 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/_addon.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | from typing import Any, List, Optional, Type
  3 | 
  4 | from duplicate_url_discarder_rules import RULE_PATHS
  5 | from scrapy.settings import BaseSettings
  6 | from scrapy.utils.misc import load_object
  7 | 
  8 | from zyte_spider_templates import (
  9 |     AllowOffsiteMiddleware,
 10 |     CrawlingLogsMiddleware,
 11 |     IncrementalCrawlMiddleware,
 12 |     MaxRequestsPerSeedDownloaderMiddleware,
 13 |     OffsiteRequestsPerSeedMiddleware,
 14 |     OnlyFeedsMiddleware,
 15 |     TrackNavigationDepthSpiderMiddleware,
 16 |     TrackSeedsSpiderMiddleware,
 17 | )
 18 | 
 19 | logger = getLogger(__name__)
 20 | 
 21 | 
 22 | def _extend_module_list(settings: BaseSettings, setting: str, item: str) -> None:
 23 |     spider_modules: List[str] = settings.getlist(setting)
 24 |     if item not in spider_modules:
 25 |         spider_modules_priority = settings.getpriority(setting)
 26 |         settings.set(
 27 |             setting,
 28 |             spider_modules + [item],
 29 |             priority=spider_modules_priority,  # type: ignore[arg-type]
 30 |         )
 31 | 
 32 | 
 33 | def _replace_builtin(
 34 |     settings: BaseSettings, setting: str, builtin_cls: Type, new_cls: Type
 35 | ) -> None:
 36 |     setting_value = settings[setting]
 37 |     if not setting_value:
 38 |         logger.warning(
 39 |             f"Setting {setting!r} is empty. Could not replace the built-in "
 40 |             f"{builtin_cls} entry with {new_cls}. Add {new_cls} manually to "
 41 |             f"silence this warning."
 42 |         )
 43 |         return None
 44 | 
 45 |     if new_cls in setting_value:
 46 |         return None
 47 |     for cls_or_path in setting_value:
 48 |         if isinstance(cls_or_path, str):
 49 |             _cls = load_object(cls_or_path)
 50 |             if _cls == new_cls:
 51 |                 return None
 52 | 
 53 |     builtin_entry: Optional[Any] = None
 54 |     for _setting_value in (setting_value, settings[f"{setting}_BASE"]):
 55 |         if builtin_cls in _setting_value:
 56 |             builtin_entry = builtin_cls
 57 |             pos = _setting_value[builtin_entry]
 58 |             break
 59 |         for cls_or_path in _setting_value:
 60 |             if isinstance(cls_or_path, str):
 61 |                 _cls = load_object(cls_or_path)
 62 |                 if _cls == builtin_cls:
 63 |                     builtin_entry = cls_or_path
 64 |                     pos = _setting_value[builtin_entry]
 65 |                     break
 66 |         if builtin_entry:
 67 |             break
 68 | 
 69 |     if not builtin_entry:
 70 |         logger.warning(
 71 |             f"Settings {setting!r} and {setting + '_BASE'!r} are both "
 72 |             f"missing built-in entry {builtin_cls}. Cannot replace it with {new_cls}. "
 73 |             f"Add {new_cls} manually to silence this warning."
 74 |         )
 75 |         return None
 76 | 
 77 |     if pos is None:
 78 |         logger.warning(
 79 |             f"Built-in entry {builtin_cls} of setting {setting!r} is disabled "
 80 |             f"(None). Cannot replace it with {new_cls}. Add {new_cls} "
 81 |             f"manually to silence this warning. If you had replaced "
 82 |             f"{builtin_cls} with some other entry, you might also need to "
 83 |             f"disable that other entry for things to work as expected."
 84 |         )
 85 |         return
 86 | 
 87 |     settings[setting][builtin_entry] = None
 88 |     settings[setting][new_cls] = pos
 89 | 
 90 | 
 91 | # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/scrapy_zyte_api/addon.py#L12
 92 | def _setdefault(settings: BaseSettings, setting: str, cls: Type, pos: int) -> None:
 93 |     setting_value = settings[setting]
 94 |     if not setting_value:
 95 |         settings[setting] = {cls: pos}
 96 |         return None
 97 |     if cls in setting_value:
 98 |         return None
 99 |     for cls_or_path in setting_value:
100 |         if isinstance(cls_or_path, str):
101 |             _cls = load_object(cls_or_path)
102 |             if _cls == cls:
103 |                 return None
104 |     settings[setting][cls] = pos
105 | 
106 | 
107 | class Addon:
108 |     def update_settings(self, settings: BaseSettings) -> None:
109 |         for setting, value in (
110 |             ("CLOSESPIDER_TIMEOUT_NO_ITEM", 600),
111 |             ("SCHEDULER_DISK_QUEUE", "scrapy.squeues.PickleFifoDiskQueue"),
112 |             ("SCHEDULER_MEMORY_QUEUE", "scrapy.squeues.FifoMemoryQueue"),
113 |             ("SCHEDULER_PRIORITY_QUEUE", "scrapy.pqueues.DownloaderAwarePriorityQueue"),
114 |             (
115 |                 "ITEM_PROBABILITY_THRESHOLDS",
116 |                 {
117 |                     "zyte_common_items.items.Article": 0.1,
118 |                     "zyte_common_items.items.Product": 0.1,
119 |                 },
120 |             ),
121 |             ("DUD_LOAD_RULE_PATHS", RULE_PATHS),
122 |         ):
123 |             settings.set(setting, value, priority="addon")
124 | 
125 |         _extend_module_list(
126 |             settings, "SCRAPY_POET_DISCOVER", "zyte_spider_templates.pages"
127 |         )
128 |         _extend_module_list(settings, "SPIDER_MODULES", "zyte_spider_templates.spiders")
129 | 
130 |         _setdefault(
131 |             settings,
132 |             "DOWNLOADER_MIDDLEWARES",
133 |             MaxRequestsPerSeedDownloaderMiddleware,
134 |             100,
135 |         )
136 |         _setdefault(settings, "SPIDER_MIDDLEWARES", IncrementalCrawlMiddleware, 45)
137 |         _setdefault(
138 |             settings, "SPIDER_MIDDLEWARES", OffsiteRequestsPerSeedMiddleware, 49
139 |         )
140 |         _setdefault(settings, "SPIDER_MIDDLEWARES", TrackSeedsSpiderMiddleware, 550)
141 |         _setdefault(settings, "SPIDER_MIDDLEWARES", OnlyFeedsMiddleware, 108)
142 |         _setdefault(
143 |             settings, "SPIDER_MIDDLEWARES", TrackNavigationDepthSpiderMiddleware, 110
144 |         )
145 |         _setdefault(settings, "SPIDER_MIDDLEWARES", CrawlingLogsMiddleware, 1000)
146 | 
147 |         try:
148 |             from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware
149 |         except ImportError:
150 |             from scrapy.spidermiddlewares.offsite import (  # type: ignore[assignment]
151 |                 OffsiteMiddleware,
152 |             )
153 | 
154 |             _replace_builtin(
155 |                 settings,
156 |                 "SPIDER_MIDDLEWARES",
157 |                 OffsiteMiddleware,
158 |                 AllowOffsiteMiddleware,
159 |             )
160 |         else:
161 |             _replace_builtin(
162 |                 settings,
163 |                 "DOWNLOADER_MIDDLEWARES",
164 |                 OffsiteMiddleware,
165 |                 AllowOffsiteMiddleware,
166 |             )
167 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/_incremental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/zyte_spider_templates/_incremental/__init__.py


--------------------------------------------------------------------------------
/zyte_spider_templates/_incremental/manager.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from collections import defaultdict
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | from typing import Dict, List, Optional, Set, Tuple, Union
  6 | 
  7 | import scrapinghub
  8 | from itemadapter import ItemAdapter
  9 | from scrapinghub.client.exceptions import Unauthorized
 10 | from scrapy import signals
 11 | from scrapy.crawler import Crawler
 12 | from scrapy.http.request import Request
 13 | from zyte_common_items import Item
 14 | 
 15 | from zyte_spider_templates.utils import (
 16 |     get_client,
 17 |     get_project_id,
 18 |     get_request_fingerprint,
 19 |     get_spider_name,
 20 | )
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | INCREMENTAL_SUFFIX = "_incremental"
 25 | COLLECTION_API_URL = "https://storage.scrapinghub.com/collections"
 26 | 
 27 | THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10)
 28 | 
 29 | 
 30 | class CollectionsFingerprintsManager:
 31 |     def __init__(self, crawler: Crawler) -> None:
 32 |         self.writer = None
 33 |         self.collection = None
 34 |         self.crawler = crawler
 35 | 
 36 |         self.batch: Set[Tuple[str, str]] = set()
 37 |         self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50)
 38 | 
 39 |         project_id = get_project_id(crawler)
 40 |         collection_name = self.get_collection_name(crawler)
 41 | 
 42 |         self.init_collection(project_id, collection_name)
 43 |         self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}"
 44 | 
 45 |         logger.info(
 46 |             f"Configuration of CollectionsFingerprintsManager for IncrementalCrawlMiddleware:\n"
 47 |             f"batch_size: {self.batch_size},\n"
 48 |             f"project: {project_id},\n"
 49 |             f"collection_name: {collection_name}"
 50 |         )
 51 | 
 52 |         crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
 53 | 
 54 |     def get_collection_name(self, crawler):
 55 |         return (
 56 |             crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME")
 57 |             or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}"
 58 |         )
 59 | 
 60 |     def init_collection(self, project_id, collection_name) -> None:
 61 |         client = get_client()
 62 |         collection = client.get_project(project_id).collections.get_store(
 63 |             collection_name
 64 |         )
 65 |         try:
 66 |             # Trying to get a random key to make sure the collection exists.
 67 |             collection.list(key=["init_key"])
 68 |         except scrapinghub.client.exceptions.NotFound as e:
 69 |             if f"unknown collection {collection_name}" in str(e):
 70 |                 logger.info(
 71 |                     f"The collection: {collection_name} for {project_id=} doesn't exist"
 72 |                     f" and will be created automatically"
 73 |                 )
 74 |                 # This trick forces the creation of a collection.
 75 |                 collection.set({"_key": "init", "value": "1"})
 76 |                 collection.delete("init")
 77 |             else:
 78 |                 logger.error(f"The error {e} for {project_id=}")
 79 |                 raise RuntimeError("incremental_crawling__not_found_exception")
 80 |         except Unauthorized:
 81 |             logger.error("The api key (SH_APIKEY or SHUB_JOBAUTH) is not valid.")
 82 |             raise ValueError("incremental_crawling__api_key_not_vaild")
 83 | 
 84 |         self.collection = collection
 85 |         self.writer = self.collection.create_writer()  # type: ignore
 86 | 
 87 |     def save_to_collection(self, items_to_save) -> None:
 88 |         """Saves the current batch of fingerprints to the collection."""
 89 |         items = [{"_key": key, "value": value} for key, value in items_to_save]
 90 |         self.writer.write(items)  # type: ignore
 91 |         self.writer.flush()  # type: ignore
 92 | 
 93 |     async def get_keys_from_collection_async(self, keys: Set[str]) -> Set[str]:
 94 |         """Asynchronously fetches a set of keys from the collection using an executor to run in separate threads."""
 95 |         return await asyncio.get_event_loop().run_in_executor(
 96 |             THREAD_POOL_EXECUTOR, lambda: self.get_keys_from_collection(keys)
 97 |         )
 98 | 
 99 |     async def read_batches(self, fingerprints: List[str], batch_start: int) -> Set[str]:
100 |         """Reads a specific batch of fingerprints and fetches corresponding keys asynchronously."""
101 |         return await self.get_keys_from_collection_async(
102 |             set(fingerprints[batch_start : batch_start + self.batch_size])
103 |         )
104 | 
105 |     def get_keys_from_collection(self, keys: Set[str]) -> Set[str]:
106 |         """Synchronously fetches a set of keys from the collection."""
107 |         return {item.get("_key", "") for item in self.collection.list(key=keys)}  # type: ignore
108 | 
109 |     async def get_existing_fingerprints_async(
110 |         self, fingerprints: List[str]
111 |     ) -> Set[str]:
112 |         """Asynchronously checks for duplicate fingerprints in both the collection and the local buffer.
113 |         Async interaction with the collection could be replaced by
114 |         https://github.com/scrapinghub/python-scrapinghub/issues/169 in the future"""
115 | 
116 |         fingerprints_size = len(fingerprints)
117 | 
118 |         if fingerprints_size == 0:
119 |             return set()
120 | 
121 |         duplicated_fingerprints = set()
122 | 
123 |         tasks = [
124 |             self.read_batches(fingerprints, i)
125 |             for i in range(0, fingerprints_size, self.batch_size)
126 |         ]
127 |         for future in asyncio.as_completed(tasks):
128 |             try:
129 |                 batch_keys = await future
130 |                 duplicated_fingerprints.update(batch_keys)
131 |             except Exception as e:
132 |                 logging.error(f"Error while processing batch: {e}")
133 | 
134 |         # Check duplicates in the local buffer
135 |         local_duplicates = set(fingerprints) & {fp for fp, _ in self.batch}
136 |         duplicated_fingerprints.update(local_duplicates)
137 | 
138 |         return duplicated_fingerprints
139 | 
140 |     def add_to_batch(self, fp_url_map: Set[Tuple[str, str]]) -> None:
141 |         """
142 |         Add the list of provided fingerprints and corresponding URLs per one item to the batch
143 |         """
144 |         for fp_url in fp_url_map:
145 |             logger.debug(f"Adding fingerprint and URL ({fp_url}) to batch.")
146 |             self.crawler.stats.inc_value(  # type: ignore[union-attr]
147 |                 "incremental_crawling/fingerprint_url_to_batch"
148 |             )
149 |             self.batch.add(fp_url)
150 |             if len(self.batch) >= self.batch_size:
151 |                 self.save_batch()
152 |         self.crawler.stats.inc_value("incremental_crawling/add_to_batch")  # type: ignore[union-attr]
153 | 
154 |     def save_batch(self) -> None:
155 |         if not self.batch:
156 |             return
157 |         logger.debug(
158 |             f"Saving {len(self.batch)} fingerprints to the Collection. "
159 |             f"The fingerprints are: {self.batch}."
160 |         )
161 |         self.crawler.stats.inc_value("incremental_crawling/batch_saved")  # type: ignore[union-attr]
162 |         self.save_to_collection(items_to_save=self.batch)
163 |         self.batch.clear()
164 | 
165 |     def spider_closed(self) -> None:
166 |         """Save fingerprints and corresponding URLs remaining in the batch, before spider closes."""
167 |         self.save_batch()
168 | 
169 | 
170 | class IncrementalCrawlingManager:
171 |     def __init__(self, crawler: Crawler, fm: CollectionsFingerprintsManager) -> None:
172 |         self.crawler = crawler
173 |         self.fm = fm
174 | 
175 |     async def process_incremental_async(
176 |         self, request: Request, result: List
177 |     ) -> List[Union[Request, Item]]:
178 |         """
179 |         Processes the spider's parsing callbacks when IncrementalCrawlMiddleware is enabled.
180 | 
181 |         The function handles both requests and items returned by the spider.
182 |         - If an item is found:
183 |           - It saves the `request.url` and `item.url/item.canonicalURL` (if they differ) to the collection.
184 |         - If the result is a Request:
185 |           - It checks whether the request was processed previously.
186 |           - If it was processed, the request is removed from the result.
187 |           - If it was not, the request remains in the result.
188 |         """
189 |         item: Optional[Item] = None
190 |         to_check = defaultdict(list)
191 |         fingerprint_to_url_map: Set[Tuple[str, str]] = set()
192 |         for i, element in enumerate(result):
193 |             if isinstance(element, Request):
194 |                 # The requests are only checked to see if the links exist in the Collection
195 |                 fp = get_request_fingerprint(self.crawler, element)
196 |                 to_check[fp].append(i)
197 |                 self.crawler.stats.inc_value("incremental_crawling/requests_to_check")  # type: ignore[union-attr]
198 |             else:
199 |                 if item:
200 |                     raise NotImplementedError(
201 |                         f"Unexpected number of returned items for {request.url}. "
202 |                         f"None or one was expected."
203 |                     )
204 | 
205 |                 item = element
206 |                 unique_urls = self._get_unique_urls(request.url, item)
207 |                 for url, url_field in unique_urls.items():
208 |                     fp = get_request_fingerprint(self.crawler, request.replace(url=url))
209 |                     if url_field != "request_url":
210 |                         to_check[fp].append(i)
211 | 
212 |                     # Storing the fingerprint-to-URL mapping for the item only.
213 |                     # This will be used when storing the item in the Collection.
214 |                     fingerprint_to_url_map.add((fp, url))
215 | 
216 |                     if url_field == "url":
217 |                         self.crawler.stats.inc_value(  # type: ignore[union-attr]
218 |                             "incremental_crawling/redirected_urls"
219 |                         )
220 |                         logger.debug(
221 |                             f"Request URL for the item {request.url} was redirected to {url}."
222 |                         )
223 | 
224 |         # Prepare list of duplications
225 |         duplicated_fingerprints = await self.fm.get_existing_fingerprints_async(
226 |             list(to_check.keys())
227 |         )
228 | 
229 |         if duplicated_fingerprints:
230 |             logging.debug(
231 |                 f"Skipping {len(duplicated_fingerprints)} Request fingerprints that were processed previously."
232 |             )
233 | 
234 |         n_dups = 0
235 |         for dupe_fp in duplicated_fingerprints:
236 |             # Marking duplicates for removal as None
237 |             for index in to_check[dupe_fp]:
238 |                 result[index] = None
239 |                 n_dups += 1
240 | 
241 |         filtered_result = [x for x in result if x is not None]
242 | 
243 |         self.crawler.stats.inc_value(  # type: ignore[union-attr]
244 |             "incremental_crawling/filtered_items_and_requests", n_dups
245 |         )
246 |         # Check for any new fingerprints and their corresponding URLs for the item
247 |         fingerprint_url_map_new = {
248 |             (fp, url)
249 |             for fp, url in fingerprint_to_url_map
250 |             if fp not in duplicated_fingerprints
251 |         }
252 |         # Add any new fingerprints and their corresponding URLs to the batch for future saving
253 |         if fingerprint_url_map_new:
254 |             self.fm.add_to_batch(fingerprint_url_map_new)
255 |         return filtered_result
256 | 
257 |     def _get_unique_urls(
258 |         self, request_url: str, item: Optional[Item], discard_request_url: bool = False
259 |     ) -> Dict[str, Optional[str]]:
260 |         """Retrieves a dictionary of unique URLs associated with an item."""
261 | 
262 |         urls: Dict[str, Optional[str]] = {request_url: "request_url"}
263 |         if not item:
264 |             return urls
265 | 
266 |         url_fields = ["url", "canonicalUrl"]
267 | 
268 |         adapter = ItemAdapter(item)
269 |         for url_field in url_fields:
270 |             if (url := adapter[url_field]) and url not in urls:
271 |                 urls[url] = url_field
272 | 
273 |         if discard_request_url:
274 |             urls.pop(request_url)
275 | 
276 |         return urls
277 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/_incremental/middleware.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import AsyncGenerator, Union
 3 | 
 4 | from scrapinghub.client.exceptions import Unauthorized
 5 | from scrapy.crawler import Crawler
 6 | from scrapy.exceptions import CloseSpider, NotConfigured
 7 | from scrapy.http import Request
 8 | from zyte_common_items import Item
 9 | 
10 | from .manager import CollectionsFingerprintsManager, IncrementalCrawlingManager
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class IncrementalCrawlMiddleware:
16 |     """:ref:`Downloader middleware <topics-spider-middleware>` to skip
17 |     items seen in previous crawls.
18 | 
19 |     To enable this middleware, set the :setting:`INCREMENTAL_CRAWL_ENABLED`
20 |     setting to ``True``.
21 | 
22 |     This middleware keeps a record of URLs of crawled items in the :ref:`Zyte Scrapy Cloud
23 |     collection <api-collections>` specified in the :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME`
24 |     setting, and skips items, responses and requests with matching URLs.
25 | 
26 |     Use :setting:`INCREMENTAL_CRAWL_BATCH_SIZE` to fine-tune interactions with
27 |     the collection for performance.
28 |     """
29 | 
30 |     def __init__(self, crawler: Crawler):
31 |         assert crawler.spider
32 |         if not crawler.spider.settings.getbool("INCREMENTAL_CRAWL_ENABLED", False):
33 |             raise NotConfigured
34 |         self.inc_manager: IncrementalCrawlingManager = self.prepare_incremental_manager(
35 |             crawler
36 |         )
37 | 
38 |     @staticmethod
39 |     def prepare_incremental_manager(crawler):
40 |         try:
41 |             collection_fp = CollectionsFingerprintsManager(crawler)
42 |         except (AttributeError, Unauthorized, RuntimeError, ValueError) as exc_info:
43 |             logger.error(
44 |                 f"IncrementalCrawlMiddleware is enabled, but something went wrong with Collections.\n"
45 |                 f"The reason: {exc_info}"
46 |             )
47 |             raise CloseSpider("incremental_crawling_middleware_collection_issue")
48 | 
49 |         return IncrementalCrawlingManager(crawler, collection_fp)
50 | 
51 |     @classmethod
52 |     def from_crawler(cls, crawler: Crawler):
53 |         return cls(crawler)
54 | 
55 |     async def process_spider_output(
56 |         self, response, result, spider
57 |     ) -> AsyncGenerator[Union[Request, Item], None]:
58 |         result_list = []
59 |         async for item_or_request in result:
60 |             result_list.append(item_or_request)
61 | 
62 |         unique_items_or_requests = await self.inc_manager.process_incremental_async(
63 |             response.request, result_list
64 |         )
65 | 
66 |         for item_or_request in unique_items_or_requests:
67 |             yield item_or_request
68 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/_lang_codes.py:
--------------------------------------------------------------------------------
  1 | # ISO 639-1 language codes
  2 | # Taken from https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
  3 | 
  4 | LANG_CODES = [
  5 |     "ab",
  6 |     "aa",
  7 |     "af",
  8 |     "ak",
  9 |     "sq",
 10 |     "am",
 11 |     "ar",
 12 |     "an",
 13 |     "hy",
 14 |     "as",
 15 |     "av",
 16 |     "ae",
 17 |     "ay",
 18 |     "az",
 19 |     "bm",
 20 |     "ba",
 21 |     "eu",
 22 |     "be",
 23 |     "bn",
 24 |     "bi",
 25 |     "bs",
 26 |     "br",
 27 |     "bg",
 28 |     "my",
 29 |     "ca",
 30 |     "ch",
 31 |     "ce",
 32 |     "ny",
 33 |     "zh",
 34 |     "cu",
 35 |     "cv",
 36 |     "kw",
 37 |     "co",
 38 |     "cr",
 39 |     "hr",
 40 |     "cs",
 41 |     "da",
 42 |     "dv",
 43 |     "nl",
 44 |     "dz",
 45 |     "en",
 46 |     "eo",
 47 |     "et",
 48 |     "ee",
 49 |     "fo",
 50 |     "fj",
 51 |     "fi",
 52 |     "fr",
 53 |     "fy",
 54 |     "ff",
 55 |     "gd",
 56 |     "gl",
 57 |     "lg",
 58 |     "ka",
 59 |     "de",
 60 |     "el",
 61 |     "kl",
 62 |     "gn",
 63 |     "gu",
 64 |     "ht",
 65 |     "ha",
 66 |     "he",
 67 |     "hz",
 68 |     "hi",
 69 |     "ho",
 70 |     "hu",
 71 |     "is",
 72 |     "io",
 73 |     "ig",
 74 |     "id",
 75 |     "ia",
 76 |     "ie",
 77 |     "iu",
 78 |     "ik",
 79 |     "ga",
 80 |     "it",
 81 |     "ja",
 82 |     "jv",
 83 |     "kn",
 84 |     "kr",
 85 |     "ks",
 86 |     "kk",
 87 |     "km",
 88 |     "ki",
 89 |     "rw",
 90 |     "ky",
 91 |     "kv",
 92 |     "kg",
 93 |     "ko",
 94 |     "kj",
 95 |     "ku",
 96 |     "lo",
 97 |     "la",
 98 |     "lv",
 99 |     "li",
100 |     "ln",
101 |     "lt",
102 |     "lu",
103 |     "lb",
104 |     "mk",
105 |     "mg",
106 |     "ms",
107 |     "ml",
108 |     "mt",
109 |     "gv",
110 |     "mi",
111 |     "mr",
112 |     "mh",
113 |     "mn",
114 |     "na",
115 |     "nv",
116 |     "nd",
117 |     "nr",
118 |     "ng",
119 |     "ne",
120 |     "no",
121 |     "nb",
122 |     "nn",
123 |     "ii",
124 |     "oc",
125 |     "oj",
126 |     "or",
127 |     "om",
128 |     "os",
129 |     "pi",
130 |     "ps",
131 |     "fa",
132 |     "pl",
133 |     "pt",
134 |     "pa",
135 |     "qu",
136 |     "ro",
137 |     "rm",
138 |     "rn",
139 |     "ru",
140 |     "se",
141 |     "sm",
142 |     "sg",
143 |     "sa",
144 |     "sc",
145 |     "sr",
146 |     "sn",
147 |     "sd",
148 |     "si",
149 |     "sk",
150 |     "sl",
151 |     "so",
152 |     "st",
153 |     "es",
154 |     "su",
155 |     "sw",
156 |     "ss",
157 |     "sv",
158 |     "tl",
159 |     "ty",
160 |     "tg",
161 |     "ta",
162 |     "tt",
163 |     "te",
164 |     "th",
165 |     "bo",
166 |     "ti",
167 |     "to",
168 |     "ts",
169 |     "tn",
170 |     "tr",
171 |     "tk",
172 |     "tw",
173 |     "ug",
174 |     "uk",
175 |     "ur",
176 |     "uz",
177 |     "ve",
178 |     "vi",
179 |     "vo",
180 |     "wa",
181 |     "cy",
182 |     "wo",
183 |     "xh",
184 |     "yi",
185 |     "yo",
186 |     "za",
187 |     "zu",
188 | ]
189 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/documentation.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from enum_tools.documentation import document_enum
3 | except ImportError:
4 | 
5 |     def document_enum(func):  # type: ignore[misc]
6 |         return func
7 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/feeds.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Set, Union
 2 | 
 3 | import feedparser
 4 | from scrapy.utils.python import unique
 5 | from w3lib.html import strip_html5_whitespace
 6 | from w3lib.url import canonicalize_url
 7 | from web_poet import AnyResponse, BrowserResponse, HttpResponse, RequestUrl, ResponseUrl
 8 | 
 9 | 
10 | def unique_urls(urls: List[str]) -> List[str]:
11 |     return unique(urls, key=canonicalize_url)
12 | 
13 | 
14 | def get_feed_urls(
15 |     response: Union[AnyResponse, HttpResponse, BrowserResponse]
16 | ) -> Set[str]:
17 |     """Find all RSS or Atom feeds from a page"""
18 |     feed_urls = set()
19 | 
20 |     for link in response.xpath("//link[@type]"):
21 |         link_type: str = strip_html5_whitespace(link.attrib["type"])
22 |         link_href: Union[str, RequestUrl, ResponseUrl] = strip_html5_whitespace(
23 |             link.attrib.get("href", "")
24 |         )
25 |         if link_href:
26 |             link_href = response.urljoin(link_href)
27 |             rss_url = atom_url = None
28 |             if "rss+xml" in link_type:
29 |                 rss_url = link_href
30 |             elif "atom+xml" in link_type:
31 |                 atom_url = link_href
32 |             feed_url = rss_url or atom_url
33 |             if feed_url:
34 |                 feed_urls.add(str(feed_url))
35 | 
36 |     for link in response.xpath("//a/@href").getall():
37 |         link_href = strip_html5_whitespace(link)
38 |         if link_href.endswith("rss.xml"):
39 |             feed_urls.add(str(response.urljoin(link_href)))
40 | 
41 |     return feed_urls
42 | 
43 | 
44 | def parse_feed(
45 |     response: Union[AnyResponse, HttpResponse, BrowserResponse]
46 | ) -> List[str]:
47 |     response_text = (
48 |         str(response.html) if isinstance(response, BrowserResponse) else response.text
49 |     )
50 | 
51 |     feed = feedparser.parse(response_text)
52 |     urls = [
53 |         strip_html5_whitespace(entry.get("link", ""))
54 |         for entry in feed.get("entries", [])
55 |     ]
56 |     return unique_urls([str(response.urljoin(url)) for url in urls if url])
57 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/heuristics.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Tuple
  3 | from urllib.parse import urlparse, urlsplit
  4 | 
  5 | from scrapy.link import Link
  6 | from scrapy.linkextractors import IGNORED_EXTENSIONS
  7 | from web_poet import BrowserResponse
  8 | 
  9 | from zyte_spider_templates._geolocations import GEOLOCATION_OPTIONS
 10 | from zyte_spider_templates._lang_codes import LANG_CODES as _LANG_CODES
 11 | 
 12 | COUNTRY_CODES = set([k.lower() for k in GEOLOCATION_OPTIONS])
 13 | LANG_CODES = set(_LANG_CODES)
 14 | 
 15 | ATOM_PATTERN = re.compile(r"<feed[^>]*>.*?<id[^>]*>.*?</id>", re.IGNORECASE | re.DOTALL)
 16 | RDF_PATTERN = re.compile(r"<rdf[^>]*>\s*<channel[^>]*>", re.IGNORECASE)
 17 | RSS_PATTERN = re.compile(r"<rss[^>]*>\s*<channel[^>]*>", re.IGNORECASE)
 18 | 
 19 | 
 20 | NO_CONTENT_KEYWORDS = (
 21 |     "authenticate",
 22 |     "my-account",
 23 |     "account",
 24 |     "my-wishlist",
 25 |     "search",
 26 |     "archive",
 27 |     "privacy-policy",
 28 |     "cookie-policy",
 29 |     "terms-conditions",
 30 |     "tos",
 31 |     "admin",
 32 |     "rss.xml",
 33 |     "subscribe",
 34 |     "newsletter",
 35 |     "settings",
 36 |     "cart",
 37 |     "articles",
 38 |     "artykuly",  # Polish for articles
 39 |     "news",
 40 |     "blog",
 41 |     "about",
 42 |     "about-us",
 43 |     "affiliate",
 44 |     "press",
 45 |     "careers",
 46 | )
 47 | 
 48 | SUFFIXES = [".html", ".php", ".cgi", ".asp"]
 49 | 
 50 | NO_CONTENT_RE = (
 51 |     r"/sign[_-]?in",
 52 |     r"/log[_-]?(in|out)",
 53 |     r"/contact[_-]?(us)?",
 54 |     r"/(lost|forgot)[_-]password",
 55 |     r"/terms[_-]of[_-](service|use|conditions)",
 56 | )
 57 | 
 58 | NO_ARTICLES_CONTENT_PATHS = (
 59 |     "/archive",
 60 |     "/about",
 61 |     "/about-us",
 62 |     "/account",
 63 |     "/admin",
 64 |     "/affiliate",
 65 |     "/authenticate",
 66 |     "/best-deals",
 67 |     "/careers",
 68 |     "/cart",
 69 |     "/checkout",
 70 |     "/contactez-nous",
 71 |     "/cookie-policy",
 72 |     "/my-account",
 73 |     "/my-wishlist",
 74 |     "/press",
 75 |     "/pricing",
 76 |     "/privacy-policy",
 77 |     "/returns",
 78 |     "/rss.xml",
 79 |     "/search",
 80 |     "/settings",
 81 |     "/shipping",
 82 |     "/subscribe",
 83 |     "/terms-conditions",
 84 |     "/tos",
 85 | )
 86 | 
 87 | 
 88 | SEED_URL_RE = re.compile(r"^https?:\/\/[^:\/\s]+(:\d{1,5})?(\/[^\s]*)*(#[^\s]*)?")
 89 | 
 90 | NON_HTML_FILE_EXTENSION_RE = re.compile(
 91 |     ".*(?:{}$)".format("|".join(re.escape("." + ext) for ext in IGNORED_EXTENSIONS)),
 92 |     re.IGNORECASE,
 93 | )
 94 | 
 95 | SOCIAL_DOMAINS = (
 96 |     "facebook.com",
 97 |     "youtube.com",
 98 |     "youtu.be",
 99 |     "twitter.com",
100 |     "t.co",
101 |     "instagram.com",
102 |     "mail.yahoo.com",
103 |     "plus.google.com",
104 |     "play.google.com",
105 |     "www.google.com",
106 |     "itunes.apple.com",
107 |     "login.yahoo.com",
108 |     "consent.yahoo.com",
109 |     "outlook.live.com",
110 |     "linkedin.com",
111 |     "vk.com",
112 |     "www.odnoklassniki.ru",
113 |     "api.whatsapp.com",
114 |     "telegram.me",
115 |     "telegram.org",
116 |     # ads
117 |     "doubleclick.net",
118 | )
119 | domains = "|".join(re.escape(domain) for domain in SOCIAL_DOMAINS)
120 | pattern = rf"(?:^(?:[./])(?:{domains})|\b(?:{domains}))$"
121 | SOCIAL_DOMAINS_RE = re.compile(pattern)
122 | 
123 | 
124 | def might_be_category(url: str) -> bool:
125 |     """Returns True if the given url might be a category based on its path."""
126 | 
127 |     url = url.lower().rstrip("/")
128 |     parsed_url = urlparse(url)
129 | 
130 |     for suffix in [""] + SUFFIXES:
131 |         for path in NO_CONTENT_KEYWORDS:
132 |             if parsed_url.path.endswith(f"/{path}{suffix}"):
133 |                 return False
134 |             if parsed_url.netloc.startswith(f"{path}."):
135 |                 return False
136 |         for rule in NO_CONTENT_RE:
137 |             if re.search(rule + suffix, url):
138 |                 return False
139 | 
140 |     return True
141 | 
142 | 
143 | INDEX_URL_PATHS = {
144 |     "",
145 |     "/index",
146 |     "/index.html",
147 |     "/index.htm",
148 |     "/index.php",
149 |     "/home",
150 | }
151 | 
152 | 
153 | def is_homepage(url: str) -> bool:
154 |     """Given a URL, returns True if the URL could be a homepage."""
155 |     url_split = urlsplit(url)
156 |     url_path = url_split.path.rstrip("/").lower()
157 | 
158 |     # Finds and removes URL subpaths like "/us/en", "en-us", "en-uk", etc.
159 |     if _url_has_locale_pair(url_path):
160 |         url_path = url_path[6:]
161 | 
162 |     # Finds and removes URL subpaths like "/en", "/fr", etc.
163 |     match = re.search(r"/(\w{2})(?!\w)", url_path)
164 |     if match and (match.group(1) in LANG_CODES or match.group(1) in COUNTRY_CODES):
165 |         url_path = url_path[3:]
166 | 
167 |     if url_path in INDEX_URL_PATHS and not url_split.query:
168 |         return True
169 | 
170 |     return False
171 | 
172 | 
173 | def _url_has_locale_pair(url_path: str) -> bool:
174 |     if match := re.search(r"/(\w{2})[^a-z](\w{2})(?!\w)", url_path):
175 |         x, y = match.groups()
176 |         if x in LANG_CODES and y in COUNTRY_CODES:
177 |             return True
178 |         if y in LANG_CODES and x in COUNTRY_CODES:
179 |             return True
180 |     return False
181 | 
182 | 
183 | def is_comments_article_feed(url: str) -> bool:
184 |     """
185 |     Try to guess if a feed URL is for comments, not for articles.
186 |     """
187 |     if "comments/feed" in url or "feed=comments-rss2" in url:
188 |         return True
189 |     return False
190 | 
191 | 
192 | def is_non_html_file(url: str) -> bool:
193 |     """
194 |     True for urls with extensions that clearly are not HTML. For example,
195 |     they are images, or a compressed file, etc.
196 |     >>> is_non_html_file("http://example.com/article")
197 |     False
198 |     >>> is_non_html_file("http://example.com/image.jpg")
199 |     True
200 |     """
201 |     return bool(NON_HTML_FILE_EXTENSION_RE.match(url))
202 | 
203 | 
204 | def is_social_link(url: str) -> bool:
205 |     """
206 |     True for urls corresponding to the typical social networks
207 |     >>> is_social_link("http://facebook.com")
208 |     True
209 |     >>> is_social_link("http://www.facebook.com")
210 |     True
211 |     >>> is_social_link("http://rrr.t.co")
212 |     True
213 |     >>> is_social_link("http://t.co")
214 |     True
215 |     >>> is_social_link("http://sport.co")
216 |     False
217 |     >>> is_social_link("http://sport.com")
218 |     False
219 |     >>> is_social_link("http://example.com")
220 |     False
221 |     """
222 |     netloc = urlsplit(url).netloc
223 | 
224 |     if SOCIAL_DOMAINS_RE.search(netloc):
225 |         return True
226 |     return False
227 | 
228 | 
229 | def classify_article_crawling_links(links: List[Link]) -> Tuple[List[Link], List[Link]]:
230 |     """In accordance with the rules, it divides the list of links into two new lists with allowed and disallowed links.
231 |     Returns a tuple of these new lists."""
232 |     allowed_links = []
233 |     disallowed_links = []
234 |     for link in links:
235 |         url = link.url
236 |         if (
237 |             is_social_link(url)
238 |             or is_non_html_file(url)
239 |             or url.endswith(NO_ARTICLES_CONTENT_PATHS)
240 |         ):
241 |             disallowed_links.append(link)
242 |             continue
243 |         allowed_links.append(link)
244 | 
245 |     return allowed_links, disallowed_links
246 | 
247 | 
248 | def classify_article_feed_links(links: List[Link]) -> Tuple[List[Link], List[Link]]:
249 |     """In accordance with the rules, it divides the list of urls into two new lists with allowed and disallowed urls.
250 |     Returns a tuple of these new lists."""
251 |     allowed_links = []
252 |     disallowed_links = []
253 |     for link in links:
254 |         if is_comments_article_feed(link.url):
255 |             disallowed_links.append(link)
256 |             continue
257 |         allowed_links.append(link)
258 |     return allowed_links, disallowed_links
259 | 
260 | 
261 | def is_feed_content(response: BrowserResponse) -> bool:
262 |     # RSS 0.91, 0.92, 2.0
263 |     if RSS_PATTERN.search(response.html):
264 |         return True
265 |     # Atom feed
266 |     if ATOM_PATTERN.search(response.html):
267 |         return True
268 |     # RSS 1.0/RDF
269 |     if RDF_PATTERN.search(response.html):
270 |         return True
271 |     return False
272 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/page_objects/__init__.py:
--------------------------------------------------------------------------------
 1 | from warnings import warn
 2 | 
 3 | from ..pages import HeuristicsProductNavigationPage
 4 | 
 5 | warn(
 6 |     "The zyte_spider_templates.page_objects module is deprecated, use "
 7 |     "zyte_spider_templates.pages instead.",
 8 |     DeprecationWarning,
 9 |     stacklevel=2,
10 | )
11 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/page_objects/product_navigation_heuristics.py:
--------------------------------------------------------------------------------
1 | from ..pages import HeuristicsProductNavigationPage
2 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/pages/__init__.py:
--------------------------------------------------------------------------------
1 | from .article_heuristics import HeuristicsArticleNavigationPage
2 | from .product_navigation_heuristics import HeuristicsProductNavigationPage
3 | from .search_request_template import DefaultSearchRequestTemplatePage
4 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/pages/article_heuristics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Iterable, List
  4 | 
  5 | import attrs
  6 | import xtractmime
  7 | from scrapy.http import TextResponse
  8 | from scrapy.link import Link
  9 | from scrapy.linkextractors import LinkExtractor
 10 | from web_poet import AnyResponse, HttpResponse, PageParams, Stats, field, handle_urls
 11 | from web_poet.utils import cached_method
 12 | from zyte_common_items import (
 13 |     BaseArticleNavigationPage,
 14 |     ProbabilityMetadata,
 15 |     ProbabilityRequest,
 16 | )
 17 | 
 18 | from zyte_spider_templates.feeds import get_feed_urls, parse_feed
 19 | from zyte_spider_templates.heuristics import (
 20 |     classify_article_crawling_links,
 21 |     classify_article_feed_links,
 22 | )
 23 | 
 24 | from ..heuristics import is_feed_content
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | def is_feed_request(request: ProbabilityRequest) -> bool:
 30 |     return bool(
 31 |         request.name
 32 |         and request.name.startswith("[heuristics][articleNavigation][feed]")
 33 |     )
 34 | 
 35 | 
 36 | @handle_urls("")
 37 | @attrs.define
 38 | class HeuristicsArticleNavigationPage(BaseArticleNavigationPage):
 39 |     response: AnyResponse
 40 |     stats: Stats
 41 |     page_params: PageParams
 42 |     _ARTICLE_HEURISTIC = {"name": "article", "dummy probability": 0.5}
 43 |     _NAVIGATION_HEURISTIC = {"name": "subCategories", "dummy probability": 0.5}
 44 |     _FEED_HEURISTIC = {"name": "feed", "dummy probability": 1.0}
 45 |     _FEED_ITEMS_HEURISTIC = {"name": "feed items", "dummy probability": 0.99}
 46 | 
 47 |     @field
 48 |     def url(self) -> str:
 49 |         return str(self.response.url)
 50 | 
 51 |     @field
 52 |     def subCategories(self) -> Iterable[ProbabilityRequest]:
 53 |         if self._is_response_feed():
 54 |             return
 55 | 
 56 |         feeds = self._get_feed_links()
 57 |         feed_urls = {link.url for link in feeds}
 58 |         for link in feeds:
 59 |             yield self._get_request(link, self._FEED_HEURISTIC)
 60 | 
 61 |         if self.skip_subcategories() or self.is_only_feeds():
 62 |             return
 63 | 
 64 |         sub_categories = [
 65 |             link
 66 |             for link in self._get_article_or_navigation_links()
 67 |             if link.url not in feed_urls
 68 |         ]
 69 |         for link in sub_categories:
 70 |             yield self._get_request(link, self._NAVIGATION_HEURISTIC)
 71 | 
 72 |     @field
 73 |     def items(self) -> Iterable[ProbabilityRequest]:
 74 |         if self._is_response_feed():
 75 |             links = self._get_feed_items_links()
 76 |             heuristic = self._FEED_ITEMS_HEURISTIC
 77 |         elif not self.is_only_feeds():
 78 |             links = self._get_article_or_navigation_links()
 79 |             heuristic = self._ARTICLE_HEURISTIC
 80 |         else:
 81 |             return
 82 | 
 83 |         for link in links:
 84 |             yield self._get_request(link, heuristic)
 85 | 
 86 |     @cached_method
 87 |     def _get_article_or_navigation_links(self) -> List[Link]:
 88 |         """Extract links from an HTML web page."""
 89 |         response = TextResponse(
 90 |             url=str(self.response.url), body=self.response.text.encode()
 91 |         )
 92 |         link_extractor = LinkExtractor()
 93 |         links = link_extractor.extract_links(response)
 94 |         allowed_links, disallowed_links = classify_article_crawling_links(links)
 95 | 
 96 |         _log_and_stats(
 97 |             self,
 98 |             "heuristic_navigation_or_article",
 99 |             links,
100 |             allowed_links,
101 |             disallowed_links,
102 |         )
103 |         return allowed_links
104 | 
105 |     @cached_method
106 |     def _get_feed_items_links(self) -> List[Link]:
107 |         """Extract links from an RSS/Atom feed."""
108 |         links = [Link(url) for url in parse_feed(self.response)]
109 |         allowed_links, disallowed_links = classify_article_crawling_links(links)
110 | 
111 |         _log_and_stats(
112 |             self, "heuristic_feed_items", links, allowed_links, disallowed_links
113 |         )
114 |         return allowed_links
115 | 
116 |     @cached_method
117 |     def _get_feed_links(self) -> List[Link]:
118 |         """Extract links to RSS/Atom feeds form an HTML web page."""
119 |         links = [Link(url) for url in get_feed_urls(self.response)]
120 |         allowed_links, disallowed_links = classify_article_feed_links(links)
121 | 
122 |         _log_and_stats(self, "heuristic_feed", links, allowed_links, disallowed_links)
123 |         return allowed_links
124 | 
125 |     @cached_method
126 |     def _is_response_feed(self) -> bool:
127 |         """Return True if a response is an RSS or Atom feed."""
128 | 
129 |         content_type = ""
130 |         if isinstance(self.response.response, HttpResponse):
131 |             content_type = self.response.response.headers.get("Content-Type", "")
132 |         elif is_feed_content(self.response.response):
133 |             logger.warning(
134 |                 "It is likely that the spider is using BrowserHtml to extract the RSS feed. "
135 |                 "Please note that using HttpResponse is more efficient."
136 |             )
137 |             return True
138 | 
139 |         mime_type = xtractmime.extract_mime(
140 |             self.response.text.encode(),
141 |             content_types=(content_type.encode(),),
142 |         )
143 | 
144 |         return xtractmime.mimegroups.is_xml_mime_type(
145 |             mime_type
146 |         ) or xtractmime.mimegroups.is_json_mime_type(mime_type)
147 | 
148 |     def _get_request(self, link, heuristic) -> ProbabilityRequest:
149 |         return ProbabilityRequest(
150 |             url=link.url,
151 |             name=f"[heuristics][articleNavigation][{heuristic['name']}] {link.text.strip()}",
152 |             metadata=ProbabilityMetadata(probability=heuristic["dummy probability"]),
153 |         )
154 | 
155 |     def skip_subcategories(self) -> bool:
156 |         return self.page_params.get("skip_subcategories", False)
157 | 
158 |     def is_only_feeds(self) -> bool:
159 |         return self.page_params.get("only_feeds", False)
160 | 
161 | 
162 | def _log_and_stats(self, urls_type, links, allowed_links, disallowed_links):
163 |     _logs(self, urls_type, links, allowed_links, disallowed_links)
164 |     _stats(self, urls_type, links, allowed_links, disallowed_links)
165 | 
166 | 
167 | def _stats(page, urls_type, urls, allowed_urls, disallowed_urls):
168 |     page.stats.inc(f"article_spider/{urls_type}/visited", 1)
169 |     page.stats.inc(f"article_spider/{urls_type}/no_links", 0 if urls else 1)
170 |     page.stats.inc(f"article_spider/{urls_type}/with_links", 1 if urls else 0)
171 |     page.stats.inc(f"article_spider/{urls_type}/links/total", len(urls))
172 |     page.stats.inc(f"article_spider/{urls_type}/links/allow", len(allowed_urls))
173 |     page.stats.inc(f"article_spider/{urls_type}/links/disallow", len(disallowed_urls))
174 | 
175 | 
176 | def _logs(page, urls_type, urls, allowed_urls, disallowed_urls):
177 |     page_name = page.item_cls.__name__
178 |     data = {
179 |         "page": page_name,
180 |         "page url": page.url,
181 |         "urls type": urls_type,
182 |         "urls found": len(urls),
183 |         "allowed urls": len(allowed_urls),
184 |         "urls to skip": len(disallowed_urls),
185 |         "list of urls to skip": [
186 |             url.url if isinstance(url, Link) else url for url in disallowed_urls
187 |         ],
188 |     }
189 |     logger.debug(f"Article Heuristic Logs:\n{json.dumps(data, indent=2)}")
190 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/pages/product_navigation_heuristics.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | import attrs
 4 | from scrapy.http import TextResponse
 5 | from scrapy.linkextractors import LinkExtractor
 6 | from web_poet import AnyResponse, PageParams, field, handle_urls
 7 | from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest
 8 | 
 9 | from zyte_spider_templates.heuristics import might_be_category
10 | 
11 | 
12 | @handle_urls("")
13 | @attrs.define
14 | class HeuristicsProductNavigationPage(AutoProductNavigationPage):
15 |     response: AnyResponse
16 |     page_params: PageParams
17 | 
18 |     @field
19 |     def subCategories(self) -> Optional[List[ProbabilityRequest]]:
20 |         if self.page_params.get("full_domain"):
21 |             return (
22 |                 self.product_navigation.subCategories or []
23 |             ) + self._probably_category_links()
24 |         return self.product_navigation.subCategories
25 | 
26 |     def _urls_for_category(self) -> List[str]:
27 |         """Return a list of all URLs in the ProductNavigation item:
28 |         - items
29 |         - next page
30 |         - subcategories
31 |         """
32 | 
33 |         category_urls = []
34 |         if self.product_navigation.items:
35 |             category_urls.extend(
36 |                 [r.url for r in self.product_navigation.subCategories or []]
37 |             )
38 |             category_urls.extend([r.url for r in self.product_navigation.items or []])
39 |             if self.product_navigation.nextPage:
40 |                 category_urls.append(self.product_navigation.nextPage.url)
41 |         return category_urls
42 | 
43 |     def _probably_category_links(self) -> List[ProbabilityRequest]:
44 |         # TODO: This should be tuned later
45 |         default_probability = 0.1
46 | 
47 |         link_extractor = LinkExtractor(
48 |             allow_domains=self.page_params.get("full_domain", [])
49 |         )
50 |         ignore_urls = set(self._urls_for_category())
51 | 
52 |         links = []
53 |         response = TextResponse(
54 |             url=str(self.response.url), body=self.response.text.encode()
55 |         )
56 |         for link in link_extractor.extract_links(response):
57 |             if link.url in ignore_urls:
58 |                 continue
59 | 
60 |             # TODO: Convert to a configurable parameter like 'obey_nofollow_links'
61 |             # some time after the MVP launch.
62 |             if link.nofollow:
63 |                 continue
64 | 
65 |             if not might_be_category(link.url):
66 |                 continue
67 | 
68 |             name = (link.text or "").strip()
69 |             request = ProbabilityRequest.from_dict(
70 |                 {
71 |                     "url": link.url,
72 |                     "name": f"[heuristics] {name}",
73 |                     "metadata": {"probability": default_probability},
74 |                 }
75 |             )
76 |             links.append(request)
77 | 
78 |         return links
79 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zytedata/zyte-spider-templates/d87e3e4c23b83fba5860ae3428e6ff4a49c3f536/zyte_spider_templates/spiders/__init__.py


--------------------------------------------------------------------------------
/zyte_spider_templates/spiders/_google_domains.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | # https://www.google.com/supported_domains
  5 | # Sorted alphabetically, except for keeping the main domain first.
  6 | class GoogleDomain(str, Enum):
  7 |     google_com: str = "google.com"
  8 |     google_ad: str = "google.ad"
  9 |     google_ae: str = "google.ae"
 10 |     google_al: str = "google.al"
 11 |     google_am: str = "google.am"
 12 |     google_as: str = "google.as"
 13 |     google_at: str = "google.at"
 14 |     google_az: str = "google.az"
 15 |     google_ba: str = "google.ba"
 16 |     google_be: str = "google.be"
 17 |     google_bf: str = "google.bf"
 18 |     google_bg: str = "google.bg"
 19 |     google_bi: str = "google.bi"
 20 |     google_bj: str = "google.bj"
 21 |     google_bs: str = "google.bs"
 22 |     google_bt: str = "google.bt"
 23 |     google_by: str = "google.by"
 24 |     google_ca: str = "google.ca"
 25 |     google_cat: str = "google.cat"
 26 |     google_cd: str = "google.cd"
 27 |     google_cf: str = "google.cf"
 28 |     google_cg: str = "google.cg"
 29 |     google_ch: str = "google.ch"
 30 |     google_ci: str = "google.ci"
 31 |     google_cl: str = "google.cl"
 32 |     google_cm: str = "google.cm"
 33 |     google_cn: str = "google.cn"
 34 |     google_co_ao: str = "google.co.ao"
 35 |     google_co_bw: str = "google.co.bw"
 36 |     google_co_ck: str = "google.co.ck"
 37 |     google_co_cr: str = "google.co.cr"
 38 |     google_co_id: str = "google.co.id"
 39 |     google_co_il: str = "google.co.il"
 40 |     google_co_in: str = "google.co.in"
 41 |     google_co_jp: str = "google.co.jp"
 42 |     google_co_ke: str = "google.co.ke"
 43 |     google_co_kr: str = "google.co.kr"
 44 |     google_co_ls: str = "google.co.ls"
 45 |     google_co_ma: str = "google.co.ma"
 46 |     google_co_mz: str = "google.co.mz"
 47 |     google_co_nz: str = "google.co.nz"
 48 |     google_co_th: str = "google.co.th"
 49 |     google_co_tz: str = "google.co.tz"
 50 |     google_co_ug: str = "google.co.ug"
 51 |     google_co_uk: str = "google.co.uk"
 52 |     google_co_uz: str = "google.co.uz"
 53 |     google_co_ve: str = "google.co.ve"
 54 |     google_co_vi: str = "google.co.vi"
 55 |     google_co_za: str = "google.co.za"
 56 |     google_co_zm: str = "google.co.zm"
 57 |     google_co_zw: str = "google.co.zw"
 58 |     google_com_af: str = "google.com.af"
 59 |     google_com_ag: str = "google.com.ag"
 60 |     google_com_ar: str = "google.com.ar"
 61 |     google_com_au: str = "google.com.au"
 62 |     google_com_bd: str = "google.com.bd"
 63 |     google_com_bh: str = "google.com.bh"
 64 |     google_com_bn: str = "google.com.bn"
 65 |     google_com_bo: str = "google.com.bo"
 66 |     google_com_br: str = "google.com.br"
 67 |     google_com_bz: str = "google.com.bz"
 68 |     google_com_co: str = "google.com.co"
 69 |     google_com_cu: str = "google.com.cu"
 70 |     google_com_cy: str = "google.com.cy"
 71 |     google_com_do: str = "google.com.do"
 72 |     google_com_ec: str = "google.com.ec"
 73 |     google_com_eg: str = "google.com.eg"
 74 |     google_com_et: str = "google.com.et"
 75 |     google_com_fj: str = "google.com.fj"
 76 |     google_com_gh: str = "google.com.gh"
 77 |     google_com_gi: str = "google.com.gi"
 78 |     google_com_gt: str = "google.com.gt"
 79 |     google_com_hk: str = "google.com.hk"
 80 |     google_com_jm: str = "google.com.jm"
 81 |     google_com_kh: str = "google.com.kh"
 82 |     google_com_kw: str = "google.com.kw"
 83 |     google_com_lb: str = "google.com.lb"
 84 |     google_com_ly: str = "google.com.ly"
 85 |     google_com_mm: str = "google.com.mm"
 86 |     google_com_mt: str = "google.com.mt"
 87 |     google_com_mx: str = "google.com.mx"
 88 |     google_com_my: str = "google.com.my"
 89 |     google_com_na: str = "google.com.na"
 90 |     google_com_ng: str = "google.com.ng"
 91 |     google_com_ni: str = "google.com.ni"
 92 |     google_com_np: str = "google.com.np"
 93 |     google_com_om: str = "google.com.om"
 94 |     google_com_pa: str = "google.com.pa"
 95 |     google_com_pe: str = "google.com.pe"
 96 |     google_com_pg: str = "google.com.pg"
 97 |     google_com_ph: str = "google.com.ph"
 98 |     google_com_pk: str = "google.com.pk"
 99 |     google_com_pr: str = "google.com.pr"
100 |     google_com_py: str = "google.com.py"
101 |     google_com_qa: str = "google.com.qa"
102 |     google_com_sa: str = "google.com.sa"
103 |     google_com_sb: str = "google.com.sb"
104 |     google_com_sg: str = "google.com.sg"
105 |     google_com_sl: str = "google.com.sl"
106 |     google_com_sv: str = "google.com.sv"
107 |     google_com_tj: str = "google.com.tj"
108 |     google_com_tr: str = "google.com.tr"
109 |     google_com_tw: str = "google.com.tw"
110 |     google_com_ua: str = "google.com.ua"
111 |     google_com_uy: str = "google.com.uy"
112 |     google_com_vc: str = "google.com.vc"
113 |     google_com_vn: str = "google.com.vn"
114 |     google_cv: str = "google.cv"
115 |     google_cz: str = "google.cz"
116 |     google_de: str = "google.de"
117 |     google_dj: str = "google.dj"
118 |     google_dk: str = "google.dk"
119 |     google_dm: str = "google.dm"
120 |     google_dz: str = "google.dz"
121 |     google_ee: str = "google.ee"
122 |     google_es: str = "google.es"
123 |     google_fi: str = "google.fi"
124 |     google_fm: str = "google.fm"
125 |     google_fr: str = "google.fr"
126 |     google_ga: str = "google.ga"
127 |     google_ge: str = "google.ge"
128 |     google_gg: str = "google.gg"
129 |     google_gl: str = "google.gl"
130 |     google_gm: str = "google.gm"
131 |     google_gr: str = "google.gr"
132 |     google_gy: str = "google.gy"
133 |     google_hn: str = "google.hn"
134 |     google_hr: str = "google.hr"
135 |     google_ht: str = "google.ht"
136 |     google_hu: str = "google.hu"
137 |     google_ie: str = "google.ie"
138 |     google_im: str = "google.im"
139 |     google_iq: str = "google.iq"
140 |     google_is: str = "google.is"
141 |     google_it: str = "google.it"
142 |     google_je: str = "google.je"
143 |     google_jo: str = "google.jo"
144 |     google_kg: str = "google.kg"
145 |     google_ki: str = "google.ki"
146 |     google_kz: str = "google.kz"
147 |     google_la: str = "google.la"
148 |     google_li: str = "google.li"
149 |     google_lk: str = "google.lk"
150 |     google_lt: str = "google.lt"
151 |     google_lu: str = "google.lu"
152 |     google_lv: str = "google.lv"
153 |     google_md: str = "google.md"
154 |     google_me: str = "google.me"
155 |     google_mg: str = "google.mg"
156 |     google_mk: str = "google.mk"
157 |     google_ml: str = "google.ml"
158 |     google_mn: str = "google.mn"
159 |     google_mu: str = "google.mu"
160 |     google_mv: str = "google.mv"
161 |     google_mw: str = "google.mw"
162 |     google_ne: str = "google.ne"
163 |     google_nl: str = "google.nl"
164 |     google_no: str = "google.no"
165 |     google_nr: str = "google.nr"
166 |     google_nu: str = "google.nu"
167 |     google_pl: str = "google.pl"
168 |     google_pn: str = "google.pn"
169 |     google_ps: str = "google.ps"
170 |     google_pt: str = "google.pt"
171 |     google_ro: str = "google.ro"
172 |     google_rs: str = "google.rs"
173 |     google_ru: str = "google.ru"
174 |     google_rw: str = "google.rw"
175 |     google_sc: str = "google.sc"
176 |     google_se: str = "google.se"
177 |     google_sh: str = "google.sh"
178 |     google_si: str = "google.si"
179 |     google_sk: str = "google.sk"
180 |     google_sm: str = "google.sm"
181 |     google_sn: str = "google.sn"
182 |     google_so: str = "google.so"
183 |     google_sr: str = "google.sr"
184 |     google_st: str = "google.st"
185 |     google_td: str = "google.td"
186 |     google_tg: str = "google.tg"
187 |     google_tl: str = "google.tl"
188 |     google_tm: str = "google.tm"
189 |     google_tn: str = "google.tn"
190 |     google_to: str = "google.to"
191 |     google_tt: str = "google.tt"
192 |     google_vu: str = "google.vu"
193 |     google_ws: str = "google.ws"
194 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/spiders/_google_hl.py:
--------------------------------------------------------------------------------
  1 | # _google_gl.py counterpart for
  2 | # https://developers.google.com/custom-search/docs/json_api_reference#interfaceLanguages
  3 | #
  4 | # Built automatically with ../../utils/google-hl-updater
  5 | 
  6 | from enum import Enum
  7 | 
  8 | GOOGLE_HL_OPTIONS = {
  9 |     "af": "Afrikaans",
 10 |     "sq": "Albanian",
 11 |     "sm": "Amharic",
 12 |     "ar": "Arabic",
 13 |     "az": "Azerbaijani",
 14 |     "eu": "Basque",
 15 |     "be": "Belarusian",
 16 |     "bn": "Bengali",
 17 |     "bh": "Bihari",
 18 |     "bs": "Bosnian",
 19 |     "bg": "Bulgarian",
 20 |     "ca": "Catalan",
 21 |     "zh-CN": "Chinese (Simplified)",
 22 |     "zh-TW": "Chinese (Traditional)",
 23 |     "hr": "Croatian",
 24 |     "cs": "Czech",
 25 |     "da": "Danish",
 26 |     "nl": "Dutch",
 27 |     "en": "English",
 28 |     "eo": "Esperanto",
 29 |     "et": "Estonian",
 30 |     "fo": "Faroese",
 31 |     "fi": "Finnish",
 32 |     "fr": "French",
 33 |     "fy": "Frisian",
 34 |     "gl": "Galician",
 35 |     "ka": "Georgian",
 36 |     "de": "German",
 37 |     "el": "Greek",
 38 |     "gu": "Gujarati",
 39 |     "iw": "Hebrew",
 40 |     "hi": "Hindi",
 41 |     "hu": "Hungarian",
 42 |     "is": "Icelandic",
 43 |     "id": "Indonesian",
 44 |     "ia": "Interlingua",
 45 |     "ga": "Irish",
 46 |     "it": "Italian",
 47 |     "ja": "Japanese",
 48 |     "jw": "Javanese",
 49 |     "kn": "Kannada",
 50 |     "ko": "Korean",
 51 |     "la": "Latin",
 52 |     "lv": "Latvian",
 53 |     "lt": "Lithuanian",
 54 |     "mk": "Macedonian",
 55 |     "ms": "Malay",
 56 |     "ml": "Malayam",
 57 |     "mt": "Maltese",
 58 |     "mr": "Marathi",
 59 |     "ne": "Nepali",
 60 |     "no": "Norwegian",
 61 |     "nn": "Norwegian (Nynorsk)",
 62 |     "oc": "Occitan",
 63 |     "fa": "Persian",
 64 |     "pl": "Polish",
 65 |     "pt-BR": "Portuguese (Brazil)",
 66 |     "pt-PT": "Portuguese (Portugal)",
 67 |     "pa": "Punjabi",
 68 |     "ro": "Romanian",
 69 |     "ru": "Russian",
 70 |     "gd": "Scots Gaelic",
 71 |     "sr": "Serbian",
 72 |     "si": "Sinhalese",
 73 |     "sk": "Slovak",
 74 |     "sl": "Slovenian",
 75 |     "es": "Spanish",
 76 |     "su": "Sudanese",
 77 |     "sw": "Swahili",
 78 |     "sv": "Swedish",
 79 |     "tl": "Tagalog",
 80 |     "ta": "Tamil",
 81 |     "te": "Telugu",
 82 |     "th": "Thai",
 83 |     "ti": "Tigrinya",
 84 |     "tr": "Turkish",
 85 |     "uk": "Ukrainian",
 86 |     "ur": "Urdu",
 87 |     "uz": "Uzbek",
 88 |     "vi": "Vietnamese",
 89 |     "cy": "Welsh",
 90 |     "xh": "Xhosa",
 91 |     "zu": "Zulu",
 92 | }
 93 | GOOGLE_HL_OPTIONS_WITH_CODE = {
 94 |     code: f"{name} ({code})" for code, name in GOOGLE_HL_OPTIONS.items()
 95 | }
 96 | 
 97 | 
 98 | class GoogleHl(str, Enum):
 99 |     af: str = "af"
100 |     sq: str = "sq"
101 |     sm: str = "sm"
102 |     ar: str = "ar"
103 |     az: str = "az"
104 |     eu: str = "eu"
105 |     be: str = "be"
106 |     bn: str = "bn"
107 |     bh: str = "bh"
108 |     bs: str = "bs"
109 |     bg: str = "bg"
110 |     ca: str = "ca"
111 |     zh_CN: str = "zh-CN"
112 |     zh_TW: str = "zh-TW"
113 |     hr: str = "hr"
114 |     cs: str = "cs"
115 |     da: str = "da"
116 |     nl: str = "nl"
117 |     en: str = "en"
118 |     eo: str = "eo"
119 |     et: str = "et"
120 |     fo: str = "fo"
121 |     fi: str = "fi"
122 |     fr: str = "fr"
123 |     fy: str = "fy"
124 |     gl: str = "gl"
125 |     ka: str = "ka"
126 |     de: str = "de"
127 |     el: str = "el"
128 |     gu: str = "gu"
129 |     iw: str = "iw"
130 |     hi: str = "hi"
131 |     hu: str = "hu"
132 |     is_: str = "is"
133 |     id: str = "id"
134 |     ia: str = "ia"
135 |     ga: str = "ga"
136 |     it: str = "it"
137 |     ja: str = "ja"
138 |     jw: str = "jw"
139 |     kn: str = "kn"
140 |     ko: str = "ko"
141 |     la: str = "la"
142 |     lv: str = "lv"
143 |     lt: str = "lt"
144 |     mk: str = "mk"
145 |     ms: str = "ms"
146 |     ml: str = "ml"
147 |     mt: str = "mt"
148 |     mr: str = "mr"
149 |     ne: str = "ne"
150 |     no: str = "no"
151 |     nn: str = "nn"
152 |     oc: str = "oc"
153 |     fa: str = "fa"
154 |     pl: str = "pl"
155 |     pt_BR: str = "pt-BR"
156 |     pt_PT: str = "pt-PT"
157 |     pa: str = "pa"
158 |     ro: str = "ro"
159 |     ru: str = "ru"
160 |     gd: str = "gd"
161 |     sr: str = "sr"
162 |     si: str = "si"
163 |     sk: str = "sk"
164 |     sl: str = "sl"
165 |     es: str = "es"
166 |     su: str = "su"
167 |     sw: str = "sw"
168 |     sv: str = "sv"
169 |     tl: str = "tl"
170 |     ta: str = "ta"
171 |     te: str = "te"
172 |     th: str = "th"
173 |     ti: str = "ti"
174 |     tr: str = "tr"
175 |     uk: str = "uk"
176 |     ur: str = "ur"
177 |     uz: str = "uz"
178 |     vi: str = "vi"
179 |     cy: str = "cy"
180 |     xh: str = "xh"
181 |     zu: str = "zu"
182 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/spiders/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from importlib.metadata import version
  4 | from typing import TYPE_CHECKING, Annotated, Any, Dict
  5 | from warnings import warn
  6 | 
  7 | import scrapy
  8 | from pydantic import BaseModel, ConfigDict, model_validator
  9 | from scrapy.crawler import Crawler
 10 | from scrapy_zyte_api import custom_attrs
 11 | from zyte_common_items import CustomAttributes
 12 | 
 13 | from ..params import (
 14 |     INPUT_GROUP,
 15 |     ExtractFromParam,
 16 |     GeolocationParam,
 17 |     MaxRequestsParam,
 18 |     SearchQueriesParam,
 19 |     UrlParam,
 20 |     UrlsFileParam,
 21 |     UrlsParam,
 22 | )
 23 | 
 24 | if TYPE_CHECKING:
 25 |     # typing.Self requires Python 3.11
 26 |     from typing_extensions import Self
 27 | 
 28 | 
 29 | class _LogExceptionsContextManager:
 30 |     def __init__(self, spider, exc_info):
 31 |         self._spider = spider
 32 |         self._exc_info = exc_info
 33 | 
 34 |     def __enter__(self):
 35 |         return
 36 | 
 37 |     def __exit__(self, exc_type, exc_value, exc_traceback):
 38 |         if exc_type is None:
 39 |             return True
 40 |         if issubclass(exc_type, self._exc_info):
 41 |             self._spider.logger.exception(exc_value)
 42 |             return True
 43 |         return False
 44 | 
 45 | 
 46 | # Higher priority than command-line-defined settings (40).
 47 | ARG_SETTING_PRIORITY: int = 50
 48 | 
 49 | 
 50 | class BaseSpiderParams(
 51 |     ExtractFromParam,
 52 |     MaxRequestsParam,
 53 |     GeolocationParam,
 54 |     SearchQueriesParam,
 55 |     UrlsFileParam,
 56 |     UrlsParam,
 57 |     UrlParam,
 58 |     BaseModel,
 59 | ):
 60 |     model_config = ConfigDict(
 61 |         json_schema_extra={
 62 |             "groups": [
 63 |                 INPUT_GROUP,
 64 |             ],
 65 |         },
 66 |     )
 67 | 
 68 |     @model_validator(mode="after")
 69 |     def deprecated(self):
 70 |         warn(
 71 |             (
 72 |                 "BaseSpiderParams is deprecated, use pydantic.BaseModel and "
 73 |                 "your desired combination of classes from "
 74 |                 "zyte_spider_templates.params instead."
 75 |             ),
 76 |             DeprecationWarning,
 77 |         )
 78 |         return self
 79 | 
 80 | 
 81 | class BaseSpider(scrapy.Spider):
 82 |     custom_settings: Dict[str, Any] = {  # type: ignore[assignment]
 83 |         "ZYTE_API_TRANSPARENT_MODE": True,
 84 |         "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}",
 85 |     }
 86 | 
 87 |     metadata: Dict[str, Any] = {
 88 |         "template": True,
 89 |         "title": "Base",
 90 |         "description": "Base template.",
 91 |     }
 92 | 
 93 |     _NEXT_PAGE_PRIORITY: int = 100
 94 | 
 95 |     _custom_attrs_dep = None
 96 |     _log_request_exception: _LogExceptionsContextManager = None  # type: ignore[assignment]
 97 | 
 98 |     @classmethod
 99 |     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self:
100 |         spider = super().from_crawler(crawler, *args, **kwargs)
101 | 
102 |         # all subclasses of this need to also have Args as a subclass
103 |         # this may be possible to express in type hints instead
104 |         assert hasattr(spider, "args")
105 | 
106 |         if geolocation := getattr(spider.args, "geolocation", None):
107 |             # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected
108 |             # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object
109 |             # additional requests.
110 |             for component in ("AUTOMAP", "PROVIDER"):
111 |                 default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS")
112 |                 default_params["geolocation"] = geolocation
113 |                 spider.settings.set(
114 |                     f"ZYTE_API_{component}_PARAMS",
115 |                     default_params,
116 |                     priority=ARG_SETTING_PRIORITY,
117 |                 )
118 | 
119 |         if spider.args.max_requests:
120 |             spider.settings.set(
121 |                 "ZYTE_API_MAX_REQUESTS",
122 |                 spider.args.max_requests,
123 |                 priority=ARG_SETTING_PRIORITY,
124 |             )
125 | 
126 |         if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None):
127 |             custom_attrs_options = {
128 |                 "method": spider.args.custom_attrs_method,
129 |             }
130 |             if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"):
131 |                 custom_attrs_options["maxInputTokens"] = max_input_tokens
132 |             if max_output_tokens := crawler.settings.getint(
133 |                 "ZYTE_API_MAX_OUTPUT_TOKENS"
134 |             ):
135 |                 custom_attrs_options["maxOutputTokens"] = max_output_tokens
136 | 
137 |             spider._custom_attrs_dep = Annotated[
138 |                 CustomAttributes,
139 |                 custom_attrs(custom_attrs_input, custom_attrs_options),
140 |             ]
141 | 
142 |         spider._log_request_exception = _LogExceptionsContextManager(spider, ValueError)
143 | 
144 |         return spider
145 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/spiders/job_posting.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from enum import Enum
  4 | from typing import (
  5 |     TYPE_CHECKING,
  6 |     Any,
  7 |     Callable,
  8 |     Dict,
  9 |     Iterable,
 10 |     List,
 11 |     Optional,
 12 |     Union,
 13 |     cast,
 14 | )
 15 | 
 16 | import scrapy
 17 | from pydantic import BaseModel, ConfigDict, Field
 18 | from scrapy.crawler import Crawler
 19 | from scrapy_poet import DummyResponse, DynamicDeps
 20 | from scrapy_spider_metadata import Args
 21 | from web_poet import BrowserResponse
 22 | from zyte_common_items import (
 23 |     CustomAttributes,
 24 |     JobPosting,
 25 |     JobPostingNavigation,
 26 |     ProbabilityRequest,
 27 |     SearchRequestTemplate,
 28 | )
 29 | 
 30 | from zyte_spider_templates.spiders.base import (
 31 |     ARG_SETTING_PRIORITY,
 32 |     INPUT_GROUP,
 33 |     BaseSpider,
 34 | )
 35 | 
 36 | from ..documentation import document_enum
 37 | from ..params import (
 38 |     CustomAttrsInputParam,
 39 |     CustomAttrsMethodParam,
 40 |     ExtractFrom,
 41 |     ExtractFromParam,
 42 |     GeolocationParam,
 43 |     MaxRequestsParam,
 44 |     SearchQueriesParam,
 45 |     UrlParam,
 46 |     UrlsFileParam,
 47 |     UrlsParam,
 48 |     parse_input_params,
 49 | )
 50 | 
 51 | if TYPE_CHECKING:
 52 |     # typing.Self requires Python 3.11
 53 |     from typing_extensions import Self
 54 | 
 55 | 
 56 | @document_enum
 57 | class JobPostingCrawlStrategy(str, Enum):
 58 |     navigation: str = "navigation"
 59 |     """Follow pagination and job posting detail pages."""
 60 | 
 61 |     direct_item: str = "direct_item"
 62 |     """Treat input URLs as direct links to job posting detail pages, and extract a
 63 |     job posting from each."""
 64 | 
 65 | 
 66 | class JobPostingCrawlStrategyParam(BaseModel):
 67 |     crawl_strategy: JobPostingCrawlStrategy = Field(
 68 |         title="Crawl strategy",
 69 |         description="Determines how input URLs and follow-up URLs are crawled.",
 70 |         default=JobPostingCrawlStrategy.navigation,
 71 |         json_schema_extra={
 72 |             "enumMeta": {
 73 |                 JobPostingCrawlStrategy.navigation: {
 74 |                     "title": "Navigation",
 75 |                     "description": "Follow pagination and job posting detail pages.",
 76 |                 },
 77 |                 JobPostingCrawlStrategy.direct_item: {
 78 |                     "title": "Direct URLs to job postings",
 79 |                     "description": (
 80 |                         "Treat input URLs as direct links to job posting detail pages, and "
 81 |                         "extract a job posting from each."
 82 |                     ),
 83 |                 },
 84 |             },
 85 |         },
 86 |     )
 87 | 
 88 | 
 89 | class JobPostingSearchQueriesParam(SearchQueriesParam):
 90 |     search_queries: List[str] = Field(
 91 |         title="Search Queries",
 92 |         description=(
 93 |             "A list of search queries, one per line, to submit using the "
 94 |             "search form found on each input URL. Only works for input URLs "
 95 |             "that support search. May not work on every website."
 96 |         ),
 97 |         default_factory=list,
 98 |         json_schema_extra={
 99 |             "default": [],
100 |             "widget": "textarea",
101 |         },
102 |     )
103 | 
104 | 
105 | class JobPostingSpiderParams(
106 |     CustomAttrsMethodParam,
107 |     CustomAttrsInputParam,
108 |     ExtractFromParam,
109 |     MaxRequestsParam,
110 |     GeolocationParam,
111 |     JobPostingCrawlStrategyParam,
112 |     JobPostingSearchQueriesParam,
113 |     UrlsFileParam,
114 |     UrlsParam,
115 |     UrlParam,
116 |     BaseModel,
117 | ):
118 |     model_config = ConfigDict(
119 |         json_schema_extra={
120 |             "groups": [
121 |                 INPUT_GROUP,
122 |             ],
123 |         },
124 |     )
125 | 
126 | 
127 | class JobPostingSpider(Args[JobPostingSpiderParams], BaseSpider):
128 |     """Yield job postings from a job website.
129 | 
130 |     See :class:`~zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams`
131 |     for supported parameters.
132 | 
133 |     .. seealso:: :ref:`job-posting`.
134 |     """
135 | 
136 |     name = "job_posting"
137 | 
138 |     metadata: Dict[str, Any] = {
139 |         **BaseSpider.metadata,
140 |         "title": "Job posting",
141 |         "description": "[Experimental] Template for spiders that extract job posting data from websites.",
142 |     }
143 | 
144 |     @classmethod
145 |     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self:
146 |         spider = super().from_crawler(crawler, *args, **kwargs)
147 |         parse_input_params(spider)
148 |         spider._init_extract_from()
149 |         return spider
150 | 
151 |     def _init_extract_from(self):
152 |         if self.args.extract_from is not None:
153 |             self.settings.set(
154 |                 "ZYTE_API_PROVIDER_PARAMS",
155 |                 {
156 |                     "jobPostingOptions": {"extractFrom": self.args.extract_from},
157 |                     "jobPostingNavigationOptions": {
158 |                         "extractFrom": self.args.extract_from
159 |                     },
160 |                     **self.settings.get("ZYTE_API_PROVIDER_PARAMS", {}),
161 |                 },
162 |                 priority=ARG_SETTING_PRIORITY,
163 |             )
164 | 
165 |     def get_start_request(self, url):
166 |         callback = (
167 |             self.parse_job_posting
168 |             if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
169 |             else self.parse_navigation
170 |         )
171 |         meta: Dict[str, Any] = {
172 |             "crawling_logs": {
173 |                 "page_type": "jobPosting"
174 |                 if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
175 |                 else "jobPostingNavigation"
176 |             },
177 |         }
178 |         if (
179 |             self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
180 |             and self._custom_attrs_dep
181 |         ):
182 |             meta["inject"] = [
183 |                 self._custom_attrs_dep,
184 |             ]
185 |         return scrapy.Request(
186 |             url=url,
187 |             callback=callback,
188 |             meta=meta,
189 |         )
190 | 
191 |     def start_requests(self) -> Iterable[scrapy.Request]:
192 |         if self.args.search_queries:
193 |             for url in self.start_urls:
194 |                 meta: Dict[str, Any] = {
195 |                     "crawling_logs": {"page_type": "searchRequestTemplate"},
196 |                 }
197 |                 if self.args.extract_from == ExtractFrom.browserHtml:
198 |                     meta["inject"] = [BrowserResponse]
199 |                 with self._log_request_exception:
200 |                     yield scrapy.Request(
201 |                         url=url,
202 |                         callback=self.parse_search_request_template,
203 |                         meta=meta,
204 |                     )
205 |         else:
206 |             for url in self.start_urls:
207 |                 with self._log_request_exception:
208 |                     yield self.get_start_request(url)
209 | 
210 |     def parse_search_request_template(
211 |         self,
212 |         response: DummyResponse,
213 |         search_request_template: SearchRequestTemplate,
214 |         dynamic: DynamicDeps,
215 |     ) -> Iterable[scrapy.Request]:
216 |         probability = search_request_template.get_probability()
217 |         if probability is not None and probability <= 0:
218 |             return
219 |         for query in self.args.search_queries:
220 |             meta: Dict[str, Any] = {
221 |                 "crawling_logs": {"page_type": "jobPostingNavigation"},
222 |             }
223 |             with self._log_request_exception:
224 |                 yield search_request_template.request(query=query).to_scrapy(
225 |                     callback=self.parse_navigation,
226 |                     meta=meta,
227 |                 )
228 | 
229 |     def parse_navigation(
230 |         self, response: DummyResponse, navigation: JobPostingNavigation
231 |     ) -> Iterable[scrapy.Request]:
232 |         job_postings = navigation.items or []
233 |         for request in job_postings:
234 |             with self._log_request_exception:
235 |                 yield self.get_parse_job_posting_request(request)
236 | 
237 |         if navigation.nextPage:
238 |             if not job_postings:
239 |                 self.logger.info(
240 |                     f"Ignoring nextPage link {navigation.nextPage} since there "
241 |                     f"are no job posting links found in {navigation.url}"
242 |                 )
243 |             else:
244 |                 with self._log_request_exception:
245 |                     yield self.get_nextpage_request(
246 |                         cast(ProbabilityRequest, navigation.nextPage)
247 |                     )
248 | 
249 |     def parse_job_posting(
250 |         self, response: DummyResponse, job_posting: JobPosting, dynamic: DynamicDeps
251 |     ) -> Iterable[
252 |         Union[JobPosting, Dict[str, Union[JobPosting, Optional[CustomAttributes]]]]
253 |     ]:
254 |         probability = job_posting.get_probability()
255 | 
256 |         # TODO: convert to a configurable parameter later on after the launch
257 |         if probability is None or probability >= 0.1:
258 |             if self.args.custom_attrs_input:
259 |                 yield {
260 |                     "jobPosting": job_posting,
261 |                     "customAttributes": dynamic.get(CustomAttributes),
262 |                 }
263 |             else:
264 |                 yield job_posting
265 |         else:
266 |             assert self.crawler.stats
267 |             self.crawler.stats.inc_value("drop_item/job_posting/low_probability")
268 |             self.logger.info(
269 |                 f"Ignoring item from {response.url} since its probability is "
270 |                 f"less than threshold of 0.1:\n{job_posting}"
271 |             )
272 | 
273 |     def get_parse_navigation_request(
274 |         self,
275 |         request: ProbabilityRequest,
276 |         callback: Optional[Callable] = None,
277 |         page_params: Optional[Dict[str, Any]] = None,
278 |         page_type: str = "jobPostingNavigation",
279 |     ) -> scrapy.Request:
280 |         callback = callback or self.parse_navigation
281 | 
282 |         return request.to_scrapy(
283 |             callback=callback,
284 |             meta={
285 |                 "page_params": page_params or {},
286 |                 "crawling_logs": {
287 |                     "name": request.name or "",
288 |                     "probability": request.get_probability(),
289 |                     "page_type": page_type,
290 |                 },
291 |             },
292 |         )
293 | 
294 |     def get_nextpage_request(
295 |         self,
296 |         request: ProbabilityRequest,
297 |         callback: Optional[Callable] = None,
298 |         page_params: Optional[Dict[str, Any]] = None,
299 |     ):
300 |         return self.get_parse_navigation_request(
301 |             request, callback, page_params, "nextPage"
302 |         )
303 | 
304 |     def get_parse_job_posting_request(
305 |         self, request: ProbabilityRequest, callback: Optional[Callable] = None
306 |     ) -> scrapy.Request:
307 |         callback = callback or self.parse_job_posting
308 | 
309 |         probability = request.get_probability()
310 |         meta: Dict[str, Any] = {
311 |             "crawling_logs": {
312 |                 "name": request.name,
313 |                 "probability": probability,
314 |                 "page_type": "jobPosting",
315 |             },
316 |         }
317 |         if self._custom_attrs_dep:
318 |             meta["inject"] = [
319 |                 self._custom_attrs_dep,
320 |             ]
321 | 
322 |         scrapy_request = request.to_scrapy(
323 |             callback=callback,
324 |             meta=meta,
325 |         )
326 |         scrapy_request.meta["allow_offsite"] = True
327 |         return scrapy_request
328 | 


--------------------------------------------------------------------------------
/zyte_spider_templates/utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | import os
  4 | import re
  5 | from typing import List, Optional
  6 | 
  7 | import scrapinghub
  8 | import tldextract
  9 | from scrapy.crawler import Crawler
 10 | from scrapy.http import Request
 11 | from scrapy.utils.url import parse_url
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | _URL_PATTERN = r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$"
 16 | 
 17 | 
 18 | def get_domain(url: str, include_port: bool = True) -> str:
 19 |     return re.sub(
 20 |         r"^www\d*\.",
 21 |         "",
 22 |         parse_url(url).netloc if include_port else parse_url(url).hostname or "",
 23 |     )
 24 | 
 25 | 
 26 | def load_url_list(urls: str) -> List[str]:
 27 |     result = []
 28 |     bad_urls = []
 29 |     for url in urls.split("\n"):
 30 |         if not (url := url.strip()):
 31 |             continue
 32 |         if not re.search(_URL_PATTERN, url):
 33 |             bad_urls.append(url)
 34 |         elif not bad_urls:
 35 |             result.append(url)
 36 |     if bad_urls:
 37 |         bad_url_list = "\n".join(bad_urls)
 38 |         raise ValueError(
 39 |             f"URL list contained the following invalid URLs:\n{bad_url_list}"
 40 |         )
 41 |     return result
 42 | 
 43 | 
 44 | def get_domain_fingerprint(url: str) -> str:
 45 |     """
 46 |     Create a consistent 2-byte domain fingerprint by combining partial hashes
 47 |     of the main domain (without TLD) and the subdomain components.
 48 |     """
 49 |     extracted = tldextract.extract(url)
 50 |     main_domain = extracted.domain
 51 |     subdomains = extracted.subdomain
 52 | 
 53 |     # Calculate partial hashes for each component
 54 |     main_domain_hash = hashlib.sha1(main_domain.encode("utf-8")).hexdigest()[:2]
 55 |     subdomain_hash = (
 56 |         hashlib.sha1(subdomains.encode("utf-8")).hexdigest()[:2] if subdomains else "00"
 57 |     )
 58 | 
 59 |     return main_domain_hash + subdomain_hash
 60 | 
 61 | 
 62 | def get_request_fingerprint(crawler: Crawler, request: Request) -> str:
 63 |     """Create a fingerprint by including a domain-specific part."""
 64 | 
 65 |     # Calculate domain fingerprint
 66 |     domain_fingerprint = get_domain_fingerprint(request.url)
 67 | 
 68 |     # Calculate request fingerprint
 69 |     request_fingerprint = crawler.request_fingerprinter.fingerprint(request).hex()  # type: ignore[union-attr]
 70 | 
 71 |     # Combine the fingerprints by taking the 2-bytes (4 chars) domain fingerprint
 72 |     # to create a domain-specific identifier.
 73 |     # This optimization aids in efficient read/write operations in the Collection.
 74 | 
 75 |     return domain_fingerprint + request_fingerprint
 76 | 
 77 | 
 78 | def get_project_id(crawler: Crawler) -> Optional[str]:
 79 |     """
 80 |     Retrieve the project ID required for IncrementalCrawlMiddleware.
 81 | 
 82 |     The function attempts to obtain the project ID in the following order:
 83 |     1. For Scrapy Cloud deployments, the project ID is automatically set as SCRAPY_PROJECT_ID
 84 |        in the environment variables.
 85 |     2. Otherwise, it checks the ZYTE_PROJECT_ID environment variable.
 86 |     3. If still not found, it checks the spider setting named ZYTE_PROJECT_ID.
 87 | 
 88 |     """
 89 | 
 90 |     if project_id := os.environ.get("SCRAPY_PROJECT_ID"):
 91 |         logger.info(
 92 |             f"Picked project id {project_id} from SCRAPY_PROJECT_ID env variable."
 93 |         )
 94 |         return project_id
 95 |     # Try to pick from manually set environmental variable
 96 |     if project_id := os.environ.get("ZYTE_PROJECT_ID"):
 97 |         logger.info(
 98 |             f"Picked project id {project_id} from ZYTE_PROJECT_ID env variable."
 99 |         )
100 |         return project_id
101 |     # Try to pick from settings
102 |     if project_id := crawler.settings.get("ZYTE_PROJECT_ID"):
103 |         logger.info(
104 |             f"Picked project id {project_id} from the spider's ZYTE_PROJECT_ID setting."
105 |         )
106 |         return project_id
107 |     raise ValueError(
108 |         "Zyte project id wasn't found in job data, env, or settings. "
109 |         "The env variable SCRAPY_PROJECT_ID or settings property ZYTE_PROJECT_ID was expected."
110 |     )
111 | 
112 | 
113 | def get_spider_name(crawler: Crawler) -> str:
114 |     if spider_name := os.environ.get("SHUB_VIRTUAL_SPIDER"):
115 |         logger.info(
116 |             f"Picked virtual spider name {spider_name} from the spider's SHUB_VIRTUAL_SPIDER setting."
117 |         )
118 |         return spider_name
119 | 
120 |     logger.info(f"Picked spider name {crawler.spider.name} from the spider.")  # type: ignore[union-attr]
121 |     return crawler.spider.name  # type: ignore[union-attr]
122 | 
123 | 
124 | def get_client() -> scrapinghub.ScrapinghubClient:
125 |     # auth is taken from SH_APIKEY or SHUB_JOBAUTH
126 |     return scrapinghub.ScrapinghubClient(
127 |         dash_endpoint=os.getenv("SHUB_APIURL"),
128 |         endpoint=os.getenv("SHUB_STORAGE"),
129 |     )
130 | 


--------------------------------------------------------------------------------