├── .github
    └── workflows
    │   ├── main.yml
    │   └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGES.rst
├── README.rst
├── pyproject.toml
├── scrapy_deltafetch
    ├── __init__.py
    └── middleware.py
├── tests
    ├── __init__.py
    ├── benchmark.py
    └── test_deltafetch.py
└── tox.ini


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches: [ master ]
 5 |   pull_request:
 6 | jobs:
 7 |   test:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         include:
13 |         - python-version: "3.9"
14 |           toxenv: min
15 |         - python-version: "3.9"
16 |         - python-version: "3.10"
17 |         - python-version: "3.11"
18 |         - python-version: "3.12"
19 |         - python-version: "3.13"
20 |         - python-version: "3.13"
21 |           toxenv: pre-commit
22 |         - python-version: "3.13"
23 |           toxenv: mypy
24 |         - python-version: "3.13"
25 |           toxenv: pylint
26 |         - python-version: "3.13"
27 |           toxenv: twinecheck
28 |     steps:
29 |     - uses: actions/checkout@v4
30 |     - name: libddb
31 |       run: |
32 |         sudo apt-get install libdb-dev
33 |     - name: Set up Python ${{ matrix.python-version }}
34 |       uses: actions/setup-python@v5
35 |       with:
36 |         python-version: ${{ matrix.python-version }}
37 |     - name: Run
38 |       env:
39 |         TOXENV: ${{ matrix.toxenv }}
40 |       run: |
41 |         pip install -U tox
42 |         tox
43 |     - name: Upload coverage report
44 |       uses: codecov/codecov-action@v5
45 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on:
 3 |   push:
 4 |     tags:
 5 |     - 'v[0-9]+.[0-9]+.[0-9]+'
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 |     environment:
10 |       name: pypi
11 |       url: https://pypi.org/p/${{ github.event.repository.name }}
12 |     permissions:
13 |       id-token: write
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - uses: actions/setup-python@v5
17 |       with:
18 |         python-version: 3.13
19 |     - run: |
20 |         python -m pip install --upgrade build
21 |         python -m build
22 |     - name: Publish to PyPI
23 |       uses: pypa/gh-action-pypi-publish@release/v1
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | .idea
91 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 |   rev: v0.9.7
4 |   hooks:
5 |     - id: ruff
6 |       args: [ --fix ]
7 |     - id: ruff-format
8 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | Changes
 2 | =======
 3 | 
 4 | 2.1.0 (2025-02-26)
 5 | ------------------
 6 | 
 7 | * Drop support for Python 3.8 and lower, add support for Python 3.9 and higher.
 8 | * Add support for Scrapy 2.12.
 9 | * Use the ``REQUEST_FINGERPRINTER_CLASS`` setting introduced in Scrapy 2.7.
10 | * Support new item types introduced in Scrapy 2.2.
11 | * Support ``Path`` instances in the ``DELTAFETCH_DIR`` setting.
12 | 
13 | 2.0.0 (2021-09-20)
14 | ------------------
15 | * drop Python 2 support
16 | * replace bsddb3 with Python's dbm for storing request fingerprints
17 | * minor README fix
18 | * option to disable deltafetch for some requests with deltafetch_enabled=False request meta key
19 | * dev workflow: changed from Travis to Github Actions
20 | 
21 | 1.2.1 (2017-02-09)
22 | ------------------
23 | 
24 | * Use python idiom to check key in dict
25 | * Update minimum Scrapy version supported by this extension to v1.1.0
26 | 
27 | 1.2.0 (2016-12-07)
28 | ------------------
29 | 
30 | * Log through ``logging`` module instead of (deprecated) scrapy's spider.log().
31 | * Fix README on passing ``deltafetch_reset`` argument on the command line.
32 | 
33 | 
34 | 1.1.0 (2016-06-29)
35 | ------------------
36 | 
37 | Adds support for callbacks returning dict items.
38 | 
39 | 
40 | 1.0.1 (2016-06-27)
41 | ------------------
42 | 
43 | Fix package URL in setup.py
44 | 
45 | 
46 | 1.0.0 (2016-06-27)
47 | ------------------
48 | 
49 | Initial release.
50 | 
51 | This version is functionally equivalent to scrapylib's v1.7.0
52 | ``scrapylib.deltafetch.DeltaFetch``.
53 | 
54 | The only (and major) difference is that support for ``bsddb`` is dropped
55 | in favor of ``bsddb3``, which is a new required dependency.
56 | 
57 | .. note::
58 |     `bsddb`_ has been deprecated since Python 2.6,
59 |     and even removed in Python 3
60 | 
61 | 
62 | .. _bsddb: https://docs.python.org/2/library/bsddb.html
63 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | scrapy-deltafetch
 3 | =================
 4 |   
 5 | .. image:: https://github.com/scrapy-plugins/scrapy-deltafetch/workflows/CI/badge.svg
 6 |    :target: https://github.com/scrapy-plugins/scrapy-deltafetch/actions
 7 | 
 8 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-deltafetch.svg
 9 |     :target: https://pypi.python.org/pypi/scrapy-deltafetch
10 | 
11 | .. image:: https://img.shields.io/pypi/v/scrapy-deltafetch.svg
12 |     :target: https://pypi.python.org/pypi/scrapy-deltafetch
13 | 
14 | .. image:: https://img.shields.io/pypi/l/scrapy-deltafetch.svg
15 |     :target: https://pypi.python.org/pypi/scrapy-deltafetch
16 | 
17 | .. image:: https://img.shields.io/pypi/dm/scrapy-deltafetch.svg
18 |    :target: https://pypistats.org/packages/scrapy-deltafetch
19 |    :alt: Downloads count
20 | 
21 | This is a Scrapy spider middleware to ignore requests
22 | to pages seen in previous crawls of the same spider,
23 | thus producing a "delta crawl" containing only new requests.
24 | 
25 | This also speeds up the crawl, by reducing the number of requests that need
26 | to be crawled, and processed (typically, item requests are the most CPU
27 | intensive).
28 | 
29 | DeltaFetch middleware uses Python's dbm_ package to store requests fingerprints.
30 | 
31 | .. _dbm: https://docs.python.org/3/library/dbm.html
32 | 
33 | 
34 | Installation
35 | ============
36 | 
37 | Install scrapy-deltafetch using ``pip``::
38 | 
39 |     $ pip install scrapy-deltafetch
40 | 
41 | 
42 | Configuration
43 | =============
44 | 
45 | 1. Add DeltaFetch middleware by including it in ``SPIDER_MIDDLEWARES``
46 |    in your ``settings.py`` file::
47 | 
48 |       SPIDER_MIDDLEWARES = {
49 |           'scrapy_deltafetch.DeltaFetch': 100,
50 |       }
51 | 
52 |    Here, priority ``100`` is just an example.
53 |    Set its value depending on other middlewares you may have enabled already.
54 | 
55 | 2. Enable the middleware using ``DELTAFETCH_ENABLED`` in your ``settings.py``::
56 | 
57 |       DELTAFETCH_ENABLED = True
58 | 
59 | 
60 | Usage
61 | =====
62 | 
63 | Following are the different options to control DeltaFetch middleware
64 | behavior.
65 | 
66 | Supported Scrapy settings
67 | -------------------------
68 | 
69 | * ``DELTAFETCH_ENABLED`` — to enable (or disable) this extension
70 | * ``DELTAFETCH_DIR`` — directory where to store state
71 | * ``DELTAFETCH_RESET`` — reset the state, clearing out all seen requests
72 | 
73 | These usually go in your Scrapy project's ``settings.py``.
74 | 
75 | 
76 | Supported Scrapy spider arguments
77 | ---------------------------------
78 | 
79 | * ``deltafetch_reset`` — same effect as DELTAFETCH_RESET setting
80 | 
81 | Example::
82 | 
83 |     $ scrapy crawl example -a deltafetch_reset=1
84 | 
85 | 
86 | Supported Scrapy request meta keys
87 | ----------------------------------
88 | 
89 | * ``deltafetch_key`` — used to define the lookup key for that request. by
90 |   default it's Scrapy's default Request fingerprint function,
91 |   but it can be changed to contain an item id, for example.
92 |   This requires support from the spider, but makes the extension
93 |   more efficient for sites that many URLs for the same item.
94 | 
95 | * ``deltafetch_enabled`` - if set to False it will disable deltafetch for some
96 |   specific request
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=61.2"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "scrapy-deltafetch"
  7 | version = "2.1.0"
  8 | authors = [{name = "Zyte", email = "opensource@zyte.com"}]
  9 | license = {text = "BSD"}
 10 | description = "Scrapy middleware to ignore previously crawled pages"
 11 | readme = "README.rst"
 12 | classifiers = [
 13 |     "Development Status :: 4 - Beta",
 14 |     "License :: OSI Approved :: BSD License",
 15 |     "Operating System :: OS Independent",
 16 |     "Programming Language :: Python",
 17 |     "Programming Language :: Python :: 3",
 18 |     "Programming Language :: Python :: 3.9",
 19 |     "Programming Language :: Python :: 3.10",
 20 |     "Programming Language :: Python :: 3.11",
 21 |     "Programming Language :: Python :: 3.12",
 22 |     "Programming Language :: Python :: 3.13",
 23 | ]
 24 | requires-python = ">=3.9"
 25 | dependencies = ["Scrapy>=1.1.0"]
 26 | 
 27 | [project.urls]
 28 | Homepage = "http://github.com/scrapy-plugins/scrapy-deltafetch"
 29 | 
 30 | [tool.bumpversion]
 31 | current_version = "2.1.0"
 32 | commit = true
 33 | tag = true
 34 | 
 35 | [[tool.bumpversion.files]]
 36 | filename = 'CHANGES.rst'
 37 | search = "\\(unreleased\\)$"
 38 | replace = "({now:%Y-%m-%d})"
 39 | regex = true
 40 | 
 41 | [[tool.bumpversion.files]]
 42 | search = "version = \"{current_version}\""
 43 | replace = "version = \"{new_version}\""
 44 | filename = "pyproject.toml"
 45 | 
 46 | [[tool.bumpversion.files]]
 47 | filename = "scrapy_deltafetch/__init__.py"
 48 | 
 49 | [tool.coverage.run]
 50 | branch = true
 51 | include = ["scrapy_deltafetch/*"]
 52 | omit = ["tests/*"]
 53 | disable_warnings = ["include-ignored"]
 54 | 
 55 | [tool.coverage.paths]
 56 | source = [
 57 |     "scrapy_deltafetch",
 58 |     ".tox/**/site-packages/scrapy-deltafetch"
 59 | ]
 60 | 
 61 | [tool.coverage.report]
 62 | # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
 63 | exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"]
 64 | 
 65 | [tool.pylint.MASTER]
 66 | persistent = "no"
 67 | jobs = 1          # >1 hides results
 68 | 
 69 | [tool.pylint."MESSAGES CONTROL"]
 70 | enable = [
 71 |     "useless-suppression",
 72 | ]
 73 | disable = [
 74 |     # Ones we want to ignore
 75 |     "attribute-defined-outside-init",
 76 |     "broad-exception-caught",
 77 |     "consider-using-with",
 78 |     "cyclic-import",
 79 |     "disallowed-name",
 80 |     "duplicate-code",  # https://github.com/pylint-dev/pylint/issues/214
 81 |     "fixme",
 82 |     "import-outside-toplevel",
 83 |     "inherit-non-class",  # false positives with create_deprecated_class()
 84 |     "invalid-name",
 85 |     "invalid-overridden-method",
 86 |     "isinstance-second-argument-not-valid-type",  # false positives with create_deprecated_class()
 87 |     "line-too-long",
 88 |     "logging-format-interpolation",
 89 |     "logging-fstring-interpolation",
 90 |     "logging-not-lazy",
 91 |     "missing-docstring",
 92 |     "no-member",
 93 |     "no-name-in-module",  # caught by mypy already
 94 |     "no-value-for-parameter",  # https://github.com/pylint-dev/pylint/issues/3268
 95 |     "not-callable",
 96 |     "protected-access",
 97 |     "redefined-builtin",
 98 |     "redefined-outer-name",
 99 |     "too-few-public-methods",
100 |     "too-many-ancestors",
101 |     "too-many-arguments",
102 |     "too-many-branches",
103 |     "too-many-function-args",
104 |     "too-many-instance-attributes",
105 |     "too-many-lines",
106 |     "too-many-locals",
107 |     "too-many-positional-arguments",
108 |     "too-many-public-methods",
109 |     "too-many-return-statements",
110 |     "unused-argument",
111 |     "unused-import",
112 |     "unused-variable",
113 |     "useless-import-alias",  # used as a hint to mypy
114 |     "useless-return",  # https://github.com/pylint-dev/pylint/issues/6530
115 |     "wrong-import-position",
116 | ]
117 | 
118 | [tool.ruff.lint]
119 | extend-select = [
120 |     # flake8-bugbear
121 |     "B",
122 |     # flake8-comprehensions
123 |     "C4",
124 |     # pydocstyle
125 |     "D",
126 |     # flake8-future-annotations
127 |     "FA",
128 |     # flynt
129 |     "FLY",
130 |     # refurb
131 |     "FURB",
132 |     # isort
133 |     "I",
134 |     # flake8-implicit-str-concat
135 |     "ISC",
136 |     # flake8-logging
137 |     "LOG",
138 |     # Perflint
139 |     "PERF",
140 |     # pygrep-hooks
141 |     "PGH",
142 |     # flake8-pie
143 |     "PIE",
144 |     # pylint
145 |     "PL",
146 |     # flake8-pytest-style
147 |     "PT",
148 |     # flake8-use-pathlib
149 |     "PTH",
150 |     # flake8-pyi
151 |     "PYI",
152 |     # flake8-quotes
153 |     "Q",
154 |     # flake8-return
155 |     "RET",
156 |     # flake8-raise
157 |     "RSE",
158 |     # Ruff-specific rules
159 |     "RUF",
160 |     # flake8-bandit
161 |     "S",
162 |     # flake8-simplify
163 |     "SIM",
164 |     # flake8-slots
165 |     "SLOT",
166 |     # flake8-debugger
167 |     "T10",
168 |     # flake8-type-checking
169 |     "TC",
170 |     # pyupgrade
171 |     "UP",
172 |     # pycodestyle warnings
173 |     "W",
174 |     # flake8-2020
175 |     "YTT",
176 | ]
177 | ignore = [
178 |     # Missing docstring in public module
179 |     "D100",
180 |     # Missing docstring in public class
181 |     "D101",
182 |     # Missing docstring in public function
183 |     "D103",
184 |     # Missing docstring in public package
185 |     "D104",
186 |     # Missing docstring in magic method
187 |     "D105",
188 |     # Missing docstring in __init__
189 |     "D107",
190 |     # One-line docstring should fit on one line with quotes
191 |     "D200",
192 |     # No blank lines allowed after function docstring
193 |     "D202",
194 |     # 1 blank line required between summary line and description
195 |     "D205",
196 |     # Multi-line docstring closing quotes should be on a separate line
197 |     "D209",
198 |     # First line should end with a period
199 |     "D400",
200 |     # First line should be in imperative mood; try rephrasing
201 |     "D401",
202 |     # First line should not be the function's "signature"
203 |     "D402",
204 |     # Too many return statements
205 |     "PLR0911",
206 |     # Too many branches
207 |     "PLR0912",
208 |     # Too many arguments in function definition
209 |     "PLR0913",
210 |     # Too many statements
211 |     "PLR0915",
212 |     # Magic value used in comparison
213 |     "PLR2004",
214 |     # Mutable class attributes should be annotated with `typing.ClassVar`
215 |     "RUF012",
216 |     # Use of `assert` detected
217 |     "S101",
218 | ]
219 | 
220 | [tool.ruff.lint.per-file-ignores]
221 | # D102: Missing docstring in public method
222 | "tests/**" = ["D102"]
223 | 
224 | [tool.ruff.lint.pydocstyle]
225 | convention = "pep257"
226 | 
227 | [tool.setuptools]
228 | packages = ["scrapy_deltafetch"]
229 | 


--------------------------------------------------------------------------------
/scrapy_deltafetch/__init__.py:
--------------------------------------------------------------------------------
1 | from .middleware import DeltaFetch
2 | 
3 | __all__ = ["DeltaFetch"]
4 | __version__ = "2.1.0"
5 | 


--------------------------------------------------------------------------------
/scrapy_deltafetch/middleware.py:
--------------------------------------------------------------------------------
 1 | import dbm
 2 | import logging
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | from scrapy import signals
 7 | from scrapy.exceptions import NotConfigured
 8 | from scrapy.http import Request
 9 | from scrapy.utils.project import data_path
10 | from scrapy.utils.python import to_bytes
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class DeltaFetch:
16 |     """Spider middleware to ignore requests to pages containing items seen in
17 |     previous crawls of the same spider, thus producing a "delta crawl"
18 |     containing only new items.
19 | 
20 |     This also speeds up the crawl, by reducing the number of requests that need
21 |     to be crawled, and processed (typically, item requests are the most cpu
22 |     intensive).
23 |     """
24 | 
25 |     def __init__(self, dir, reset=False, stats=None):
26 |         self.dir = dir
27 |         self.reset = reset
28 |         self.stats = stats
29 | 
30 |     @classmethod
31 |     def from_crawler(cls, crawler):  # noqa: D102
32 |         s = crawler.settings
33 |         if not s.getbool("DELTAFETCH_ENABLED"):
34 |             raise NotConfigured
35 |         dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch"))
36 |         reset = s.getbool("DELTAFETCH_RESET")
37 |         o = cls(dir, reset, crawler.stats)
38 |         if o.stats is None:
39 |             o.stats = crawler.stats
40 |         crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
41 |         crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
42 | 
43 |         try:
44 |             o.fingerprint = crawler.request_fingerprinter.fingerprint
45 |         except AttributeError:
46 |             from scrapy.utils.request import request_fingerprint
47 | 
48 |             o.fingerprint = request_fingerprint
49 | 
50 |         return o
51 | 
52 |     def spider_opened(self, spider):  # noqa: D102
53 |         dir = Path(self.dir)
54 |         dir.mkdir(parents=True, exist_ok=True)
55 |         # TODO may be tricky, as there may be different paths on systems
56 |         dbpath = dir / f"{spider.name}.db"
57 |         reset = self.reset or getattr(spider, "deltafetch_reset", False)
58 |         flag = "n" if reset else "c"
59 |         try:
60 |             self.db = dbm.open(str(dbpath), flag=flag)  # noqa: SIM115
61 |         except Exception:
62 |             logger.warning(
63 |                 f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it"
64 |             )
65 |             if dbpath.exists():
66 |                 dbpath.unlink()
67 |             self.db = dbm.open(str(dbpath), "c")  # noqa: SIM115
68 | 
69 |     def spider_closed(self, spider):  # noqa: D102
70 |         self.db.close()
71 | 
72 |     def process_spider_output(self, response, result, spider):  # noqa: D102
73 |         for r in result:
74 |             if isinstance(r, Request):
75 |                 key = self._get_key(r)
76 |                 if key in self.db and self._is_enabled_for_request(r):
77 |                     logger.info(f"Ignoring already visited: {r}")
78 |                     if self.stats:
79 |                         self.stats.inc_value("deltafetch/skipped", spider=spider)
80 |                     continue
81 |             else:
82 |                 key = self._get_key(response.request)
83 |                 self.db[key] = str(time.time())
84 |                 if self.stats:
85 |                     self.stats.inc_value("deltafetch/stored", spider=spider)
86 |             yield r
87 | 
88 |     def _get_key(self, request):
89 |         key = request.meta.get("deltafetch_key") or self.fingerprint(request)
90 |         return to_bytes(key)
91 | 
92 |     def _is_enabled_for_request(self, request):
93 |         # Gives you option to disable deltafetch for some requests
94 |         return request.meta.get("deltafetch_enabled", True)
95 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-deltafetch/a8caf39de50ce44abe1950e6d3f3f94a56df393f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/benchmark.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from unittest import mock
 3 | 
 4 | from scrapy import Request, Spider
 5 | from scrapy.statscollectors import StatsCollector
 6 | from scrapy.utils.test import get_crawler
 7 | 
 8 | from scrapy_deltafetch import DeltaFetch
 9 | 
10 | 
11 | def benchmark_middleware(result):
12 |     spider_name = "df_tests"
13 |     spider = Spider(spider_name)
14 |     temp_dir = tempfile.gettempdir()
15 |     crawler = get_crawler(Spider)
16 |     stats = StatsCollector(crawler)
17 |     mw = DeltaFetch(temp_dir, reset=False, stats=stats)
18 |     mw.spider_opened(spider)
19 |     response = mock.Mock()
20 |     response.request = Request("http://url", meta={"deltafetch_key": "key"})
21 | 
22 |     for _x in mw.process_spider_output(response, result, spider):
23 |         pass
24 | 
25 | 
26 | def test_middleware(benchmark):
27 |     result = []
28 |     for x in range(50000):
29 |         request = Request(f"https://{x}")
30 |         result.append(request)
31 |     result = benchmark(benchmark_middleware, result)
32 | 


--------------------------------------------------------------------------------
/tests/test_deltafetch.py:
--------------------------------------------------------------------------------
  1 | import dbm
  2 | import tempfile
  3 | from dataclasses import dataclass
  4 | from pathlib import Path
  5 | from unittest import TestCase, mock
  6 | 
  7 | import pytest
  8 | from scrapy import Request
  9 | from scrapy.exceptions import NotConfigured
 10 | from scrapy.item import Item
 11 | from scrapy.settings import Settings
 12 | from scrapy.spiders import Spider
 13 | from scrapy.utils.python import to_bytes
 14 | from scrapy.utils.test import get_crawler
 15 | 
 16 | from scrapy_deltafetch.middleware import DeltaFetch
 17 | 
 18 | 
 19 | class DeltaFetchTestCase(TestCase):
 20 |     mwcls = DeltaFetch
 21 | 
 22 |     def setUp(self):
 23 |         self.spider_name = "df_tests"
 24 |         self.spider = Spider(self.spider_name)
 25 | 
 26 |         # DeltaFetch creates .db files named after the spider's name
 27 |         self.temp_dir = Path(tempfile.gettempdir())
 28 |         self.db_path = self.temp_dir / f"{self.spider.name}.db"
 29 | 
 30 |     def get_mw(self, dir=None, reset=None, cls=DeltaFetch):
 31 |         settings = {
 32 |             "DELTAFETCH_ENABLED": True,
 33 |         }
 34 |         if dir is not None:
 35 |             settings["DELTAFETCH_DIR"] = dir
 36 |         if reset is not None:
 37 |             settings["DELTAFETCH_RESET"] = reset
 38 |         crawler = get_crawler(Spider, settings_dict=settings)
 39 |         return cls.from_crawler(crawler)
 40 | 
 41 |     def test_init(self):
 42 |         # path format is any,  the folder is not created
 43 |         instance = self.get_mw("/any/dir", reset=True)
 44 |         assert isinstance(instance, self.mwcls)
 45 |         assert instance.dir == "/any/dir"
 46 |         assert instance.stats.get_stats() == {}
 47 |         assert instance.reset is True
 48 | 
 49 |     def test_init_from_crawler(self):
 50 |         crawler = mock.Mock()
 51 |         # void settings
 52 |         crawler.settings = Settings({})
 53 |         with pytest.raises(NotConfigured):
 54 |             self.mwcls.from_crawler(crawler)
 55 |         with (
 56 |             mock.patch("scrapy.utils.project.project_data_dir") as data_dir,
 57 |             mock.patch("scrapy.utils.project.inside_project") as in_project,
 58 |         ):
 59 |             data_dir.return_value = self.temp_dir
 60 |             in_project.return_value = True
 61 | 
 62 |             # simple project_data_dir mock with based settings
 63 |             crawler.settings = Settings({"DELTAFETCH_ENABLED": True})
 64 |             instance = self.mwcls.from_crawler(crawler)
 65 |             assert isinstance(instance, self.mwcls)
 66 |             assert instance.dir == str(self.temp_dir / "deltafetch")
 67 |             assert instance.reset is False
 68 | 
 69 |             # project_data_dir mock with advanced settings
 70 |             crawler.settings = Settings(
 71 |                 {
 72 |                     "DELTAFETCH_ENABLED": True,
 73 |                     "DELTAFETCH_DIR": "other",
 74 |                     "DELTAFETCH_RESET": True,
 75 |                 }
 76 |             )
 77 |             instance = self.mwcls.from_crawler(crawler)
 78 |             assert isinstance(instance, self.mwcls)
 79 |             assert instance.dir == str(self.temp_dir / "other")
 80 |             assert instance.reset is True
 81 | 
 82 |     def test_spider_opened_new(self):
 83 |         """Middleware should create a .db file if not found."""
 84 |         if self.db_path.exists():
 85 |             self.db_path.unlink()
 86 |         mw = self.get_mw(dir=self.temp_dir, reset=False)
 87 |         assert not hasattr(self.mwcls, "db")
 88 |         mw.spider_opened(self.spider)
 89 |         assert self.temp_dir.is_dir()
 90 |         assert self.db_path.exists()
 91 |         assert hasattr(mw, "db")
 92 |         assert mw.db.keys() == []
 93 | 
 94 |     def test_spider_opened_existing(self):
 95 |         """Middleware should open and use existing and valid .db files."""
 96 |         self._create_test_db()
 97 |         mw = self.get_mw(dir=self.temp_dir, reset=False)
 98 |         assert not hasattr(self.mwcls, "db")
 99 |         mw.spider_opened(self.spider)
100 |         assert hasattr(mw, "db")
101 |         for k, v in [(b"test_key_1", b"test_v_1"), (b"test_key_2", b"test_v_2")]:
102 |             assert mw.db.get(k) == v
103 | 
104 |     def test_spider_opened_corrupt_dbfile(self):
105 |         """Middleware should create a new .db if it cannot open it."""
106 |         # create an invalid .db file
107 |         with self.db_path.open("wb") as dbfile:
108 |             dbfile.write(b"bad")
109 |         mw = self.get_mw(dir=self.temp_dir, reset=False)
110 |         assert not hasattr(self.mwcls, "db")
111 | 
112 |         # file corruption is only detected when opening spider
113 |         mw.spider_opened(self.spider)
114 |         assert Path(self.temp_dir).is_dir()
115 |         assert Path(self.db_path).exists()
116 |         assert hasattr(mw, "db")
117 | 
118 |         # and db should be empty (it was re-created)
119 |         assert mw.db.keys() == []
120 | 
121 |     def test_spider_opened_existing_spider_reset(self):
122 |         self._create_test_db()
123 |         mw = self.get_mw(self.temp_dir, reset=False)
124 |         assert not hasattr(self.mwcls, "db")
125 |         self.spider.deltafetch_reset = True
126 |         mw.spider_opened(self.spider)
127 |         assert mw.db.keys() == []
128 | 
129 |     def test_spider_opened_reset_non_existing_db(self):
130 |         mw = self.get_mw(dir=self.temp_dir, reset=True)
131 |         assert not hasattr(self.mwcls, "db")
132 |         self.spider.deltafetch_reset = True
133 |         mw.spider_opened(self.spider)
134 |         assert mw.db.get(b"random") is None
135 | 
136 |     def test_spider_opened_recreate(self):
137 |         self._create_test_db()
138 |         mw = self.get_mw(dir=self.temp_dir, reset=True)
139 |         assert not hasattr(self.mwcls, "db")
140 |         mw.spider_opened(self.spider)
141 |         assert hasattr(mw, "db")
142 |         assert mw.db.keys() == []
143 | 
144 |     def test_spider_closed(self):
145 |         self._create_test_db()
146 |         mw = self.get_mw(dir=self.temp_dir, reset=True)
147 |         mw.spider_opened(self.spider)
148 |         assert mw.db.get("random") is None
149 |         mw.spider_closed(self.spider)
150 |         with pytest.raises(dbm.error):
151 |             mw.db.get("radom")
152 | 
153 |     def test_process_spider_output(self):
154 |         self._create_test_db()
155 |         settings = {
156 |             "DELTAFETCH_DIR": self.temp_dir,
157 |             "DELTAFETCH_ENABLED": True,
158 |         }
159 |         crawler = get_crawler(Spider, settings_dict=settings)
160 |         mw = self.mwcls.from_crawler(crawler)
161 |         mw.spider_opened(self.spider)
162 |         response = mock.Mock()
163 |         response.request = Request("http://url", meta={"deltafetch_key": "key"})
164 |         result = []
165 |         assert not list(mw.process_spider_output(response, result, self.spider))
166 |         result = [
167 |             # same URL but with new key --> it should be processed
168 |             Request("http://url", meta={"deltafetch_key": "key1"}),
169 |             # 'test_key_1' is already in the test db --> it should be skipped
170 |             Request("http://url1", meta={"deltafetch_key": "test_key_1"}),
171 |         ]
172 |         # so only the 1 request should go through
173 |         assert list(mw.process_spider_output(response, result, self.spider)) == [
174 |             result[0]
175 |         ]
176 | 
177 |         # the skipped "http://url1" should be counted in stats
178 |         assert crawler.stats.get_stats() == {"deltafetch/skipped": 1}
179 | 
180 |         # b'key' should not be in the db yet as no item was collected yet
181 |         assert set(mw.db.keys()) == {b"test_key_1", b"test_key_2"}
182 | 
183 |         # if the spider returns items, the request's key is added in db
184 |         result = [Item(), "not a base item"]
185 |         assert list(mw.process_spider_output(response, result, self.spider)) == result
186 |         assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"}
187 |         assert mw.db[b"key"]
188 | 
189 |     def test_process_spider_output_with_ignored_request(self):
190 |         self._create_test_db()
191 |         settings = {
192 |             "DELTAFETCH_DIR": self.temp_dir,
193 |             "DELTAFETCH_ENABLED": True,
194 |         }
195 |         crawler = get_crawler(Spider, settings_dict=settings)
196 |         mw = self.mwcls.from_crawler(crawler)
197 |         mw.spider_opened(self.spider)
198 |         response = mock.Mock()
199 |         response.request = Request("http://url")
200 |         result = []
201 |         assert not list(mw.process_spider_output(response, result, self.spider))
202 |         result = [
203 |             Request("http://url1"),
204 |             # 'url1' is already in the db, but deltafetch_enabled=False
205 |             # flag is set, URL should be processed.
206 |             Request("http://url1", meta={"deltafetch_enabled": False}),
207 |         ]
208 |         # so 2 requests should go through
209 |         assert list(mw.process_spider_output(response, result, self.spider)) == [
210 |             result[0],
211 |             result[1],
212 |         ]
213 | 
214 |     def test_process_spider_output_dict(self):
215 |         self._create_test_db()
216 |         mw = self.get_mw(dir=self.temp_dir, reset=False)
217 |         mw.spider_opened(self.spider)
218 |         response = mock.Mock()
219 |         response.request = Request("http://url", meta={"deltafetch_key": "key"})
220 |         result = [{"somekey": "somevalue"}]
221 |         assert list(mw.process_spider_output(response, result, self.spider)) == result
222 |         assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"}
223 |         assert mw.db[b"key"]
224 | 
225 |     def test_process_spider_output_stats(self):
226 |         self._create_test_db()
227 |         mw = self.get_mw(dir=self.temp_dir)
228 |         mw.spider_opened(self.spider)
229 |         response = mock.Mock()
230 |         response.request = Request("http://url", meta={"deltafetch_key": "key"})
231 |         result = []
232 |         assert not list(mw.process_spider_output(response, result, self.spider))
233 |         assert mw.stats.get_stats() == {}
234 |         result = [
235 |             Request("http://url", meta={"deltafetch_key": "key"}),
236 |             Request("http://url1", meta={"deltafetch_key": "test_key_1"}),
237 |         ]
238 |         assert list(mw.process_spider_output(response, result, self.spider)) == [
239 |             result[0]
240 |         ]
241 |         assert mw.stats.get_value("deltafetch/skipped") == 1
242 | 
243 |         @dataclass
244 |         class TestItem:
245 |             foo: str
246 | 
247 |         result = [Item(), TestItem("bar")]
248 |         assert list(mw.process_spider_output(response, result, self.spider)) == result
249 |         assert mw.stats.get_value("deltafetch/stored") == 2
250 | 
251 |     def test_init_from_crawler_legacy(self):
252 |         # test with subclass not handling passed stats
253 |         class LegacyDeltaFetchSubClass(self.mwcls):
254 |             def __init__(self, dir, reset, *args, **kwargs):
255 |                 super().__init__(dir=dir, reset=reset)
256 |                 self.something = True
257 | 
258 |         crawler = mock.Mock()
259 |         # void settings
260 |         crawler.settings = Settings({})
261 |         with pytest.raises(NotConfigured):
262 |             self.mwcls.from_crawler(crawler)
263 | 
264 |         with (
265 |             mock.patch("scrapy.utils.project.project_data_dir") as data_dir,
266 |             mock.patch("scrapy.utils.project.inside_project") as in_project,
267 |         ):
268 |             data_dir.return_value = self.temp_dir
269 |             in_project.return_value = True
270 | 
271 |             # simple project_data_dir mock with based settings
272 |             crawler.settings = Settings({"DELTAFETCH_ENABLED": True})
273 |             instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
274 |             assert isinstance(instance, self.mwcls)
275 |             assert instance.dir == str(Path(self.temp_dir) / "deltafetch")
276 |             assert instance.reset is False
277 | 
278 |             # project_data_dir mock with advanced settings
279 |             crawler.settings = Settings(
280 |                 {
281 |                     "DELTAFETCH_ENABLED": True,
282 |                     "DELTAFETCH_DIR": "other",
283 |                     "DELTAFETCH_RESET": True,
284 |                 }
285 |             )
286 |             instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
287 |             assert isinstance(instance, self.mwcls)
288 |             assert instance.dir == str(Path(self.temp_dir) / "other")
289 |             assert instance.reset is True
290 | 
291 |     def test_process_spider_output_stats_legacy(self):
292 |         # testing the subclass not handling stats works at runtime
293 |         # (i.e. that trying to update stats does not trigger exception)
294 |         class LegacyDeltaFetchSubClass(self.mwcls):
295 |             def __init__(self, dir, *args, reset=False, **kwargs):
296 |                 super().__init__(dir=dir, reset=reset)
297 |                 self.something = True
298 | 
299 |         self._create_test_db()
300 |         mw = self.get_mw(dir=self.temp_dir, reset=False, cls=LegacyDeltaFetchSubClass)
301 |         mw.spider_opened(self.spider)
302 |         response = mock.Mock()
303 |         response.request = Request("http://url", meta={"deltafetch_key": "key"})
304 |         result = []
305 |         assert not list(mw.process_spider_output(response, result, self.spider))
306 |         assert mw.stats.get_stats() == {}
307 |         result = [
308 |             Request("http://url", meta={"deltafetch_key": "key"}),
309 |             Request("http://url1", meta={"deltafetch_key": "test_key_1"}),
310 |         ]
311 | 
312 |         assert list(mw.process_spider_output(response, result, self.spider)) == [
313 |             result[0]
314 |         ]
315 |         assert mw.stats.get_value("deltafetch/skipped") == 1
316 | 
317 |         @dataclass
318 |         class TestItem:
319 |             foo: str
320 | 
321 |         result = [Item(), TestItem("bar")]
322 |         assert list(mw.process_spider_output(response, result, self.spider)) == result
323 |         assert mw.stats.get_value("deltafetch/stored") == 2
324 | 
325 |     def test_get_key(self):
326 |         settings = {
327 |             "DELTAFETCH_DIR": self.temp_dir,
328 |             "DELTAFETCH_ENABLED": True,
329 |             "DELTAFETCH_RESET": True,
330 |         }
331 |         crawler = get_crawler(Spider, settings_dict=settings)
332 |         mw = self.mwcls.from_crawler(crawler)
333 |         test_req1 = Request("http://url1")
334 |         try:
335 |             fingerprint = crawler.request_fingerprinter.fingerprint
336 |         except AttributeError:  # Scrapy < 2.7.0
337 |             from scrapy.utils.request import request_fingerprint
338 | 
339 |             fingerprint = request_fingerprint
340 |         assert mw._get_key(test_req1) == to_bytes(fingerprint(test_req1))
341 |         test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"})
342 |         assert mw._get_key(test_req2) == b"dfkey1"
343 | 
344 |         test_req3 = Request("http://url2", meta={"deltafetch_key": "dfkey1"})
345 |         # key will be converted to bytes
346 |         assert mw._get_key(test_req3) == b"dfkey1"
347 | 
348 |     def _create_test_db(self):
349 |         # truncate test db if there were failed tests
350 |         with dbm.open(str(self.db_path), "n") as db:
351 |             db[b"test_key_1"] = b"test_v_1"
352 |             db[b"test_key_2"] = b"test_v_2"
353 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = pre-commit,mypy,pylint,twinecheck,min,py39,py310,py311,py312,py313
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     pytest-cov
 8 |     pytest-benchmark
 9 | commands =
10 |     pytest \
11 |         --cov=scrapy_deltafetch \
12 |         --cov-config=pyproject.toml \
13 |         --cov-report=xml \
14 |         --cov-report= \
15 |         {posargs:scrapy_deltafetch tests}
16 | 
17 | [testenv:min]
18 | basepython = python3.9
19 | deps =
20 |     {[testenv]deps}
21 |     scrapy==1.1.0
22 | 
23 | [testenv:pre-commit]
24 | deps =
25 |     pre-commit
26 | commands =
27 |     pre-commit run {posargs:--all-files}
28 | 
29 | [testenv:mypy]
30 | deps =
31 |     {[testenv]deps}
32 |     mypy==1.15.0
33 | commands =
34 |     mypy {posargs:scrapy_deltafetch tests}
35 | 
36 | # https://github.com/astral-sh/ruff/issues/970
37 | [testenv:pylint]
38 | deps =
39 |     {[testenv]deps}
40 |     pylint==3.3.4
41 | commands =
42 |     pylint {posargs:scrapy_deltafetch tests}
43 | 
44 | [testenv:twinecheck]
45 | deps =
46 |     twine==6.1.0
47 |     build==1.2.2.post1
48 | commands =
49 |     python -m build --sdist
50 |     twine check dist/*
51 | 


--------------------------------------------------------------------------------