├── .github └── workflows │ ├── main.yml │ └── publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGES.rst ├── README.rst ├── pyproject.toml ├── scrapy_deltafetch ├── __init__.py └── middleware.py ├── tests ├── __init__.py ├── benchmark.py └── test_deltafetch.py └── tox.ini /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | jobs: 7 | test: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - python-version: "3.9" 14 | toxenv: min 15 | - python-version: "3.9" 16 | - python-version: "3.10" 17 | - python-version: "3.11" 18 | - python-version: "3.12" 19 | - python-version: "3.13" 20 | - python-version: "3.13" 21 | toxenv: pre-commit 22 | - python-version: "3.13" 23 | toxenv: mypy 24 | - python-version: "3.13" 25 | toxenv: pylint 26 | - python-version: "3.13" 27 | toxenv: twinecheck 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: libddb 31 | run: | 32 | sudo apt-get install libdb-dev 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Run 38 | env: 39 | TOXENV: ${{ matrix.toxenv }} 40 | run: | 41 | pip install -U tox 42 | tox 43 | - name: Upload coverage report 44 | uses: codecov/codecov-action@v5 45 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - 'v[0-9]+.[0-9]+.[0-9]+' 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: pypi 11 | url: https://pypi.org/p/${{ github.event.repository.name }} 12 | permissions: 13 | id-token: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.13 19 | - run: | 20 | python -m pip install --upgrade build 21 | python -m build 22 | - name: Publish to PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea 91 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.9.7 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | - id: ruff-format 8 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 2.1.0 (2025-02-26) 5 | ------------------ 6 | 7 | * Drop support for Python 3.8 and lower, add support for Python 3.9 and higher. 8 | * Add support for Scrapy 2.12. 9 | * Use the ``REQUEST_FINGERPRINTER_CLASS`` setting introduced in Scrapy 2.7. 10 | * Support new item types introduced in Scrapy 2.2. 11 | * Support ``Path`` instances in the ``DELTAFETCH_DIR`` setting. 12 | 13 | 2.0.0 (2021-09-20) 14 | ------------------ 15 | * drop Python 2 support 16 | * replace bsddb3 with Python's dbm for storing request fingerprints 17 | * minor README fix 18 | * option to disable deltafetch for some requests with deltafetch_enabled=False request meta key 19 | * dev workflow: changed from Travis to Github Actions 20 | 21 | 1.2.1 (2017-02-09) 22 | ------------------ 23 | 24 | * Use python idiom to check key in dict 25 | * Update minimum Scrapy version supported by this extension to v1.1.0 26 | 27 | 1.2.0 (2016-12-07) 28 | ------------------ 29 | 30 | * Log through ``logging`` module instead of (deprecated) scrapy's spider.log(). 31 | * Fix README on passing ``deltafetch_reset`` argument on the command line. 32 | 33 | 34 | 1.1.0 (2016-06-29) 35 | ------------------ 36 | 37 | Adds support for callbacks returning dict items. 38 | 39 | 40 | 1.0.1 (2016-06-27) 41 | ------------------ 42 | 43 | Fix package URL in setup.py 44 | 45 | 46 | 1.0.0 (2016-06-27) 47 | ------------------ 48 | 49 | Initial release. 50 | 51 | This version is functionally equivalent to scrapylib's v1.7.0 52 | ``scrapylib.deltafetch.DeltaFetch``. 53 | 54 | The only (and major) difference is that support for ``bsddb`` is dropped 55 | in favor of ``bsddb3``, which is a new required dependency. 56 | 57 | .. note:: 58 | `bsddb`_ has been deprecated since Python 2.6, 59 | and even removed in Python 3 60 | 61 | 62 | .. _bsddb: https://docs.python.org/2/library/bsddb.html 63 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | scrapy-deltafetch 3 | ================= 4 | 5 | .. image:: https://github.com/scrapy-plugins/scrapy-deltafetch/workflows/CI/badge.svg 6 | :target: https://github.com/scrapy-plugins/scrapy-deltafetch/actions 7 | 8 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-deltafetch.svg 9 | :target: https://pypi.python.org/pypi/scrapy-deltafetch 10 | 11 | .. image:: https://img.shields.io/pypi/v/scrapy-deltafetch.svg 12 | :target: https://pypi.python.org/pypi/scrapy-deltafetch 13 | 14 | .. image:: https://img.shields.io/pypi/l/scrapy-deltafetch.svg 15 | :target: https://pypi.python.org/pypi/scrapy-deltafetch 16 | 17 | .. image:: https://img.shields.io/pypi/dm/scrapy-deltafetch.svg 18 | :target: https://pypistats.org/packages/scrapy-deltafetch 19 | :alt: Downloads count 20 | 21 | This is a Scrapy spider middleware to ignore requests 22 | to pages seen in previous crawls of the same spider, 23 | thus producing a "delta crawl" containing only new requests. 24 | 25 | This also speeds up the crawl, by reducing the number of requests that need 26 | to be crawled, and processed (typically, item requests are the most CPU 27 | intensive). 28 | 29 | DeltaFetch middleware uses Python's dbm_ package to store requests fingerprints. 30 | 31 | .. _dbm: https://docs.python.org/3/library/dbm.html 32 | 33 | 34 | Installation 35 | ============ 36 | 37 | Install scrapy-deltafetch using ``pip``:: 38 | 39 | $ pip install scrapy-deltafetch 40 | 41 | 42 | Configuration 43 | ============= 44 | 45 | 1. Add DeltaFetch middleware by including it in ``SPIDER_MIDDLEWARES`` 46 | in your ``settings.py`` file:: 47 | 48 | SPIDER_MIDDLEWARES = { 49 | 'scrapy_deltafetch.DeltaFetch': 100, 50 | } 51 | 52 | Here, priority ``100`` is just an example. 53 | Set its value depending on other middlewares you may have enabled already. 54 | 55 | 2. Enable the middleware using ``DELTAFETCH_ENABLED`` in your ``settings.py``:: 56 | 57 | DELTAFETCH_ENABLED = True 58 | 59 | 60 | Usage 61 | ===== 62 | 63 | Following are the different options to control DeltaFetch middleware 64 | behavior. 65 | 66 | Supported Scrapy settings 67 | ------------------------- 68 | 69 | * ``DELTAFETCH_ENABLED`` — to enable (or disable) this extension 70 | * ``DELTAFETCH_DIR`` — directory where to store state 71 | * ``DELTAFETCH_RESET`` — reset the state, clearing out all seen requests 72 | 73 | These usually go in your Scrapy project's ``settings.py``. 74 | 75 | 76 | Supported Scrapy spider arguments 77 | --------------------------------- 78 | 79 | * ``deltafetch_reset`` — same effect as DELTAFETCH_RESET setting 80 | 81 | Example:: 82 | 83 | $ scrapy crawl example -a deltafetch_reset=1 84 | 85 | 86 | Supported Scrapy request meta keys 87 | ---------------------------------- 88 | 89 | * ``deltafetch_key`` — used to define the lookup key for that request. by 90 | default it's Scrapy's default Request fingerprint function, 91 | but it can be changed to contain an item id, for example. 92 | This requires support from the spider, but makes the extension 93 | more efficient for sites that many URLs for the same item. 94 | 95 | * ``deltafetch_enabled`` - if set to False it will disable deltafetch for some 96 | specific request 97 | 98 | 99 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "scrapy-deltafetch" 7 | version = "2.1.0" 8 | authors = [{name = "Zyte", email = "opensource@zyte.com"}] 9 | license = {text = "BSD"} 10 | description = "Scrapy middleware to ignore previously crawled pages" 11 | readme = "README.rst" 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "License :: OSI Approved :: BSD License", 15 | "Operating System :: OS Independent", 16 | "Programming Language :: Python", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | ] 24 | requires-python = ">=3.9" 25 | dependencies = ["Scrapy>=1.1.0"] 26 | 27 | [project.urls] 28 | Homepage = "http://github.com/scrapy-plugins/scrapy-deltafetch" 29 | 30 | [tool.bumpversion] 31 | current_version = "2.1.0" 32 | commit = true 33 | tag = true 34 | 35 | [[tool.bumpversion.files]] 36 | filename = 'CHANGES.rst' 37 | search = "\\(unreleased\\)$" 38 | replace = "({now:%Y-%m-%d})" 39 | regex = true 40 | 41 | [[tool.bumpversion.files]] 42 | search = "version = \"{current_version}\"" 43 | replace = "version = \"{new_version}\"" 44 | filename = "pyproject.toml" 45 | 46 | [[tool.bumpversion.files]] 47 | filename = "scrapy_deltafetch/__init__.py" 48 | 49 | [tool.coverage.run] 50 | branch = true 51 | include = ["scrapy_deltafetch/*"] 52 | omit = ["tests/*"] 53 | disable_warnings = ["include-ignored"] 54 | 55 | [tool.coverage.paths] 56 | source = [ 57 | "scrapy_deltafetch", 58 | ".tox/**/site-packages/scrapy-deltafetch" 59 | ] 60 | 61 | [tool.coverage.report] 62 | # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 63 | exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"] 64 | 65 | [tool.pylint.MASTER] 66 | persistent = "no" 67 | jobs = 1 # >1 hides results 68 | 69 | [tool.pylint."MESSAGES CONTROL"] 70 | enable = [ 71 | "useless-suppression", 72 | ] 73 | disable = [ 74 | # Ones we want to ignore 75 | "attribute-defined-outside-init", 76 | "broad-exception-caught", 77 | "consider-using-with", 78 | "cyclic-import", 79 | "disallowed-name", 80 | "duplicate-code", # https://github.com/pylint-dev/pylint/issues/214 81 | "fixme", 82 | "import-outside-toplevel", 83 | "inherit-non-class", # false positives with create_deprecated_class() 84 | "invalid-name", 85 | "invalid-overridden-method", 86 | "isinstance-second-argument-not-valid-type", # false positives with create_deprecated_class() 87 | "line-too-long", 88 | "logging-format-interpolation", 89 | "logging-fstring-interpolation", 90 | "logging-not-lazy", 91 | "missing-docstring", 92 | "no-member", 93 | "no-name-in-module", # caught by mypy already 94 | "no-value-for-parameter", # https://github.com/pylint-dev/pylint/issues/3268 95 | "not-callable", 96 | "protected-access", 97 | "redefined-builtin", 98 | "redefined-outer-name", 99 | "too-few-public-methods", 100 | "too-many-ancestors", 101 | "too-many-arguments", 102 | "too-many-branches", 103 | "too-many-function-args", 104 | "too-many-instance-attributes", 105 | "too-many-lines", 106 | "too-many-locals", 107 | "too-many-positional-arguments", 108 | "too-many-public-methods", 109 | "too-many-return-statements", 110 | "unused-argument", 111 | "unused-import", 112 | "unused-variable", 113 | "useless-import-alias", # used as a hint to mypy 114 | "useless-return", # https://github.com/pylint-dev/pylint/issues/6530 115 | "wrong-import-position", 116 | ] 117 | 118 | [tool.ruff.lint] 119 | extend-select = [ 120 | # flake8-bugbear 121 | "B", 122 | # flake8-comprehensions 123 | "C4", 124 | # pydocstyle 125 | "D", 126 | # flake8-future-annotations 127 | "FA", 128 | # flynt 129 | "FLY", 130 | # refurb 131 | "FURB", 132 | # isort 133 | "I", 134 | # flake8-implicit-str-concat 135 | "ISC", 136 | # flake8-logging 137 | "LOG", 138 | # Perflint 139 | "PERF", 140 | # pygrep-hooks 141 | "PGH", 142 | # flake8-pie 143 | "PIE", 144 | # pylint 145 | "PL", 146 | # flake8-pytest-style 147 | "PT", 148 | # flake8-use-pathlib 149 | "PTH", 150 | # flake8-pyi 151 | "PYI", 152 | # flake8-quotes 153 | "Q", 154 | # flake8-return 155 | "RET", 156 | # flake8-raise 157 | "RSE", 158 | # Ruff-specific rules 159 | "RUF", 160 | # flake8-bandit 161 | "S", 162 | # flake8-simplify 163 | "SIM", 164 | # flake8-slots 165 | "SLOT", 166 | # flake8-debugger 167 | "T10", 168 | # flake8-type-checking 169 | "TC", 170 | # pyupgrade 171 | "UP", 172 | # pycodestyle warnings 173 | "W", 174 | # flake8-2020 175 | "YTT", 176 | ] 177 | ignore = [ 178 | # Missing docstring in public module 179 | "D100", 180 | # Missing docstring in public class 181 | "D101", 182 | # Missing docstring in public function 183 | "D103", 184 | # Missing docstring in public package 185 | "D104", 186 | # Missing docstring in magic method 187 | "D105", 188 | # Missing docstring in __init__ 189 | "D107", 190 | # One-line docstring should fit on one line with quotes 191 | "D200", 192 | # No blank lines allowed after function docstring 193 | "D202", 194 | # 1 blank line required between summary line and description 195 | "D205", 196 | # Multi-line docstring closing quotes should be on a separate line 197 | "D209", 198 | # First line should end with a period 199 | "D400", 200 | # First line should be in imperative mood; try rephrasing 201 | "D401", 202 | # First line should not be the function's "signature" 203 | "D402", 204 | # Too many return statements 205 | "PLR0911", 206 | # Too many branches 207 | "PLR0912", 208 | # Too many arguments in function definition 209 | "PLR0913", 210 | # Too many statements 211 | "PLR0915", 212 | # Magic value used in comparison 213 | "PLR2004", 214 | # Mutable class attributes should be annotated with `typing.ClassVar` 215 | "RUF012", 216 | # Use of `assert` detected 217 | "S101", 218 | ] 219 | 220 | [tool.ruff.lint.per-file-ignores] 221 | # D102: Missing docstring in public method 222 | "tests/**" = ["D102"] 223 | 224 | [tool.ruff.lint.pydocstyle] 225 | convention = "pep257" 226 | 227 | [tool.setuptools] 228 | packages = ["scrapy_deltafetch"] 229 | -------------------------------------------------------------------------------- /scrapy_deltafetch/__init__.py: -------------------------------------------------------------------------------- 1 | from .middleware import DeltaFetch 2 | 3 | __all__ = ["DeltaFetch"] 4 | __version__ = "2.1.0" 5 | -------------------------------------------------------------------------------- /scrapy_deltafetch/middleware.py: -------------------------------------------------------------------------------- 1 | import dbm 2 | import logging 3 | import time 4 | from pathlib import Path 5 | 6 | from scrapy import signals 7 | from scrapy.exceptions import NotConfigured 8 | from scrapy.http import Request 9 | from scrapy.utils.project import data_path 10 | from scrapy.utils.python import to_bytes 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class DeltaFetch: 16 | """Spider middleware to ignore requests to pages containing items seen in 17 | previous crawls of the same spider, thus producing a "delta crawl" 18 | containing only new items. 19 | 20 | This also speeds up the crawl, by reducing the number of requests that need 21 | to be crawled, and processed (typically, item requests are the most cpu 22 | intensive). 23 | """ 24 | 25 | def __init__(self, dir, reset=False, stats=None): 26 | self.dir = dir 27 | self.reset = reset 28 | self.stats = stats 29 | 30 | @classmethod 31 | def from_crawler(cls, crawler): # noqa: D102 32 | s = crawler.settings 33 | if not s.getbool("DELTAFETCH_ENABLED"): 34 | raise NotConfigured 35 | dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch")) 36 | reset = s.getbool("DELTAFETCH_RESET") 37 | o = cls(dir, reset, crawler.stats) 38 | if o.stats is None: 39 | o.stats = crawler.stats 40 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 41 | crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) 42 | 43 | try: 44 | o.fingerprint = crawler.request_fingerprinter.fingerprint 45 | except AttributeError: 46 | from scrapy.utils.request import request_fingerprint 47 | 48 | o.fingerprint = request_fingerprint 49 | 50 | return o 51 | 52 | def spider_opened(self, spider): # noqa: D102 53 | dir = Path(self.dir) 54 | dir.mkdir(parents=True, exist_ok=True) 55 | # TODO may be tricky, as there may be different paths on systems 56 | dbpath = dir / f"{spider.name}.db" 57 | reset = self.reset or getattr(spider, "deltafetch_reset", False) 58 | flag = "n" if reset else "c" 59 | try: 60 | self.db = dbm.open(str(dbpath), flag=flag) # noqa: SIM115 61 | except Exception: 62 | logger.warning( 63 | f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it" 64 | ) 65 | if dbpath.exists(): 66 | dbpath.unlink() 67 | self.db = dbm.open(str(dbpath), "c") # noqa: SIM115 68 | 69 | def spider_closed(self, spider): # noqa: D102 70 | self.db.close() 71 | 72 | def process_spider_output(self, response, result, spider): # noqa: D102 73 | for r in result: 74 | if isinstance(r, Request): 75 | key = self._get_key(r) 76 | if key in self.db and self._is_enabled_for_request(r): 77 | logger.info(f"Ignoring already visited: {r}") 78 | if self.stats: 79 | self.stats.inc_value("deltafetch/skipped", spider=spider) 80 | continue 81 | else: 82 | key = self._get_key(response.request) 83 | self.db[key] = str(time.time()) 84 | if self.stats: 85 | self.stats.inc_value("deltafetch/stored", spider=spider) 86 | yield r 87 | 88 | def _get_key(self, request): 89 | key = request.meta.get("deltafetch_key") or self.fingerprint(request) 90 | return to_bytes(key) 91 | 92 | def _is_enabled_for_request(self, request): 93 | # Gives you option to disable deltafetch for some requests 94 | return request.meta.get("deltafetch_enabled", True) 95 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-deltafetch/a8caf39de50ce44abe1950e6d3f3f94a56df393f/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmark.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from unittest import mock 3 | 4 | from scrapy import Request, Spider 5 | from scrapy.statscollectors import StatsCollector 6 | from scrapy.utils.test import get_crawler 7 | 8 | from scrapy_deltafetch import DeltaFetch 9 | 10 | 11 | def benchmark_middleware(result): 12 | spider_name = "df_tests" 13 | spider = Spider(spider_name) 14 | temp_dir = tempfile.gettempdir() 15 | crawler = get_crawler(Spider) 16 | stats = StatsCollector(crawler) 17 | mw = DeltaFetch(temp_dir, reset=False, stats=stats) 18 | mw.spider_opened(spider) 19 | response = mock.Mock() 20 | response.request = Request("http://url", meta={"deltafetch_key": "key"}) 21 | 22 | for _x in mw.process_spider_output(response, result, spider): 23 | pass 24 | 25 | 26 | def test_middleware(benchmark): 27 | result = [] 28 | for x in range(50000): 29 | request = Request(f"https://{x}") 30 | result.append(request) 31 | result = benchmark(benchmark_middleware, result) 32 | -------------------------------------------------------------------------------- /tests/test_deltafetch.py: -------------------------------------------------------------------------------- 1 | import dbm 2 | import tempfile 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | from unittest import TestCase, mock 6 | 7 | import pytest 8 | from scrapy import Request 9 | from scrapy.exceptions import NotConfigured 10 | from scrapy.item import Item 11 | from scrapy.settings import Settings 12 | from scrapy.spiders import Spider 13 | from scrapy.utils.python import to_bytes 14 | from scrapy.utils.test import get_crawler 15 | 16 | from scrapy_deltafetch.middleware import DeltaFetch 17 | 18 | 19 | class DeltaFetchTestCase(TestCase): 20 | mwcls = DeltaFetch 21 | 22 | def setUp(self): 23 | self.spider_name = "df_tests" 24 | self.spider = Spider(self.spider_name) 25 | 26 | # DeltaFetch creates .db files named after the spider's name 27 | self.temp_dir = Path(tempfile.gettempdir()) 28 | self.db_path = self.temp_dir / f"{self.spider.name}.db" 29 | 30 | def get_mw(self, dir=None, reset=None, cls=DeltaFetch): 31 | settings = { 32 | "DELTAFETCH_ENABLED": True, 33 | } 34 | if dir is not None: 35 | settings["DELTAFETCH_DIR"] = dir 36 | if reset is not None: 37 | settings["DELTAFETCH_RESET"] = reset 38 | crawler = get_crawler(Spider, settings_dict=settings) 39 | return cls.from_crawler(crawler) 40 | 41 | def test_init(self): 42 | # path format is any, the folder is not created 43 | instance = self.get_mw("/any/dir", reset=True) 44 | assert isinstance(instance, self.mwcls) 45 | assert instance.dir == "/any/dir" 46 | assert instance.stats.get_stats() == {} 47 | assert instance.reset is True 48 | 49 | def test_init_from_crawler(self): 50 | crawler = mock.Mock() 51 | # void settings 52 | crawler.settings = Settings({}) 53 | with pytest.raises(NotConfigured): 54 | self.mwcls.from_crawler(crawler) 55 | with ( 56 | mock.patch("scrapy.utils.project.project_data_dir") as data_dir, 57 | mock.patch("scrapy.utils.project.inside_project") as in_project, 58 | ): 59 | data_dir.return_value = self.temp_dir 60 | in_project.return_value = True 61 | 62 | # simple project_data_dir mock with based settings 63 | crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) 64 | instance = self.mwcls.from_crawler(crawler) 65 | assert isinstance(instance, self.mwcls) 66 | assert instance.dir == str(self.temp_dir / "deltafetch") 67 | assert instance.reset is False 68 | 69 | # project_data_dir mock with advanced settings 70 | crawler.settings = Settings( 71 | { 72 | "DELTAFETCH_ENABLED": True, 73 | "DELTAFETCH_DIR": "other", 74 | "DELTAFETCH_RESET": True, 75 | } 76 | ) 77 | instance = self.mwcls.from_crawler(crawler) 78 | assert isinstance(instance, self.mwcls) 79 | assert instance.dir == str(self.temp_dir / "other") 80 | assert instance.reset is True 81 | 82 | def test_spider_opened_new(self): 83 | """Middleware should create a .db file if not found.""" 84 | if self.db_path.exists(): 85 | self.db_path.unlink() 86 | mw = self.get_mw(dir=self.temp_dir, reset=False) 87 | assert not hasattr(self.mwcls, "db") 88 | mw.spider_opened(self.spider) 89 | assert self.temp_dir.is_dir() 90 | assert self.db_path.exists() 91 | assert hasattr(mw, "db") 92 | assert mw.db.keys() == [] 93 | 94 | def test_spider_opened_existing(self): 95 | """Middleware should open and use existing and valid .db files.""" 96 | self._create_test_db() 97 | mw = self.get_mw(dir=self.temp_dir, reset=False) 98 | assert not hasattr(self.mwcls, "db") 99 | mw.spider_opened(self.spider) 100 | assert hasattr(mw, "db") 101 | for k, v in [(b"test_key_1", b"test_v_1"), (b"test_key_2", b"test_v_2")]: 102 | assert mw.db.get(k) == v 103 | 104 | def test_spider_opened_corrupt_dbfile(self): 105 | """Middleware should create a new .db if it cannot open it.""" 106 | # create an invalid .db file 107 | with self.db_path.open("wb") as dbfile: 108 | dbfile.write(b"bad") 109 | mw = self.get_mw(dir=self.temp_dir, reset=False) 110 | assert not hasattr(self.mwcls, "db") 111 | 112 | # file corruption is only detected when opening spider 113 | mw.spider_opened(self.spider) 114 | assert Path(self.temp_dir).is_dir() 115 | assert Path(self.db_path).exists() 116 | assert hasattr(mw, "db") 117 | 118 | # and db should be empty (it was re-created) 119 | assert mw.db.keys() == [] 120 | 121 | def test_spider_opened_existing_spider_reset(self): 122 | self._create_test_db() 123 | mw = self.get_mw(self.temp_dir, reset=False) 124 | assert not hasattr(self.mwcls, "db") 125 | self.spider.deltafetch_reset = True 126 | mw.spider_opened(self.spider) 127 | assert mw.db.keys() == [] 128 | 129 | def test_spider_opened_reset_non_existing_db(self): 130 | mw = self.get_mw(dir=self.temp_dir, reset=True) 131 | assert not hasattr(self.mwcls, "db") 132 | self.spider.deltafetch_reset = True 133 | mw.spider_opened(self.spider) 134 | assert mw.db.get(b"random") is None 135 | 136 | def test_spider_opened_recreate(self): 137 | self._create_test_db() 138 | mw = self.get_mw(dir=self.temp_dir, reset=True) 139 | assert not hasattr(self.mwcls, "db") 140 | mw.spider_opened(self.spider) 141 | assert hasattr(mw, "db") 142 | assert mw.db.keys() == [] 143 | 144 | def test_spider_closed(self): 145 | self._create_test_db() 146 | mw = self.get_mw(dir=self.temp_dir, reset=True) 147 | mw.spider_opened(self.spider) 148 | assert mw.db.get("random") is None 149 | mw.spider_closed(self.spider) 150 | with pytest.raises(dbm.error): 151 | mw.db.get("radom") 152 | 153 | def test_process_spider_output(self): 154 | self._create_test_db() 155 | settings = { 156 | "DELTAFETCH_DIR": self.temp_dir, 157 | "DELTAFETCH_ENABLED": True, 158 | } 159 | crawler = get_crawler(Spider, settings_dict=settings) 160 | mw = self.mwcls.from_crawler(crawler) 161 | mw.spider_opened(self.spider) 162 | response = mock.Mock() 163 | response.request = Request("http://url", meta={"deltafetch_key": "key"}) 164 | result = [] 165 | assert not list(mw.process_spider_output(response, result, self.spider)) 166 | result = [ 167 | # same URL but with new key --> it should be processed 168 | Request("http://url", meta={"deltafetch_key": "key1"}), 169 | # 'test_key_1' is already in the test db --> it should be skipped 170 | Request("http://url1", meta={"deltafetch_key": "test_key_1"}), 171 | ] 172 | # so only the 1 request should go through 173 | assert list(mw.process_spider_output(response, result, self.spider)) == [ 174 | result[0] 175 | ] 176 | 177 | # the skipped "http://url1" should be counted in stats 178 | assert crawler.stats.get_stats() == {"deltafetch/skipped": 1} 179 | 180 | # b'key' should not be in the db yet as no item was collected yet 181 | assert set(mw.db.keys()) == {b"test_key_1", b"test_key_2"} 182 | 183 | # if the spider returns items, the request's key is added in db 184 | result = [Item(), "not a base item"] 185 | assert list(mw.process_spider_output(response, result, self.spider)) == result 186 | assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} 187 | assert mw.db[b"key"] 188 | 189 | def test_process_spider_output_with_ignored_request(self): 190 | self._create_test_db() 191 | settings = { 192 | "DELTAFETCH_DIR": self.temp_dir, 193 | "DELTAFETCH_ENABLED": True, 194 | } 195 | crawler = get_crawler(Spider, settings_dict=settings) 196 | mw = self.mwcls.from_crawler(crawler) 197 | mw.spider_opened(self.spider) 198 | response = mock.Mock() 199 | response.request = Request("http://url") 200 | result = [] 201 | assert not list(mw.process_spider_output(response, result, self.spider)) 202 | result = [ 203 | Request("http://url1"), 204 | # 'url1' is already in the db, but deltafetch_enabled=False 205 | # flag is set, URL should be processed. 206 | Request("http://url1", meta={"deltafetch_enabled": False}), 207 | ] 208 | # so 2 requests should go through 209 | assert list(mw.process_spider_output(response, result, self.spider)) == [ 210 | result[0], 211 | result[1], 212 | ] 213 | 214 | def test_process_spider_output_dict(self): 215 | self._create_test_db() 216 | mw = self.get_mw(dir=self.temp_dir, reset=False) 217 | mw.spider_opened(self.spider) 218 | response = mock.Mock() 219 | response.request = Request("http://url", meta={"deltafetch_key": "key"}) 220 | result = [{"somekey": "somevalue"}] 221 | assert list(mw.process_spider_output(response, result, self.spider)) == result 222 | assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} 223 | assert mw.db[b"key"] 224 | 225 | def test_process_spider_output_stats(self): 226 | self._create_test_db() 227 | mw = self.get_mw(dir=self.temp_dir) 228 | mw.spider_opened(self.spider) 229 | response = mock.Mock() 230 | response.request = Request("http://url", meta={"deltafetch_key": "key"}) 231 | result = [] 232 | assert not list(mw.process_spider_output(response, result, self.spider)) 233 | assert mw.stats.get_stats() == {} 234 | result = [ 235 | Request("http://url", meta={"deltafetch_key": "key"}), 236 | Request("http://url1", meta={"deltafetch_key": "test_key_1"}), 237 | ] 238 | assert list(mw.process_spider_output(response, result, self.spider)) == [ 239 | result[0] 240 | ] 241 | assert mw.stats.get_value("deltafetch/skipped") == 1 242 | 243 | @dataclass 244 | class TestItem: 245 | foo: str 246 | 247 | result = [Item(), TestItem("bar")] 248 | assert list(mw.process_spider_output(response, result, self.spider)) == result 249 | assert mw.stats.get_value("deltafetch/stored") == 2 250 | 251 | def test_init_from_crawler_legacy(self): 252 | # test with subclass not handling passed stats 253 | class LegacyDeltaFetchSubClass(self.mwcls): 254 | def __init__(self, dir, reset, *args, **kwargs): 255 | super().__init__(dir=dir, reset=reset) 256 | self.something = True 257 | 258 | crawler = mock.Mock() 259 | # void settings 260 | crawler.settings = Settings({}) 261 | with pytest.raises(NotConfigured): 262 | self.mwcls.from_crawler(crawler) 263 | 264 | with ( 265 | mock.patch("scrapy.utils.project.project_data_dir") as data_dir, 266 | mock.patch("scrapy.utils.project.inside_project") as in_project, 267 | ): 268 | data_dir.return_value = self.temp_dir 269 | in_project.return_value = True 270 | 271 | # simple project_data_dir mock with based settings 272 | crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) 273 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler) 274 | assert isinstance(instance, self.mwcls) 275 | assert instance.dir == str(Path(self.temp_dir) / "deltafetch") 276 | assert instance.reset is False 277 | 278 | # project_data_dir mock with advanced settings 279 | crawler.settings = Settings( 280 | { 281 | "DELTAFETCH_ENABLED": True, 282 | "DELTAFETCH_DIR": "other", 283 | "DELTAFETCH_RESET": True, 284 | } 285 | ) 286 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler) 287 | assert isinstance(instance, self.mwcls) 288 | assert instance.dir == str(Path(self.temp_dir) / "other") 289 | assert instance.reset is True 290 | 291 | def test_process_spider_output_stats_legacy(self): 292 | # testing the subclass not handling stats works at runtime 293 | # (i.e. that trying to update stats does not trigger exception) 294 | class LegacyDeltaFetchSubClass(self.mwcls): 295 | def __init__(self, dir, *args, reset=False, **kwargs): 296 | super().__init__(dir=dir, reset=reset) 297 | self.something = True 298 | 299 | self._create_test_db() 300 | mw = self.get_mw(dir=self.temp_dir, reset=False, cls=LegacyDeltaFetchSubClass) 301 | mw.spider_opened(self.spider) 302 | response = mock.Mock() 303 | response.request = Request("http://url", meta={"deltafetch_key": "key"}) 304 | result = [] 305 | assert not list(mw.process_spider_output(response, result, self.spider)) 306 | assert mw.stats.get_stats() == {} 307 | result = [ 308 | Request("http://url", meta={"deltafetch_key": "key"}), 309 | Request("http://url1", meta={"deltafetch_key": "test_key_1"}), 310 | ] 311 | 312 | assert list(mw.process_spider_output(response, result, self.spider)) == [ 313 | result[0] 314 | ] 315 | assert mw.stats.get_value("deltafetch/skipped") == 1 316 | 317 | @dataclass 318 | class TestItem: 319 | foo: str 320 | 321 | result = [Item(), TestItem("bar")] 322 | assert list(mw.process_spider_output(response, result, self.spider)) == result 323 | assert mw.stats.get_value("deltafetch/stored") == 2 324 | 325 | def test_get_key(self): 326 | settings = { 327 | "DELTAFETCH_DIR": self.temp_dir, 328 | "DELTAFETCH_ENABLED": True, 329 | "DELTAFETCH_RESET": True, 330 | } 331 | crawler = get_crawler(Spider, settings_dict=settings) 332 | mw = self.mwcls.from_crawler(crawler) 333 | test_req1 = Request("http://url1") 334 | try: 335 | fingerprint = crawler.request_fingerprinter.fingerprint 336 | except AttributeError: # Scrapy < 2.7.0 337 | from scrapy.utils.request import request_fingerprint 338 | 339 | fingerprint = request_fingerprint 340 | assert mw._get_key(test_req1) == to_bytes(fingerprint(test_req1)) 341 | test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"}) 342 | assert mw._get_key(test_req2) == b"dfkey1" 343 | 344 | test_req3 = Request("http://url2", meta={"deltafetch_key": "dfkey1"}) 345 | # key will be converted to bytes 346 | assert mw._get_key(test_req3) == b"dfkey1" 347 | 348 | def _create_test_db(self): 349 | # truncate test db if there were failed tests 350 | with dbm.open(str(self.db_path), "n") as db: 351 | db[b"test_key_1"] = b"test_v_1" 352 | db[b"test_key_2"] = b"test_v_2" 353 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = pre-commit,mypy,pylint,twinecheck,min,py39,py310,py311,py312,py313 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-cov 8 | pytest-benchmark 9 | commands = 10 | pytest \ 11 | --cov=scrapy_deltafetch \ 12 | --cov-config=pyproject.toml \ 13 | --cov-report=xml \ 14 | --cov-report= \ 15 | {posargs:scrapy_deltafetch tests} 16 | 17 | [testenv:min] 18 | basepython = python3.9 19 | deps = 20 | {[testenv]deps} 21 | scrapy==1.1.0 22 | 23 | [testenv:pre-commit] 24 | deps = 25 | pre-commit 26 | commands = 27 | pre-commit run {posargs:--all-files} 28 | 29 | [testenv:mypy] 30 | deps = 31 | {[testenv]deps} 32 | mypy==1.15.0 33 | commands = 34 | mypy {posargs:scrapy_deltafetch tests} 35 | 36 | # https://github.com/astral-sh/ruff/issues/970 37 | [testenv:pylint] 38 | deps = 39 | {[testenv]deps} 40 | pylint==3.3.4 41 | commands = 42 | pylint {posargs:scrapy_deltafetch tests} 43 | 44 | [testenv:twinecheck] 45 | deps = 46 | twine==6.1.0 47 | build==1.2.2.post1 48 | commands = 49 | python -m build --sdist 50 | twine check dist/* 51 | --------------------------------------------------------------------------------