├── .bandit.yml ├── .bumpversion.cfg ├── .cookiecutterrc ├── .coveragerc ├── .dockerignore ├── .editorconfig ├── .flake8 ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── builds.yml │ ├── checks.yml │ ├── docs.yml │ └── tests.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .python-version ├── .readthedocs.yml ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── Dockerfile ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── TODO.rst ├── VERSION ├── coverage.xml ├── docker-compose.yaml ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── modules.rst ├── readme.rst ├── requirements.txt └── scrapy_redis.rst ├── example-project ├── Dockerfile ├── README.rst ├── docker-compose.yml ├── example │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── dmoz.py │ │ ├── mycrawler_redis.py │ │ └── myspider_redis.py ├── process_items.py ├── requirements.txt └── scrapy.cfg ├── pylintrc ├── pytest.ini ├── requirements-tests.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── src └── scrapy_redis │ ├── __init__.py │ ├── connection.py │ ├── defaults.py │ ├── dupefilter.py │ ├── picklecompat.py │ ├── pipelines.py │ ├── queue.py │ ├── scheduler.py │ ├── spiders.py │ ├── stats.py │ └── utils.py ├── tests ├── test_connection.py ├── test_dupefilter.py ├── test_package_import.py ├── test_picklecompat.py ├── test_queue.py ├── test_scrapy_redis.py ├── test_spiders.py └── test_utils.py └── tox.ini /.bandit.yml: -------------------------------------------------------------------------------- 1 | skips: 2 | - B101 3 | - B105 4 | - B301 5 | - B303 6 | - B306 7 | - B307 8 | - B311 9 | - B320 10 | - B321 11 | - B324 12 | - B403 13 | - B404 14 | - B406 15 | - B410 16 | - B503 17 | - B603 18 | - B605 -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.9.1 3 | commit = False 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P\w+))? 6 | serialize = 7 | {major}.{minor}.{patch}-{release} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = placeholder 12 | values = 13 | a1 14 | b1 15 | rc1 16 | placeholder 17 | 18 | [bumpversion:file:VERSION] 19 | search = {current_version} 20 | replace = {new_version} 21 | 22 | [bumpversion:file:src/scrapy_redis/__init__.py] 23 | search = __version__ = "{current_version}" 24 | replace = __version__ = "{new_version}" 25 | 26 | [bumpversion:file:.cookiecutterrc] 27 | search = version: {current_version} 28 | replace = version: {new_version} 29 | 30 | [bumpversion:file:HISTORY.rst] 31 | search = .. bumpversion marker 32 | replace = .. bumpversion marker 33 | 34 | {new_version} ({now:%Y-%m-%d}) 35 | ------------------ 36 | -------------------------------------------------------------------------------- /.cookiecutterrc: -------------------------------------------------------------------------------- 1 | # Generated by cookiepatcher, a small shim around cookiecutter (pip install cookiepatcher) 2 | 3 | cookiecutter: 4 | email: rolando at rmax.io 5 | full_name: Rolando Espinoza 6 | github_username: rolando 7 | project_name: Scrapy-Redis 8 | project_package: scrapy_redis 9 | project_short_description: Redis-based components for Scrapy. 10 | project_slug: scrapy-redis 11 | pypi_username: rolando 12 | use_codecov: y 13 | use_cython: n 14 | use_landscape: y 15 | use_pypi_deployment_with_travis: n 16 | use_pytest: y 17 | use_requiresio: y 18 | version: 0.9.1 19 | year: 2011-2022 20 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [paths] 2 | source = 3 | src 4 | 5 | [run] 6 | omit = setup.py 7 | branch = true 8 | source = 9 | scrapy_redis 10 | tests 11 | parallel = true 12 | 13 | [report] 14 | show_missing = true 15 | precision = 2 16 | omit = */__init__.py 17 | exclude_lines = 18 | pragma: no cover 19 | def __repr__ 20 | if self.debug: 21 | if settings.DEBUG 22 | raise AssertionError 23 | raise NotImplementedError 24 | if 0: 25 | if __name__ == .__main__.: 26 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.swp 3 | *~ 4 | 5 | .ropeproject 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Packages 11 | *.egg 12 | *.egg-info 13 | dist 14 | build 15 | eggs 16 | parts 17 | bin 18 | var 19 | sdist 20 | develop-eggs 21 | .installed.cfg 22 | lib 23 | lib64 24 | __pycache__ 25 | 26 | # Installer logs 27 | pip-log.txt 28 | 29 | # Unit test / coverage reports 30 | .coverage 31 | .tox 32 | nosetests.xml 33 | 34 | # Translations 35 | *.mo 36 | 37 | # Mr Developer 38 | .mr.developer.cfg 39 | .project 40 | .pydevproject 41 | 42 | # JetBrains PyCharm IDE 43 | /.idea/ 44 | 45 | .venv 46 | .tags 47 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | 2 | [flake8] 3 | 4 | max-line-length = 119 5 | ignore = 6 | W503 7 | P102 8 | P103 9 | 10 | exclude = 11 | tests/test_spiders.py E731 12 | docs/conf.py E265 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # GitHub syntax highlighting 2 | pixi.lock linguist-language=YAML 3 | 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please describe your problem/feature request/bug 4 | 5 | # Step to Reproduce 6 | 7 | Please offer the steps to reproduce your problem/bug 8 | 9 | # Error log 10 | 11 | Please provide error message or screen shot for better understanding. 12 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change. 4 | 5 | Fixes #(issue) 6 | 7 | # How Has This Been Tested? 8 | 9 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration 10 | - [] pytest 11 | - [] Other test (please specify) 12 | 13 | # Test Configuration: 14 | - OS version: 15 | - Necessary Libraries (optional): 16 | 17 | # Checklist: 18 | - [] My code follows the style guidelines of this project 19 | - [] I have performed a self-review of my code 20 | - [] I have commented my code, particularly in hard-to-understand areas 21 | - [] I have made corresponding changes to the documentation 22 | - [] My changes generate no new warnings 23 | - [] I have added tests that prove my fix is effective or that my feature works 24 | - [] New and existing unit tests pass locally with my changes 25 | - [] Any dependent changes have been merged and published in downstream modules 26 | -------------------------------------------------------------------------------- /.github/workflows/builds.yml: -------------------------------------------------------------------------------- 1 | # This is GitHub Action for cross platform building 2 | name: build 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | builds: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | os: [ubuntu-latest, macos-latest, windows-latest] 16 | python-version: ["3.12"] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Run build 27 | env: 28 | TOXENV: build 29 | run: | 30 | pip install -r requirements-tests.txt 31 | tox 32 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | # This is GitHub Action for linting and security check 2 | name: check 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | concurrency: 10 | group: ${{github.workflow}}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | checks: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.12"] 20 | env: [security, flake8] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Run check 31 | env: 32 | TOXENV: ${{ matrix.env }} 33 | run: | 34 | pip install -r requirements-tests.txt 35 | tox 36 | 37 | pre-commit: 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: pre-commit/action@v3.0.0 42 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | # This is GitHub Action for cross platform building 2 | name: docs 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | builds: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.12"] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Build docs 26 | env: 27 | TOXENV: docs 28 | run: | 29 | pip install -r requirements-tests.txt 30 | tox 31 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This is GitHub Action for tests 2 | name: test 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | tests: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.12"] 16 | 17 | services: 18 | redis: 19 | image: redis 20 | options: >- 21 | --health-cmd "redis-cli ping" 22 | --health-interval 10s 23 | --health-timeout 5s 24 | --health-retries 5 25 | 26 | container: python:${{ matrix.python-version }} 27 | 28 | steps: 29 | - uses: actions/checkout@v4 30 | 31 | - name: Set up Python ${{ matrix.python-version }} 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | 36 | - name: Run pytest 37 | env: 38 | REDIS_HOST: redis 39 | TOXENV: pytest 40 | TOX_TESTENV_PASSENV: REDIS_HOST 41 | run: | 42 | pip install -r requirements-tests.txt 43 | tox 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | .venv 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # rope-vim 63 | .ropeproject 64 | 65 | # Extra 66 | .DS_Store 67 | .vscode 68 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/bandit 3 | rev: 1.7.7 4 | hooks: 5 | - id: bandit 6 | args: [-r, -c, .bandit.yml] 7 | - repo: https://github.com/PyCQA/flake8 8 | rev: 7.0.0 9 | hooks: 10 | - id: flake8 11 | additional_dependencies: 12 | - flake8-bugbear 13 | - flake8-comprehensions 14 | - flake8-debugger 15 | #- flake8-docstrings 16 | - flake8-string-format 17 | - flake8-type-checking 18 | - repo: https://github.com/psf/black.git 19 | rev: 24.2.0 20 | hooks: 21 | - id: black 22 | - repo: https://github.com/pycqa/isort 23 | rev: 5.13.2 24 | hooks: 25 | - id: isort 26 | - repo: https://github.com/adamchainz/blacken-docs 27 | rev: 1.16.0 28 | hooks: 29 | - id: blacken-docs 30 | additional_dependencies: 31 | - black==24.2.0 32 | - repo: https://github.com/asottile/pyupgrade 33 | rev: v3.15.2 34 | hooks: 35 | - id: pyupgrade 36 | args: [--py38-plus, --keep-runtime-typing] 37 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.13 2 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | fail_on_warning: true 6 | 7 | build: 8 | os: ubuntu-22.04 9 | tools: 10 | # For available versions, see: 11 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 12 | python: "3.12" 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements.txt 17 | - path: . 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3.5 3 | sudo: false 4 | 5 | services: 6 | - redis-server 7 | 8 | env: 9 | - TOXENV=py27-scrapyrel 10 | - TOXENV=py34-scrapyrel 11 | - TOXENV=py35-scrapyrel 12 | 13 | matrix: 14 | fast_finish: true 15 | 16 | before_install: 17 | - python --version 18 | - uname -a 19 | - lsb_release -a 20 | 21 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 22 | install: 23 | - pip install -U pip wheel 24 | - pip install -U tox twine coverage 25 | - virtualenv --version 26 | - pip --version 27 | - tox --version 28 | 29 | # command to run tests, e.g. python setup.py test 30 | script: 31 | - tox -e $TOXENV --workdir $HOME/.tox 32 | 33 | after_success: 34 | # Codecov requires a single .coverage and will run 'coverage xml' to 35 | # generate the report. 36 | - coverage combine 37 | - bash <(curl -s https://codecov.io/bash) 38 | 39 | after_failure: 40 | - more $HOME/.tox/log/* | cat 41 | - more $HOME/.tox/*/log/* | cat 42 | 43 | before_cache: 44 | - rm -fr $HOME/.cache/pip/log 45 | - rm -fr $HOME/.tox/log/* 46 | - rm -fr $HOME/.tox/*/log/* 47 | 48 | cache: 49 | directories: 50 | - $HOME/.cache/pip 51 | - $HOME/.tox/ 52 | 53 | notifications: 54 | email: 55 | on_sucess: never 56 | on_failure: always 57 | 58 | deploy: 59 | provider: pypi 60 | distributions: "sdist bdist_wheel" 61 | user: darkrho 62 | password: 63 | secure: "Pgcj+Otx9o2MxOuXibvz9LUd5DqlW0jaKDScVOAcFT+//U0esjRqY08bRFQlrSTXokJa6X/dVZlb2mQE8L4vr7mLFspRGO4FByK34L089/ETwsLKI2rks2zVbmPSyweL3sz88EXLKmYs7WsKtCnET67qra6hreKbO67ALAh5WWk=" 64 | on: 65 | tags: true 66 | all_branches: true 67 | repo: rolando/scrapy-redis 68 | condition: "$TOXENV == py35-scrapyrel" 69 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * R Max Espinoza 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contribution 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every 8 | little bit helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | New to here 16 | ~~~~~~~~~~~ 17 | 18 | Any issue with good first issue tag on it is a great place to start! Feel free to ask any questions here. 19 | 20 | Don't know how to start 21 | ~~~~~~~~~~~~~~~~~~~~~~~ 22 | 23 | Review codebases and PRs can give you quite a knowledge to know what's going on here! 24 | 25 | Report Bugs 26 | ~~~~~~~~~~~ 27 | 28 | Report bugs at https://github.com/rmax/scrapy-redis/issues. 29 | 30 | If you are reporting a bug, please include: 31 | 32 | * Your operating system name and version. 33 | * Any details about your local setup that might be helpful in troubleshooting. 34 | * Detailed steps to reproduce the bug. 35 | 36 | Fix Bugs 37 | ~~~~~~~~ 38 | 39 | Look through the GitHub issues for bugs. Anything tagged with "bug" 40 | is open to whoever wants to implement it. 41 | 42 | Implement Features & improvments 43 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 44 | 45 | Look through the GitHub issues for features. Anything tagged with "feature" or "improvments" 46 | is open to whoever wants to implement it. 47 | 48 | Write Documentation 49 | ~~~~~~~~~~~~~~~~~~~ 50 | 51 | Scrapy-Redis could always use more documentation, whether as part of the 52 | official Scrapy-Redis docs, in docstrings, or even on the web in blog posts, 53 | articles, and such. 54 | 55 | Submit Feedback 56 | ~~~~~~~~~~~~~~~ 57 | 58 | The best way to send feedback is to file an issue at https://github.com/rmax/scrapy-redis/issues. 59 | 60 | If you are proposing a feature: 61 | 62 | * Explain in detail how it would work. 63 | * Keep the scope as narrow as possible, to make it easier to implement. 64 | * Remember that this is a volunteer-driven project, and that contributions 65 | are welcome :) 66 | 67 | Get Started! 68 | ------------ 69 | 70 | Ready to contribute? Here's how to set up `scrapy-redis` for local development. 71 | 72 | Setup environment 73 | ~~~~~~~~~~~~~~~~~ 74 | 75 | 1. Fork the `scrapy-redis` repo on GitHub. 76 | 2. Clone your fork locally:: 77 | 78 | git clone git@github.com:your_name_here/scrapy-redis.git 79 | 80 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 81 | 82 | pip install virtualenv==20.0.23 83 | virtualenv --python=/usr/bin/python3 ~/scrapy_redis 84 | source ~/scrapy_redis/bin/activate 85 | cd scrapy-redis/ 86 | pip install -r requirements-install.txt 87 | pip install . 88 | 89 | 4. Create a branch for local development:: 90 | 91 | git checkout -b name-of-your-bugfix-or-feature 92 | 93 | Now you can make your changes locally. 94 | 95 | Setup testing environment 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | 1. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 99 | 100 | pip install -r requirements-tests.txt 101 | flake8 src/ tests/ 102 | python -m pytest --ignore=setup.py 103 | tox 104 | 105 | 2. Note that if the error of `No module named scrapy_redis` shows, please check the install `scrapy-redis` of your branch by:: 106 | 107 | pip install . 108 | 109 | 3. Or change the import lines:: 110 | 111 | from scrapy_redis import xxx # from this 112 | from src.scrapy_redis import xxx # to this 113 | 114 | 4. Commit your changes and push your branch to GitHub:: 115 | 116 | git add . 117 | git commit -m "Your detailed description of your changes." 118 | git push origin name-of-your-bugfix-or-feature 119 | 120 | 5. Submit a pull request through the GitHub website. 121 | 122 | Pull Request Guidelines 123 | ----------------------- 124 | 125 | Before you submit a pull request, check that it meets these guidelines: 126 | 127 | 1. The pull request should include tests. 128 | 2. If the pull request adds functionality, the docs should be updated. Put 129 | your new functionality into a function with a docstring, and add the 130 | feature to the list in README.rst. 131 | 3. Make sure that the tests pass for all supported Python versions. 132 | 133 | Tips 134 | ---- 135 | 136 | To run a subset of tests:: 137 | 138 | pytest tests/test_scrapy_redis 139 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | # Set working directory 4 | WORKDIR /app 5 | 6 | # Install tox and dependencies (replace 'your-requirements.txt' with your actual file) 7 | COPY requirements.txt . 8 | COPY requirements-tests.txt . 9 | RUN pip install -r requirements.txt -r requirements-tests.txt 10 | 11 | # Copy your project code 12 | COPY . . 13 | 14 | # Run Tox tests 15 | CMD ["tox"] 16 | 17 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | .. bumpversion marker 6 | 7 | 0.9.1 (2024-07-06) 8 | ------------------ 9 | * Fixed docs build. 10 | 11 | 0.9.0 (2024-07-06) 12 | ------------------ 13 | * Fixed ``Scheduler`` not compatible with BaseDupeFilter (#294) 14 | * Added precommit hooks. 15 | * Switched to Python 3.12 as default build version. 16 | 17 | 0.8.0 (2024-07-03) 18 | ------------------ 19 | * Fixed request fingerprint method. 20 | * Fixed support for Scrapy 2.6+. 21 | * Fixed tox tests and github workflow. 22 | * Deprecated ``REDIS_START_URLS_BATCH_SIZE``. 23 | 24 | 0.7.3 (2022-07-21) 25 | ------------------ 26 | * Move docs to GitHub Wiki 27 | * Update tox and support dynamic tests 28 | * Update support for json data 29 | * Refactor max idle time 30 | * Add support for python3.7~python3.10 31 | * Deprecate python2.x support 32 | 33 | 0.7.2 (2021-12-27) 34 | ------------------ 35 | * Fix RedisStatsCollector._get_key() 36 | * Fix redis-py dependency version 37 | * Added maximum idle waiting time MAX_IDLE_TIME_BEFORE_CLOSE 38 | 39 | 0.7.1 (2021-03-27) 40 | ------------------ 41 | * Fixes datetime parse error for redis-py 3.x. 42 | * Add support for stats extensions. 43 | 44 | 0.7.1-rc1 (2021-03-27) 45 | ---------------------- 46 | * Fixes datetime parse error for redis-py 3.x. 47 | 48 | 0.7.1-b1 (2021-03-22) 49 | --------------------- 50 | * Add support for stats extensions. 51 | 52 | 0.7.0-dev (unreleased) 53 | ---------------------- 54 | * Unreleased. 55 | 56 | 0.6.8 (2017-02-14) 57 | ------------------ 58 | * Fixed automated release due to not matching registered email. 59 | 60 | 0.6.7 (2016-12-27) 61 | ------------------ 62 | * Fixes bad formatting in logging message. 63 | 64 | 0.6.6 (2016-12-20) 65 | ------------------ 66 | * Fixes wrong message on dupefilter duplicates. 67 | 68 | 0.6.5 (2016-12-19) 69 | ------------------ 70 | * Fixed typo in default settings. 71 | 72 | 0.6.4 (2016-12-18) 73 | ------------------ 74 | * Fixed data decoding in Python 3.x. 75 | * Added ``REDIS_ENCODING`` setting (default ``utf-8``). 76 | * Default to ``CONCURRENT_REQUESTS`` value for ``REDIS_START_URLS_BATCH_SIZE``. 77 | * Renamed queue classes to a proper naming conventiong (backwards compatible). 78 | 79 | 0.6.3 (2016-07-03) 80 | ------------------ 81 | * Added ``REDIS_START_URLS_KEY`` setting. 82 | * Fixed spider method ``from_crawler`` signature. 83 | 84 | 0.6.2 (2016-06-26) 85 | ------------------ 86 | * Support ``redis_cls`` parameter in ``REDIS_PARAMS`` setting. 87 | * Python 3.x compatibility fixed. 88 | * Added ``SCHEDULER_SERIALIZER`` setting. 89 | 90 | 0.6.1 (2016-06-25) 91 | ------------------ 92 | * **Backwards incompatible change:** Require explicit ``DUPEFILTER_CLASS`` 93 | setting. 94 | * Added ``SCHEDULER_FLUSH_ON_START`` setting. 95 | * Added ``REDIS_START_URLS_AS_SET`` setting. 96 | * Added ``REDIS_ITEMS_KEY`` setting. 97 | * Added ``REDIS_ITEMS_SERIALIZER`` setting. 98 | * Added ``REDIS_PARAMS`` setting. 99 | * Added ``REDIS_START_URLS_BATCH_SIZE`` spider attribute to read start urls 100 | in batches. 101 | * Added ``RedisCrawlSpider``. 102 | 103 | 0.6.0 (2015-07-05) 104 | ------------------ 105 | * Updated code to be compatible with Scrapy 1.0. 106 | * Added `-a domain=...` option for example spiders. 107 | 108 | 0.5.0 (2013-09-02) 109 | ------------------ 110 | * Added `REDIS_URL` setting to support Redis connection string. 111 | * Added `SCHEDULER_IDLE_BEFORE_CLOSE` setting to prevent the spider closing too 112 | quickly when the queue is empty. Default value is zero keeping the previous 113 | behavior. 114 | * Schedule preemptively requests on item scraped. 115 | * This version is the latest release compatible with Scrapy 0.24.x. 116 | 117 | 0.4.0 (2013-04-19) 118 | ------------------ 119 | * Added `RedisSpider` and `RedisMixin` classes as building blocks for spiders 120 | to be fed through a redis queue. 121 | * Added redis queue stats. 122 | * Let the encoder handle the item as it comes instead converting it to a dict. 123 | 124 | 0.3.0 (2013-02-18) 125 | ------------------ 126 | * Added support for different queue classes. 127 | * Changed requests serialization from `marshal` to `cPickle`. 128 | 129 | 0.2.0 (2013-02-17) 130 | ------------------ 131 | * Improved backward compatibility. 132 | * Added example project. 133 | 134 | 0.1.0 (2011-09-01) 135 | ------------------ 136 | * First release on PyPI. 137 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2024, R Max Espinoza 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft docs 2 | graft src 3 | graft tests 4 | graft example-project 5 | 6 | include *.in 7 | include *.ini 8 | include *.rst 9 | include *.txt 10 | 11 | include LICENSE 12 | include VERSION 13 | include Makefile 14 | 15 | global-exclude __pycache__ *.py[cod] 16 | global-exclude *.so *.dylib 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-so clean-test clean-pyc clean-build clean-docs clean 2 | .PHONY: docs check check-manifest check-setup check-history lint 3 | .PHONY: test test-all coverage 4 | .PHONY: compile-reqs install-reqs 5 | .PHONY: release dist install build-inplace 6 | define BROWSER_PYSCRIPT 7 | import os, webbrowser, sys 8 | FAIL = "\033[91m" 9 | ENDC = "\033[0m" 10 | 11 | try: 12 | from urllib.request import pathname2url 13 | except: 14 | print(FAIL + "Python2 is deprecated, please upgrade your python >= 3.7" + ENDC) 15 | 16 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 17 | endef 18 | export BROWSER_PYSCRIPT 19 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 20 | 21 | SPHINX_BUILD := html 22 | 23 | help: 24 | @echo "check - check setup, code style, setup, etc" 25 | @echo "check-manifest - check manifest" 26 | @echo "check-setup - check setup" 27 | @echo "check-history - check history" 28 | @echo "clean - remove all build, test, coverage and Python artifacts" 29 | @echo "clean-build - remove build artifacts" 30 | @echo "clean-docs - remove docs artifacts" 31 | @echo "clean-pyc - remove Python file artifacts" 32 | @echo "clean-test - remove test and coverage artifacts" 33 | @echo "clean-so - remove compiled extensions" 34 | @echo "lint - check style with flake8" 35 | @echo "test - run tests quickly with the default Python" 36 | @echo "test-all - run tests on every Python version with tox" 37 | @echo "coverage - check code coverage quickly with the default Python" 38 | @echo "compile-reqs - compile requirements" 39 | @echo "install-reqs - install requirements" 40 | @echo "docs - generate Sphinx HTML documentation, including API docs" 41 | @echo "dist-upload - package and upload a release" 42 | @echo "release - bump release and push changes" 43 | @echo "dist - package" 44 | @echo "develop - install package in develop mode" 45 | @echo "install - install the package to the active Python's site-packages" 46 | 47 | check: check-setup check-manifest check-history lint 48 | 49 | check-setup: 50 | @echo "Checking package metadata (name, description, etc)" 51 | python setup.py check --strict --metadata --restructuredtext 52 | 53 | check-manifest: 54 | @echo "Checking MANIFEST.in" 55 | check-manifest --ignore ".*" 56 | 57 | check-history: 58 | @echo "Checking latest version in HISTORY" 59 | VERSION=`cat VERSION`; grep "^$${VERSION}\b" HISTORY.rst 60 | 61 | clean: clean-build clean-docs clean-pyc clean-test clean-so 62 | 63 | clean-build: 64 | rm -fr build/ 65 | rm -fr dist/ 66 | rm -fr .eggs/ 67 | find . -name '*.egg-info' -exec rm -fr {} + 68 | find . -name '*.egg' -exec rm -f {} + 69 | 70 | clean-docs: 71 | $(MAKE) -C docs clean 72 | 73 | clean-pyc: 74 | find . -name '*.pyc' -exec rm -f {} + 75 | find . -name '*.pyo' -exec rm -f {} + 76 | find . -name '*~' -exec rm -f {} + 77 | find . -name '__pycache__' -exec rm -fr {} + 78 | 79 | clean-test: 80 | rm -fr .tox/ 81 | rm -f .coverage 82 | rm -fr htmlcov/ 83 | 84 | clean-so: 85 | find . -name '*.so' -exec rm -f {} + 86 | 87 | lint: 88 | flake8 src tests 89 | 90 | build-inplace: 91 | python setup.py build_ext --inplace 92 | 93 | develop: clean 94 | pip install -e . 95 | 96 | test: develop 97 | pytest --ignore=setup.py 98 | 99 | test-all: 100 | tox -v 101 | 102 | coverage: develop 103 | coverage run -m pytest --ignore=setup.py 104 | coverage combine 105 | coverage report 106 | coverage html 107 | $(BROWSER) htmlcov/index.html 108 | 109 | docs-build: develop 110 | rm -f docs/scrapy_redis.rst 111 | rm -f docs/modules.rst 112 | sphinx-apidoc -o docs/ src/scrapy_redis 113 | $(MAKE) -C docs clean 114 | $(MAKE) -C docs $(SPHINX_BUILD) 115 | 116 | docs: docs-build 117 | $(BROWSER) docs/_build/$(SPHINX_BUILD)/index.html 118 | 119 | servedocs: docs 120 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 121 | 122 | release: 123 | @echo "To do a release, follow the steps:" 124 | @echo "- bumpversion release" 125 | @echo "- Review and commit" 126 | @echo "- git tag -a \`cat VERSION\`" 127 | @echo "- git push --follow-tags" 128 | 129 | dist-upload: clean check dist 130 | twine upload dist/* 131 | 132 | dist: clean 133 | python setup.py sdist 134 | python setup.py bdist_wheel 135 | ls -l dist 136 | 137 | install: clean 138 | pip install . 139 | 140 | REQUIREMENTS_IN := $(wildcard requirements*.in) 141 | .PHONY: $(REQUIREMENTS_IN) 142 | 143 | requirements%.txt: requirements%.in 144 | pip-compile -v $< -o $@ 145 | 146 | REQUIREMENTS_TXT := $(REQUIREMENTS_IN:.in=.txt) 147 | ifndef REQUIREMENTS_TXT 148 | REQUIREMENTS_TXT := $(wildcard requirements*.txt) 149 | endif 150 | 151 | compile-reqs: $(REQUIREMENTS_TXT) 152 | @test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.in' files. Nothing to do" 153 | 154 | install-reqs: 155 | @test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.txt' files. Nothing to do" 156 | $(foreach req,$(REQUIREMENTS_TXT),pip install -r $(req);) 157 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Scrapy-Redis 3 | ============ 4 | 5 | .. image:: https://readthedocs.org/projects/scrapy-redis/badge/?version=latest 6 | :alt: Documentation Status 7 | :target: https://readthedocs.org/projects/scrapy-redis/?badge=latest 8 | 9 | .. image:: https://img.shields.io/pypi/v/scrapy-redis.svg 10 | :target: https://pypi.python.org/pypi/scrapy-redis 11 | 12 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-redis.svg 13 | :target: https://pypi.python.org/pypi/scrapy-redis 14 | 15 | .. image:: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml/badge.svg 16 | :target: https://github.com/rmax/scrapy-redis/actions/workflows/builds.yml 17 | 18 | .. image:: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml/badge.svg 19 | :target: https://github.com/rmax/scrapy-redis/actions/workflows/checks.yml 20 | 21 | .. image:: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml/badge.svg 22 | :target: https://github.com/rmax/scrapy-redis/actions/workflows/tests.yml 23 | 24 | .. image:: https://codecov.io/github/rmax/scrapy-redis/coverage.svg?branch=master 25 | :alt: Coverage Status 26 | :target: https://codecov.io/github/rmax/scrapy-redis 27 | 28 | .. image:: https://img.shields.io/badge/security-bandit-green.svg 29 | :alt: Security Status 30 | :target: https://github.com/rmax/scrapy-redis 31 | 32 | Redis-based components for Scrapy. 33 | 34 | * Usage: https://github.com/rmax/scrapy-redis/wiki/Usage 35 | * Documentation: https://github.com/rmax/scrapy-redis/wiki. 36 | * Release: https://github.com/rmax/scrapy-redis/wiki/History 37 | * Contribution: https://github.com/rmax/scrapy-redis/wiki/Getting-Started 38 | * LICENSE: MIT license 39 | 40 | Features 41 | -------- 42 | 43 | * Distributed crawling/scraping 44 | 45 | You can start multiple spider instances that share a single redis queue. 46 | Best suitable for broad multi-domain crawls. 47 | 48 | * Distributed post-processing 49 | 50 | Scraped items gets pushed into a redis queued meaning that you can start as 51 | many as needed post-processing processes sharing the items queue. 52 | 53 | * Scrapy plug-and-play components 54 | 55 | Scheduler + Duplication Filter, Item Pipeline, Base Spiders. 56 | 57 | * In this forked version: added ``json`` supported data in Redis 58 | 59 | data contains ``url``, ```meta``` and other optional parameters. ``meta`` is a nested json which contains sub-data. 60 | this function extract this data and send another FormRequest with ``url``, ``meta`` and addition ``formdata``. 61 | 62 | For example: 63 | 64 | .. code-block:: json 65 | 66 | { "url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" } 67 | 68 | this data can be accessed in `scrapy spider` through response. 69 | like: `request.url`, `request.meta`, `request.cookies` 70 | 71 | .. note:: This features cover the basic case of distributing the workload across multiple workers. If you need more features like URL expiration, advanced URL prioritization, etc., we suggest you to take a look at the Frontera_ project. 72 | 73 | Requirements 74 | ------------ 75 | 76 | * Python 3.7+ 77 | * Redis >= 5.0 78 | * ``Scrapy`` >= 2.0 79 | * ``redis-py`` >= 4.0 80 | 81 | Installation 82 | ------------ 83 | 84 | From pip 85 | 86 | .. code-block:: bash 87 | 88 | pip install scrapy-redis 89 | 90 | From GitHub 91 | 92 | .. code-block:: bash 93 | 94 | git clone https://github.com/darkrho/scrapy-redis.git 95 | cd scrapy-redis 96 | python setup.py install 97 | 98 | .. note:: For using this json supported data feature, please make sure you have not installed the scrapy-redis through pip. If you already did it, you first uninstall that one. 99 | 100 | .. code-block:: bash 101 | 102 | pip uninstall scrapy-redis 103 | 104 | Alternative Choice 105 | --------------------------- 106 | 107 | Frontera_ is a web crawling framework consisting of `crawl frontier`_, and distribution/scaling primitives, allowing to build a large scale online web crawler. 108 | 109 | .. _Frontera: https://github.com/scrapinghub/frontera 110 | .. _crawl frontier: http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html 111 | -------------------------------------------------------------------------------- /TODO.rst: -------------------------------------------------------------------------------- 1 | TODO 2 | ==== 3 | 4 | * Add SCRAPY_JOB global support (jobs sharing same SCRAPY_JOB share same queues). 5 | * Use a spider middleware instead of spider mixin. This will avoid the spider 6 | idle signal hack. 7 | * Allow to use pubsub whenever appropriate. 8 | * Move example project to its own repository. Include different crawling use 9 | cases (i.e.: producer/consumer). 10 | * Add pyrebloom dupefilter. 11 | * Warn and pass unserializable requests. 12 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.9.1 2 | -------------------------------------------------------------------------------- /coverage.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | python: 5 | build: . 6 | command: tox -e security,flake8,pytest 7 | environment: 8 | REDIS_HOST: redis # Use service name for hostname within docker network 9 | REDIS_PORT: 6379 10 | TOX_TESTENV_PASSENV: "REDIS_HOST REDIS_PORT" 11 | volumes: 12 | - ./:/app # Mount your project directory into the container 13 | depends_on: 14 | - redis 15 | 16 | redis: 17 | image: redis:6.2-alpine 18 | ports: 19 | - "6379:6379" # Map Redis port to host port 20 | 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scrapy-redis.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scrapy-redis.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/scrapy-redis" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scrapy-redis" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # scrapy-redis documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import os 16 | import re 17 | 18 | # If extensions (or modules to document with autodoc) are in another 19 | # directory, add these directories to sys.path here. If the directory is 20 | # relative to the documentation root, use os.path.abspath to make it 21 | # absolute, like shown here. 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # Get the project root dir, which is the parent dir of this 25 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.napoleon", 37 | "sphinx.ext.viewcode", 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ["_templates"] 42 | 43 | # The suffix of source filenames. 44 | source_suffix = ".rst" 45 | 46 | # The encoding of source files. 47 | # source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = "index" 51 | 52 | # General information about the project. 53 | project = "Scrapy-Redis" 54 | copyright = "2011-2024, R Max Espinoza" 55 | 56 | # The version info for the project you're documenting, acts as replacement 57 | # for |version| and |release|, also used in various other places throughout 58 | # the built documents. 59 | # 60 | # The full version, including alpha/beta/rc tags. 61 | release = open(os.path.join(project_root, "VERSION")).read().strip() 62 | # The short X.Y version. 63 | version = re.findall(r"\d+\.\d+\.\d+", release)[0] 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # language = None 68 | 69 | # There are two options for replacing |today|: either, you set today to 70 | # some non-false value, then it is used: 71 | # today = '' 72 | # Else, today_fmt is used as the format for a strftime call. 73 | # today_fmt = '%B %d, %Y' 74 | 75 | # List of patterns, relative to source directory, that match files and 76 | # directories to ignore when looking for source files. 77 | exclude_patterns = ["_build"] 78 | 79 | # The reST default role (used for this markup: `text`) to use for all 80 | # documents. 81 | # default_role = None 82 | 83 | # If true, '()' will be appended to :func: etc. cross-reference text. 84 | # add_function_parentheses = True 85 | 86 | # If true, the current module name will be prepended to all description 87 | # unit titles (such as .. function::). 88 | # add_module_names = True 89 | 90 | # If true, sectionauthor and moduleauthor directives will be shown in the 91 | # output. They are ignored by default. 92 | # show_authors = False 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = "sphinx" 96 | 97 | # A list of ignored prefixes for module index sorting. 98 | # modindex_common_prefix = [] 99 | 100 | # If true, keep warnings as "system message" paragraphs in the built 101 | # documents. 102 | # keep_warnings = False 103 | 104 | 105 | # -- Options for HTML output ------------------------------------------- 106 | 107 | # The theme to use for HTML and HTML Help pages. See the documentation for 108 | # a list of builtin themes. 109 | html_theme = "default" 110 | 111 | # Theme options are theme-specific and customize the look and feel of a 112 | # theme further. For a list of options available for each theme, see the 113 | # documentation. 114 | # html_theme_options = {} 115 | 116 | # Add any paths that contain custom themes here, relative to this directory. 117 | # html_theme_path = [] 118 | 119 | # The name for this set of Sphinx documents. If None, it defaults to 120 | # " v documentation". 121 | # html_title = None 122 | 123 | # A shorter title for the navigation bar. Default is the same as 124 | # html_title. 125 | # html_short_title = None 126 | 127 | # The name of an image file (relative to this directory) to place at the 128 | # top of the sidebar. 129 | # html_logo = None 130 | 131 | # The name of an image file (within the static path) to use as favicon 132 | # of the docs. This file should be a Windows icon file (.ico) being 133 | # 16x16 or 32x32 pixels large. 134 | # html_favicon = None 135 | 136 | # Add any paths that contain custom static files (such as style sheets) 137 | # here, relative to this directory. They are copied after the builtin 138 | # static files, so a file named "default.css" will overwrite the builtin 139 | # "default.css". 140 | # html_static_path = ["_static"] 141 | 142 | # If not '', a 'Last updated on:' timestamp is inserted at every page 143 | # bottom, using the given strftime format. 144 | # html_last_updated_fmt = '%b %d, %Y' 145 | 146 | # If true, SmartyPants will be used to convert quotes and dashes to 147 | # typographically correct entities. 148 | # html_use_smartypants = True 149 | 150 | # Custom sidebar templates, maps document names to template names. 151 | # html_sidebars = {} 152 | 153 | # Additional templates that should be rendered to pages, maps page names 154 | # to template names. 155 | # html_additional_pages = {} 156 | 157 | # If false, no module index is generated. 158 | # html_domain_indices = True 159 | 160 | # If false, no index is generated. 161 | # html_use_index = True 162 | 163 | # If true, the index is split into individual pages for each letter. 164 | # html_split_index = False 165 | 166 | # If true, links to the reST sources are added to the pages. 167 | # html_show_sourcelink = True 168 | 169 | # If true, "Created using Sphinx" is shown in the HTML footer. 170 | # Default is True. 171 | # html_show_sphinx = True 172 | 173 | # If true, "(C) Copyright ..." is shown in the HTML footer. 174 | # Default is True. 175 | # html_show_copyright = True 176 | 177 | # If true, an OpenSearch description file will be output, and all pages 178 | # will contain a tag referring to it. The value of this option 179 | # must be the base URL from which the finished HTML is served. 180 | # html_use_opensearch = '' 181 | 182 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 183 | # html_file_suffix = None 184 | 185 | # Output file base name for HTML help builder. 186 | htmlhelp_basename = "scrapy_redisdoc" 187 | 188 | 189 | # -- Options for LaTeX output ------------------------------------------ 190 | 191 | latex_elements = { 192 | # The paper size ('letterpaper' or 'a4paper'). 193 | # 'papersize': 'letterpaper', 194 | # The font size ('10pt', '11pt' or '12pt'). 195 | # 'pointsize': '10pt', 196 | # Additional stuff for the LaTeX preamble. 197 | # 'preamble': '', 198 | } 199 | 200 | # Grouping the document tree into LaTeX files. List of tuples 201 | # (source start file, target name, title, author, documentclass 202 | # [howto/manual]). 203 | latex_documents = [ 204 | ( 205 | "index", 206 | "scrapy_redis.tex", 207 | "Scrapy-Redis Documentation", 208 | "R Max Espinoza", 209 | "manual", 210 | ), 211 | ] 212 | 213 | # The name of an image file (relative to this directory) to place at 214 | # the top of the title page. 215 | # latex_logo = None 216 | 217 | # For "manual" documents, if this is true, then toplevel headings 218 | # are parts, not chapters. 219 | # latex_use_parts = False 220 | 221 | # If true, show page references after internal links. 222 | # latex_show_pagerefs = False 223 | 224 | # If true, show URL addresses after external links. 225 | # latex_show_urls = False 226 | 227 | # Documents to append as an appendix to all manuals. 228 | # latex_appendices = [] 229 | 230 | # If false, no module index is generated. 231 | # latex_domain_indices = True 232 | 233 | 234 | # -- Options for manual page output ------------------------------------ 235 | 236 | # One entry per manual page. List of tuples 237 | # (source start file, name, description, authors, manual section). 238 | man_pages = [ 239 | ("index", "scrapy_redis", "Scrapy-Redis Documentation", ["R Max Espinoza"], 1) 240 | ] 241 | 242 | # If true, show URL addresses after external links. 243 | # man_show_urls = False 244 | 245 | 246 | # -- Options for Texinfo output ---------------------------------------- 247 | 248 | # Grouping the document tree into Texinfo files. List of tuples 249 | # (source start file, target name, title, author, 250 | # dir menu entry, description, category) 251 | texinfo_documents = [ 252 | ( 253 | "index", 254 | "scrapy_redis", 255 | "Scrapy-Redis Documentation", 256 | "R Max Espinoza", 257 | "scrapy-redis", 258 | "One line description of project.", 259 | "Miscellaneous", 260 | ), 261 | ] 262 | 263 | # Documents to append as an appendix to all manuals. 264 | # texinfo_appendices = [] 265 | 266 | # If false, no module index is generated. 267 | # texinfo_domain_indices = True 268 | 269 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 270 | # texinfo_show_urls = 'footnote' 271 | 272 | # If true, do not generate a @detailmenu in the "Top" node's menu. 273 | # texinfo_no_detailmenu = False 274 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. scrapy-redis documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Scrapy-Redis's documentation! 7 | ======================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | readme 15 | installation 16 | modules 17 | contributing 18 | history 19 | authors 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install Scrapy-Redis, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | pip install scrapy-redis 16 | 17 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 18 | you through the process. 19 | 20 | .. _pip: https://pip.pypa.io 21 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 22 | 23 | 24 | From sources 25 | ------------ 26 | 27 | The sources for Scrapy-Redis can be downloaded from the `Github repo`_. 28 | 29 | You can either clone the public repository: 30 | 31 | .. code-block:: console 32 | 33 | git clone git://github.com/rolando/scrapy-redis 34 | 35 | Or download the `tarball`_: 36 | 37 | .. code-block:: console 38 | 39 | curl -OL https://github.com/rolando/scrapy-redis/tarball/master 40 | 41 | Once you have a copy of the source, you can install it with: 42 | 43 | .. code-block:: console 44 | 45 | pip install -e . 46 | 47 | 48 | .. _Github repo: https://github.com/rolando/scrapy-redis 49 | .. _tarball: https://github.com/rolando/scrapy-redis/tarball/master 50 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\scrapy-redis.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\scrapy-redis.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | scrapy_redis 8 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # This packages are requires only for development and release management. 2 | Sphinx 3 | bumpversion 4 | check-manifest 5 | pip-tools 6 | twine 7 | watchdog 8 | wheel 9 | -------------------------------------------------------------------------------- /docs/scrapy_redis.rst: -------------------------------------------------------------------------------- 1 | scrapy_redis package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scrapy_redis.connection module 8 | ------------------------------ 9 | 10 | .. automodule:: scrapy_redis.connection 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | scrapy_redis.dupefilter module 16 | ------------------------------ 17 | 18 | .. automodule:: scrapy_redis.dupefilter 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | scrapy_redis.pipelines module 24 | ----------------------------- 25 | 26 | .. automodule:: scrapy_redis.pipelines 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | scrapy_redis.queue module 32 | ------------------------- 33 | 34 | .. automodule:: scrapy_redis.queue 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | scrapy_redis.scheduler module 40 | ----------------------------- 41 | 42 | .. automodule:: scrapy_redis.scheduler 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | scrapy_redis.spiders module 48 | --------------------------- 49 | 50 | .. automodule:: scrapy_redis.spiders 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: scrapy_redis 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /example-project/Dockerfile: -------------------------------------------------------------------------------- 1 | #@IgnoreInspection BashAddShebang 2 | FROM python:2.7-onbuild 3 | 4 | ENTRYPOINT ["scrapy"] 5 | CMD ["crawl", "dmoz"] 6 | -------------------------------------------------------------------------------- /example-project/README.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | Scrapy Redis Example Project 3 | ============================ 4 | 5 | 6 | This directory contains an example Scrapy project integrated with scrapy-redis. 7 | By default, all items are sent to redis (key ``:items``). All spiders 8 | schedule requests through redis, so you can start additional spiders to speed 9 | up the crawling. 10 | 11 | Spiders 12 | ------- 13 | 14 | * **dmoz** 15 | 16 | This spider simply scrapes dmoz.org. 17 | 18 | * **myspider_redis** 19 | 20 | This spider uses redis as a shared requests queue and uses 21 | ``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs 22 | one item. 23 | 24 | * **mycrawler_redis** 25 | 26 | This spider uses redis as a shared requests queue and uses 27 | ``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows 28 | are links. 29 | 30 | 31 | .. note:: 32 | 33 | All requests are persisted by default. You can clear the queue by using the 34 | ``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s 35 | SCHEDULER_FLUSH_ON_START=1``. 36 | 37 | 38 | Running the example project 39 | --------------------------- 40 | 41 | This example illustrates how to share a spider's requests queue 42 | across multiple spider instances, highly suitable for broad crawls. 43 | 44 | 1. Check scrapy_redis package in your ``PYTHONPATH`` 45 | 46 | 2. Run the crawler for first time then stop it 47 | 48 | .. code-block:: bash 49 | 50 | cd example-project 51 | scrapy crawl dmoz 52 | ... [dmoz] ... 53 | ^C 54 | 55 | 3. Run the crawler again to resume stopped crawling 56 | 57 | .. code-block:: bash 58 | 59 | scrapy crawl dmoz 60 | ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled) 61 | 62 | 4. Start one or more additional scrapy crawlers 63 | 64 | .. code-block:: bash 65 | 66 | scrapy crawl dmoz 67 | ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled) 68 | 69 | 5. Start one or more post-processing workers 70 | 71 | .. code-block:: bash 72 | 73 | python process_items.py dmoz:items -v 74 | ... 75 | Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/) 76 | Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/) 77 | ... 78 | 79 | 80 | Feeding a Spider from Redis 81 | --------------------------- 82 | 83 | The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the 84 | urls from redis. The urls in the redis queue will be processed one 85 | after another, if the first request yields more requests, the spider 86 | will process those requests before fetching another url from redis. 87 | 88 | For example, create a file ``myspider.py`` with the code below: 89 | 90 | .. code-block:: python 91 | 92 | from scrapy_redis.spiders import RedisSpider 93 | 94 | 95 | class MySpider(RedisSpider): 96 | name = "myspider" 97 | 98 | def parse(self, response): 99 | # do stuff 100 | pass 101 | 102 | 103 | Then: 104 | 105 | 1. run the spider 106 | 107 | .. code-block:: bash 108 | 109 | scrapy runspider myspider.py 110 | 111 | 2. push json data to redis 112 | 113 | .. code-block:: bash 114 | 115 | redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }' 116 | 117 | 118 | .. note:: 119 | 120 | * These spiders rely on the spider idle signal to fetch start urls, hence it 121 | may have a few seconds of delay between the time you push a new url and the 122 | spider starts crawling it. 123 | 124 | * Also please pay attention to json formatting. 125 | 126 | 127 | Processing items 128 | ---------------- 129 | 130 | The ``process_items.py`` provides an example of consuming the items queue:: 131 | 132 | .. code-block:: bash 133 | 134 | python process_items.py --help 135 | 136 | 137 | Run via Docker 138 | -------------- 139 | 140 | You require the following applications: 141 | 142 | * docker (https://docs.docker.com/installation/) 143 | * docker-compose (https://docs.docker.com/compose/install/) 144 | 145 | For implementation details see `Dockerfile` and `docker-compose.yml` and read 146 | official docker documentation. 147 | 148 | 1. To start sample `example-project` (`-d` for daemon):: 149 | 150 | docker-compose up 151 | 152 | 2. To scale `crawler` (4 instances for example):: 153 | 154 | docker-compose scale crawler=4 155 | -------------------------------------------------------------------------------- /example-project/docker-compose.yml: -------------------------------------------------------------------------------- 1 | redis: 2 | image: redis 3 | ports: 4 | - "6379:6379" # added port for external db provisioning 5 | 6 | crawler: 7 | build: . 8 | links: 9 | - redis:localhost 10 | -------------------------------------------------------------------------------- /example-project/example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmax/scrapy-redis/c3064c2fa74e623bf14448d82cc07ca2da8e183d/example-project/example/__init__.py -------------------------------------------------------------------------------- /example-project/example/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Field, Item 7 | from scrapy.loader import ItemLoader 8 | from scrapy.loader.processors import Join, MapCompose, TakeFirst 9 | 10 | 11 | class ExampleItem(Item): 12 | name = Field() 13 | description = Field() 14 | link = Field() 15 | crawled = Field() 16 | spider = Field() 17 | url = Field() 18 | 19 | 20 | class ExampleLoader(ItemLoader): 21 | default_item_class = ExampleItem 22 | default_input_processor = MapCompose(lambda s: s.strip()) 23 | default_output_processor = TakeFirst() 24 | description_out = Join() 25 | -------------------------------------------------------------------------------- /example-project/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | from datetime import datetime 6 | 7 | 8 | class ExamplePipeline: 9 | def process_item(self, item, spider): 10 | item["crawled"] = datetime.utcnow() 11 | item["spider"] = spider.name 12 | return item 13 | -------------------------------------------------------------------------------- /example-project/example/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for example project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | SPIDER_MODULES = ["example.spiders"] 9 | NEWSPIDER_MODULE = "example.spiders" 10 | 11 | USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)" 12 | 13 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 14 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 15 | SCHEDULER_PERSIST = True 16 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 17 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 18 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 19 | 20 | ITEM_PIPELINES = { 21 | "example.pipelines.ExamplePipeline": 300, 22 | "scrapy_redis.pipelines.RedisPipeline": 400, 23 | } 24 | 25 | LOG_LEVEL = "DEBUG" 26 | 27 | # Introduce an artifical delay to make use of parallelism. to speed up the 28 | # crawl. 29 | DOWNLOAD_DELAY = 1 30 | -------------------------------------------------------------------------------- /example-project/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # To create the first spider for your project use this command: 4 | # 5 | # scrapy genspider myspider myspider-domain.com 6 | # 7 | # For more info see: 8 | # http://doc.scrapy.org/topics/spiders.html 9 | -------------------------------------------------------------------------------- /example-project/example/spiders/dmoz.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | 4 | 5 | class DmozSpider(CrawlSpider): 6 | """Follow categories and extract links.""" 7 | 8 | name = "dmoz" 9 | allowed_domains = ["dmoz-odp.org"] 10 | start_urls = ["http://www.dmoz-odp.org/"] 11 | 12 | rules = [ 13 | Rule( 14 | LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")), 15 | callback="parse_directory", 16 | follow=True, 17 | ), 18 | ] 19 | 20 | def parse_directory(self, response): 21 | for div in response.css(".title-and-desc"): 22 | yield { 23 | "name": div.css(".site-title::text").extract_first(), 24 | "description": div.css(".site-descr::text").extract_first().strip(), 25 | "link": div.css("a::attr(href)").extract_first(), 26 | } 27 | -------------------------------------------------------------------------------- /example-project/example/spiders/mycrawler_redis.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import Rule 3 | 4 | from scrapy_redis.spiders import RedisCrawlSpider 5 | 6 | 7 | class MyCrawler(RedisCrawlSpider): 8 | """Spider that reads urls from redis queue (myspider:start_urls).""" 9 | 10 | name = "mycrawler_redis" 11 | redis_key = "mycrawler:start_urls" 12 | 13 | rules = ( 14 | # follow all links 15 | Rule(LinkExtractor(), callback="parse_page", follow=True), 16 | ) 17 | 18 | def __init__(self, *args, **kwargs): 19 | # Dynamically define the allowed domains list. 20 | domain = kwargs.pop("domain", "") 21 | self.allowed_domains = filter(None, domain.split(",")) 22 | super().__init__(*args, **kwargs) 23 | 24 | def parse_page(self, response): 25 | return { 26 | "name": response.css("title::text").extract_first(), 27 | "url": response.url, 28 | } 29 | -------------------------------------------------------------------------------- /example-project/example/spiders/myspider_redis.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisSpider 2 | 3 | 4 | class MySpider(RedisSpider): 5 | """Spider that reads urls from redis queue (myspider:start_urls).""" 6 | 7 | name = "myspider_redis" 8 | redis_key = "myspider:start_urls" 9 | 10 | def __init__(self, *args, **kwargs): 11 | # Dynamically define the allowed domains list. 12 | domain = kwargs.pop("domain", "") 13 | self.allowed_domains = filter(None, domain.split(",")) 14 | super().__init__(*args, **kwargs) 15 | 16 | def parse(self, response): 17 | return { 18 | "name": response.css("title::text").extract_first(), 19 | "url": response.url, 20 | } 21 | -------------------------------------------------------------------------------- /example-project/process_items.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | """A script to process items from a redis queue.""" 5 | 6 | import argparse 7 | import json 8 | import logging 9 | import pprint 10 | import sys 11 | import time 12 | 13 | from scrapy_redis import get_redis 14 | 15 | logger = logging.getLogger("process_items") 16 | 17 | 18 | def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1): 19 | """Process items from a redis queue. 20 | 21 | Parameters 22 | ---------- 23 | r : Redis 24 | Redis connection instance. 25 | keys : list 26 | List of keys to read the items from. 27 | timeout: int 28 | Read timeout. 29 | 30 | """ 31 | limit = limit or float("inf") 32 | processed = 0 33 | while processed < limit: 34 | # Change ``blpop`` to ``brpop`` to process as LIFO. 35 | ret = r.blpop(keys, timeout) 36 | # If data is found before the timeout then we consider we are done. 37 | if ret is None: 38 | time.sleep(wait) 39 | continue 40 | 41 | source, data = ret 42 | try: 43 | item = json.loads(data) 44 | except Exception: 45 | logger.exception("Failed to load item:\n%r", pprint.pformat(data)) 46 | continue 47 | 48 | try: 49 | name = item.get("name") or item.get("title") 50 | url = item.get("url") or item.get("link") 51 | logger.debug("[%s] Processing item: %s <%s>", source, name, url) 52 | except KeyError: 53 | logger.exception( 54 | "[%s] Failed to process item:\n%r", source, pprint.pformat(item) 55 | ) 56 | continue 57 | 58 | processed += 1 59 | if processed % log_every == 0: 60 | logger.info("Processed %s items", processed) 61 | 62 | 63 | def main(): 64 | parser = argparse.ArgumentParser(description=__doc__) 65 | parser.add_argument("key", help="Redis key where items are stored") 66 | parser.add_argument("--host") 67 | parser.add_argument("--port") 68 | parser.add_argument("--timeout", type=int, default=5) 69 | parser.add_argument("--limit", type=int, default=0) 70 | parser.add_argument("--progress-every", type=int, default=100) 71 | parser.add_argument("-v", "--verbose", action="store_true") 72 | 73 | args = parser.parse_args() 74 | 75 | params = {} 76 | if args.host: 77 | params["host"] = args.host 78 | if args.port: 79 | params["port"] = args.port 80 | 81 | logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) 82 | 83 | r = get_redis(**params) 84 | host = r.connection_pool.get_connection("info").host 85 | logger.info("Waiting for items in '%s' (server: %s)", args.key, host) 86 | kwargs = { 87 | "keys": [args.key], 88 | "timeout": args.timeout, 89 | "limit": args.limit, 90 | "log_every": args.progress_every, 91 | } 92 | try: 93 | process_items(r, **kwargs) 94 | retcode = 0 # ok 95 | except KeyboardInterrupt: 96 | retcode = 0 # ok 97 | except Exception: 98 | logger.exception("Unhandled exception") 99 | retcode = 2 100 | 101 | return retcode 102 | 103 | 104 | if __name__ == "__main__": 105 | sys.exit(main()) 106 | -------------------------------------------------------------------------------- /example-project/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | scrapy-redis 3 | -------------------------------------------------------------------------------- /example-project/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | persistent=no 3 | jobs=1 # >1 hides results 4 | suggestion-mode=yes # guess common misconfiguration and emit user-friendly hints 5 | py-version = 3.11.3 6 | 7 | [MESSAGES CONTROL] 8 | disable=abstract-method, 9 | anomalous-backslash-in-string, 10 | arguments-differ, 11 | arguments-renamed, 12 | attribute-defined-outside-init, 13 | bad-classmethod-argument, 14 | bad-continuation, 15 | bad-indentation, 16 | bad-mcs-classmethod-argument, 17 | bad-super-call, 18 | bad-whitespace, 19 | bare-except, 20 | blacklisted-name, 21 | broad-except, 22 | c-extension-no-member, 23 | catching-non-exception, 24 | cell-var-from-loop, 25 | comparison-with-callable, 26 | consider-iterating-dictionary, 27 | consider-using-dict-items, 28 | consider-using-from-import, 29 | consider-using-in, 30 | consider-using-set-comprehension, 31 | consider-using-sys-exit, 32 | consider-using-with, 33 | cyclic-import, 34 | dangerous-default-value, 35 | deprecated-method, 36 | deprecated-module, 37 | duplicate-code, # https://github.com/PyCQA/pylint/issues/214 38 | eval-used, 39 | expression-not-assigned, 40 | fixme, 41 | function-redefined, 42 | global-statement, 43 | import-error, 44 | import-outside-toplevel, 45 | import-self, 46 | inconsistent-return-statements, 47 | inherit-non-class, 48 | invalid-name, 49 | invalid-overridden-method, 50 | isinstance-second-argument-not-valid-type, 51 | keyword-arg-before-vararg, 52 | line-too-long, 53 | logging-format-interpolation, 54 | logging-not-lazy, 55 | lost-exception, 56 | method-hidden, 57 | misplaced-comparison-constant, 58 | missing-docstring, 59 | missing-final-newline, 60 | multiple-imports, 61 | multiple-statements, 62 | no-else-continue, 63 | no-else-raise, 64 | no-else-return, 65 | no-init, 66 | no-member, 67 | no-method-argument, 68 | no-name-in-module, 69 | no-self-argument, 70 | no-self-use, 71 | no-value-for-parameter, 72 | not-an-iterable, 73 | not-callable, 74 | pointless-statement, 75 | pointless-string-statement, 76 | protected-access, 77 | raise-missing-from, 78 | redefined-argument-from-local, 79 | redefined-builtin, 80 | redefined-outer-name, 81 | reimported, 82 | signature-differs, 83 | singleton-comparison, 84 | super-init-not-called, 85 | super-with-arguments, 86 | superfluous-parens, 87 | too-few-public-methods, 88 | too-many-ancestors, 89 | too-many-arguments, 90 | too-many-branches, 91 | too-many-format-args, 92 | too-many-function-args, 93 | too-many-instance-attributes, 94 | too-many-lines, 95 | too-many-locals, 96 | too-many-public-methods, 97 | too-many-return-statements, 98 | trailing-newlines, 99 | trailing-whitespace, 100 | unbalanced-tuple-unpacking, 101 | undefined-variable, 102 | undefined-loop-variable, 103 | unexpected-special-method-signature, 104 | ungrouped-imports, 105 | unidiomatic-typecheck, 106 | unnecessary-comprehension, 107 | unnecessary-lambda, 108 | unnecessary-pass, 109 | unreachable, 110 | unspecified-encoding, 111 | unsupported-assignment-operation, 112 | unsubscriptable-object, 113 | unused-argument, 114 | unused-import, 115 | unused-private-member, 116 | unused-variable, 117 | unused-wildcard-import, 118 | use-implicit-booleaness-not-comparison, 119 | used-before-assignment, 120 | useless-object-inheritance, # Required for Python 2 support 121 | useless-return, 122 | useless-super-delegation, 123 | wildcard-import, 124 | wrong-import-order, 125 | wrong-import-position 126 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = 3 | .* 4 | dist 5 | build 6 | python_files = 7 | test_*.py 8 | *_test.py 9 | tests.py 10 | addopts = 11 | -rxEfsw -v 12 | -------------------------------------------------------------------------------- /requirements-tests.txt: -------------------------------------------------------------------------------- 1 | # This packages are required to run all the tests. 2 | flake8 3 | mock 4 | pytest>=6.0,<7 5 | pytest-cov 6 | tox>=4.0,<5 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=2.6.0 2 | redis>=4.2 3 | six>=1.15 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | exclude = docs, tests 6 | max-line-length = 120 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import io 3 | from pkgutil import walk_packages 4 | 5 | from setuptools import setup 6 | 7 | 8 | def find_packages(path): 9 | # This method returns packages and subpackages as well. 10 | return [name for _, name, is_pkg in walk_packages([path]) if is_pkg] 11 | 12 | 13 | def read_file(filename): 14 | with open(filename) as fp: 15 | return fp.read().strip() 16 | 17 | 18 | def read_rst(filename): 19 | # Ignore unsupported directives by pypi. 20 | content = read_file(filename) 21 | return "".join( 22 | line for line in io.StringIO(content) if not line.startswith(".. comment::") 23 | ) 24 | 25 | 26 | def read_requirements(filename): 27 | return [ 28 | line.strip() 29 | for line in read_file(filename).splitlines() 30 | if not line.startswith("#") 31 | ] 32 | 33 | 34 | setup( 35 | name="scrapy-redis", 36 | version=read_file("VERSION"), 37 | description="Redis-based components for Scrapy.", 38 | long_description=read_rst("README.rst") + "\n\n" + read_rst("HISTORY.rst"), 39 | author="R Max Espinoza", 40 | author_email="hey@rmax.dev", 41 | url="https://github.com/rmax/scrapy-redis", 42 | packages=list(find_packages("src")), 43 | package_dir={"": "src"}, 44 | install_requires=read_requirements("requirements.txt"), 45 | include_package_data=True, 46 | license="MIT", 47 | keywords="scrapy-redis", 48 | classifiers=[ 49 | "Development Status :: 4 - Beta", 50 | "Intended Audience :: Developers", 51 | "License :: OSI Approved :: MIT License", 52 | "Natural Language :: English", 53 | "Programming Language :: Python :: 3", 54 | "Programming Language :: Python :: 3.7", 55 | "Programming Language :: Python :: 3.8", 56 | "Programming Language :: Python :: 3.9", 57 | "Programming Language :: Python :: 3.10", 58 | ], 59 | ) 60 | -------------------------------------------------------------------------------- /src/scrapy_redis/__init__.py: -------------------------------------------------------------------------------- 1 | from .connection import get_redis, get_redis_from_settings # NOQA 2 | 3 | __author__ = "R Max Espinoza" 4 | __email__ = "hey at rmax.dev" 5 | __version__ = "0.9.1" 6 | -------------------------------------------------------------------------------- /src/scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | 3 | from . import defaults 4 | 5 | # Shortcut maps 'setting name' -> 'parmater name'. 6 | SETTINGS_PARAMS_MAP = { 7 | "REDIS_URL": "url", 8 | "REDIS_HOST": "host", 9 | "REDIS_PORT": "port", 10 | "REDIS_DB": "db", 11 | "REDIS_ENCODING": "encoding", 12 | } 13 | 14 | SETTINGS_PARAMS_MAP["REDIS_DECODE_RESPONSES"] = "decode_responses" 15 | 16 | 17 | def get_redis_from_settings(settings): 18 | """Returns a redis client instance from given Scrapy settings object. 19 | 20 | This function uses ``get_client`` to instantiate the client and uses 21 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 22 | can override them using the ``REDIS_PARAMS`` setting. 23 | 24 | Parameters 25 | ---------- 26 | settings : Settings 27 | A scrapy settings object. See the supported settings below. 28 | 29 | Returns 30 | ------- 31 | server 32 | Redis client instance. 33 | 34 | Other Parameters 35 | ---------------- 36 | REDIS_URL : str, optional 37 | Server connection URL. 38 | REDIS_HOST : str, optional 39 | Server host. 40 | REDIS_PORT : str, optional 41 | Server port. 42 | REDIS_DB : int, optional 43 | Server database 44 | REDIS_ENCODING : str, optional 45 | Data encoding. 46 | REDIS_PARAMS : dict, optional 47 | Additional client parameters. 48 | 49 | Python 3 Only 50 | ---------------- 51 | REDIS_DECODE_RESPONSES : bool, optional 52 | Sets the `decode_responses` kwarg in Redis cls ctor 53 | 54 | """ 55 | params = defaults.REDIS_PARAMS.copy() 56 | params.update(settings.getdict("REDIS_PARAMS")) 57 | # XXX: Deprecate REDIS_* settings. 58 | for source, dest in SETTINGS_PARAMS_MAP.items(): 59 | val = settings.get(source) 60 | if val: 61 | params[dest] = val 62 | 63 | # Allow ``redis_cls`` to be a path to a class. 64 | if isinstance(params.get("redis_cls"), str): 65 | params["redis_cls"] = load_object(params["redis_cls"]) 66 | 67 | return get_redis(**params) 68 | 69 | 70 | # Backwards compatible alias. 71 | from_settings = get_redis_from_settings 72 | 73 | 74 | def get_redis(**kwargs): 75 | """Returns a redis client instance. 76 | 77 | Parameters 78 | ---------- 79 | redis_cls : class, optional 80 | Defaults to ``redis.StrictRedis``. 81 | url : str, optional 82 | If given, ``redis_cls.from_url`` is used to instantiate the class. 83 | **kwargs 84 | Extra parameters to be passed to the ``redis_cls`` class. 85 | 86 | Returns 87 | ------- 88 | server 89 | Redis client instance. 90 | 91 | """ 92 | redis_cls = kwargs.pop("redis_cls", defaults.REDIS_CLS) 93 | url = kwargs.pop("url", None) 94 | if url: 95 | return redis_cls.from_url(url, **kwargs) 96 | else: 97 | return redis_cls(**kwargs) 98 | -------------------------------------------------------------------------------- /src/scrapy_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | # For standalone use. 4 | DUPEFILTER_KEY = "dupefilter:%(timestamp)s" 5 | 6 | PIPELINE_KEY = "%(spider)s:items" 7 | 8 | STATS_KEY = "%(spider)s:stats" 9 | 10 | REDIS_CLS = redis.StrictRedis 11 | REDIS_ENCODING = "utf-8" 12 | # Sane connection defaults. 13 | REDIS_PARAMS = { 14 | "socket_timeout": 30, 15 | "socket_connect_timeout": 30, 16 | "retry_on_timeout": True, 17 | "encoding": REDIS_ENCODING, 18 | } 19 | REDIS_CONCURRENT_REQUESTS = 16 20 | 21 | SCHEDULER_QUEUE_KEY = "%(spider)s:requests" 22 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue" 23 | SCHEDULER_DUPEFILTER_KEY = "%(spider)s:dupefilter" 24 | SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 25 | SCHEDULER_PERSIST = False 26 | START_URLS_KEY = "%(name)s:start_urls" 27 | START_URLS_AS_SET = False 28 | START_URLS_AS_ZSET = False 29 | MAX_IDLE_TIME = 0 30 | -------------------------------------------------------------------------------- /src/scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import logging 4 | import time 5 | 6 | from scrapy.dupefilters import BaseDupeFilter 7 | from scrapy.utils.python import to_unicode 8 | from w3lib.url import canonicalize_url 9 | 10 | from . import defaults 11 | from .connection import get_redis_from_settings 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # TODO: Rename class to RedisDupeFilter. 17 | class RFPDupeFilter(BaseDupeFilter): 18 | """Redis-based request duplicates filter. 19 | 20 | This class can also be used with default Scrapy's scheduler. 21 | 22 | """ 23 | 24 | logger = logger 25 | 26 | def __init__(self, server, key, debug=False): 27 | """Initialize the duplicates filter. 28 | 29 | Parameters 30 | ---------- 31 | server : redis.StrictRedis 32 | The redis server instance. 33 | key : str 34 | Redis key Where to store fingerprints. 35 | debug : bool, optional 36 | Whether to log filtered requests. 37 | 38 | """ 39 | self.server = server 40 | self.key = key 41 | self.debug = debug 42 | self.logdupes = True 43 | 44 | @classmethod 45 | def from_settings(cls, settings): 46 | """Returns an instance from given settings. 47 | 48 | This uses by default the key ``dupefilter:``. When using the 49 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 50 | it needs to pass the spider name in the key. 51 | 52 | Parameters 53 | ---------- 54 | settings : scrapy.settings.Settings 55 | 56 | Returns 57 | ------- 58 | RFPDupeFilter 59 | A RFPDupeFilter instance. 60 | 61 | 62 | """ 63 | server = get_redis_from_settings(settings) 64 | # XXX: This creates one-time key. needed to support to use this 65 | # class as standalone dupefilter with scrapy's default scheduler 66 | # if scrapy passes spider on open() method this wouldn't be needed 67 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 68 | key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())} 69 | debug = settings.getbool("DUPEFILTER_DEBUG") 70 | return cls(server, key=key, debug=debug) 71 | 72 | @classmethod 73 | def from_crawler(cls, crawler): 74 | """Returns instance from crawler. 75 | 76 | Parameters 77 | ---------- 78 | crawler : scrapy.crawler.Crawler 79 | 80 | Returns 81 | ------- 82 | RFPDupeFilter 83 | Instance of RFPDupeFilter. 84 | 85 | """ 86 | return cls.from_settings(crawler.settings) 87 | 88 | def request_seen(self, request): 89 | """Returns True if request was already seen. 90 | 91 | Parameters 92 | ---------- 93 | request : scrapy.http.Request 94 | 95 | Returns 96 | ------- 97 | bool 98 | 99 | """ 100 | fp = self.request_fingerprint(request) 101 | # This returns the number of values added, zero if already exists. 102 | added = self.server.sadd(self.key, fp) 103 | return added == 0 104 | 105 | def request_fingerprint(self, request): 106 | """Returns a fingerprint for a given request. 107 | 108 | Parameters 109 | ---------- 110 | request : scrapy.http.Request 111 | 112 | Returns 113 | ------- 114 | str 115 | 116 | """ 117 | fingerprint_data = { 118 | "method": to_unicode(request.method), 119 | "url": canonicalize_url(request.url), 120 | "body": (request.body or b"").hex(), 121 | } 122 | fingerprint_json = json.dumps(fingerprint_data, sort_keys=True) 123 | return hashlib.sha1(fingerprint_json.encode()).hexdigest() 124 | 125 | @classmethod 126 | def from_spider(cls, spider): 127 | settings = spider.settings 128 | server = get_redis_from_settings(settings) 129 | dupefilter_key = settings.get( 130 | "SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY 131 | ) 132 | key = dupefilter_key % {"spider": spider.name} 133 | debug = settings.getbool("DUPEFILTER_DEBUG") 134 | return cls(server, key=key, debug=debug) 135 | 136 | def close(self, reason=""): 137 | """Delete data on close. Called by Scrapy's scheduler. 138 | 139 | Parameters 140 | ---------- 141 | reason : str, optional 142 | 143 | """ 144 | self.clear() 145 | 146 | def clear(self): 147 | """Clears fingerprints data.""" 148 | self.server.delete(self.key) 149 | 150 | def log(self, request, spider): 151 | """Logs given request. 152 | 153 | Parameters 154 | ---------- 155 | request : scrapy.http.Request 156 | spider : scrapy.spiders.Spider 157 | 158 | """ 159 | if self.debug: 160 | msg = "Filtered duplicate request: %(request)s" 161 | self.logger.debug(msg, {"request": request}, extra={"spider": spider}) 162 | elif self.logdupes: 163 | msg = ( 164 | "Filtered duplicate request %(request)s" 165 | " - no more duplicates will be shown" 166 | " (see DUPEFILTER_DEBUG to show all duplicates)" 167 | ) 168 | self.logger.debug(msg, {"request": request}, extra={"spider": spider}) 169 | self.logdupes = False 170 | -------------------------------------------------------------------------------- /src/scrapy_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) 15 | -------------------------------------------------------------------------------- /src/scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | default_serialize = ScrapyJSONEncoder().encode 8 | 9 | 10 | class RedisPipeline: 11 | """Pushes serialized item into a redis list/queue 12 | 13 | Settings 14 | -------- 15 | REDIS_ITEMS_KEY : str 16 | Redis key where to store items. 17 | REDIS_ITEMS_SERIALIZER : str 18 | Object path to serializer function. 19 | 20 | """ 21 | 22 | def __init__( 23 | self, server, key=defaults.PIPELINE_KEY, serialize_func=default_serialize 24 | ): 25 | """Initialize pipeline. 26 | 27 | Parameters 28 | ---------- 29 | server : StrictRedis 30 | Redis client instance. 31 | key : str 32 | Redis key where to store items. 33 | serialize_func : callable 34 | Items serializer function. 35 | 36 | """ 37 | self.server = server 38 | self.key = key 39 | self.serialize = serialize_func 40 | 41 | @classmethod 42 | def from_settings(cls, settings): 43 | params = { 44 | "server": connection.from_settings(settings), 45 | } 46 | if settings.get("REDIS_ITEMS_KEY"): 47 | params["key"] = settings["REDIS_ITEMS_KEY"] 48 | if settings.get("REDIS_ITEMS_SERIALIZER"): 49 | params["serialize_func"] = load_object(settings["REDIS_ITEMS_SERIALIZER"]) 50 | 51 | return cls(**params) 52 | 53 | @classmethod 54 | def from_crawler(cls, crawler): 55 | return cls.from_settings(crawler.settings) 56 | 57 | def process_item(self, item, spider): 58 | return deferToThread(self._process_item, item, spider) 59 | 60 | def _process_item(self, item, spider): 61 | key = self.item_key(item, spider) 62 | data = self.serialize(item) 63 | self.server.rpush(key, data) 64 | return item 65 | 66 | def item_key(self, item, spider): 67 | """Returns redis key based on given spider. 68 | 69 | Override this function to use a different key depending on the item 70 | and/or spider. 71 | 72 | """ 73 | return self.key % {"spider": spider.name} 74 | -------------------------------------------------------------------------------- /src/scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | try: 2 | from scrapy.utils.request import request_from_dict 3 | except ImportError: 4 | from scrapy.utils.reqser import request_to_dict, request_from_dict 5 | 6 | from . import picklecompat 7 | 8 | 9 | class Base: 10 | """Per-spider base queue class""" 11 | 12 | def __init__(self, server, spider, key, serializer=None): 13 | """Initialize per-spider redis queue. 14 | 15 | Parameters 16 | ---------- 17 | server : StrictRedis 18 | Redis client instance. 19 | spider : Spider 20 | Scrapy spider instance. 21 | key: str 22 | Redis key where to put and get messages. 23 | serializer : object 24 | Serializer object with ``loads`` and ``dumps`` methods. 25 | 26 | """ 27 | if serializer is None: 28 | # Backward compatibility. 29 | # TODO: deprecate pickle. 30 | serializer = picklecompat 31 | if not hasattr(serializer, "loads"): 32 | raise TypeError( 33 | f"serializer does not implement 'loads' function: {serializer}" 34 | ) 35 | if not hasattr(serializer, "dumps"): 36 | raise TypeError( 37 | f"serializer does not implement 'dumps' function: {serializer}" 38 | ) 39 | 40 | self.server = server 41 | self.spider = spider 42 | self.key = key % {"spider": spider.name} 43 | self.serializer = serializer 44 | 45 | def _encode_request(self, request): 46 | """Encode a request object""" 47 | try: 48 | obj = request.to_dict(spider=self.spider) 49 | except AttributeError: 50 | obj = request_to_dict(request, self.spider) 51 | return self.serializer.dumps(obj) 52 | 53 | def _decode_request(self, encoded_request): 54 | """Decode an request previously encoded""" 55 | obj = self.serializer.loads(encoded_request) 56 | return request_from_dict(obj, spider=self.spider) 57 | 58 | def __len__(self): 59 | """Return the length of the queue""" 60 | raise NotImplementedError 61 | 62 | def push(self, request): 63 | """Push a request""" 64 | raise NotImplementedError 65 | 66 | def pop(self, timeout=0): 67 | """Pop a request""" 68 | raise NotImplementedError 69 | 70 | def clear(self): 71 | """Clear queue/stack""" 72 | self.server.delete(self.key) 73 | 74 | 75 | class FifoQueue(Base): 76 | """Per-spider FIFO queue""" 77 | 78 | def __len__(self): 79 | """Return the length of the queue""" 80 | return self.server.llen(self.key) 81 | 82 | def push(self, request): 83 | """Push a request""" 84 | self.server.lpush(self.key, self._encode_request(request)) 85 | 86 | def pop(self, timeout=0): 87 | """Pop a request""" 88 | if timeout > 0: 89 | data = self.server.brpop(self.key, timeout) 90 | if isinstance(data, tuple): 91 | data = data[1] 92 | else: 93 | data = self.server.rpop(self.key) 94 | if data: 95 | return self._decode_request(data) 96 | 97 | 98 | class PriorityQueue(Base): 99 | """Per-spider priority queue abstraction using redis' sorted set""" 100 | 101 | def __len__(self): 102 | """Return the length of the queue""" 103 | return self.server.zcard(self.key) 104 | 105 | def push(self, request): 106 | """Push a request""" 107 | data = self._encode_request(request) 108 | score = -request.priority 109 | # We don't use zadd method as the order of arguments change depending on 110 | # whether the class is Redis or StrictRedis, and the option of using 111 | # kwargs only accepts strings, not bytes. 112 | self.server.execute_command("ZADD", self.key, score, data) 113 | 114 | def pop(self, timeout=0): 115 | """ 116 | Pop a request 117 | timeout not support in this queue class 118 | """ 119 | # use atomic range/remove using multi/exec 120 | pipe = self.server.pipeline() 121 | pipe.multi() 122 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 123 | results, count = pipe.execute() 124 | if results: 125 | return self._decode_request(results[0]) 126 | 127 | 128 | class LifoQueue(Base): 129 | """Per-spider LIFO queue.""" 130 | 131 | def __len__(self): 132 | """Return the length of the stack""" 133 | return self.server.llen(self.key) 134 | 135 | def push(self, request): 136 | """Push a request""" 137 | self.server.lpush(self.key, self._encode_request(request)) 138 | 139 | def pop(self, timeout=0): 140 | """Pop a request""" 141 | if timeout > 0: 142 | data = self.server.blpop(self.key, timeout) 143 | if isinstance(data, tuple): 144 | data = data[1] 145 | else: 146 | data = self.server.lpop(self.key) 147 | 148 | if data: 149 | return self._decode_request(data) 150 | 151 | 152 | # TODO: Deprecate the use of these names. 153 | SpiderQueue = FifoQueue 154 | SpiderStack = LifoQueue 155 | SpiderPriorityQueue = PriorityQueue 156 | -------------------------------------------------------------------------------- /src/scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import connection, defaults 6 | 7 | 8 | # TODO: add SCRAPY_JOB support. 9 | class Scheduler: 10 | """Redis-based scheduler 11 | 12 | Settings 13 | -------- 14 | SCHEDULER_PERSIST : bool (default: False) 15 | Whether to persist or clear redis queue. 16 | SCHEDULER_FLUSH_ON_START : bool (default: False) 17 | Whether to flush redis queue on start. 18 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 19 | How many seconds to wait before closing if no message is received. 20 | SCHEDULER_QUEUE_KEY : str 21 | Scheduler redis key. 22 | SCHEDULER_QUEUE_CLASS : str 23 | Scheduler queue class. 24 | SCHEDULER_DUPEFILTER_KEY : str 25 | Scheduler dupefilter redis key. 26 | SCHEDULER_DUPEFILTER_CLASS : str 27 | Scheduler dupefilter class. 28 | SCHEDULER_SERIALIZER : str 29 | Scheduler serializer. 30 | 31 | """ 32 | 33 | def __init__( 34 | self, 35 | server, 36 | persist=False, 37 | flush_on_start=False, 38 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 39 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 40 | dupefilter=None, 41 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 42 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 43 | idle_before_close=0, 44 | serializer=None, 45 | ): 46 | """Initialize scheduler. 47 | 48 | Parameters 49 | ---------- 50 | server : Redis 51 | The redis server instance. 52 | persist : bool 53 | Whether to flush requests when closing. Default is False. 54 | flush_on_start : bool 55 | Whether to flush requests on start. Default is False. 56 | queue_key : str 57 | Requests queue key. 58 | queue_cls : str 59 | Importable path to the queue class. 60 | dupefilter: Dupefilter 61 | Custom dupefilter instance. 62 | dupefilter_key : str 63 | Duplicates filter key. 64 | dupefilter_cls : str 65 | Importable path to the dupefilter class. 66 | idle_before_close : int 67 | Timeout before giving up. 68 | 69 | """ 70 | if idle_before_close < 0: 71 | raise TypeError("idle_before_close cannot be negative") 72 | 73 | self.server = server 74 | self.persist = persist 75 | self.flush_on_start = flush_on_start 76 | self.queue_key = queue_key 77 | self.queue_cls = queue_cls 78 | self.df = dupefilter 79 | self.dupefilter_cls = dupefilter_cls 80 | self.dupefilter_key = dupefilter_key 81 | self.idle_before_close = idle_before_close 82 | self.serializer = serializer 83 | self.stats = None 84 | 85 | def __len__(self): 86 | return len(self.queue) 87 | 88 | @classmethod 89 | def from_settings(cls, settings): 90 | kwargs = { 91 | "persist": settings.getbool("SCHEDULER_PERSIST"), 92 | "flush_on_start": settings.getbool("SCHEDULER_FLUSH_ON_START"), 93 | "idle_before_close": settings.getint("SCHEDULER_IDLE_BEFORE_CLOSE"), 94 | } 95 | 96 | # If these values are missing, it means we want to use the defaults. 97 | optional = { 98 | # TODO: Use custom prefixes for this settings to note that are 99 | # specific to scrapy-redis. 100 | "queue_key": "SCHEDULER_QUEUE_KEY", 101 | "queue_cls": "SCHEDULER_QUEUE_CLASS", 102 | "dupefilter_key": "SCHEDULER_DUPEFILTER_KEY", 103 | # We use the default setting name to keep compatibility. 104 | "dupefilter_cls": "DUPEFILTER_CLASS", 105 | "serializer": "SCHEDULER_SERIALIZER", 106 | } 107 | for name, setting_name in optional.items(): 108 | val = settings.get(setting_name) 109 | if val: 110 | kwargs[name] = val 111 | 112 | dupefilter_cls = load_object(kwargs["dupefilter_cls"]) 113 | if not hasattr(dupefilter_cls, "from_spider"): 114 | kwargs["dupefilter"] = dupefilter_cls.from_settings(settings) 115 | 116 | # Support serializer as a path to a module. 117 | if isinstance(kwargs.get("serializer"), str): 118 | kwargs["serializer"] = importlib.import_module(kwargs["serializer"]) 119 | 120 | server = connection.from_settings(settings) 121 | # Ensure the connection is working. 122 | server.ping() 123 | 124 | return cls(server=server, **kwargs) 125 | 126 | @classmethod 127 | def from_crawler(cls, crawler): 128 | instance = cls.from_settings(crawler.settings) 129 | # FIXME: for now, stats are only supported from this constructor 130 | instance.stats = crawler.stats 131 | return instance 132 | 133 | def open(self, spider): 134 | self.spider = spider 135 | 136 | try: 137 | self.queue = load_object(self.queue_cls)( 138 | server=self.server, 139 | spider=spider, 140 | key=self.queue_key % {"spider": spider.name}, 141 | serializer=self.serializer, 142 | ) 143 | except TypeError as e: 144 | raise ValueError( 145 | f"Failed to instantiate queue class '{self.queue_cls}': {e}" 146 | ) 147 | 148 | if not self.df: 149 | self.df = load_object(self.dupefilter_cls).from_spider(spider) 150 | 151 | if self.flush_on_start: 152 | self.flush() 153 | # notice if there are requests already in the queue to resume the crawl 154 | if len(self.queue): 155 | spider.log(f"Resuming crawl ({len(self.queue)} requests scheduled)") 156 | 157 | def close(self, reason): 158 | if not self.persist: 159 | self.flush() 160 | 161 | def flush(self): 162 | self.df.clear() 163 | self.queue.clear() 164 | 165 | def enqueue_request(self, request): 166 | if not request.dont_filter and self.df.request_seen(request): 167 | self.df.log(request, self.spider) 168 | return False 169 | if self.stats: 170 | self.stats.inc_value("scheduler/enqueued/redis", spider=self.spider) 171 | self.queue.push(request) 172 | return True 173 | 174 | def next_request(self): 175 | block_pop_timeout = self.idle_before_close 176 | request = self.queue.pop(block_pop_timeout) 177 | if request and self.stats: 178 | self.stats.inc_value("scheduler/dequeued/redis", spider=self.spider) 179 | return request 180 | 181 | def has_pending_requests(self): 182 | return len(self) > 0 183 | -------------------------------------------------------------------------------- /src/scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from collections.abc import Iterable 4 | 5 | from scrapy import FormRequest, signals 6 | from scrapy import version_info as scrapy_version 7 | from scrapy.exceptions import DontCloseSpider 8 | from scrapy.spiders import CrawlSpider, Spider 9 | 10 | from scrapy_redis.utils import TextColor 11 | 12 | from . import connection, defaults 13 | from .utils import bytes_to_str, is_dict 14 | 15 | 16 | class RedisMixin: 17 | """Mixin class to implement reading urls from a redis queue.""" 18 | 19 | redis_key = None 20 | redis_batch_size = None 21 | redis_encoding = None 22 | 23 | # Redis client placeholder. 24 | server = None 25 | 26 | # Idle start time 27 | spider_idle_start_time = int(time.time()) 28 | max_idle_time = None 29 | 30 | def start_requests(self): 31 | """Returns a batch of start requests from redis.""" 32 | return self.next_requests() 33 | 34 | def setup_redis(self, crawler=None): 35 | """Setup redis connection and idle signal. 36 | 37 | This should be called after the spider has set its crawler object. 38 | """ 39 | if self.server is not None: 40 | return 41 | 42 | if crawler is None: 43 | # We allow optional crawler argument to keep backwards 44 | # compatibility. 45 | # XXX: Raise a deprecation warning. 46 | crawler = getattr(self, "crawler", None) 47 | 48 | if crawler is None: 49 | raise ValueError("crawler is required") 50 | 51 | settings = crawler.settings 52 | 53 | if self.redis_key is None: 54 | self.redis_key = settings.get( 55 | "REDIS_START_URLS_KEY", 56 | defaults.START_URLS_KEY, 57 | ) 58 | 59 | self.redis_key = self.redis_key % {"name": self.name} 60 | 61 | if not self.redis_key.strip(): 62 | raise ValueError("redis_key must not be empty") 63 | 64 | if self.redis_batch_size is None: 65 | self.redis_batch_size = settings.getint( 66 | "CONCURRENT_REQUESTS", defaults.REDIS_CONCURRENT_REQUESTS 67 | ) 68 | 69 | try: 70 | self.redis_batch_size = int(self.redis_batch_size) 71 | except (TypeError, ValueError): 72 | raise ValueError("redis_batch_size must be an integer") 73 | 74 | if self.redis_encoding is None: 75 | self.redis_encoding = settings.get( 76 | "REDIS_ENCODING", defaults.REDIS_ENCODING 77 | ) 78 | 79 | self.logger.info( 80 | "Reading start URLs from redis key '%(redis_key)s' " 81 | "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)", 82 | self.__dict__, 83 | ) 84 | 85 | self.server = connection.from_settings(crawler.settings) 86 | 87 | if settings.getbool("REDIS_START_URLS_AS_SET", defaults.START_URLS_AS_SET): 88 | self.fetch_data = self.server.spop 89 | self.count_size = self.server.scard 90 | elif settings.getbool("REDIS_START_URLS_AS_ZSET", defaults.START_URLS_AS_ZSET): 91 | self.fetch_data = self.pop_priority_queue 92 | self.count_size = self.server.zcard 93 | else: 94 | self.fetch_data = self.pop_list_queue 95 | self.count_size = self.server.llen 96 | 97 | if self.max_idle_time is None: 98 | self.max_idle_time = settings.get( 99 | "MAX_IDLE_TIME_BEFORE_CLOSE", defaults.MAX_IDLE_TIME 100 | ) 101 | 102 | try: 103 | self.max_idle_time = int(self.max_idle_time) 104 | except (TypeError, ValueError): 105 | raise ValueError("max_idle_time must be an integer") 106 | 107 | # The idle signal is called when the spider has no requests left, 108 | # that's when we will schedule new requests from redis queue 109 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 110 | 111 | def pop_list_queue(self, redis_key, batch_size): 112 | with self.server.pipeline() as pipe: 113 | pipe.lrange(redis_key, 0, batch_size - 1) 114 | pipe.ltrim(redis_key, batch_size, -1) 115 | datas, _ = pipe.execute() 116 | return datas 117 | 118 | def pop_priority_queue(self, redis_key, batch_size): 119 | with self.server.pipeline() as pipe: 120 | pipe.zrevrange(redis_key, 0, batch_size - 1) 121 | pipe.zremrangebyrank(redis_key, -batch_size, -1) 122 | datas, _ = pipe.execute() 123 | return datas 124 | 125 | def next_requests(self): 126 | """Returns a request to be scheduled or none.""" 127 | # XXX: Do we need to use a timeout here? 128 | found = 0 129 | datas = self.fetch_data(self.redis_key, self.redis_batch_size) 130 | for data in datas: 131 | reqs = self.make_request_from_data(data) 132 | if isinstance(reqs, Iterable): 133 | for req in reqs: 134 | yield req 135 | # XXX: should be here? 136 | found += 1 137 | self.logger.info(f"start req url:{req.url}") 138 | elif reqs: 139 | yield reqs 140 | found += 1 141 | else: 142 | self.logger.debug(f"Request not made from data: {data}") 143 | 144 | if found: 145 | self.logger.debug(f"Read {found} requests from '{self.redis_key}'") 146 | 147 | def make_request_from_data(self, data): 148 | """Returns a `Request` instance for data coming from Redis. 149 | 150 | Overriding this function to support the `json` requested `data` that contains 151 | `url` ,`meta` and other optional parameters. `meta` is a nested json which contains sub-data. 152 | 153 | Along with: 154 | After accessing the data, sending the FormRequest with `url`, `meta` and addition `formdata`, `method` 155 | 156 | For example: 157 | 158 | .. code:: json 159 | 160 | { 161 | "url": "https://example.com", 162 | "meta": { 163 | "job-id":"123xsd", 164 | "start-date":"dd/mm/yy", 165 | }, 166 | "url_cookie_key":"fertxsas", 167 | "method":"POST", 168 | } 169 | 170 | If `url` is empty, return `[]`. So you should verify the `url` in the data. 171 | If `method` is empty, the request object will set method to 'GET', optional. 172 | If `meta` is empty, the request object will set `meta` to an empty dictionary, optional. 173 | 174 | This json supported data can be accessed from 'scrapy.spider' through response. 175 | 'request.url', 'request.meta', 'request.cookies', 'request.method' 176 | 177 | Parameters 178 | ---------- 179 | data : bytes 180 | Message from redis. 181 | 182 | """ 183 | formatted_data = bytes_to_str(data, self.redis_encoding) 184 | 185 | if is_dict(formatted_data): 186 | parameter = json.loads(formatted_data) 187 | else: 188 | self.logger.warning( 189 | f"{TextColor.WARNING}WARNING: String request is deprecated, please use JSON data format. " 190 | f"Detail information, please check https://github.com/rmax/scrapy-redis#features{TextColor.ENDC}" 191 | ) 192 | return FormRequest(formatted_data, dont_filter=True) 193 | 194 | if parameter.get("url", None) is None: 195 | self.logger.warning( 196 | f"{TextColor.WARNING}The data from Redis has no url key in push data{TextColor.ENDC}" 197 | ) 198 | return [] 199 | 200 | url = parameter.pop("url") 201 | method = parameter.pop("method").upper() if "method" in parameter else "GET" 202 | metadata = parameter.pop("meta") if "meta" in parameter else {} 203 | 204 | return FormRequest( 205 | url, dont_filter=True, method=method, formdata=parameter, meta=metadata 206 | ) 207 | 208 | def schedule_next_requests(self): 209 | """Schedules a request if available""" 210 | # TODO: While there is capacity, schedule a batch of redis requests. 211 | for req in self.next_requests(): 212 | # see https://github.com/scrapy/scrapy/issues/5994 213 | if scrapy_version >= (2, 6): 214 | self.crawler.engine.crawl(req) 215 | else: 216 | self.crawler.engine.crawl(req, spider=self) 217 | 218 | def spider_idle(self): 219 | """ 220 | Schedules a request if available, otherwise waits. 221 | or close spider when waiting seconds > MAX_IDLE_TIME_BEFORE_CLOSE. 222 | MAX_IDLE_TIME_BEFORE_CLOSE will not affect SCHEDULER_IDLE_BEFORE_CLOSE. 223 | """ 224 | if self.server is not None and self.count_size(self.redis_key) > 0: 225 | self.spider_idle_start_time = int(time.time()) 226 | 227 | self.schedule_next_requests() 228 | 229 | idle_time = int(time.time()) - self.spider_idle_start_time 230 | if self.max_idle_time != 0 and idle_time >= self.max_idle_time: 231 | return 232 | raise DontCloseSpider 233 | 234 | 235 | class RedisSpider(RedisMixin, Spider): 236 | """Spider that reads urls from redis queue when idle. 237 | 238 | Attributes 239 | ---------- 240 | redis_key : str (default: REDIS_START_URLS_KEY) 241 | Redis key where to fetch start URLs from.. 242 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 243 | Number of messages to fetch from redis on each attempt. 244 | redis_encoding : str (default: REDIS_ENCODING) 245 | Encoding to use when decoding messages from redis queue. 246 | 247 | Settings 248 | -------- 249 | REDIS_START_URLS_KEY : str (default: ":start_urls") 250 | Default Redis key where to fetch start URLs from.. 251 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 252 | Default number of messages to fetch from redis on each attempt. 253 | REDIS_START_URLS_AS_SET : bool (default: False) 254 | Use SET operations to retrieve messages from the redis queue. If False, 255 | the messages are retrieve using the LPOP command. 256 | REDIS_ENCODING : str (default: "utf-8") 257 | Default encoding to use when decoding messages from redis queue. 258 | 259 | """ 260 | 261 | @classmethod 262 | def from_crawler(cls, crawler, *args, **kwargs): 263 | obj = super().from_crawler(crawler, *args, **kwargs) 264 | obj.setup_redis(crawler) 265 | return obj 266 | 267 | 268 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 269 | """Spider that reads urls from redis queue when idle. 270 | 271 | Attributes 272 | ---------- 273 | redis_key : str (default: REDIS_START_URLS_KEY) 274 | Redis key where to fetch start URLs from.. 275 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 276 | Number of messages to fetch from redis on each attempt. 277 | redis_encoding : str (default: REDIS_ENCODING) 278 | Encoding to use when decoding messages from redis queue. 279 | 280 | Settings 281 | -------- 282 | REDIS_START_URLS_KEY : str (default: ":start_urls") 283 | Default Redis key where to fetch start URLs from.. 284 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 285 | Default number of messages to fetch from redis on each attempt. 286 | REDIS_START_URLS_AS_SET : bool (default: True) 287 | Use SET operations to retrieve messages from the redis queue. 288 | REDIS_ENCODING : str (default: "utf-8") 289 | Default encoding to use when decoding messages from redis queue. 290 | 291 | """ 292 | 293 | @classmethod 294 | def from_crawler(cls, crawler, *args, **kwargs): 295 | obj = super().from_crawler(crawler, *args, **kwargs) 296 | obj.setup_redis(crawler) 297 | return obj 298 | -------------------------------------------------------------------------------- /src/scrapy_redis/stats.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from scrapy.statscollectors import StatsCollector 4 | 5 | from .connection import from_settings as redis_from_settings 6 | from .defaults import SCHEDULER_PERSIST, STATS_KEY 7 | from .utils import convert_bytes_to_str 8 | 9 | 10 | class RedisStatsCollector(StatsCollector): 11 | """ 12 | Stats Collector based on Redis 13 | """ 14 | 15 | def __init__(self, crawler, spider=None): 16 | super().__init__(crawler) 17 | self.server = redis_from_settings(crawler.settings) 18 | self.spider = spider 19 | self.spider_name = spider.name if spider else crawler.spidercls.name 20 | self.stats_key = crawler.settings.get("STATS_KEY", STATS_KEY) 21 | self.persist = crawler.settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) 22 | 23 | def _get_key(self, spider=None): 24 | """Return the hash name of stats""" 25 | if spider: 26 | return self.stats_key % {"spider": spider.name} 27 | if self.spider: 28 | return self.stats_key % {"spider": self.spider.name} 29 | return self.stats_key % {"spider": self.spider_name or "scrapy"} 30 | 31 | @classmethod 32 | def from_crawler(cls, crawler): 33 | return cls(crawler) 34 | 35 | @classmethod 36 | def from_spider(cls, spider): 37 | return cls(spider.crawler) 38 | 39 | def get_value(self, key, default=None, spider=None): 40 | """Return the value of hash stats""" 41 | if self.server.hexists(self._get_key(spider), key): 42 | return int(self.server.hget(self._get_key(spider), key)) 43 | else: 44 | return default 45 | 46 | def get_stats(self, spider=None): 47 | """Return the all of the values of hash stats""" 48 | stats = self.server.hgetall(self._get_key(spider)) 49 | if stats: 50 | return convert_bytes_to_str(stats) 51 | return {} 52 | 53 | def set_value(self, key, value, spider=None): 54 | """Set the value according to hash key of stats""" 55 | if isinstance(value, datetime): 56 | value = value.timestamp() 57 | self.server.hset(self._get_key(spider), key, value) 58 | 59 | def set_stats(self, stats, spider=None): 60 | """Set all the hash stats""" 61 | self.server.hmset(self._get_key(spider), stats) 62 | 63 | def inc_value(self, key, count=1, start=0, spider=None): 64 | """Set increment of value according to key""" 65 | if not self.server.hexists(self._get_key(spider), key): 66 | self.set_value(key, start) 67 | self.server.hincrby(self._get_key(spider), key, count) 68 | 69 | def max_value(self, key, value, spider=None): 70 | """Set max value between current and new value""" 71 | self.set_value(key, max(self.get_value(key, value), value)) 72 | 73 | def min_value(self, key, value, spider=None): 74 | """Set min value between current and new value""" 75 | self.set_value(key, min(self.get_value(key, value), value)) 76 | 77 | def clear_stats(self, spider=None): 78 | """Clear all the hash stats""" 79 | self.server.delete(self._get_key(spider)) 80 | 81 | def open_spider(self, spider): 82 | """Set spider to self""" 83 | if spider: 84 | self.spider = spider 85 | 86 | def close_spider(self, spider, reason): 87 | """Clear spider and clear stats""" 88 | self.spider = None 89 | if not self.persist: 90 | self.clear_stats(spider) 91 | -------------------------------------------------------------------------------- /src/scrapy_redis/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from json import JSONDecodeError 3 | 4 | import six 5 | 6 | 7 | class TextColor: 8 | HEADER = "\033[95m" 9 | OKBLUE = "\033[94m" 10 | OKCYAN = "\033[96m" 11 | OKGREEN = "\033[92m" 12 | WARNING = "\033[93m" 13 | FAIL = "\033[91m" 14 | ENDC = "\033[0m" 15 | BOLD = "\033[1m" 16 | UNDERLINE = "\033[4m" 17 | 18 | 19 | def bytes_to_str(s, encoding="utf-8"): 20 | """Returns a str if a bytes object is given.""" 21 | if six.PY3 and isinstance(s, bytes): 22 | return s.decode(encoding) 23 | return s 24 | 25 | 26 | def is_dict(string_content): 27 | """Try load string_content as json, if failed, return False, else return True.""" 28 | try: 29 | json.loads(string_content) 30 | except JSONDecodeError: 31 | return False 32 | return True 33 | 34 | 35 | def convert_bytes_to_str(data, encoding="utf-8"): 36 | """Convert a dict's keys & values from `bytes` to `str` 37 | or convert bytes to str""" 38 | if isinstance(data, bytes): 39 | return data.decode(encoding) 40 | if isinstance(data, dict): 41 | return dict(map(convert_bytes_to_str, data.items())) 42 | elif isinstance(data, tuple): 43 | return map(convert_bytes_to_str, data) 44 | return data 45 | -------------------------------------------------------------------------------- /tests/test_connection.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | from scrapy.settings import Settings 4 | 5 | from scrapy_redis import defaults 6 | from scrapy_redis.connection import from_settings, get_redis, get_redis_from_settings 7 | 8 | 9 | class TestGetRedis: 10 | 11 | def test_default_instance(self): 12 | server = get_redis() 13 | assert isinstance(server, defaults.REDIS_CLS) 14 | 15 | def test_custom_class(self): 16 | client_cls = mock.Mock() 17 | server = get_redis(param="foo", redis_cls=client_cls) 18 | assert server is client_cls.return_value 19 | client_cls.assert_called_with(param="foo") 20 | 21 | def test_from_url(self): 22 | client_cls = mock.Mock() 23 | url = "redis://localhost" 24 | server = get_redis(redis_cls=client_cls, url=url, param="foo") 25 | assert server is client_cls.from_url.return_value 26 | client_cls.from_url.assert_called_with(url, param="foo") 27 | 28 | 29 | class TestFromSettings: 30 | 31 | def setup(self): 32 | self.redis_cls = mock.Mock() 33 | self.expected_params = { 34 | "timeout": 0, 35 | "flag": False, 36 | } 37 | self.settings = Settings( 38 | { 39 | "REDIS_PARAMS": dict(self.expected_params, redis_cls=self.redis_cls), 40 | } 41 | ) 42 | 43 | def test_redis_cls_default(self): 44 | server = from_settings(Settings()) 45 | assert isinstance(server, defaults.REDIS_CLS) 46 | 47 | def test_redis_cls_custom_path(self): 48 | self.settings["REDIS_PARAMS"]["redis_cls"] = "unittest.mock.Mock" 49 | server = from_settings(self.settings) 50 | assert isinstance(server, mock.Mock) 51 | 52 | def test_default_params(self): 53 | server = from_settings(self.settings) 54 | assert server is self.redis_cls.return_value 55 | self.redis_cls.assert_called_with( 56 | **dict(defaults.REDIS_PARAMS, **self.expected_params) 57 | ) 58 | 59 | def test_override_default_params(self): 60 | for key, _ in defaults.REDIS_PARAMS.items(): 61 | self.expected_params[key] = self.settings["REDIS_PARAMS"][key] = object() 62 | 63 | server = from_settings(self.settings) 64 | assert server is self.redis_cls.return_value 65 | self.redis_cls.assert_called_with(**self.expected_params) 66 | 67 | 68 | def test_get_server_from_settings_alias(): 69 | assert from_settings is get_redis_from_settings 70 | -------------------------------------------------------------------------------- /tests/test_dupefilter.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | from scrapy.http import Request 4 | from scrapy.settings import Settings 5 | 6 | from scrapy_redis.dupefilter import RFPDupeFilter 7 | 8 | 9 | def get_redis_mock(): 10 | server = mock.Mock() 11 | 12 | def sadd(key, fp, added=0, db={}): # noqa: mutable db 13 | fingerprints = db.setdefault(key, set()) 14 | if fp not in fingerprints: 15 | fingerprints.add(fp) 16 | added += 1 17 | return added 18 | 19 | server.sadd = sadd 20 | 21 | return server 22 | 23 | 24 | class TestRFPDupeFilter: 25 | 26 | def setup(self): 27 | self.server = get_redis_mock() 28 | self.key = "dupefilter:1" 29 | self.df = RFPDupeFilter(self.server, self.key) 30 | 31 | def test_request_seen(self): 32 | req = Request("http://example.com") 33 | 34 | def same_request(): 35 | assert not self.df.request_seen(req) 36 | assert self.df.request_seen(req) 37 | 38 | def diff_method(): 39 | diff_method = Request("http://example.com", method="POST") 40 | assert self.df.request_seen(req) 41 | assert not self.df.request_seen(diff_method) 42 | 43 | def diff_url(): 44 | diff_url = Request("http://example2.com") 45 | assert self.df.request_seen(req) 46 | assert not self.df.request_seen(diff_url) 47 | 48 | same_request() 49 | diff_method() 50 | diff_url() 51 | 52 | def test_overridable_request_fingerprinter(self): 53 | req = Request("http://example.com") 54 | self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint) 55 | assert not self.df.request_seen(req) 56 | self.df.request_fingerprint.assert_called_with(req) 57 | 58 | def test_clear_deletes(self): 59 | self.df.clear() 60 | self.server.delete.assert_called_with(self.key) 61 | 62 | def test_close_calls_clear(self): 63 | self.df.clear = mock.Mock(wraps=self.df.clear) 64 | self.df.close() 65 | self.df.close(reason="foo") 66 | assert self.df.clear.call_count == 2 67 | 68 | 69 | def test_log_dupes(): 70 | def _test(df, dupes, logcount): 71 | df.logger.debug = mock.Mock(wraps=df.logger.debug) 72 | for _ in range(dupes): 73 | req = Request("http://example") 74 | df.log(req, spider=mock.Mock()) 75 | assert df.logger.debug.call_count == logcount 76 | 77 | server = get_redis_mock() 78 | 79 | df_quiet = RFPDupeFilter(server, "foo") # debug=False 80 | _test(df_quiet, 5, 1) 81 | 82 | df_debug = RFPDupeFilter(server, "foo", debug=True) 83 | _test(df_debug, 5, 5) 84 | 85 | 86 | @mock.patch("scrapy_redis.dupefilter.get_redis_from_settings") 87 | class TestFromMethods: 88 | 89 | def setup(self): 90 | self.settings = Settings( 91 | { 92 | "DUPEFILTER_DEBUG": True, 93 | } 94 | ) 95 | 96 | def test_from_settings(self, get_redis_from_settings): 97 | df = RFPDupeFilter.from_settings(self.settings) 98 | self.assert_dupefilter(df, get_redis_from_settings) 99 | 100 | def test_from_crawler(self, get_redis_from_settings): 101 | crawler = mock.Mock(settings=self.settings) 102 | df = RFPDupeFilter.from_crawler(crawler) 103 | self.assert_dupefilter(df, get_redis_from_settings) 104 | 105 | def assert_dupefilter(self, df, get_redis_from_settings): 106 | assert df.server is get_redis_from_settings.return_value 107 | assert df.key.startswith("dupefilter:") 108 | assert df.debug # true 109 | -------------------------------------------------------------------------------- /tests/test_package_import.py: -------------------------------------------------------------------------------- 1 | import scrapy_redis 2 | 3 | 4 | def test_package_metadata(): 5 | assert scrapy_redis.__author__ 6 | assert scrapy_redis.__email__ 7 | assert scrapy_redis.__version__ 8 | -------------------------------------------------------------------------------- /tests/test_picklecompat.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis import picklecompat 2 | 3 | 4 | def test_picklecompat(): 5 | obj = { 6 | "_encoding": "utf-8", 7 | "body": "", 8 | "callback": "_response_downloaded", 9 | "cookies": {}, 10 | "dont_filter": False, 11 | "errback": None, 12 | "headers": {"Referer": ["http://www.dmoz.org/"]}, 13 | "meta": {"depth": 1, "link_text": "Fran\xe7ais", "rule": 0}, 14 | "method": "GET", 15 | "priority": 0, 16 | "url": "http://www.dmoz.org/World/Fran%C3%A7ais/", 17 | } 18 | assert obj == picklecompat.loads(picklecompat.dumps(obj)) 19 | -------------------------------------------------------------------------------- /tests/test_queue.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | from scrapy import Spider 4 | from scrapy.http import Request 5 | 6 | from scrapy_redis.queue import Base 7 | 8 | 9 | class TestBaseQueue: 10 | 11 | queue_cls = Base 12 | 13 | def setup(self): 14 | self.server = mock.Mock() 15 | self.spider = Spider(name="foo") 16 | self.spider.parse_method = lambda x: x 17 | self.key = "key" 18 | self.q = self.queue_cls(self.server, self.spider, self.key) 19 | 20 | def test_encode_decode_requests(self, q=None): 21 | if q is None: 22 | q = self.q 23 | req = Request( 24 | "http://example.com", callback=self.spider.parse, meta={"foo": "bar"} 25 | ) 26 | out = q._decode_request(q._encode_request(req)) 27 | assert req.url == out.url 28 | assert req.meta == out.meta 29 | assert req.callback == out.callback 30 | 31 | def test_custom_serializer(self): 32 | serializer = mock.Mock() 33 | serializer.dumps = mock.Mock(side_effect=lambda x: x) 34 | serializer.loads = mock.Mock(side_effect=lambda x: x) 35 | q = Base(self.server, self.spider, self.key, serializer=serializer) 36 | self.test_encode_decode_requests(q) 37 | assert serializer.dumps.call_count == 1 38 | assert serializer.loads.call_count == 1 39 | -------------------------------------------------------------------------------- /tests/test_scrapy_redis.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase, mock 3 | 4 | import redis 5 | from scrapy import Request, Spider 6 | from scrapy.settings import Settings 7 | from scrapy.utils.test import get_crawler 8 | 9 | from scrapy_redis import connection 10 | from scrapy_redis.dupefilter import RFPDupeFilter 11 | from scrapy_redis.queue import FifoQueue, LifoQueue, PriorityQueue 12 | from scrapy_redis.scheduler import Scheduler 13 | 14 | # allow test settings from environment 15 | REDIS_HOST = os.environ.get("REDIS_HOST", "localhost") 16 | REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) 17 | 18 | 19 | def get_spider(*args, **kwargs): 20 | crawler = get_crawler( 21 | spidercls=kwargs.pop("spidercls", None), 22 | settings_dict=kwargs.pop("settings_dict", None), 23 | ) 24 | return crawler._create_spider(*args, **kwargs) 25 | 26 | 27 | class RedisTestMixin: 28 | 29 | @property 30 | def server(self): 31 | if not hasattr(self, "_redis"): 32 | self._redis = redis.Redis(REDIS_HOST, REDIS_PORT) 33 | return self._redis 34 | 35 | def clear_keys(self, prefix): 36 | keys = self.server.keys(prefix + "*") 37 | if keys: 38 | self.server.delete(*keys) 39 | 40 | 41 | class DupeFilterTest(RedisTestMixin, TestCase): 42 | 43 | def setUp(self): 44 | self.key = "scrapy_redis:tests:dupefilter:" 45 | self.df = RFPDupeFilter(self.server, self.key) 46 | 47 | def tearDown(self): 48 | self.clear_keys(self.key) 49 | 50 | def test_dupe_filter(self): 51 | req = Request("http://example.com") 52 | 53 | self.assertFalse(self.df.request_seen(req)) 54 | self.assertTrue(self.df.request_seen(req)) 55 | 56 | self.df.close("nothing") 57 | 58 | 59 | class QueueTestMixin(RedisTestMixin): 60 | 61 | queue_cls = None 62 | 63 | def setUp(self): 64 | self.spider = get_spider(name="myspider") 65 | self.key = f"scrapy_redis:tests:{self.spider.name}:queue" 66 | self.q = self.queue_cls(self.server, Spider("myspider"), self.key) 67 | 68 | def tearDown(self): 69 | self.clear_keys(self.key) 70 | 71 | def test_clear(self): 72 | self.assertEqual(len(self.q), 0) 73 | 74 | for i in range(10): 75 | # XXX: can't use same url for all requests as SpiderPriorityQueue 76 | # uses redis' set implemention and we will end with only one 77 | # request in the set and thus failing the test. It should be noted 78 | # that when using SpiderPriorityQueue it acts as a request 79 | # duplication filter whenever the serielized requests are the same. 80 | # This might be unwanted on repetitive requests to the same page 81 | # even with dont_filter=True flag. 82 | req = Request(f"http://example.com/?page={i}") 83 | self.q.push(req) 84 | self.assertEqual(len(self.q), 10) 85 | 86 | self.q.clear() 87 | self.assertEqual(len(self.q), 0) 88 | 89 | 90 | class FifoQueueTest(QueueTestMixin, TestCase): 91 | 92 | queue_cls = FifoQueue 93 | 94 | def test_queue(self): 95 | req1 = Request("http://example.com/page1") 96 | req2 = Request("http://example.com/page2") 97 | 98 | self.q.push(req1) 99 | self.q.push(req2) 100 | 101 | out1 = self.q.pop() 102 | out2 = self.q.pop(timeout=1) 103 | 104 | self.assertEqual(out1.url, req1.url) 105 | self.assertEqual(out2.url, req2.url) 106 | 107 | 108 | class PriorityQueueTest(QueueTestMixin, TestCase): 109 | 110 | queue_cls = PriorityQueue 111 | 112 | def test_queue(self): 113 | req1 = Request("http://example.com/page1", priority=100) 114 | req2 = Request("http://example.com/page2", priority=50) 115 | req3 = Request("http://example.com/page2", priority=200) 116 | 117 | self.q.push(req1) 118 | self.q.push(req2) 119 | self.q.push(req3) 120 | 121 | out1 = self.q.pop() 122 | out2 = self.q.pop(timeout=0) 123 | out3 = self.q.pop(timeout=1) 124 | 125 | self.assertEqual(out1.url, req3.url) 126 | self.assertEqual(out2.url, req1.url) 127 | self.assertEqual(out3.url, req2.url) 128 | 129 | 130 | class LifoQueueTest(QueueTestMixin, TestCase): 131 | 132 | queue_cls = LifoQueue 133 | 134 | def test_queue(self): 135 | req1 = Request("http://example.com/page1") 136 | req2 = Request("http://example.com/page2") 137 | 138 | self.q.push(req1) 139 | self.q.push(req2) 140 | 141 | out1 = self.q.pop() 142 | out2 = self.q.pop(timeout=1) 143 | 144 | self.assertEqual(out1.url, req2.url) 145 | self.assertEqual(out2.url, req1.url) 146 | 147 | 148 | class SchedulerTest(RedisTestMixin, TestCase): 149 | 150 | def setUp(self): 151 | self.key_prefix = "scrapy_redis:tests:" 152 | self.queue_key = self.key_prefix + "%(spider)s:requests" 153 | self.dupefilter_key = self.key_prefix + "%(spider)s:dupefilter" 154 | self.spider = get_spider( 155 | name="myspider", 156 | settings_dict={ 157 | "REDIS_HOST": REDIS_HOST, 158 | "REDIS_PORT": REDIS_PORT, 159 | "SCHEDULER_QUEUE_KEY": self.queue_key, 160 | "SCHEDULER_DUPEFILTER_KEY": self.dupefilter_key, 161 | "SCHEDULER_FLUSH_ON_START": False, 162 | "SCHEDULER_PERSIST": False, 163 | "SCHEDULER_SERIALIZER": "pickle", 164 | "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter", 165 | }, 166 | ) 167 | self.scheduler = Scheduler.from_crawler(self.spider.crawler) 168 | 169 | def tearDown(self): 170 | self.clear_keys(self.key_prefix) 171 | 172 | def test_scheduler(self): 173 | # default no persist 174 | self.assertFalse(self.scheduler.persist) 175 | 176 | self.scheduler.open(self.spider) 177 | self.assertEqual(len(self.scheduler), 0) 178 | 179 | req = Request("http://example.com") 180 | self.scheduler.enqueue_request(req) 181 | self.assertTrue(self.scheduler.has_pending_requests()) 182 | self.assertEqual(len(self.scheduler), 1) 183 | 184 | # dupefilter in action 185 | self.scheduler.enqueue_request(req) 186 | self.assertEqual(len(self.scheduler), 1) 187 | 188 | out = self.scheduler.next_request() 189 | self.assertEqual(out.url, req.url) 190 | 191 | self.assertFalse(self.scheduler.has_pending_requests()) 192 | self.assertEqual(len(self.scheduler), 0) 193 | 194 | self.scheduler.close("finish") 195 | 196 | def test_scheduler_persistent(self): 197 | # TODO: Improve this test to avoid the need to check for log messages. 198 | self.spider.log = mock.Mock(spec=self.spider.log) 199 | 200 | self.scheduler.persist = True 201 | self.scheduler.open(self.spider) 202 | 203 | self.assertEqual(self.spider.log.call_count, 0) 204 | 205 | self.scheduler.enqueue_request(Request("http://example.com/page1")) 206 | self.scheduler.enqueue_request(Request("http://example.com/page2")) 207 | 208 | self.assertTrue(self.scheduler.has_pending_requests()) 209 | self.scheduler.close("finish") 210 | 211 | self.scheduler.open(self.spider) 212 | self.spider.log.assert_has_calls( 213 | [ 214 | mock.call("Resuming crawl (2 requests scheduled)"), 215 | ] 216 | ) 217 | self.assertEqual(len(self.scheduler), 2) 218 | 219 | self.scheduler.persist = False 220 | self.scheduler.close("finish") 221 | 222 | self.assertEqual(len(self.scheduler), 0) 223 | 224 | 225 | class ConnectionTest(TestCase): 226 | 227 | # We can get a connection from just REDIS_URL. 228 | def test_redis_url(self): 229 | settings = Settings( 230 | { 231 | "REDIS_URL": "redis://foo:bar@localhost:9001/42", 232 | } 233 | ) 234 | 235 | server = connection.from_settings(settings) 236 | connect_args = server.connection_pool.connection_kwargs 237 | 238 | self.assertEqual(connect_args["host"], "localhost") 239 | self.assertEqual(connect_args["port"], 9001) 240 | self.assertEqual(connect_args["password"], "bar") 241 | self.assertEqual(connect_args["db"], 42) 242 | 243 | # We can get a connection from REDIS_HOST/REDIS_PORT. 244 | def test_redis_host_port(self): 245 | settings = Settings( 246 | { 247 | "REDIS_HOST": "localhost", 248 | "REDIS_PORT": 9001, 249 | } 250 | ) 251 | 252 | server = connection.from_settings(settings) 253 | connect_args = server.connection_pool.connection_kwargs 254 | 255 | self.assertEqual(connect_args["host"], "localhost") 256 | self.assertEqual(connect_args["port"], 9001) 257 | 258 | # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. 259 | def test_redis_url_precedence(self): 260 | settings = Settings( 261 | { 262 | "REDIS_HOST": "baz", 263 | "REDIS_PORT": 1337, 264 | "REDIS_URL": "redis://foo:bar@localhost:9001/42", 265 | } 266 | ) 267 | 268 | server = connection.from_settings(settings) 269 | connect_args = server.connection_pool.connection_kwargs 270 | 271 | self.assertEqual(connect_args["host"], "localhost") 272 | self.assertEqual(connect_args["port"], 9001) 273 | self.assertEqual(connect_args["password"], "bar") 274 | self.assertEqual(connect_args["db"], 42) 275 | 276 | # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. 277 | def test_redis_host_port_fallback(self): 278 | settings = Settings( 279 | {"REDIS_HOST": "baz", "REDIS_PORT": 1337, "REDIS_URL": None} 280 | ) 281 | 282 | server = connection.from_settings(settings) 283 | connect_args = server.connection_pool.connection_kwargs 284 | 285 | self.assertEqual(connect_args["host"], "baz") 286 | self.assertEqual(connect_args["port"], 1337) 287 | 288 | # We use default values for REDIS_HOST/REDIS_PORT. 289 | def test_redis_default(self): 290 | settings = Settings() 291 | 292 | server = connection.from_settings(settings) 293 | connect_args = server.connection_pool.connection_kwargs 294 | 295 | self.assertEqual(connect_args["host"], "localhost") 296 | self.assertEqual(connect_args["port"], 6379) 297 | -------------------------------------------------------------------------------- /tests/test_spiders.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | from unittest import mock 4 | 5 | import pytest 6 | from scrapy import signals 7 | from scrapy.exceptions import DontCloseSpider 8 | from scrapy.settings import Settings 9 | 10 | from scrapy_redis.spiders import RedisCrawlSpider, RedisSpider 11 | 12 | REDIS_HOST = os.environ.get("REDIS_HOST", "localhost") 13 | REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) 14 | 15 | 16 | @contextlib.contextmanager 17 | def flushall(server): 18 | try: 19 | yield 20 | finally: 21 | server.flushall() 22 | 23 | 24 | class MySpider(RedisSpider): 25 | name = "myspider" 26 | 27 | 28 | class MyCrawlSpider(RedisCrawlSpider): 29 | name = "myspider" 30 | 31 | 32 | def get_crawler(**kwargs): 33 | return mock.Mock( 34 | settings=Settings( 35 | { 36 | "REDIS_HOST": REDIS_HOST, 37 | "REDIS_PORT": REDIS_PORT, 38 | } 39 | ), 40 | **kwargs, 41 | ) 42 | 43 | 44 | class TestRedisMixin_setup_redis: 45 | 46 | def setup(self): 47 | self.myspider = MySpider() 48 | 49 | def test_crawler_required(self): 50 | with pytest.raises(ValueError) as excinfo: 51 | self.myspider.setup_redis() 52 | assert "crawler" in str(excinfo.value) 53 | 54 | def test_requires_redis_key(self): 55 | self.myspider.crawler = get_crawler() 56 | self.myspider.redis_key = "" 57 | with pytest.raises(ValueError) as excinfo: 58 | self.myspider.setup_redis() 59 | assert "redis_key" in str(excinfo.value) 60 | 61 | def test_invalid_batch_size(self): 62 | self.myspider.redis_batch_size = "x" 63 | self.myspider.crawler = get_crawler() 64 | with pytest.raises(ValueError) as excinfo: 65 | self.myspider.setup_redis() 66 | assert "redis_batch_size" in str(excinfo.value) 67 | 68 | def test_invalid_idle_time(self): 69 | self.myspider.max_idle_time = "x" 70 | self.myspider.crawler = get_crawler() 71 | with pytest.raises(ValueError) as excinfo: 72 | self.myspider.setup_redis() 73 | assert "max_idle_time" in str(excinfo.value) 74 | 75 | @mock.patch("scrapy_redis.spiders.connection") 76 | def test_via_from_crawler(self, connection): 77 | server = connection.from_settings.return_value = mock.Mock() 78 | crawler = get_crawler() 79 | myspider = MySpider.from_crawler(crawler) 80 | assert myspider.server is server 81 | connection.from_settings.assert_called_with(crawler.settings) 82 | crawler.signals.connect.assert_called_with( 83 | myspider.spider_idle, signal=signals.spider_idle 84 | ) 85 | # Second call does nothing. 86 | server = myspider.server 87 | crawler.signals.connect.reset_mock() 88 | myspider.setup_redis() 89 | assert myspider.server is server 90 | assert crawler.signals.connect.call_count == 0 91 | 92 | 93 | @pytest.mark.parametrize( 94 | "spider_cls", 95 | [ 96 | MySpider, 97 | MyCrawlSpider, 98 | ], 99 | ) 100 | def test_from_crawler_with_spider_arguments(spider_cls): 101 | crawler = get_crawler() 102 | spider = spider_cls.from_crawler( 103 | crawler, 104 | "foo", 105 | redis_key="key:%(name)s", 106 | redis_batch_size="2000", 107 | max_idle_time="100", 108 | ) 109 | assert spider.name == "foo" 110 | assert spider.redis_key == "key:foo" 111 | assert spider.redis_batch_size == 2000 112 | assert spider.max_idle_time == 100 113 | 114 | 115 | class MockRequest(mock.Mock): 116 | def __init__(self, url, **kwargs): 117 | super().__init__() 118 | self.url = url 119 | 120 | def __eq__(self, other): 121 | return self.url == other.url 122 | 123 | def __hash__(self): 124 | return hash(self.url) 125 | 126 | def __repr__(self): 127 | return f"<{self.__class__.__name__}({self.url})>" 128 | 129 | 130 | @pytest.mark.parametrize( 131 | "spider_cls", 132 | [ 133 | MySpider, 134 | MyCrawlSpider, 135 | ], 136 | ) 137 | @pytest.mark.parametrize("start_urls_as_zset", [False, True]) 138 | @pytest.mark.parametrize("start_urls_as_set", [False, True]) 139 | @mock.patch("scrapy.spiders.Request", MockRequest) 140 | def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_cls): 141 | batch_size = 5 142 | redis_key = "start:urls" 143 | crawler = get_crawler() 144 | crawler.settings.setdict( 145 | { 146 | "REDIS_HOST": REDIS_HOST, 147 | "REDIS_PORT": REDIS_PORT, 148 | "REDIS_START_URLS_KEY": redis_key, 149 | "REDIS_START_URLS_AS_ZSET": start_urls_as_zset, 150 | "REDIS_START_URLS_AS_SET": start_urls_as_set, 151 | "CONCURRENT_REQUESTS": batch_size, 152 | } 153 | ) 154 | spider = spider_cls.from_crawler(crawler) 155 | with flushall(spider.server): 156 | urls = [f"http://example.com/{i}" for i in range(batch_size * 2)] 157 | reqs = [] 158 | if start_urls_as_set: 159 | server_put = spider.server.sadd 160 | elif start_urls_as_zset: 161 | 162 | def server_put(key, value): 163 | spider.server.zadd(key, {value: 0}) 164 | 165 | else: 166 | server_put = spider.server.rpush 167 | for url in urls: 168 | server_put(redis_key, url) 169 | reqs.append(MockRequest(url)) 170 | 171 | # First call is to start requests. 172 | start_requests = list(spider.start_requests()) 173 | if start_urls_as_zset or start_urls_as_set: 174 | assert len(start_requests) == batch_size 175 | assert {r.url for r in start_requests}.issubset(r.url for r in reqs) 176 | else: 177 | assert start_requests == reqs[:batch_size] 178 | 179 | # Second call is to spider idle method. 180 | with pytest.raises(DontCloseSpider): 181 | spider.spider_idle() 182 | # Process remaining requests in the queue. 183 | with pytest.raises(DontCloseSpider): 184 | spider.spider_idle() 185 | 186 | # Last batch was passed to crawl. 187 | assert crawler.engine.crawl.call_count == batch_size 188 | 189 | if start_urls_as_zset or start_urls_as_set: 190 | crawler.engine.crawl.assert_has_calls( 191 | [mock.call(req) for req in reqs if req not in start_requests], 192 | any_order=True, 193 | ) 194 | else: 195 | crawler.engine.crawl.assert_has_calls( 196 | [mock.call(req) for req in reqs[batch_size:]] 197 | ) 198 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.utils import bytes_to_str 2 | 3 | 4 | def test_bytes_to_str(): 5 | assert bytes_to_str(b"foo") == "foo" 6 | # This char is the same in bytes or latin1. 7 | assert bytes_to_str(b"\xc1", "latin1") == "\xc1" 8 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | requires = 3 | tox>=4 4 | envlist = 5 | docs 6 | security 7 | flake8 8 | py{38,39,310,311,312}-scrapy{26,27,28,29,210,211}-redis{42,43,44,45,46,50} 9 | minversion = 3.0.0 10 | 11 | [base] 12 | deps = 13 | -r requirements-tests.txt 14 | -r requirements.txt 15 | setuptools 16 | 17 | [testenv] 18 | basepython = 19 | py38: python3.8 20 | py39: python3.9 21 | py310: python3.10 22 | py311: python3.11 23 | py312: python3.12 24 | deps = 25 | {[base]deps} 26 | scrapy26: scrapy~=2.6.0 27 | scrapy27: scrapy~=2.7.0 28 | scrapy28: scrapy~=2.8.0 29 | scrapy29: scrapy~=2.9.0 30 | scrapy210: scrapy~=2.10.0 31 | scrapy211: scrapy~=2.11.0 32 | redis42: redis~=4.2.0 33 | redis43: redis~=4.3.0 34 | redis44: redis~=4.4.0 35 | redis45: redis~=4.5.0 36 | redis46: redis~=4.6.0 37 | redis50: redis~=5.0.0 38 | passenv = 39 | REDIS_HOST 40 | REDIS_PORT 41 | commands = 42 | python -m pytest # --cov-report term --cov=scrapy_redis 43 | 44 | [testenv:flake8] 45 | basepython = 46 | python3.12 47 | deps = 48 | {[base]deps} 49 | commands = 50 | flake8 --ignore=W503,E265,E731 docs src tests 51 | 52 | [testenv:security] 53 | basepython = 54 | python3.12 55 | deps = 56 | bandit~=1.7.3 57 | commands = 58 | bandit -r -c .bandit.yml src/ tests/ 59 | 60 | [testenv:pytest] 61 | basepython = 62 | python3.12 63 | deps = 64 | {[testenv]deps} 65 | passenv = 66 | REDIS_HOST 67 | REDIS_PORT 68 | commands = 69 | python -m pytest --cov-report term --cov=scrapy_redis 70 | 71 | [testenv:build] 72 | basepython = 73 | python3.12 74 | deps = 75 | {[base]deps} 76 | build 77 | commands = 78 | python -m build 79 | 80 | [testenv:docs] 81 | basepython = 82 | python3.12 83 | deps = 84 | {[base]deps} 85 | -r docs/requirements.txt 86 | allowlist_externals = 87 | make 88 | commands = 89 | # Same command as readthedocs 90 | make -C docs html SPHINXOPTS="-T -W --keep-going -D language=en" 91 | --------------------------------------------------------------------------------