├── .git-blame-ignore-revs ├── .github ├── dependabot.yml └── workflows │ ├── lint.yml │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── api.rst ├── cli.rst ├── conf.py ├── config.rst ├── contributing │ ├── api.rst │ └── index.rst ├── deploy.rst ├── index.rst ├── news.rst ├── overview.rst └── requirements.txt ├── integration_tests ├── __init__.py ├── test_webservice.py └── test_website.py ├── pyproject.toml ├── scrapyd ├── __init__.py ├── __main__.py ├── app.py ├── basicauth.py ├── config.py ├── default_scrapyd.conf ├── eggstorage.py ├── environ.py ├── exceptions.py ├── interfaces.py ├── jobstorage.py ├── launcher.py ├── poller.py ├── runner.py ├── scheduler.py ├── spiderqueue.py ├── sqlite.py ├── txapp.py ├── utils.py ├── webservice.py └── website.py └── tests ├── __init__.py ├── conftest.py ├── fixtures ├── entrypoint_missing.egg ├── filesystem │ ├── localproject │ │ ├── __init__.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── example.py │ └── scrapy.cfg ├── mybot.egg ├── mybot2.egg ├── quotesbot.egg ├── settings_asyncioreactor.egg ├── settings_log_stdout.egg ├── settings_raise.egg └── spiders_utf8.egg ├── mockapp.py ├── mockserver.py ├── test_config.py ├── test_eggstorage.py ├── test_environ.py ├── test_interfaces.py ├── test_jobstorage.py ├── test_launcher.py ├── test_main.py ├── test_poller.py ├── test_runner.py ├── test_scheduler.py ├── test_server.py ├── test_spiderqueue.py ├── test_sqlite.py ├── test_webservice.py └── test_website.py /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Example: git blame --ignore-revs-file .git-blame-ignore-revs file 2 | 3 | # Migrate code style to Black 4 | 51521eed7216eb7545028e2be0de5a2c3e5049f6 5 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | - uses: actions/setup-python@v5 10 | with: 11 | python-version: 3.9 12 | cache: pip 13 | - run: pip install --upgrade pre-commit 14 | - run: pre-commit run --all-files 15 | - run: pip install --upgrade check-manifest setuptools 16 | - run: check-manifest 17 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | on: push 3 | jobs: 4 | publish: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: actions/setup-python@v5 9 | with: 10 | python-version: 3.9 11 | - run: pip install --upgrade build 12 | - run: python -m build --sdist --wheel 13 | - name: Publish to TestPyPI 14 | uses: pypa/gh-action-pypi-publish@release/v1 15 | with: 16 | password: ${{ secrets.TEST_PYPI_TOKEN }} 17 | repository-url: https://test.pypi.org/legacy/ 18 | skip-existing: true 19 | - name: Publish to PyPI 20 | if: startsWith(github.ref, 'refs/tags') 21 | uses: pypa/gh-action-pypi-publish@release/v1 22 | with: 23 | password: ${{ secrets.PYPI_TOKEN }} 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request] 3 | jobs: 4 | tests: 5 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 6 | runs-on: ${{ matrix.os }} 7 | strategy: 8 | matrix: 9 | os: [macos-latest, windows-latest, ubuntu-latest] 10 | python-version: [3.9, "3.10", "3.11", "3.12", "3.13"] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | with: 15 | python-version: ${{ matrix.python-version }} 16 | cache: pip 17 | - run: pip install -e .[test] 18 | # Python 3.12 deprecates pkg_resources (also used by py-html-checker). 19 | # https://github.com/pytest-dev/pytest-twisted/issues/183 20 | # https://github.com/sveetch/py-html-checker/issues/26 21 | - run: | 22 | pytest -W error -W ignore::ResourceWarning -W ignore::DeprecationWarning:scrapyd.runner -W ignore::DeprecationWarning:pytest_twisted -W ignore::DeprecationWarning:html_checker -W ignore::DeprecationWarning:pkg_resources tests --cov scrapyd 23 | # Occasional "ConnectionRefusedError: [Errno 111] Connection refused". 24 | - name: Run integration tests 25 | run: | 26 | printf "[scrapyd]\nusername = hello12345\npassword = 67890world\n" > scrapyd.conf 27 | mkdir logs 28 | scrapyd > scrapyd.log 2>&1 & 29 | sleep 1 30 | pytest -W error -W ignore::ResourceWarning -W ignore::DeprecationWarning:pytest_twisted integration_tests 31 | cat scrapyd.log 32 | - env: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | run: coveralls --service=github 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Build 2 | /*.egg-info 3 | /dist 4 | *.pyc 5 | 6 | # Development 7 | venv 8 | .vscode 9 | .idea 10 | /myproject 11 | 12 | # Docs 13 | /docs/_build 14 | 15 | # Tests 16 | /.coverage* 17 | /htmlcov 18 | /_trial_temp 19 | /tests.test_* 20 | 21 | # CLI 22 | /scrapyd.conf 23 | /twistd.pid 24 | /dbs 25 | /eggs 26 | /items 27 | /logs 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_schedule: quarterly 3 | repos: 4 | - repo: https://github.com/astral-sh/ruff-pre-commit 5 | rev: v0.5.0 6 | hooks: 7 | - id: ruff 8 | - id: ruff-format 9 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-20.04 4 | tools: 5 | python: "3.9" 6 | python: 7 | install: 8 | - path: . 9 | - requirements: docs/requirements.txt 10 | sphinx: 11 | configuration: docs/conf.py 12 | fail_on_warning: true 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapy developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Scrapy nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include scrapyd/default_scrapyd.conf 3 | recursive-include docs *.py 4 | recursive-include docs *.rst 5 | recursive-include docs *.txt 6 | recursive-include docs Makefile 7 | recursive-include scrapyd *.py 8 | recursive-include tests *.cfg 9 | recursive-include tests *.egg 10 | recursive-include tests *.py 11 | recursive-include integration_tests *.py 12 | exclude .git-blame-ignore-revs 13 | exclude .pre-commit-config.yaml 14 | exclude .readthedocs.yaml 15 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |PyPI Version| |Build Status| |Coverage Status| |Python Version| |Pypi Downloads| 2 | 3 | Scrapyd is a service for deploying and running `Scrapy `__ spiders. 4 | 5 | It allows you to upload Scrapy projects and control their spiders using a JSON API. 6 | 7 | (If you are viewing this on GitHub, open the `full documentation `__ for additional details.) 8 | 9 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/scrapyd.svg 10 | :target: https://pypi.org/project/scrapyd/ 11 | .. |Build Status| image:: https://github.com/scrapy/scrapyd/workflows/Tests/badge.svg 12 | .. |Coverage Status| image:: https://coveralls.io/repos/github/scrapy/scrapyd/badge.svg?branch=master 13 | :target: https://coveralls.io/github/scrapy/scrapyd?branch=master 14 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/scrapyd.svg 15 | :target: https://pypi.org/project/scrapyd/ 16 | .. |Pypi Downloads| image:: https://img.shields.io/pypi/dm/scrapyd.svg 17 | :target: https://pypi.python.org/pypi/scrapyd/ 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | PYTHON = python 6 | SPHINXOPTS = 7 | SPHINXBUILD = sphinx-build 8 | PAPER = 9 | BUILDDIR = _build 10 | 11 | # Internal variables. 12 | PAPEROPT_a4 = -D latex_paper_size=a4 13 | PAPEROPT_letter = -D latex_paper_size=letter 14 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 15 | # the i18n builder cannot share the environment and doctrees with the others 16 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 17 | 18 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 19 | 20 | help: 21 | @echo "Please use \`make ' where is one of" 22 | @echo " html to make standalone HTML files" 23 | @echo " dirhtml to make HTML files named index.html in directories" 24 | @echo " singlehtml to make a single large HTML file" 25 | @echo " pickle to make pickle files" 26 | @echo " json to make JSON files" 27 | @echo " htmlhelp to make HTML files and a HTML help project" 28 | @echo " qthelp to make HTML files and a qthelp project" 29 | @echo " devhelp to make HTML files and a Devhelp project" 30 | @echo " epub to make an epub" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " text to make text files" 34 | @echo " man to make manual pages" 35 | @echo " texinfo to make Texinfo files" 36 | @echo " info to make Texinfo files and run them through makeinfo" 37 | @echo " gettext to make PO message catalogs" 38 | @echo " changes to make an overview of all changed/added/deprecated items" 39 | @echo " linkcheck to check all external links for integrity" 40 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 41 | 42 | clean: 43 | -rm -rf $(BUILDDIR)/* 44 | 45 | html: 46 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 47 | @echo 48 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 49 | 50 | dirhtml: 51 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 52 | @echo 53 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 54 | 55 | singlehtml: 56 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 57 | @echo 58 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 59 | 60 | pickle: 61 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 62 | @echo 63 | @echo "Build finished; now you can process the pickle files." 64 | 65 | json: 66 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 67 | @echo 68 | @echo "Build finished; now you can process the JSON files." 69 | 70 | htmlhelp: 71 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 72 | @echo 73 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 74 | ".hhp project file in $(BUILDDIR)/htmlhelp." 75 | 76 | qthelp: 77 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 78 | @echo 79 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 80 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 81 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Scrapyd.qhcp" 82 | @echo "To view the help file:" 83 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Scrapyd.qhc" 84 | 85 | devhelp: 86 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 87 | @echo 88 | @echo "Build finished." 89 | @echo "To view the help file:" 90 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Scrapyd" 91 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Scrapyd" 92 | @echo "# devhelp" 93 | 94 | epub: 95 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 96 | @echo 97 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 98 | 99 | latex: 100 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 101 | @echo 102 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 103 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 104 | "(use \`make latexpdf' here to do that automatically)." 105 | 106 | latexpdf: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo "Running LaTeX files through pdflatex..." 109 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 110 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 111 | 112 | text: 113 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 114 | @echo 115 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 116 | 117 | man: 118 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 119 | @echo 120 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 121 | 122 | texinfo: 123 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 124 | @echo 125 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 126 | @echo "Run \`make' in that directory to run these through makeinfo" \ 127 | "(use \`make info' here to do that automatically)." 128 | 129 | info: 130 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 131 | @echo "Running Texinfo files through makeinfo..." 132 | make -C $(BUILDDIR)/texinfo info 133 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 134 | 135 | gettext: 136 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 137 | @echo 138 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 139 | 140 | changes: 141 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 142 | @echo 143 | @echo "The overview file is in $(BUILDDIR)/changes." 144 | 145 | linkcheck: 146 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 147 | @echo 148 | @echo "Link check complete; look for any errors in the above output " \ 149 | "or in $(BUILDDIR)/linkcheck/output.txt." 150 | 151 | doctest: 152 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 153 | @echo "Testing of doctests in the sources finished, look at the " \ 154 | "results in $(BUILDDIR)/doctest/output.txt." 155 | 156 | htmlview: html 157 | $(PYTHON) -c "import webbrowser; webbrowser.open('_build/html/index.html')" 158 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | If :ref:`basic authentication` is enabled, you can use ``curl``'s ``-u`` option in the examples below, for example: 5 | 6 | .. code-block:: shell 7 | 8 | curl -u yourusername:yourpassword http://localhost:6800/daemonstatus.json 9 | 10 | .. _daemonstatus.json: 11 | 12 | daemonstatus.json 13 | ----------------- 14 | 15 | .. versionadded:: 1.2.0 16 | 17 | To check the load status of a service. 18 | 19 | Supported request methods 20 | ``GET`` 21 | 22 | Example: 23 | 24 | .. code-block:: shell-session 25 | 26 | $ curl http://localhost:6800/daemonstatus.json 27 | {"node_name": "mynodename", "status": "ok", "pending": 0, "running": 0, "finished": 0} 28 | 29 | .. _addversion.json: 30 | 31 | addversion.json 32 | --------------- 33 | 34 | Add a version to a project in :ref:`eggstorage`, creating the project if needed. 35 | 36 | Supported request methods 37 | ``POST`` 38 | Parameters 39 | ``project`` (required) 40 | the project name 41 | ``version`` (required) 42 | the project version 43 | 44 | Scrapyd uses the packaging `Version `__ to interpret the version numbers you provide. 45 | ``egg`` (required) 46 | a Python egg containing the project's code 47 | 48 | The egg must set an entry point to its Scrapy settings. For example, with a ``setup.py`` file: 49 | 50 | .. code-block:: python 51 | :emphasize-lines: 5 52 | 53 | setup( 54 | name = 'project', 55 | version = '1.0', 56 | packages = find_packages(), 57 | entry_points = {'scrapy': ['settings = projectname.settings']}, 58 | ) 59 | 60 | Do this easily with the ``scrapyd-deploy`` command from the `scrapyd-client `__ package. 61 | 62 | Example: 63 | 64 | .. code-block:: shell-session 65 | 66 | $ curl http://localhost:6800/addversion.json -F project=myproject -F version=r23 -F egg=@myproject.egg 67 | {"node_name": "mynodename", "status": "ok", "spiders": 3} 68 | 69 | .. _schedule.json: 70 | 71 | schedule.json 72 | ------------- 73 | 74 | Schedule a job. (A job is a `Scrapy crawl `__.) 75 | 76 | If the :ref:`logs_dir` setting is set, log files are written to ``{logs_dir}/{project}/{spider}/{jobid}.log``. Set the ``jobid`` parameter to configure the basename of the log file. 77 | 78 | .. important:: Like Scrapy's ``scrapy.Spider`` class, spiders should allow an arbitrary number of keyword arguments in their ``__init__`` method, because Scrapyd sets internally-generated spider arguments when starting crawls. 79 | 80 | Supported request methods 81 | ``POST`` 82 | Parameters 83 | ``project`` (required) 84 | the project name 85 | ``spider`` (required) 86 | the spider name 87 | ``_version`` 88 | the project version (the latest project version by default) 89 | ``jobid`` 90 | the job's ID (a hexadecimal UUID v1 by default) 91 | ``priority`` 92 | the job's priority in the project's spider queue (0 by default, higher number, higher priority) 93 | ``setting`` 94 | a Scrapy setting 95 | 96 | For example, using `DOWNLOAD_DELAY `__: 97 | 98 | .. code-block:: shell 99 | 100 | curl http://localhost:6800/schedule.json -d setting=DOWNLOAD_DELAY=2 -d project=myproject -d spider=somespider 101 | Any other parameter 102 | a spider argument 103 | 104 | For example, using ``arg1``: 105 | 106 | .. code-block:: shell 107 | 108 | curl http://localhost:6800/schedule.json -d arg1=val1 -d project=myproject -d spider=somespider 109 | 110 | .. warning:: 111 | 112 | When such parameters are set multiple times, only the first value is sent to the spider. 113 | 114 | To change this behavior, please `open an issue `__. 115 | 116 | Example: 117 | 118 | .. code-block:: shell-session 119 | 120 | $ curl http://localhost:6800/schedule.json -d project=myproject -d spider=somespider 121 | {"node_name": "mynodename", "status": "ok", "jobid": "6487ec79947edab326d6db28a2d86511e8247444"} 122 | 123 | .. _status.json: 124 | 125 | status.json 126 | ----------- 127 | 128 | .. versionadded:: 1.5.0 129 | 130 | Get the status of a job. 131 | 132 | Supported request methods 133 | ``GET`` 134 | Parameters 135 | ``job`` (required) 136 | the job ID 137 | ``project`` 138 | the project name 139 | 140 | Example: 141 | 142 | .. code-block:: shell-session 143 | 144 | $ curl http://localhost:6800/status.json?job=6487ec79947edab326d6db28a2d86511e8247444 145 | {"node_name": "mynodename", "status": "ok", "currstate": "running"} 146 | 147 | .. _cancel.json: 148 | 149 | cancel.json 150 | ----------- 151 | 152 | Cancel a job. 153 | 154 | - If the job is pending, it is removed from the project's spider queue. 155 | - If the job is running, the process is sent a signal to terminate. 156 | 157 | Supported request methods 158 | ``POST`` 159 | Parameters 160 | ``project`` (required) 161 | the project name 162 | ``job`` (required) 163 | the job ID 164 | ``signal`` 165 | the `signal `__ to send to the Scrapy process (``BREAK`` by default on Windows and ``INT`` by default, otherwise) 166 | 167 | Example: 168 | 169 | .. code-block:: shell-session 170 | 171 | $ curl http://localhost:6800/cancel.json -d project=myproject -d job=6487ec79947edab326d6db28a2d86511e8247444 172 | {"node_name": "mynodename", "status": "ok", "prevstate": "running"} 173 | 174 | .. _listprojects.json: 175 | 176 | listprojects.json 177 | ----------------- 178 | 179 | Get the projects. 180 | 181 | Supported request methods 182 | ``GET`` 183 | 184 | Example: 185 | 186 | .. code-block:: shell-session 187 | 188 | $ curl http://localhost:6800/listprojects.json 189 | {"node_name": "mynodename", "status": "ok", "projects": ["myproject", "otherproject"]} 190 | 191 | .. _listversions.json: 192 | 193 | listversions.json 194 | ----------------- 195 | 196 | Get the versions of a project in :ref:`eggstorage`, in :ref:`order`, with the latest version last. 197 | 198 | Supported request methods 199 | ``GET`` 200 | Parameters 201 | ``project`` (required) 202 | the project name 203 | 204 | Example: 205 | 206 | .. code-block:: shell-session 207 | 208 | $ curl http://localhost:6800/listversions.json?project=myproject 209 | {"node_name": "mynodename", "status": "ok", "versions": ["r99", "r156"]} 210 | 211 | .. _listspiders.json: 212 | 213 | listspiders.json 214 | ---------------- 215 | 216 | Get the spiders in a version of a project. 217 | 218 | .. note:: If the project is configured via a :ref:`scrapy.cfg` file rather than uploaded via the :ref:`addversion.json` webservice, don't set the ``version`` parameter. 219 | 220 | Supported request methods 221 | ``GET`` 222 | Parameters 223 | ``project`` (required) 224 | the project name 225 | ``_version`` 226 | the project version (the latest project version by default) 227 | 228 | Example: 229 | 230 | .. code-block:: shell-session 231 | 232 | $ curl http://localhost:6800/listspiders.json?project=myproject 233 | {"node_name": "mynodename", "status": "ok", "spiders": ["spider1", "spider2", "spider3"]} 234 | 235 | .. _listjobs.json: 236 | 237 | listjobs.json 238 | ------------- 239 | 240 | Get the pending, running and finished jobs of a project. 241 | 242 | - Pending jobs are in :ref:`spider queues`. 243 | - Running jobs have Scrapy processes. 244 | - Finished jobs are in :ref:job storage`. 245 | 246 | .. note:: 247 | 248 | - The default :ref:`jobstorage` setting stores jobs in memory, such that jobs are lost when the Scrapyd process ends. 249 | - ``log_url`` is ``null`` in the response if :ref:`logs_dir` is disabled or the file doesn't exist. 250 | - ``items_url`` is ``null`` in the response if :ref:`items_dir` is disabled or the file doesn't exist. 251 | 252 | Supported request methods 253 | ``GET`` 254 | Parameters 255 | ``project`` 256 | filter results by project name 257 | 258 | Example: 259 | 260 | .. code-block:: shell-session 261 | 262 | $ curl http://localhost:6800/listjobs.json?project=myproject | python -m json.tool 263 | { 264 | "node_name": "mynodename", 265 | "status": "ok", 266 | "pending": [ 267 | { 268 | "id": "78391cc0fcaf11e1b0090800272a6d06", 269 | "project": "myproject", 270 | "spider": "spider1", 271 | "version": "0.1", 272 | "settings": {"DOWNLOAD_DELAY=2"}, 273 | "args": {"arg1": "val1"}, 274 | } 275 | ], 276 | "running": [ 277 | { 278 | "id": "422e608f9f28cef127b3d5ef93fe9399", 279 | "project": "myproject", 280 | "spider": "spider2", 281 | "pid": 93956, 282 | "start_time": "2012-09-12 10:14:03.594664", 283 | "log_url": "/logs/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.log", 284 | "items_url": "/items/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.jl" 285 | } 286 | ], 287 | "finished": [ 288 | { 289 | "id": "2f16646cfcaf11e1b0090800272a6d06", 290 | "project": "myproject", 291 | "spider": "spider3", 292 | "start_time": "2012-09-12 10:14:03.594664", 293 | "end_time": "2012-09-12 10:24:03.594664", 294 | "log_url": "/logs/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.log", 295 | "items_url": "/items/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.jl" 296 | } 297 | ] 298 | } 299 | 300 | .. _delversion.json: 301 | 302 | delversion.json 303 | --------------- 304 | 305 | Delete a version of a project from :ref:`eggstorage`. If no versions of the project remain, delete the project, too. 306 | 307 | Supported request methods 308 | ``POST`` 309 | Parameters 310 | ``project`` (required) 311 | the project name 312 | ``version`` (required) 313 | the project version 314 | 315 | Example: 316 | 317 | .. code-block:: shell-session 318 | 319 | $ curl http://localhost:6800/delversion.json -d project=myproject -d version=r99 320 | {"node_name": "mynodename", "status": "ok"} 321 | 322 | .. _delproject.json: 323 | 324 | delproject.json 325 | --------------- 326 | 327 | Delete a project and its versions from :ref:`eggstorage`. 328 | 329 | Supported request methods 330 | ``POST`` 331 | Parameters 332 | ``project`` (required) 333 | the project name 334 | 335 | Example: 336 | 337 | .. code-block:: shell-session 338 | 339 | $ curl http://localhost:6800/delproject.json -d project=myproject 340 | {"node_name": "mynodename", "status": "ok"} 341 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Command-line interface 2 | ====================== 3 | 4 | The CLI is simply a wrapper around `twistd `__. 5 | 6 | The most relevant option is ``--logfile`` (``-l``). The ``--nodaemon`` option is always enabled by Scrapyd. 7 | 8 | .. code-block:: none 9 | 10 | Usage: scrapyd [options] 11 | Options: 12 | -b, --debug Run the application in the Python Debugger (implies 13 | nodaemon), sending SIGUSR2 will drop into 14 | debugger 15 | --chroot= Chroot to a supplied directory before running 16 | -e, --encrypted The specified tap/aos file is encrypted. 17 | --euid Set only effective user-id rather than real user-id. 18 | (This option has no effect unless the server is running 19 | as root, in which case it means not to shed all 20 | privileges after binding ports, retaining the option to 21 | regain privileges in cases such as spawning processes. 22 | Use with caution.) 23 | -f, --file= read the given .tap file [default: twistd.tap] 24 | -g, --gid= The gid to run as. If not specified, the default gid 25 | associated with the specified --uid is used. 26 | --help Display this help and exit. 27 | --help-reactors Display a list of possibly available reactor names. 28 | -l, --logfile= log to a specified file, - for stdout 29 | --logger= A fully-qualified name to a log observer factory to use 30 | for the initial log observer. Takes precedence over 31 | --logfile and --syslog (when available). 32 | -n, --nodaemon don't daemonize, don't use default umask of 0077 33 | -o, --no_save do not save state on shutdown 34 | --originalname Don't try to change the process name 35 | -p, --profile= Run in profile mode, dumping results to specified file. 36 | --pidfile= Name of the pidfile [default: twistd.pid] 37 | --prefix= use the given prefix when syslogging [default: twisted] 38 | --profiler= Name of the profiler to use (profile, cprofile). 39 | [default: cprofile] 40 | -r, --reactor= Which reactor to use (see --help-reactors for a list of 41 | possibilities) 42 | -s, --source= Read an application from a .tas file (AOT format). 43 | --savestats save the Stats object rather than the text output of the 44 | profiler. 45 | --spew Print an insanely verbose log of everything that happens. 46 | Useful when debugging freezes or locks in complex code. 47 | --syslog Log to syslog, not to file 48 | -u, --uid= The uid to run as. 49 | --umask= The (octal) file creation mask to apply. 50 | --version Print version information and exit. 51 | 52 | Scrapyd is an application for deploying and running Scrapy spiders. 53 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | import os.path 13 | import sys 14 | 15 | sys.path.insert(0, os.path.abspath("..")) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "Scrapyd" 21 | copyright = "2013-2023, Scrapy group" 22 | author = "Scrapy group" 23 | 24 | # The short X.Y version 25 | version = "1.5.0" 26 | # The full version, including alpha/beta/rc tags 27 | release = version 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | "sphinx.ext.autodoc", 37 | "sphinx.ext.extlinks", 38 | "sphinx.ext.viewcode", 39 | "sphinxcontrib.zopeext.autointerface", 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ["_templates"] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ["_build"] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | html_theme = "furo" 57 | 58 | # Add any paths that contain custom static files (such as style sheets) here, 59 | # relative to this directory. They are copied after the builtin static files, 60 | # so a file named "default.css" will overwrite the builtin "default.css". 61 | html_static_path = [] 62 | 63 | 64 | # -- Extension configuration ------------------------------------------------- 65 | 66 | autodoc_default_options = { 67 | "members": None, 68 | "member-order": "bysource", 69 | } 70 | autodoc_typehints = "description" 71 | autodoc_type_aliases = {} 72 | 73 | extlinks = { 74 | "issue": ("https://github.com/open-contracting/pelican-frontend/issues/%s", "#%s"), 75 | "commit": ("https://github.com/open-contracting/pelican-frontend/commit/%s", "%s"), 76 | } 77 | -------------------------------------------------------------------------------- /docs/config.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Configuration 3 | ============= 4 | 5 | .. _config-default: 6 | 7 | Default configuration 8 | ===================== 9 | 10 | Scrapyd always loads this configuration file, which can be overridden by :ref:`config-sources`: 11 | 12 | .. literalinclude:: ../scrapyd/default_scrapyd.conf 13 | 14 | .. _config-sources: 15 | 16 | Configuration sources 17 | ===================== 18 | 19 | Scrapyd reads these configuration files in this order. Values in later files take priority. 20 | 21 | #. ``c:\scrapyd\scrapyd.conf`` 22 | #. ``/etc/scrapyd/scrapyd.conf`` 23 | #. ``/etc/scrapyd/conf.d/*`` in alphabetical order 24 | #. ``scrapyd.conf`` in the current directory 25 | #. ``~/.scrapyd.conf`` in the home directory of the user that invoked the ``scrapyd`` command 26 | #. the closest ``scrapy.cfg`` file, starting in the current directory and traversing upward 27 | 28 | .. _config-envvars: 29 | 30 | Environment variables 31 | ===================== 32 | 33 | .. versionadded:: 1.5.0 34 | 35 | These environment variables override corresponding options: 36 | 37 | * ``SCRAPYD_BIND_ADDRESS`` (:ref:`bind_address`) 38 | * ``SCRAPYD_HTTP_PORT`` (:ref:`http_port`) 39 | * ``SCRAPYD_USERNAME`` (:ref:`username`) 40 | * ``SCRAPYD_PASSWORD`` (:ref:`password`) 41 | * ``SCRAPYD_UNIX_SOCKET_PATH`` (:ref:`unix_socket_path`) 42 | 43 | scrapyd section 44 | =============== 45 | 46 | Application options 47 | ------------------- 48 | 49 | .. _application: 50 | 51 | application 52 | ~~~~~~~~~~~ 53 | 54 | The function that returns the Twisted Application to use. 55 | 56 | If necessary, override this to fully control how Scrapyd works. 57 | 58 | Default 59 | ``scrapyd.app.application`` 60 | Options 61 | Any Twisted `Application `__ 62 | 63 | .. _bind_address: 64 | 65 | bind_address 66 | ~~~~~~~~~~~~ 67 | 68 | The IP address on which the :ref:`webui` and :doc:`api` listen for connections. 69 | 70 | Default 71 | ``127.0.0.1`` 72 | Options 73 | Any IP address, including: 74 | 75 | - ``127.0.0.1`` to listen for local IPv4 connections only 76 | - ``0.0.0.0`` to listen for all IPv4 connections 77 | - ``::0`` to listen for all IPv4 and IPv6 connections 78 | 79 | .. note:: If ``sysctl`` sets ``net.ipv6.bindv6only`` to true (default false), then ``::0`` listens for IPv6 connections only. 80 | 81 | .. _http_port: 82 | 83 | http_port 84 | ~~~~~~~~~ 85 | 86 | The TCP port on which the :ref:`webui` and :doc:`api` listen for connections. 87 | 88 | Default 89 | ``6800`` 90 | Options 91 | Any integer 92 | 93 | .. _unix_socket_path: 94 | 95 | unix_socket_path 96 | ---------------- 97 | 98 | .. versionadded:: 1.5.0 99 | 100 | The filesystem path of the Unix socket on which the :ref:`webui` and :doc:`api` listen for connections. 101 | 102 | For example: 103 | 104 | .. code-block:: ini 105 | 106 | unix_socket_path = /var/run/scrapyd/web.socket 107 | 108 | The file's mode is set to 660 (owner and group, read and write) to control access to Scrapyd. 109 | 110 | .. attention:: 111 | 112 | If :ref:`bind_address` and :ref:`http_port` are set, a TCP server will start, in addition to the Unix server. To disable the TCP server, set ``bind_address`` to empty: 113 | 114 | .. code-block:: ini 115 | 116 | bind_address = 117 | 118 | .. _username: 119 | 120 | username 121 | ~~~~~~~~ 122 | 123 | .. versionadded:: 1.3.0 124 | 125 | Enable basic authentication by setting this and :ref:`password` to non-empty values. 126 | 127 | Default 128 | ``""`` (empty) 129 | 130 | .. _password: 131 | 132 | password 133 | ~~~~~~~~ 134 | 135 | .. versionadded:: 1.3.0 136 | 137 | Enable basic authentication by setting this and :ref:`username` to non-empty values. 138 | 139 | Default 140 | ``""`` (empty) 141 | 142 | .. _spiderqueue: 143 | 144 | spiderqueue 145 | ~~~~~~~~~~~ 146 | 147 | .. versionadded:: 1.4.2 148 | 149 | The class that stores pending jobs. 150 | 151 | Default 152 | ``scrapyd.spiderqueue.SqliteSpiderQueue`` 153 | Options 154 | - ``scrapyd.spiderqueue.SqliteSpiderQueue`` stores spider queues in SQLite databases named after each project, in the :ref:`dbs_dir` directory 155 | - Implement your own, using the :py:interface:`~scrapyd.interfaces.ISpiderQueue` interface 156 | Also used by 157 | - :ref:`addversion.json` webservice, to create a queue if the project is new 158 | - :ref:`schedule.json` webservice, to add a pending job 159 | - :ref:`cancel.json` webservice, to remove a pending job 160 | - :ref:`listjobs.json` webservice, to list the pending jobs 161 | - :ref:`daemonstatus.json` webservice, to count the pending jobs 162 | - :ref:`webui`, to list the pending jobs and, if queues are transient, to create the queues per project at startup 163 | 164 | .. Community PostgreSQL and RabbitMQ queues: https://github.com/scrapy/scrapyd/pull/140/files#diff-c479470812a00776da54c3cefc15bb5bb244b4056996ae972f4daba7f6ec5bd5 165 | 166 | Poller options 167 | -------------- 168 | 169 | .. _poller: 170 | 171 | poller 172 | ~~~~~~ 173 | 174 | .. versionadded:: 1.5.0 175 | 176 | The class that tracks capacity for new jobs, and starts jobs when ready. 177 | 178 | Default 179 | ``scrapyd.poller.QueuePoller`` 180 | Options 181 | - ``scrapyd.poller.QueuePoller``. When using the default :ref:`application` and :ref:`launcher` values: 182 | 183 | - The launcher adds :ref:`max_proc` capacity at startup, and one capacity each time a Scrapy process ends. 184 | - The :ref:`application` starts a timer so that, every :ref:`poll_interval` seconds, jobs start if there's capacity: that is, if the number of Scrapy processes that are running is less than the :ref:`max_proc` value. 185 | 186 | - Implement your own, using the :py:interface:`~scrapyd.interfaces.IPoller` interface 187 | 188 | .. _poll_interval: 189 | 190 | poll_interval 191 | ~~~~~~~~~~~~~ 192 | 193 | The number of seconds between capacity checks. 194 | 195 | Default 196 | ``5.0`` 197 | Options 198 | Any floating-point number 199 | 200 | .. _config-launcher: 201 | 202 | Launcher options 203 | ---------------- 204 | 205 | .. _launcher: 206 | 207 | launcher 208 | ~~~~~~~~ 209 | 210 | The class that starts Scrapy processes. 211 | 212 | Default 213 | ``scrapyd.launcher.Launcher`` 214 | Options 215 | Any Twisted `Service `__ 216 | 217 | .. _max_proc: 218 | 219 | max_proc 220 | ~~~~~~~~ 221 | 222 | The maximum number of Scrapy processes to run concurrently. 223 | 224 | Default 225 | ``0`` 226 | Options 227 | Any non-negative integer, including: 228 | 229 | - ``0`` to use :ref:`max_proc_per_cpu` multiplied by the number of CPUs 230 | 231 | .. _max_proc_per_cpu: 232 | 233 | max_proc_per_cpu 234 | ~~~~~~~~~~~~~~~~ 235 | 236 | See :ref:`max_proc`. 237 | 238 | Default 239 | ``4`` 240 | 241 | .. _logs_dir: 242 | 243 | logs_dir 244 | ~~~~~~~~ 245 | 246 | The directory in which to write Scrapy logs. 247 | 248 | A log file is written to ``{logs_dir}/{project}/{spider}/{job}.log``. 249 | 250 | To disable log storage, set this option to empty: 251 | 252 | .. code-block:: ini 253 | 254 | logs_dir = 255 | 256 | To log messages to a remote service, you can, for example, reconfigure Scrapy's logger from your Scrapy project: 257 | 258 | .. code-block:: python 259 | 260 | import logging 261 | import logstash 262 | 263 | logger = logging.getLogger("scrapy") 264 | logger.handlers.clear() 265 | logger.addHandler(logstash.LogstashHandler("https://user:pass@id.us-east-1.aws.found.io", 5959, version=1)) 266 | 267 | Default 268 | ``logs`` 269 | Also used by 270 | :ref:`webui`, to link to log files 271 | 272 | .. attention:: Each ``*_dir`` setting must point to a different directory. 273 | 274 | .. _items_dir: 275 | 276 | items_dir 277 | ~~~~~~~~~ 278 | 279 | The directory in which to write Scrapy items. 280 | 281 | An item feed is written to ``{items_dir}/{project}/{spider}/{job}.jl``. 282 | 283 | If this option is non-empty, the `FEEDS `__ Scrapy setting is set as follows, resulting in items being written to the above path as JSON lines: 284 | 285 | .. code-block:: json 286 | 287 | {"file:///path/to/items_dir/project/spider/job.jl": {"format": "jsonlines"}} 288 | 289 | Default 290 | ``""`` (empty), because it is recommended to instead use either: 291 | 292 | - `Feed exports `__, by setting the ``FEEDS`` Scrapy setting in your Scrapy project. See the full list of `storage backends `__. 293 | - `Item pipeline `__, to store the scraped items in a database. See the `MongoDB example `__, which can be adapted to another database. 294 | Also used by 295 | :ref:`webui`, to link to item feeds 296 | 297 | .. attention:: Each ``*_dir`` setting must point to a different directory. 298 | 299 | .. _jobs_to_keep: 300 | 301 | jobs_to_keep 302 | ~~~~~~~~~~~~ 303 | 304 | The number of finished jobs per spider, for which to keep the most recent log files in the :ref:`logs_dir` directory and item feeds in the :ref:`items_dir` directory. 305 | 306 | To "disable" this feature, set this to an arbitrarily large value. For example, on a 64-bit system: 307 | 308 | .. code-block:: ini 309 | 310 | jobs_to_keep = 9223372036854775807 311 | 312 | .. warning:: 313 | 314 | Scrapyd deletes old files in these directories, regardless of origin. 315 | 316 | Default 317 | ``5`` 318 | 319 | .. _runner: 320 | 321 | runner 322 | ~~~~~~ 323 | 324 | The Python script to run Scrapy's `CLI `__. 325 | 326 | If necessary, override this to fully control how the Scrapy CLI is called. 327 | 328 | Default 329 | ``scrapyd.runner`` 330 | Options 331 | Any Python `script `__ 332 | Also used by 333 | :ref:`listspiders.json` webservice, to run Scrapy's `list `__ command 334 | 335 | Web UI and API options 336 | ---------------------- 337 | 338 | .. _webroot: 339 | 340 | webroot 341 | ~~~~~~~ 342 | 343 | .. versionadded:: 1.2.0 344 | 345 | The class that defines the :ref:`webui` and :doc:`api`, as a Twisted Resource. 346 | 347 | If necessary, override this to fully control how the web UI and API work. 348 | 349 | Default 350 | ``scrapyd.website.Root`` 351 | Options 352 | Any Twisted `Resource `__ 353 | 354 | .. _prefix_header: 355 | 356 | prefix_header 357 | ~~~~~~~~~~~~~ 358 | 359 | .. versionadded:: 1.4.2 360 | 361 | The header for the base path of the original request. 362 | 363 | The header is relevant only if Scrapyd is running behind a reverse proxy, and if the public URL contains a base path, before the Scrapyd API path components. 364 | A base path must have a leading slash and no trailing slash, e.g. ``/base/path``. 365 | 366 | Default 367 | ``x-forwarded-prefix`` 368 | 369 | .. _node_name: 370 | 371 | node_name 372 | ~~~~~~~~~ 373 | 374 | .. versionadded:: 1.1.0 375 | 376 | The node name, which appears in :doc:`api` responses. 377 | 378 | Default 379 | ``socket.gethostname()`` 380 | 381 | .. _debug: 382 | 383 | debug 384 | ~~~~~ 385 | 386 | Whether debug mode is enabled. 387 | 388 | If enabled, a Python traceback is returned (as a plain-text response) when the :doc:`api` errors. 389 | 390 | Default 391 | ``off`` 392 | 393 | Egg storage options 394 | ------------------- 395 | 396 | .. _eggstorage: 397 | 398 | eggstorage 399 | ~~~~~~~~~~ 400 | 401 | .. versionadded:: 1.3.0 402 | 403 | The class that stores project eggs. 404 | 405 | Default 406 | ``scrapyd.eggstorage.FilesystemEggStorage`` 407 | Options 408 | - ``scrapyd.eggstorage.FilesystemEggStorage`` writes eggs in the :ref:`eggs_dir` directory 409 | 410 | .. note:: Eggs are named after the ``version``, replacing characters other than ``A-Za-z0-9_-`` with underscores. Therefore, if you frequently use non-word, non-hyphen characters, the eggs for different versions can collide. 411 | - Implement your own, using the :py:interface:`~scrapyd.interfaces.IEggStorage` interface: for example, to store eggs remotely 412 | 413 | .. _eggs_dir: 414 | 415 | eggs_dir 416 | ~~~~~~~~ 417 | 418 | The directory in which to write project eggs. 419 | 420 | Default 421 | ``eggs`` 422 | 423 | .. attention:: Each ``*_dir`` setting must point to a different directory. 424 | 425 | Job storage options 426 | ------------------- 427 | 428 | .. _jobstorage: 429 | 430 | jobstorage 431 | ~~~~~~~~~~ 432 | 433 | .. versionadded:: 1.3.0 434 | 435 | The class that stores finished jobs. 436 | 437 | Default 438 | ``scrapyd.jobstorage.MemoryJobStorage`` 439 | Options 440 | - ``scrapyd.jobstorage.MemoryJobStorage`` stores jobs in memory, such that jobs are lost when the Scrapyd process ends 441 | - ``scrapyd.jobstorage.SqliteJobStorage`` stores jobs in a SQLite database named ``jobs.db``, in the :ref:`dbs_dir` directory 442 | - Implement your own, using the :py:interface:`~scrapyd.interfaces.IJobStorage` interface 443 | 444 | .. _finished_to_keep: 445 | 446 | finished_to_keep 447 | ~~~~~~~~~~~~~~~~ 448 | 449 | The number of finished jobs, for which to keep metadata in the :ref:`jobstorage` backend. 450 | 451 | Finished jobs are accessed via the :ref:`webui` and :ref:`listjobs.json` webservice. 452 | 453 | Default 454 | ``100`` 455 | Options 456 | Any non-negative integer 457 | 458 | Directory options 459 | ----------------- 460 | 461 | .. _dbs_dir: 462 | 463 | dbs_dir 464 | ~~~~~~~ 465 | 466 | The directory in which to write SQLite databases. 467 | 468 | Default 469 | ``dbs`` 470 | Options 471 | Any relative or absolute path, or `:memory: `__ 472 | Used by 473 | - :ref:`spiderqueue` (``scrapyd.spiderqueue.SqliteSpiderQueue``) 474 | - :ref:`jobstorage` (``scrapyd.jobstorage.SqliteJobStorage``) 475 | 476 | .. attention:: Each ``*_dir`` setting must point to a different directory. 477 | 478 | .. _config-services: 479 | 480 | services section 481 | ================ 482 | 483 | If you want to add a webservice (endpoint), add, for example: 484 | 485 | .. code-block:: ini 486 | 487 | [services] 488 | mywebservice.json = amodule.anothermodule.MyWebService 489 | 490 | You can use code for webservices in `webservice.py `__ as inspiration. 491 | 492 | To remove a :ref:`default webservice`, set it to empty: 493 | 494 | .. code-block:: ini 495 | 496 | [services] 497 | daemonstatus.json = 498 | 499 | .. _config-settings: 500 | 501 | settings section (scrapy.cfg) 502 | ============================= 503 | 504 | Project code is usually stored in a `Python egg `__ and uploaded to Scrapyd via the :ref:`addversion.json` webservice. 505 | 506 | Alternatively, you can invoke Scrapyd within a Scrapy project: that is, you can run the ``scrapyd`` command from a directory containing a ``scrapy.cfg`` file (or from a directory with any parent directory containing a ``scrapy.cfg`` file). 507 | 508 | As described in `Scrapy's documentation `__, the ``scrapy.cfg`` file contains a ``[settings]`` section, which can describe many Scrapy projects. By default, it is: 509 | 510 | .. code-block:: ini 511 | 512 | [settings] 513 | default = projectname.settings 514 | -------------------------------------------------------------------------------- /docs/contributing/api.rst: -------------------------------------------------------------------------------- 1 | Developer API reference 2 | ======================= 3 | 4 | Interfaces 5 | ---------- 6 | 7 | .. automodule:: scrapyd.interfaces 8 | :members: 9 | :undoc-members: 10 | :special-members: 11 | 12 | Config 13 | ------ 14 | 15 | .. automodule:: scrapyd.config 16 | :members: 17 | :undoc-members: 18 | 19 | Exceptions 20 | ---------- 21 | 22 | .. automodule:: scrapyd.exceptions 23 | :members: 24 | :undoc-members: 25 | -------------------------------------------------------------------------------- /docs/contributing/index.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | .. important:: Read through the `Scrapy Contribution Docs `__ for tips relating to writing patches, reporting bugs, and coding style. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Contents 9 | 10 | api 11 | 12 | Issues and bugs 13 | --------------- 14 | 15 | Report on `GitHub `__. 16 | 17 | Tests 18 | ----- 19 | 20 | Include tests in your pull requests. 21 | 22 | To run unit tests: 23 | 24 | .. code-block:: shell 25 | 26 | pytest tests 27 | 28 | To run integration tests: 29 | 30 | .. code-block:: shell 31 | 32 | printf "[scrapyd]\nusername = hello12345\npassword = 67890world\n" > scrapyd.conf 33 | mkdir logs 34 | scrapyd & 35 | pytest integration_tests 36 | 37 | Installation 38 | ------------ 39 | 40 | To install an editable version for development, clone the repository, change to its directory, and run: 41 | 42 | .. code-block:: shell 43 | 44 | pip install -e .[test,docs] 45 | 46 | Developer documentation 47 | ----------------------- 48 | 49 | Configuration 50 | ~~~~~~~~~~~~~ 51 | 52 | Pass the ``config`` object to a class' ``__init__`` method, but don't store it on the instance (:issue:`526`). 53 | 54 | Processes 55 | ~~~~~~~~~ 56 | 57 | Scrapyd starts Scrapy processes. It runs ``scrapy crawl`` in the :ref:`launcher`, and ``scrapy list`` in the :ref:`schedule.json` (to check the spider exists), :ref:`addversion.json` (to return the number of spiders) and :ref:`listspiders.json` (to return the names of spiders) webservices. 58 | 59 | Environment variables 60 | ~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | Scrapyd uses environment variables to communicate between the Scrapyd process and the Scrapy processes that it starts. 63 | 64 | SCRAPY_PROJECT 65 | The project to use. See ``scrapyd/runner.py``. 66 | SCRAPYD_EGG_VERSION 67 | The version of the project, to be retrieved as an egg from :ref:`eggstorage` and activated. 68 | SCRAPY_SETTINGS_MODULE 69 | The Python path to the `settings `__ module of the project. 70 | 71 | This is usually the module from the `entry points `__ of the egg, but can be the module from the ``[settings]`` section of a :ref:`scrapy.cfg` file. See ``scrapyd/environ.py``. 72 | 73 | Jobs 74 | ~~~~ 75 | 76 | A **pending job** is a ``dict`` object (referred to as a "message"), accessible via an :py:interface:`~scrapyd.interfaces.ISpiderQueue`'s :meth:`~scrapyd.interfaces.ISpiderQueue.pop` or :meth:`~scrapyd.interfaces.ISpiderQueue.list` methods. 77 | 78 | .. note:: The short-lived message returned by :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.poll` method is also referred to as a "message". 79 | 80 | - The :ref:`schedule.json` webservice calls :py:interface:`~scrapyd.interfaces.ISpiderScheduler`'s :meth:`~scrapyd.interfaces.ISpiderScheduler.schedule` method. The ``SpiderScheduler`` implementation of :meth:`~scrapyd.interfaces.ISpiderScheduler.schedule` adds the message to the project's :py:interface:`~scrapyd.interfaces.ISpiderQueue`. 81 | - The default :ref:`application` sets a `TimerService `__ to call :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.poll` method, at :ref:`poll_interval`. 82 | - :py:interface:`~scrapyd.interfaces.IPoller` has a :attr:`~scrapyd.interfaces.IPoller.queues` attribute, that implements a ``__getitem__`` method to get a project's :py:interface:`~scrapyd.interfaces.ISpiderQueue` by project name. 83 | - The ``QueuePoller`` implementation of :meth:`~scrapyd.interfaces.IPoller.poll` calls a project's :py:interface:`~scrapyd.interfaces.ISpiderQueue`'s :meth:`~scrapyd.interfaces.ISpiderQueue.pop` method, adds a ``_project`` key to the message and renames the ``name`` key to ``_spider``, and fires a callback. 84 | - The ``Launcher`` service had added the callback to the `Deferred `__, which had been returned by :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.next` method. 85 | - The ``Launcher`` service adapts the message to instantiate a ``ScrapyProcessProtocol`` (`ProcessProtocol `__) object, adds a callback, and `spawns a process `__. 86 | 87 | A **running job** is a ``ScrapyProcessProtocol`` object, accessible via ``Launcher.processes`` (a ``dict``), in which each key is a slot's number (an ``int``). 88 | 89 | - ``Launcher`` has a ``finished`` attribute, which is an :py:interface:`~scrapyd.interfaces.IJobStorage`. 90 | - When the process ends, the callback fires. The ``Launcher`` service calls :py:interface:`~scrapyd.interfaces.IJobStorage`'s :meth:`~scrapyd.interfaces.IJobStorage.add` method, passing the ``ScrapyProcessProtocol`` as input. 91 | 92 | A **finished job** is an object with the attributes ``project``, ``spider``, ``job``, ``start_time`` and ``end_time``, accessible via an :py:interface:`~scrapyd.interfaces.IJobStorage`'s :meth:`~scrapyd.interfaces.IJobStorage.list` or :meth:`~scrapyd.interfaces.IJobStorage.__iter__` methods. 93 | 94 | .. list-table:: 95 | :header-rows: 1 96 | :stub-columns: 1 97 | 98 | * - Concept 99 | - ISpiderQueue 100 | - IPoller 101 | - ScrapyProcessProtocol 102 | - IJobStorage 103 | * - Project 104 | - *not specified* 105 | - _project 106 | - project 107 | - project 108 | * - Spider 109 | - name 110 | - _spider 111 | - spider 112 | - spider 113 | * - Job ID 114 | - _job 115 | - _job 116 | - job 117 | - job 118 | * - Egg version 119 | - _version 120 | - _version 121 | - ✗ 122 | - ✗ 123 | * - Scrapy settings 124 | - settings 125 | - settings 126 | - args (``-s k=v``) 127 | - ✗ 128 | * - Spider arguments 129 | - *remaining keys* 130 | - *remaining keys* 131 | - args (``-a k=v``) 132 | - ✗ 133 | * - Environment variables 134 | - ✗ 135 | - ✗ 136 | - env 137 | - ✗ 138 | * - Process ID 139 | - ✗ 140 | - ✗ 141 | - pid 142 | - ✗ 143 | * - Start time 144 | - ✗ 145 | - ✗ 146 | - start_time 147 | - start_time 148 | * - End time 149 | - ✗ 150 | - ✗ 151 | - end_time 152 | - end_time 153 | -------------------------------------------------------------------------------- /docs/deploy.rst: -------------------------------------------------------------------------------- 1 | Deployment 2 | ========== 3 | 4 | .. _docker: 5 | 6 | Creating a Docker image 7 | ----------------------- 8 | 9 | If you prefer to create a Docker image for the Scrapyd service and your Scrapy projects, you can copy this ``Dockerfile`` template into your Scrapy project, and adapt it. 10 | 11 | .. code-block:: dockerfile 12 | 13 | # Build an egg of your project. 14 | 15 | FROM python as build-stage 16 | 17 | RUN pip install --no-cache-dir scrapyd-client 18 | 19 | WORKDIR /workdir 20 | 21 | COPY . . 22 | 23 | RUN scrapyd-deploy --build-egg=myproject.egg 24 | 25 | # Build the image. 26 | 27 | FROM python:alpine 28 | 29 | # Install Scrapy dependencies - and any others for your project. 30 | 31 | RUN apk --no-cache add --virtual build-dependencies \ 32 | gcc \ 33 | musl-dev \ 34 | libffi-dev \ 35 | libressl-dev \ 36 | libxml2-dev \ 37 | libxslt-dev \ 38 | && pip install --no-cache-dir \ 39 | scrapyd \ 40 | && apk del build-dependencies \ 41 | && apk add \ 42 | libressl \ 43 | libxml2 \ 44 | libxslt 45 | 46 | # Mount two volumes for configuration and runtime. 47 | 48 | VOLUME /etc/scrapyd/ /var/lib/scrapyd/ 49 | 50 | COPY ./scrapyd.conf /etc/scrapyd/ 51 | 52 | RUN mkdir -p /src/eggs/myproject 53 | 54 | COPY --from=build-stage /workdir/myproject.egg /src/eggs/myproject/1.egg 55 | 56 | EXPOSE 6800 57 | 58 | ENTRYPOINT ["scrapyd", "--pidfile="] 59 | 60 | Where your ``scrapy.cfg`` file, used by ``scrapyd-deploy``, might be: 61 | 62 | .. code-block:: ini 63 | 64 | [settings] 65 | default = myproject.settings 66 | 67 | [deploy] 68 | url = http://localhost:6800 69 | project = myproject 70 | 71 | And your ``scrapyd.conf`` file might be: 72 | 73 | .. code-block:: ini 74 | 75 | [scrapyd] 76 | bind_address = 0.0.0.0 77 | logs_dir = /var/lib/scrapyd/logs 78 | items_dir = /var/lib/scrapyd/items 79 | dbs_dir = /var/lib/scrapyd/dbs 80 | eggs_dir = /src/eggs 81 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Scrapyd |release| 3 | ================= 4 | 5 | .. include:: ../README.rst 6 | 7 | Quickstart 8 | ========== 9 | 10 | Install Scrapyd 11 | --------------- 12 | 13 | .. code-block:: shell 14 | 15 | pip install scrapyd 16 | 17 | Start Scrapyd 18 | ------------- 19 | 20 | .. code-block:: shell 21 | 22 | scrapyd 23 | 24 | See :doc:`overview` and :doc:`config` for more details. 25 | 26 | Upload a project 27 | ---------------- 28 | 29 | This involves building a `Python egg `__ and uploading it to Scrapyd via the `addversion.json `_ webservice. 30 | 31 | Do this easily with the ``scrapyd-deploy`` command from the `scrapyd-client `__ package. Once configured: 32 | 33 | .. code-block:: shell 34 | 35 | scrapyd-deploy 36 | 37 | Schedule a crawl 38 | ---------------- 39 | 40 | .. code-block:: shell-session 41 | 42 | $ curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2 43 | {"status": "ok", "jobid": "26d1b1a6d6f111e0be5c001e648c57f8"} 44 | 45 | See :doc:`api` for more details. 46 | 47 | .. toctree:: 48 | :maxdepth: 2 49 | :caption: Contents 50 | 51 | overview 52 | config 53 | api 54 | cli 55 | deploy 56 | contributing/index 57 | news 58 | -------------------------------------------------------------------------------- /docs/overview.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | Projects and versions 6 | ===================== 7 | 8 | Scrapyd can manage multiple Scrapy projects. Each project can have multiple versions. The latest version is used by default for starting spiders. 9 | 10 | .. _overview-order: 11 | 12 | Version order 13 | ------------- 14 | 15 | The latest version is the alphabetically greatest, unless all version names are `version specifiers `__ like ``1.0`` or ``1.0rc1``, in which case they are sorted as such. 16 | 17 | How Scrapyd works 18 | ================= 19 | 20 | Scrapyd is a server (typically run as a daemon) that listens for :doc:`api` and :ref:`webui` requests. 21 | 22 | The API is especially used to upload projects and schedule crawls. To start a crawl, Scrapyd spawns a process that essentially runs: 23 | 24 | .. code-block:: shell 25 | 26 | scrapy crawl myspider 27 | 28 | Scrapyd runs multiple processes in parallel, and manages the number of concurrent processes. See :ref:`config-launcher` for details. 29 | 30 | If you are familiar with the `Twisted Application Framework `__, you can essentially reconfigure every part of Scrapyd. See :doc:`config` for details. 31 | 32 | .. _webui: 33 | 34 | Web interface 35 | ============= 36 | 37 | Scrapyd has a minimal web interface for monitoring running processes and accessing log files and item fees. By default, it is available at at http://localhost:6800/ Other options to manage Scrapyd include: 38 | 39 | - `ScrapydWeb `__ 40 | - `spider-admin-pro `__ 41 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo 2 | sphinxcontrib-zopeext 3 | -------------------------------------------------------------------------------- /integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin 2 | 3 | import requests 4 | 5 | 6 | def req(method, path, auth=None, status=200, **kwargs): 7 | url = urljoin("http://127.0.0.1:6800", path) 8 | 9 | for badauth in (None, ("baduser", "badpass")): 10 | response = getattr(requests, method)(url, auth=badauth, **kwargs) 11 | 12 | assert response.status_code == 401, f"401 != {response.status_code}" 13 | assert response.text == "Unauthorized" 14 | 15 | response = getattr(requests, method)(url, auth=("hello12345", "67890world"), **kwargs) 16 | 17 | assert response.status_code == status, f"{status} != {response.status_code}" 18 | 19 | return response 20 | -------------------------------------------------------------------------------- /integration_tests/test_webservice.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from pathlib import Path 3 | 4 | import pytest 5 | import requests 6 | 7 | from integration_tests import req 8 | 9 | BASEDIR = os.path.realpath(".").replace("\\", "\\\\") 10 | with (Path(__file__).absolute().parent.parent / "tests" / "fixtures" / "quotesbot.egg").open("rb") as f: 11 | EGG = f.read() 12 | 13 | 14 | def assert_response(method, path, expected, **kwargs): 15 | response = req(method, path, **kwargs) 16 | data = response.json() 17 | data.pop("node_name") 18 | 19 | assert data == expected 20 | assert response.content.endswith(b"\n") 21 | 22 | 23 | @pytest.mark.parametrize( 24 | ("method", "basename"), 25 | [ 26 | ("GET", "daemonstatus"), 27 | ("POST", "addversion"), 28 | ("POST", "schedule"), 29 | ("POST", "cancel"), 30 | ("GET", "status"), 31 | ("GET", "listprojects"), 32 | ("GET", "listversions"), 33 | ("GET", "listspiders"), 34 | ("GET", "listjobs"), 35 | ("POST", "delversion"), 36 | ("POST", "delproject"), 37 | ], 38 | ) 39 | def test_options(method, basename): 40 | response = requests.options( 41 | f"http://127.0.0.1:6800/{basename}.json", 42 | auth=("hello12345", "67890world"), 43 | ) 44 | 45 | assert response.status_code == 204, f"204 != {response.status_code}" 46 | assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}" 47 | assert response.content == b"" 48 | 49 | 50 | # ListSpiders, Schedule, Cancel, Status and ListJobs return "project '%b' not found" on directory traversal attempts. 51 | # The egg storage (in get_project_list, called by get_spider_queues, called by QueuePoller, used by these webservices) 52 | # would need to find a project like "../project" (which is impossible with the default eggstorage) to not error. 53 | @pytest.mark.parametrize( 54 | ("method", "basename", "params"), 55 | [ 56 | ("post", "addversion", {"version": "v", "egg": EGG}), 57 | ("get", "listversions", {}), 58 | ("post", "delversion", {"version": "v"}), 59 | ("post", "delproject", {}), 60 | ], 61 | ) 62 | def test_project_directory_traversal(method, basename, params): 63 | response = getattr(requests, method)( 64 | f"http://127.0.0.1:6800/{basename}.json", 65 | auth=("hello12345", "67890world"), 66 | **{"params" if method == "get" else "data": {"project": "../p", **params}}, 67 | ) 68 | 69 | data = response.json() 70 | data.pop("node_name") 71 | 72 | assert response.status_code == 200, f"200 != {response.status_code}" 73 | assert data == {"status": "error", "message": "DirectoryTraversalError: ../p"} 74 | 75 | 76 | def test_daemonstatus(): 77 | assert_response("get", "/daemonstatus.json", {"status": "ok", "running": 0, "pending": 0, "finished": 0}) 78 | 79 | 80 | def test_schedule_nonexistent_project(): 81 | assert_response( 82 | "post", 83 | "/schedule.json", 84 | {"status": "error", "message": "project 'nonexistent' not found"}, 85 | data={"project": "nonexistent", "spider": "nospider"}, 86 | ) 87 | 88 | 89 | def test_status_nonexistent_job(): 90 | assert_response( 91 | "get", 92 | "/status.json", 93 | {"status": "ok", "currstate": None}, 94 | params={"job": "sample"}, 95 | ) 96 | 97 | 98 | def test_status_nonexistent_project(): 99 | assert_response( 100 | "get", 101 | "/status.json", 102 | {"status": "error", "message": "project 'nonexistent' not found"}, 103 | params={"job": "sample", "project": "nonexistent"}, 104 | ) 105 | 106 | 107 | def test_cancel_nonexistent_project(): 108 | assert_response( 109 | "post", 110 | "/cancel.json", 111 | {"status": "error", "message": "project 'nonexistent' not found"}, 112 | data={"project": "nonexistent", "job": "nojob"}, 113 | ) 114 | 115 | 116 | def test_listprojects(): 117 | assert_response( 118 | "get", 119 | "/listprojects.json", 120 | {"status": "ok", "projects": []}, 121 | ) 122 | 123 | 124 | def test_listversions(): 125 | assert_response( 126 | "get", 127 | "/listversions.json", 128 | {"status": "ok", "versions": []}, 129 | params={"project": "sample"}, 130 | ) 131 | 132 | 133 | def test_listspiders_nonexistent_project(): 134 | assert_response( 135 | "get", 136 | "/listspiders.json", 137 | {"status": "error", "message": "project 'nonexistent' not found"}, 138 | params={"project": "nonexistent"}, 139 | ) 140 | 141 | 142 | def test_listjobs(): 143 | assert_response( 144 | "get", 145 | "/listjobs.json", 146 | {"status": "ok", "pending": [], "running": [], "finished": []}, 147 | ) 148 | 149 | 150 | def test_listjobs_nonexistent_project(): 151 | assert_response( 152 | "get", 153 | "/listjobs.json", 154 | {"status": "error", "message": "project 'nonexistent' not found"}, 155 | params={"project": "nonexistent"}, 156 | ) 157 | 158 | 159 | def test_delversion_nonexistent_project(): 160 | assert_response( 161 | "post", 162 | "/delversion.json", 163 | {"status": "error", "message": "version 'nonexistent' not found"}, 164 | data={"project": "sample", "version": "nonexistent"}, 165 | ) 166 | 167 | 168 | def test_delproject_nonexistent_project(): 169 | assert_response( 170 | "post", 171 | "/delproject.json", 172 | {"status": "error", "message": "project 'nonexistent' not found"}, 173 | data={"project": "nonexistent"}, 174 | ) 175 | -------------------------------------------------------------------------------- /integration_tests/test_website.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from integration_tests import req 4 | 5 | 6 | def test_root(): 7 | response = req("get", "/") 8 | 9 | assert '"/jobs"' in response.text 10 | assert '"/logs/"' in response.text 11 | 12 | 13 | @pytest.mark.parametrize(("path", "content"), [("jobs", "Cancel"), ("logs", "Last modified")]) 14 | def test_paths(path, content): 15 | response = req("get", f"/{path}") 16 | 17 | assert content in response.text 18 | 19 | 20 | def test_base_path(): 21 | response = req("get", "/", headers={"X-Forwarded-Prefix": "/path/to"}) 22 | 23 | assert '"/path/to/jobs"' in response.text 24 | assert '"/path/to/logs/"' in response.text 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "scrapyd" 7 | version = "1.5.0" 8 | authors = [{name = "Scrapy developers", email = "info@scrapy.org"}] 9 | description = "A service for running Scrapy spiders, with an HTTP API" 10 | readme = "README.rst" 11 | license = {text = "BSD"} 12 | urls = {Homepage = "https://github.com/scrapy/scrapyd"} 13 | classifiers = [ 14 | "License :: OSI Approved :: BSD License", 15 | "Operating System :: OS Independent", 16 | "Programming Language :: Python :: 3.9", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3.12", 20 | "Programming Language :: Python :: 3.13", 21 | "Programming Language :: Python :: Implementation :: CPython", 22 | "Development Status :: 5 - Production/Stable", 23 | "Environment :: Console", 24 | "Environment :: No Input/Output (Daemon)", 25 | "Topic :: Internet :: WWW/HTTP", 26 | ] 27 | dependencies = [ 28 | "packaging", 29 | "pywin32;platform_system=='Windows'", 30 | "scrapy>=2.0.0", 31 | "setuptools", 32 | "twisted>=17.9", 33 | "w3lib", 34 | "zope.interface", 35 | ] 36 | 37 | [project.optional-dependencies] 38 | test = [ 39 | "coveralls", 40 | "py-html-checker", 41 | "pytest", 42 | "pytest-cov", 43 | "pytest-twisted", 44 | "requests", 45 | "twisted>=19.7", # twisted.logger.capturedLogs 46 | ] 47 | docs = [ 48 | "furo", 49 | "sphinx", 50 | "sphinx-autobuild", 51 | "sphinxcontrib-zopeext", 52 | ] 53 | 54 | [project.scripts] 55 | scrapyd = "scrapyd.__main__:main" 56 | 57 | [tool.setuptools] 58 | packages = ["scrapyd"] 59 | zip-safe = false # The scrapyd.__main__ module requires the txapp.py file to be decompressed. #49 60 | 61 | [tool.ruff] 62 | line-length = 119 63 | target-version = "py38" 64 | 65 | [tool.ruff.lint] 66 | select = ["ALL"] 67 | ignore = [ 68 | "ANN", "COM", "EM", 69 | # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules 70 | "W191", "E501", "D206", "Q000", "Q001", "Q002", "Q003", "ISC001", 71 | "D203", "D212", # ignore incompatible rules 72 | "D200", # documentation preferences 73 | "C901", "PLR0912", # complexity preferences 74 | 75 | # Project-specific 76 | "D", 77 | "PTH", # Scrapyd hasn't adopted pathlib 78 | "ARG002", # Unused method argument (txrequest argument isn't always used) 79 | "N802", # Function name should be lowercase (Twisted uses method names like render_GET) 80 | "N803", # Argument name should be lowercase (Twisted uses argument names like avatarId) 81 | "N815", # Variable in class scope should not be mixedCase (Twisted uses class attributes like requestAvatarId) 82 | "PLR0913", # Too many arguments to function call 83 | "S603", # `subprocess` call: check for execution of untrusted input (informative) 84 | 85 | # sqlite3 doesn't have functions like psycopg2.sql.Identifier and psycopg2.sql.SQL.format. 86 | "S608", # Possible SQL injection vector through string-based query construction 87 | 88 | # Scrapyd uses naive datetimes. 89 | "DTZ001", # `datetime.datetime()` called without a `tzinfo` argument" 90 | "DTZ005", # `datetime.datetime.now()` called without a `tz` argument 91 | "DTZ006", # `datetime.datetime.fromtimestamp()` called without a `tz` argument 92 | "DTZ007", # Naive datetime constructed using `datetime.datetime.strptime()` without %z 93 | ] 94 | 95 | [tool.ruff.lint.flake8-builtins] 96 | builtins-ignorelist = ["copyright"] 97 | 98 | [tool.ruff.lint.per-file-ignores] 99 | "docs/conf.py" = ["INP001"] # no __init__.py file 100 | "scrapyd/__main__.py" = ["T201"] # `print` found 101 | "scrapyd/interfaces.py" = ["N805"] # First argument of a method should be named `self` 102 | "{tests,integration_tests}/*" = [ 103 | "D", # docstring 104 | "S101", # assert 105 | "S106", # password 106 | "S113", # requests timeout 107 | "PLR2004", # magic value 108 | "ARG001", "ARG002", "ARG005", # mocks 109 | "PT009", "PT027", # Scrapyd mixes unittest with pytest 110 | ] 111 | -------------------------------------------------------------------------------- /scrapyd/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from scrapyd.config import Config 4 | from scrapyd.exceptions import ConfigError 5 | from scrapyd.utils import initialize_component 6 | 7 | __version__ = "1.5.0" 8 | version_info = tuple(__version__.split(".")[:3]) 9 | 10 | 11 | def get_application(config=None): 12 | if config is None: 13 | config = Config() 14 | try: 15 | return initialize_component(config, "application", "scrapyd.app.application") 16 | except ConfigError as e: 17 | sys.exit(str(e)) 18 | -------------------------------------------------------------------------------- /scrapyd/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from os.path import dirname, join 3 | 4 | from twisted.scripts import twistd 5 | 6 | import scrapyd 7 | 8 | 9 | class ServerOptions(twistd.ServerOptions): 10 | synopsis = "Usage: scrapyd [options]" 11 | longdesc = "Scrapyd is an application for deploying and running Scrapy spiders." 12 | 13 | def __init__(self): 14 | super().__init__() 15 | # main() always sets -n (--nodaemon) and -y (--python=). -y can be set only once. -n is okay to leave as a 16 | # no-op. Scrapyd's *_dir settings don't respect --rundir. 17 | self.longOpt = [opt for opt in self.longOpt if opt not in ("python=", "rundir=")] 18 | 19 | @property 20 | def subCommands(self): 21 | return [] # remove alternatives to running txapp.py 22 | 23 | def getUsage(self, width=None): 24 | return super().getUsage(width=width)[:-11] # remove "\nCommands:\n" 25 | 26 | 27 | def main(): 28 | if len(sys.argv) > 1 and "-v" in sys.argv[1:] or "--version" in sys.argv[1:]: 29 | print(f"Scrapyd {scrapyd.__version__}") 30 | else: 31 | sys.argv[1:1] = ["-n", "-y", join(dirname(scrapyd.__file__), "txapp.py")] 32 | twistd.app.run(twistd.runApp, ServerOptions) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /scrapyd/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from twisted.application.internet import TCPServer, TimerService, UNIXServer 4 | from twisted.application.service import Application 5 | from twisted.logger import Logger 6 | from twisted.web import server 7 | 8 | from scrapyd.basicauth import wrap_resource 9 | from scrapyd.environ import Environment 10 | from scrapyd.interfaces import IEggStorage, IEnvironment, IJobStorage, IPoller, ISpiderScheduler 11 | from scrapyd.scheduler import SpiderScheduler 12 | from scrapyd.utils import initialize_component 13 | 14 | log = Logger() 15 | 16 | 17 | def application(config): 18 | app = Application("Scrapyd") 19 | bind_address = os.getenv("SCRAPYD_BIND_ADDRESS") or config.get("bind_address", "127.0.0.1") 20 | http_port = int(os.getenv("SCRAPYD_HTTP_PORT") or config.getint("http_port", "6800")) 21 | unix_socket_path = os.getenv("SCRAPYD_UNIX_SOCKET_PATH") or config.get("unix_socket_path", "") 22 | poll_interval = config.getfloat("poll_interval", 5) 23 | 24 | environment = Environment(config) 25 | scheduler = SpiderScheduler(config) 26 | poller = initialize_component(config, "poller", "scrapyd.poller.QueuePoller") 27 | jobstorage = initialize_component(config, "jobstorage", "scrapyd.jobstorage.MemoryJobStorage") 28 | eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage") 29 | 30 | app.setComponent(IEnvironment, environment) 31 | app.setComponent(ISpiderScheduler, scheduler) 32 | app.setComponent(IPoller, poller) 33 | app.setComponent(IJobStorage, jobstorage) 34 | app.setComponent(IEggStorage, eggstorage) 35 | 36 | # launcher uses jobstorage in initializer, and uses poller and environment. 37 | launcher = initialize_component(config, "launcher", "scrapyd.launcher.Launcher", app) 38 | 39 | timer = TimerService(poll_interval, poller.poll) 40 | 41 | # webroot uses launcher, poller, scheduler and environment. 42 | webroot = initialize_component(config, "webroot", "scrapyd.website.Root", app) 43 | resource = server.Site(wrap_resource(webroot, config)) 44 | if bind_address and http_port: 45 | webservice = TCPServer(http_port, resource, interface=bind_address) 46 | log.info( 47 | "Scrapyd web console available at http://{bind_address}:{http_port}/", 48 | bind_address=bind_address, 49 | http_port=http_port, 50 | ) 51 | if unix_socket_path: 52 | unix_socket_path = os.path.abspath(unix_socket_path) 53 | webservice = UNIXServer(unix_socket_path, resource, mode=0o660) 54 | log.info( 55 | "Scrapyd web console available at http+unix://{unix_socket_path}", 56 | unix_socket_path=unix_socket_path, 57 | ) 58 | 59 | launcher.setServiceParent(app) 60 | timer.setServiceParent(app) 61 | webservice.setServiceParent(app) 62 | 63 | return app 64 | -------------------------------------------------------------------------------- /scrapyd/basicauth.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from twisted.cred import credentials, error 4 | from twisted.cred.checkers import ICredentialsChecker 5 | from twisted.cred.portal import IRealm, Portal 6 | from twisted.internet import defer 7 | from twisted.logger import Logger 8 | from twisted.web.guard import BasicCredentialFactory, HTTPAuthSessionWrapper 9 | from twisted.web.resource import IResource 10 | from zope.interface import implementer 11 | 12 | from scrapyd.exceptions import InvalidUsernameError 13 | 14 | log = Logger() 15 | 16 | 17 | # https://docs.twisted.org/en/stable/web/howto/web-in-60/http-auth.html 18 | @implementer(IRealm) 19 | class PublicHTMLRealm: 20 | def __init__(self, resource): 21 | self.resource = resource 22 | 23 | def requestAvatar(self, avatarId, mind, *interfaces): 24 | if IResource in interfaces: 25 | return (IResource, self.resource, lambda: None) 26 | raise NotImplementedError 27 | 28 | 29 | @implementer(ICredentialsChecker) 30 | class StringCredentialsChecker: 31 | credentialInterfaces = (credentials.IUsernamePassword,) 32 | 33 | def __init__(self, username, password): 34 | self.username = username.encode() 35 | self.password = password.encode() 36 | 37 | def requestAvatarId(self, credentials): 38 | if credentials.username == self.username and credentials.password == self.password: 39 | return defer.succeed(credentials.username) 40 | return defer.fail(error.UnauthorizedLogin()) 41 | 42 | 43 | def wrap_resource(resource, config): 44 | username = os.getenv("SCRAPYD_USERNAME") or config.get("username", "") 45 | password = os.getenv("SCRAPYD_PASSWORD") or config.get("password", "") 46 | # https://www.rfc-editor.org/rfc/rfc2617#section-2 47 | if ":" in username: 48 | raise InvalidUsernameError 49 | 50 | if username and password: 51 | log.info("Basic authentication enabled") 52 | return HTTPAuthSessionWrapper( 53 | Portal(PublicHTMLRealm(resource), [StringCredentialsChecker(username, password)]), 54 | [BasicCredentialFactory(b"Scrapyd")], 55 | ) 56 | 57 | log.info("Basic authentication disabled as either `username` or `password` is unset") 58 | return resource 59 | -------------------------------------------------------------------------------- /scrapyd/config.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os.path 3 | from configparser import ConfigParser, NoOptionError, NoSectionError 4 | from pkgutil import get_data 5 | 6 | from scrapy.utils.conf import closest_scrapy_cfg 7 | 8 | 9 | class Config: 10 | """A ConfigParser wrapper to support defaults when calling instance 11 | methods, and also tied to a single section""" 12 | 13 | SECTION = "scrapyd" 14 | 15 | def __init__(self, values=None, extra_sources=()): 16 | if values is None: 17 | self.cp = ConfigParser() 18 | self.cp.read_string(get_data(__package__, "default_scrapyd.conf").decode()) 19 | self.cp.read( 20 | [ 21 | "/etc/scrapyd/scrapyd.conf", 22 | "c:\\scrapyd\\scrapyd.conf", 23 | *sorted(glob.glob("/etc/scrapyd/conf.d/*")), 24 | "scrapyd.conf", 25 | os.path.expanduser("~/.scrapyd.conf"), 26 | closest_scrapy_cfg(), 27 | *extra_sources, 28 | ] 29 | ) 30 | else: 31 | self.cp = ConfigParser(values) 32 | self.cp.add_section(self.SECTION) 33 | 34 | def get(self, option, default=None): 35 | return self._get(self.cp.get, option, default) 36 | 37 | def getint(self, option, default=None): 38 | return self._get(self.cp.getint, option, default) 39 | 40 | def getfloat(self, option, default=None): 41 | return self._get(self.cp.getfloat, option, default) 42 | 43 | def getboolean(self, option, default=None): 44 | return self._get(self.cp.getboolean, option, default) 45 | 46 | def _get(self, method, option, default): 47 | try: 48 | return method(self.SECTION, option) 49 | except (NoSectionError, NoOptionError): 50 | if default is not None: 51 | return default 52 | raise 53 | 54 | def items(self, section, default=None): 55 | try: 56 | return self.cp.items(section) 57 | except NoSectionError: 58 | if default is not None: 59 | return default 60 | raise 61 | -------------------------------------------------------------------------------- /scrapyd/default_scrapyd.conf: -------------------------------------------------------------------------------- 1 | [scrapyd] 2 | # Application options 3 | application = scrapyd.app.application 4 | bind_address = 127.0.0.1 5 | http_port = 6800 6 | unix_socket_path = 7 | username = 8 | password = 9 | spiderqueue = scrapyd.spiderqueue.SqliteSpiderQueue 10 | 11 | # Poller options 12 | poller = scrapyd.poller.QueuePoller 13 | poll_interval = 5.0 14 | 15 | # Launcher options 16 | launcher = scrapyd.launcher.Launcher 17 | max_proc = 0 18 | max_proc_per_cpu = 4 19 | logs_dir = logs 20 | items_dir = 21 | jobs_to_keep = 5 22 | runner = scrapyd.runner 23 | 24 | # Web UI and API options 25 | webroot = scrapyd.website.Root 26 | prefix_header = x-forwarded-prefix 27 | debug = off 28 | 29 | # Egg storage options 30 | eggstorage = scrapyd.eggstorage.FilesystemEggStorage 31 | eggs_dir = eggs 32 | 33 | # Job storage options 34 | jobstorage = scrapyd.jobstorage.MemoryJobStorage 35 | finished_to_keep = 100 36 | 37 | # Directory options 38 | dbs_dir = dbs 39 | 40 | [services] 41 | schedule.json = scrapyd.webservice.Schedule 42 | cancel.json = scrapyd.webservice.Cancel 43 | status.json = scrapyd.webservice.Status 44 | addversion.json = scrapyd.webservice.AddVersion 45 | listprojects.json = scrapyd.webservice.ListProjects 46 | listversions.json = scrapyd.webservice.ListVersions 47 | listspiders.json = scrapyd.webservice.ListSpiders 48 | delproject.json = scrapyd.webservice.DeleteProject 49 | delversion.json = scrapyd.webservice.DeleteVersion 50 | listjobs.json = scrapyd.webservice.ListJobs 51 | daemonstatus.json = scrapyd.webservice.DaemonStatus 52 | -------------------------------------------------------------------------------- /scrapyd/eggstorage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | from glob import escape, glob 5 | 6 | from packaging.version import InvalidVersion, Version 7 | from twisted.python import filepath 8 | from zope.interface import implementer 9 | 10 | from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError 11 | from scrapyd.interfaces import IEggStorage 12 | 13 | 14 | def sorted_versions(versions): 15 | try: 16 | return sorted(versions, key=Version) 17 | except InvalidVersion: 18 | return sorted(versions) 19 | 20 | 21 | @implementer(IEggStorage) 22 | class FilesystemEggStorage: 23 | def __init__(self, config): 24 | self.basedir = config.get("eggs_dir", "eggs") 25 | 26 | def put(self, eggfile, project, version): 27 | path = self._egg_path(project, version) 28 | 29 | directory = os.path.dirname(path) 30 | if not os.path.exists(directory): 31 | os.makedirs(directory) 32 | 33 | with open(path, "wb") as f: 34 | shutil.copyfileobj(eggfile, f) 35 | 36 | def get(self, project, version=None): 37 | if version is None: 38 | try: 39 | version = self.list(project)[-1] 40 | except IndexError: 41 | return None, None 42 | try: 43 | return version, open(self._egg_path(project, version), "rb") # noqa: SIM115 44 | except FileNotFoundError: 45 | return None, None 46 | 47 | def list(self, project): 48 | return sorted_versions( 49 | [os.path.splitext(os.path.basename(path))[0] for path in glob(self._get_path(escape(project), "*.egg"))] 50 | ) 51 | 52 | def list_projects(self): 53 | if os.path.exists(self.basedir): 54 | return [name for name in os.listdir(self.basedir) if os.path.isdir(os.path.join(self.basedir, name))] 55 | return [] 56 | 57 | def delete(self, project, version=None): 58 | if version is None: 59 | try: 60 | shutil.rmtree(self._get_path(project)) 61 | except FileNotFoundError as e: 62 | raise ProjectNotFoundError from e 63 | else: 64 | try: 65 | os.remove(self._egg_path(project, version)) 66 | if not self.list(project): # remove project if no versions left 67 | self.delete(project) 68 | except FileNotFoundError as e: 69 | raise EggNotFoundError from e 70 | 71 | def _egg_path(self, project, version): 72 | sanitized_version = re.sub(r"[^A-Za-z0-9_-]", "_", version) 73 | return self._get_path(project, f"{sanitized_version}.egg") 74 | 75 | def _get_path(self, project, *trusted): 76 | try: 77 | file = filepath.FilePath(self.basedir).child(project) 78 | except filepath.InsecurePath as e: 79 | raise DirectoryTraversalError(project) from e 80 | 81 | return os.path.join(file.path, *trusted) 82 | -------------------------------------------------------------------------------- /scrapyd/environ.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from contextlib import suppress 4 | from posixpath import join as urljoin 5 | from urllib.parse import urlsplit 6 | 7 | from w3lib.url import path_to_file_uri 8 | from zope.interface import implementer 9 | 10 | from scrapyd.interfaces import IEnvironment 11 | from scrapyd.utils import get_file_path, local_items 12 | 13 | 14 | @implementer(IEnvironment) 15 | class Environment: 16 | def __init__(self, config, initenv=os.environ): 17 | self.dbs_dir = config.get("dbs_dir", "dbs") 18 | self.logs_dir = config.get("logs_dir", "logs") 19 | self.items_dir = config.get("items_dir", "") 20 | self.jobs_to_keep = config.getint("jobs_to_keep", 5) 21 | self.settings = dict(config.items("settings", default=[])) 22 | self.initenv = initenv 23 | 24 | def get_settings(self, message): 25 | settings = {} 26 | if self.logs_dir: 27 | settings["LOG_FILE"] = self._prepare_file(message, self.logs_dir, "log") 28 | if self.items_dir: 29 | settings["FEEDS"] = json.dumps({self._get_feeds(message, "jl"): {"format": "jsonlines"}}) 30 | return settings 31 | 32 | def get_environment(self, message, slot): 33 | project = message["_project"] 34 | 35 | env = self.initenv.copy() 36 | env["SCRAPY_PROJECT"] = project 37 | # If the version is not provided, then the runner uses the default version, determined by egg storage. 38 | if "_version" in message: 39 | env["SCRAPYD_EGG_VERSION"] = message["_version"] 40 | # Scrapy discovers the same scrapy.cfg files as Scrapyd. So, this is only needed if users are adding [settings] 41 | # sections to Scrapyd configuration files (which Scrapy doesn't discover). This might lead to strange behavior 42 | # if an egg project and a [settings] project have the same name (unlikely). Preserved, since committed in 2010. 43 | if project in self.settings: 44 | env["SCRAPY_SETTINGS_MODULE"] = self.settings[project] 45 | 46 | return env 47 | 48 | def _get_feeds(self, message, extension): 49 | parsed = urlsplit(self.items_dir) 50 | 51 | if local_items(self.items_dir, parsed): 52 | # File URLs do not have query or fragment components. https://www.rfc-editor.org/rfc/rfc8089#section-2 53 | return path_to_file_uri(self._prepare_file(message, parsed.path, extension)) 54 | 55 | path = urljoin(parsed.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}") 56 | return parsed._replace(path=path).geturl() 57 | 58 | def _prepare_file(self, message, directory, extension): 59 | file_path = get_file_path(directory, message["_project"], message["_spider"], message["_job"], extension) 60 | 61 | parent = file_path.dirname() # returns a str 62 | if not os.path.exists(parent): 63 | os.makedirs(parent) 64 | 65 | to_delete = sorted( 66 | (os.path.join(parent, name) for name in os.listdir(parent)), 67 | key=os.path.getmtime, 68 | )[: -self.jobs_to_keep] 69 | for path in to_delete: 70 | with suppress(OSError): 71 | os.remove(path) 72 | 73 | return file_path.path 74 | -------------------------------------------------------------------------------- /scrapyd/exceptions.py: -------------------------------------------------------------------------------- 1 | class ScrapydError(Exception): 2 | """Base class for exceptions from within this package""" 3 | 4 | 5 | class ConfigError(ScrapydError): 6 | """Raised if a configuration error prevents Scrapyd from starting""" 7 | 8 | 9 | class InvalidUsernameError(ConfigError): 10 | """Raised if the username contains a colon""" 11 | 12 | def __init__(self): 13 | super().__init__( 14 | "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file." 15 | ) 16 | 17 | 18 | class BadEggError(ScrapydError): 19 | """Raised if the egg is invalid""" 20 | 21 | 22 | class DirectoryTraversalError(ScrapydError): 23 | """Raised if the resolved path is outside the expected directory""" 24 | 25 | 26 | class ProjectNotFoundError(ScrapydError): 27 | """Raised if a project isn't found in an IEggStorage implementation""" 28 | 29 | 30 | class EggNotFoundError(ScrapydError): 31 | """Raised if an egg isn't found in an IEggStorage implementation""" 32 | 33 | 34 | class RunnerError(ScrapydError): 35 | """Raised if the runner returns an error code""" 36 | -------------------------------------------------------------------------------- /scrapyd/interfaces.py: -------------------------------------------------------------------------------- 1 | from zope.interface import Attribute, Interface 2 | 3 | 4 | class IEggStorage(Interface): 5 | """ 6 | A component to store project eggs. 7 | """ 8 | 9 | def put(eggfile, project, version): 10 | """ 11 | Store the egg (a file object), which represents a ``version`` of the ``project``. 12 | """ 13 | 14 | def get(project, version=None): 15 | """ 16 | Return ``(version, file)`` for the egg matching the ``project`` and ``version``. 17 | 18 | If ``version`` is ``None``, the latest version and corresponding file are returned. 19 | 20 | If no egg is found, ``(None, None)`` is returned. 21 | 22 | .. tip:: Remember to close the ``file`` when done. 23 | """ 24 | 25 | def list(project): 26 | """ 27 | Return all versions of the ``project`` in order, with the latest version last. 28 | """ 29 | 30 | def list_projects(): 31 | """ 32 | Return all projects in storage. 33 | 34 | .. versionadded:: 1.3.0 35 | Move this logic into the interface and its implementations, to allow customization. 36 | """ 37 | 38 | def delete(project, version=None): 39 | """ 40 | Delete the egg matching the ``project`` and ``version``. Delete the ``project``, if no versions remains. 41 | """ 42 | 43 | 44 | class IPoller(Interface): 45 | """ 46 | A component that tracks capacity for new jobs, and starts jobs when ready. 47 | """ 48 | 49 | queues = Attribute( 50 | """ 51 | An object (like a ``dict``) with a ``__getitem__`` method that accepts a project's name and returns its 52 | :py:interface:`spider queue` of pending jobs. 53 | """ 54 | ) 55 | 56 | def poll(): 57 | """ 58 | Called periodically to start jobs if there's capacity. 59 | """ 60 | 61 | def next(): 62 | """ 63 | Return the next pending job. 64 | 65 | It should return a Deferred that will be fired when there's capacity, or already fired if there's capacity. 66 | 67 | The pending job is a ``dict`` containing at least the ``_project`` name, ``_spider`` name and ``_job`` ID. 68 | The job ID is unique, at least within the project. 69 | 70 | The pending job is later passed to :meth:`scrapyd.interfaces.IEnvironment.get_environment`. 71 | 72 | .. seealso:: :meth:`scrapyd.interfaces.ISpiderQueue.pop` 73 | """ 74 | 75 | def update_projects(): 76 | """ 77 | Called when projects may have changed, to refresh the available projects, including at initialization. 78 | """ 79 | 80 | 81 | class ISpiderQueue(Interface): 82 | """ 83 | A component to store pending jobs. 84 | 85 | The ``dict`` keys used by the chosen ``ISpiderQueue`` implementation must match the chosen: 86 | 87 | - :ref:`launcher` service (which calls :meth:`scrapyd.interfaces.IPoller.next`) 88 | - :py:interface:`~scrapyd.interfaces.IEnvironment` implementation (see :meth:`scrapyd.interfaces.IPoller.next`) 89 | - :ref:`webservices` that schedule, cancel or list pending jobs 90 | """ 91 | 92 | def add(name, priority, **spider_args): 93 | """ 94 | Add a pending job, given the spider ``name``, crawl ``priority`` and keyword arguments, which might include the 95 | ``_job`` ID, egg ``_version`` and Scrapy ``settings`` depending on the implementation, with keyword arguments 96 | that are not recognized by the implementation being treated as spider arguments. 97 | 98 | .. versionchanged:: 1.3.0 99 | Add the ``priority`` parameter. 100 | """ 101 | 102 | def pop(): 103 | """ 104 | Pop the next pending job. The pending job is a ``dict`` containing the spider ``name``. Depending on the 105 | implementation, other keys might include the ``_job`` ID, egg ``_version`` and Scrapy ``settings``, with 106 | keyword arguments that are not recognized by the receiver being treated as spider arguments. 107 | """ 108 | 109 | def list(): 110 | """ 111 | Return the pending jobs. 112 | 113 | .. seealso:: :meth:`scrapyd.interfaces.ISpiderQueue.pop` 114 | """ 115 | 116 | def count(): 117 | """ 118 | Return the number of pending jobs. 119 | """ 120 | 121 | def remove(func): 122 | """ 123 | Remove pending jobs for which ``func(job)`` is true, and return the number of removed pending jobss. 124 | """ 125 | 126 | def clear(): 127 | """ 128 | Remove all pending jobs. 129 | """ 130 | 131 | 132 | class ISpiderScheduler(Interface): 133 | """ 134 | A component to schedule jobs. 135 | """ 136 | 137 | def schedule(project, spider_name, priority, **spider_args): 138 | """ 139 | Schedule a crawl. 140 | 141 | .. versionchanged:: 1.3.0 142 | Add the ``priority`` parameter. 143 | """ 144 | 145 | def list_projects(): 146 | """ 147 | Return all projects that can be scheduled. 148 | """ 149 | 150 | def update_projects(): 151 | """ 152 | Called when projects may have changed, to refresh the available projects, including at initialization. 153 | """ 154 | 155 | 156 | class IEnvironment(Interface): 157 | """ 158 | A component to generate the environment of jobs. 159 | 160 | The chosen ``IEnvironment`` implementation must match the chosen :ref:`launcher` service. 161 | """ 162 | 163 | def get_settings(message): 164 | """ 165 | Return the Scrapy settings to use for running the process. 166 | 167 | Depending on the chosen :ref:`launcher`, this would be one of more ``LOG_FILE`` or ``FEEDS``. 168 | 169 | .. versionadded:: 1.4.2 170 | Support for overriding Scrapy settings via ``SCRAPY_`` environment variables was removed in Scrapy 2.8. 171 | 172 | :param message: the pending job received from the :meth:`scrapyd.interfaces.IPoller.next` method 173 | """ 174 | 175 | def get_environment(message, slot): 176 | """ 177 | Return the environment variables to use for running the process. 178 | 179 | Depending on the chosen :ref:`launcher`, this would be one of more of ``SCRAPY_PROJECT``, 180 | ``SCRAPYD_EGG_VERSION`` or ``SCRAPY_SETTINGS_MODULE``. 181 | 182 | :param message: the pending job received from the :meth:`scrapyd.interfaces.IPoller.next` method 183 | :param slot: the :ref:`launcher` slot for tracking the process 184 | """ 185 | 186 | 187 | class IJobStorage(Interface): 188 | """ 189 | A component to store finished jobs. 190 | 191 | .. versionadded:: 1.3.0 192 | """ 193 | 194 | def add(job): 195 | """ 196 | Add a finished job in the storage. 197 | """ 198 | 199 | def list(): 200 | """ 201 | Return the finished jobs. 202 | 203 | .. seealso:: :meth:`scrapyd.interfaces.IJobStorage.__iter__` 204 | """ 205 | 206 | def __len__(): 207 | """ 208 | Return the number of finished jobs. 209 | """ 210 | 211 | def __iter__(): 212 | """ 213 | Iterate over the finished jobs in reverse order by ``end_time``. 214 | 215 | A job has the attributes ``project``, ``spider``, ``job``, ``start_time`` and ``end_time`` and may have the 216 | attributes ``args`` (``scrapy crawl`` CLI arguments) and ``env`` (environment variables). 217 | """ 218 | -------------------------------------------------------------------------------- /scrapyd/jobstorage.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. versionadded:: 1.3.0 3 | Job storage was previously in-memory only and managed by the launcher. 4 | """ 5 | 6 | from zope.interface import implementer 7 | 8 | from scrapyd import sqlite 9 | from scrapyd.interfaces import IJobStorage 10 | from scrapyd.launcher import ScrapyProcessProtocol 11 | 12 | 13 | @implementer(IJobStorage) 14 | class MemoryJobStorage: 15 | def __init__(self, config): 16 | self.jobs = [] 17 | self.finished_to_keep = config.getint("finished_to_keep", 100) 18 | 19 | def add(self, job): 20 | self.jobs.append(job) 21 | del self.jobs[: -self.finished_to_keep] # keep last x finished jobs 22 | 23 | def list(self): 24 | return list(self) 25 | 26 | def __len__(self): 27 | return len(self.jobs) 28 | 29 | def __iter__(self): 30 | yield from reversed(self.jobs) 31 | 32 | 33 | @implementer(IJobStorage) 34 | class SqliteJobStorage: 35 | def __init__(self, config): 36 | self.jobs = sqlite.initialize(sqlite.SqliteFinishedJobs, config, "jobs", "finished_jobs") 37 | self.finished_to_keep = config.getint("finished_to_keep", 100) 38 | 39 | def add(self, job): 40 | self.jobs.add(job) 41 | self.jobs.clear(self.finished_to_keep) 42 | 43 | def list(self): 44 | return list(self) 45 | 46 | def __len__(self): 47 | return len(self.jobs) 48 | 49 | def __iter__(self): 50 | for project, spider, jobid, start_time, end_time in self.jobs: 51 | job = ScrapyProcessProtocol(project, spider, jobid, env={}, args=[]) 52 | job.start_time = start_time 53 | job.end_time = end_time 54 | yield job 55 | -------------------------------------------------------------------------------- /scrapyd/launcher.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import multiprocessing 3 | import sys 4 | from itertools import chain 5 | 6 | from twisted.application.service import Service 7 | from twisted.internet import defer, error, protocol, reactor 8 | from twisted.logger import Logger 9 | 10 | from scrapyd import __version__ 11 | from scrapyd.interfaces import IEnvironment, IJobStorage, IPoller 12 | 13 | log = Logger() 14 | 15 | 16 | def get_crawl_args(message): 17 | """Return the command-line arguments to use for the scrapy crawl process 18 | that will be started for this message 19 | """ 20 | copied = message.copy() 21 | del copied["_project"] 22 | 23 | return [ 24 | copied.pop("_spider"), 25 | *chain.from_iterable(["-s", f"{key}={value}"] for key, value in copied.pop("settings", {}).items()), 26 | *chain.from_iterable(["-a", f"{key}={value}"] for key, value in copied.items()), # spider arguments 27 | ] 28 | 29 | 30 | class Launcher(Service): 31 | name = "launcher" 32 | 33 | def __init__(self, config, app): 34 | self.processes = {} 35 | self.finished = app.getComponent(IJobStorage) 36 | self.max_proc = self._get_max_proc(config) 37 | self.runner = config.get("runner", "scrapyd.runner") 38 | self.app = app 39 | 40 | def startService(self): 41 | log.info( 42 | "Scrapyd {version} started: max_proc={max_proc!r}, runner={runner!r}", 43 | version=__version__, 44 | max_proc=self.max_proc, 45 | runner=self.runner, 46 | log_system="Launcher", 47 | ) 48 | for slot in range(self.max_proc): 49 | self._get_message(slot) 50 | 51 | def _get_message(self, slot): 52 | poller = self.app.getComponent(IPoller) 53 | poller.next().addCallback(self._spawn_process, slot) 54 | log.debug("Process slot {slot} ready", slot=slot) 55 | 56 | def _spawn_process(self, message, slot): 57 | project = message["_project"] 58 | environment = self.app.getComponent(IEnvironment) 59 | message.setdefault("settings", {}) 60 | message["settings"].update(environment.get_settings(message)) 61 | 62 | env = environment.get_environment(message, slot) 63 | args = [sys.executable, "-m", self.runner, "crawl", *get_crawl_args(message)] 64 | 65 | process = ScrapyProcessProtocol(project, message["_spider"], message["_job"], env, args) 66 | process.deferred.addBoth(self._process_finished, slot) 67 | 68 | reactor.spawnProcess(process, sys.executable, args=args, env=env) 69 | self.processes[slot] = process 70 | log.debug("Process slot {slot} occupied", slot=slot) 71 | 72 | def _process_finished(self, _, slot): 73 | process = self.processes.pop(slot) 74 | process.end_time = datetime.datetime.now() 75 | self.finished.add(process) 76 | log.debug("Process slot {slot} vacated", slot=slot) 77 | 78 | self._get_message(slot) 79 | 80 | def _get_max_proc(self, config): 81 | max_proc = config.getint("max_proc", 0) 82 | if max_proc: 83 | return max_proc 84 | 85 | try: 86 | cpus = multiprocessing.cpu_count() 87 | except NotImplementedError: # Windows 17520a3 88 | cpus = 1 89 | return cpus * config.getint("max_proc_per_cpu", 4) 90 | 91 | 92 | # https://docs.twisted.org/en/stable/api/twisted.internet.protocol.ProcessProtocol.html 93 | class ScrapyProcessProtocol(protocol.ProcessProtocol): 94 | def __init__(self, project, spider, job, env, args): 95 | self.project = project 96 | self.spider = spider 97 | self.job = job 98 | self.pid = None 99 | self.start_time = datetime.datetime.now() 100 | self.end_time = None 101 | self.args = args 102 | self.env = env 103 | self.deferred = defer.Deferred() 104 | 105 | # For equality assertions in tests. 106 | def __eq__(self, other): 107 | return ( 108 | self.project == other.project 109 | and self.spider == other.spider 110 | and self.job == other.job 111 | and self.pid == other.pid 112 | and self.start_time == other.start_time 113 | and self.end_time == other.end_time 114 | and self.args == other.args 115 | and self.env == other.env 116 | ) 117 | 118 | # For error messages in tests. 119 | def __repr__(self): 120 | return ( 121 | f"ScrapyProcessProtocol(project={self.project} spider={self.spider} job={self.job} pid={self.pid} " 122 | f"start_time={self.start_time} end_time={self.end_time} args={self.args} env={self.env})" 123 | ) 124 | 125 | def outReceived(self, data): 126 | log.info(data.rstrip(), log_system=f"Launcher,{self.pid}/stdout") 127 | 128 | def errReceived(self, data): 129 | log.error(data.rstrip(), log_system=f"Launcher,{self.pid}/stderr") 130 | 131 | def connectionMade(self): 132 | self.pid = self.transport.pid 133 | self.log("info", "Process started:") 134 | 135 | # https://docs.twisted.org/en/stable/core/howto/process.html#things-that-can-happen-to-your-processprotocol 136 | def processEnded(self, status): 137 | if isinstance(status.value, error.ProcessDone): 138 | self.log("info", "Process finished:") 139 | else: 140 | self.log("error", f"Process died: exitstatus={status.value.exitCode!r}") 141 | self.deferred.callback(self) 142 | 143 | def log(self, level, action): 144 | getattr(log, level)( 145 | "{action} project={project!r} spider={spider!r} job={job!r} pid={pid!r} args={args!r}", 146 | action=action, 147 | project=self.project, 148 | spider=self.spider, 149 | job=self.job, 150 | pid=self.pid, 151 | args=self.args, 152 | ) 153 | -------------------------------------------------------------------------------- /scrapyd/poller.py: -------------------------------------------------------------------------------- 1 | from twisted.internet.defer import DeferredQueue, inlineCallbacks, maybeDeferred 2 | from zope.interface import implementer 3 | 4 | from scrapyd.interfaces import IPoller 5 | from scrapyd.utils import get_spider_queues 6 | 7 | 8 | @implementer(IPoller) 9 | class QueuePoller: 10 | def __init__(self, config): 11 | self.config = config 12 | self.update_projects() 13 | self.dq = DeferredQueue() 14 | 15 | @inlineCallbacks 16 | def poll(self): 17 | for project, queue in self.queues.items(): 18 | while (yield maybeDeferred(queue.count)): 19 | # If the "waiting" backlog is empty (that is, if the maximum number of Scrapy processes are running): 20 | if not self.dq.waiting: 21 | return 22 | message = (yield maybeDeferred(queue.pop)).copy() 23 | # The message can be None if, for example, two Scrapyd instances share a spider queue database. 24 | if message is not None: 25 | message["_project"] = project 26 | message["_spider"] = message.pop("name") 27 | # Pop a dummy item from the "waiting" backlog. and fire the message's callbacks. 28 | self.dq.put(message) 29 | 30 | def next(self): 31 | """ 32 | Add a dummy item to the "waiting" backlog (based on Twisted's implementation of DeferredQueue). 33 | """ 34 | return self.dq.get() 35 | 36 | def update_projects(self): 37 | self.queues = get_spider_queues(self.config) 38 | -------------------------------------------------------------------------------- /scrapyd/runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | from contextlib import contextmanager 5 | 6 | import pkg_resources 7 | 8 | from scrapyd import Config 9 | from scrapyd.exceptions import BadEggError 10 | from scrapyd.utils import initialize_component 11 | 12 | 13 | def activate_egg(eggpath): 14 | """Activate a Scrapy egg file. This is meant to be used from egg runners 15 | to activate a Scrapy egg file. Don't use it from other code as it may 16 | leave unwanted side effects. 17 | """ 18 | distributions = pkg_resources.find_distributions(eggpath) 19 | if isinstance(distributions, tuple): 20 | raise BadEggError 21 | 22 | try: 23 | distribution = next(distributions) 24 | except StopIteration: 25 | raise BadEggError from None 26 | 27 | distribution.activate() 28 | 29 | # setdefault() was added in https://github.com/scrapy/scrapyd/commit/0641a57. It's not clear why, since the egg 30 | # should control its settings module. That said, it is unlikely to already be set. 31 | os.environ.setdefault("SCRAPY_SETTINGS_MODULE", distribution.get_entry_info("scrapy", "settings").module_name) 32 | 33 | 34 | @contextmanager 35 | def project_environment(project): 36 | config = Config() 37 | eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage") 38 | 39 | eggversion = os.environ.get("SCRAPYD_EGG_VERSION", None) 40 | sanitized_version, egg = eggstorage.get(project, eggversion) 41 | 42 | tmp = None 43 | # egg can be None if the project is not in egg storage: for example, if Scrapyd is invoked within a Scrapy project. 44 | if egg: 45 | try: 46 | if hasattr(egg, "name"): # for example, FileIO 47 | activate_egg(egg.name) 48 | else: # for example, BytesIO 49 | prefix = f"{project}-{sanitized_version}-" 50 | tmp = tempfile.NamedTemporaryFile(suffix=".egg", prefix=prefix, delete=False) 51 | shutil.copyfileobj(egg, tmp) 52 | tmp.close() 53 | activate_egg(tmp.name) 54 | finally: 55 | egg.close() 56 | 57 | try: 58 | yield 59 | finally: 60 | if tmp: 61 | os.remove(tmp.name) 62 | 63 | 64 | def main(): 65 | project = os.environ["SCRAPY_PROJECT"] 66 | with project_environment(project): 67 | from scrapy.cmdline import execute 68 | 69 | # This calls scrapy.utils.project.get_project_settings(). It uses SCRAPY_SETTINGS_MODULE if set. Otherwise, it 70 | # calls scrapy.utils.conf.init_env(), which reads Scrapy's configuration sources, looks for a project matching 71 | # SCRAPY_PROJECT in the [settings] section, and uses its value for SCRAPY_SETTINGS_MODULE. 72 | # https://docs.scrapy.org/en/latest/topics/commands.html#configuration-settings 73 | execute() 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /scrapyd/scheduler.py: -------------------------------------------------------------------------------- 1 | from zope.interface import implementer 2 | 3 | from scrapyd.interfaces import ISpiderScheduler 4 | from scrapyd.utils import get_spider_queues 5 | 6 | 7 | @implementer(ISpiderScheduler) 8 | class SpiderScheduler: 9 | def __init__(self, config): 10 | self.config = config 11 | self.update_projects() 12 | 13 | def schedule(self, project, spider_name, priority=0.0, **spider_args): 14 | self.queues[project].add(spider_name, priority=priority, **spider_args) 15 | 16 | def list_projects(self): 17 | return list(self.queues) 18 | 19 | def update_projects(self): 20 | self.queues = get_spider_queues(self.config) 21 | -------------------------------------------------------------------------------- /scrapyd/spiderqueue.py: -------------------------------------------------------------------------------- 1 | from zope.interface import implementer 2 | 3 | from scrapyd import sqlite 4 | from scrapyd.interfaces import ISpiderQueue 5 | 6 | 7 | @implementer(ISpiderQueue) 8 | class SqliteSpiderQueue: 9 | def __init__(self, config, project, table="spider_queue"): 10 | self.q = sqlite.initialize(sqlite.JsonSqlitePriorityQueue, config, project, table) 11 | 12 | def add(self, name, priority=0.0, **spider_args): 13 | message = spider_args.copy() 14 | message["name"] = name 15 | self.q.put(message, priority=priority) 16 | 17 | def pop(self): 18 | return self.q.pop() 19 | 20 | def count(self): 21 | return len(self.q) 22 | 23 | def list(self): 24 | return [message for message, _ in self.q] 25 | 26 | def remove(self, func): 27 | return self.q.remove(func) 28 | 29 | def clear(self): 30 | self.q.clear() 31 | -------------------------------------------------------------------------------- /scrapyd/sqlite.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | import sqlite3 5 | 6 | 7 | # The database argument is "jobs" (in SqliteJobStorage), or a project (in SqliteSpiderQueue) from get_spider_queues(), 8 | # which gets projects from get_project_list(), which gets projects from egg storage. We check for directory traversal 9 | # in egg storage, instead. 10 | def initialize(cls, config, database, table): 11 | dbs_dir = config.get("dbs_dir", "dbs") 12 | if dbs_dir == ":memory:": 13 | connection_string = dbs_dir 14 | else: 15 | if not os.path.exists(dbs_dir): 16 | os.makedirs(dbs_dir) 17 | connection_string = os.path.join(dbs_dir, f"{database}.db") 18 | 19 | return cls(connection_string, table) 20 | 21 | 22 | # https://docs.python.org/3/library/sqlite3.html#sqlite3-adapter-converter-recipes 23 | def adapt_datetime(val): 24 | return val.strftime("%Y-%m-%d %H:%M:%S.%f") 25 | 26 | 27 | def convert_datetime(val): 28 | return datetime.datetime.strptime(val.decode(), "%Y-%m-%d %H:%M:%S.%f") 29 | 30 | 31 | sqlite3.register_adapter(datetime.datetime, adapt_datetime) 32 | sqlite3.register_converter("datetime", convert_datetime) 33 | 34 | 35 | class SqliteMixin: 36 | def __init__(self, database, table): 37 | self.database = database or ":memory:" 38 | self.table = table 39 | # Regarding check_same_thread, see http://twistedmatrix.com/trac/ticket/4040 40 | self.conn = sqlite3.connect(self.database, check_same_thread=False) 41 | 42 | def __len__(self): 43 | return self.conn.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()[0] 44 | 45 | # SQLite JSON is enabled by default since 3.38.0 (2022-02-22), and JSONB is available since 3.45.0 (2024-01-15). 46 | # https://sqlite.org/json1.html 47 | def encode(self, obj): 48 | return sqlite3.Binary(json.dumps(obj).encode("ascii")) 49 | 50 | def decode(self, obj): 51 | return json.loads(bytes(obj).decode("ascii")) 52 | 53 | 54 | class JsonSqlitePriorityQueue(SqliteMixin): 55 | """ 56 | SQLite priority queue. It relies on SQLite concurrency support for providing atomic inter-process operations. 57 | 58 | .. versionadded:: 1.0.0 59 | """ 60 | 61 | def __init__(self, database=None, table="queue"): 62 | super().__init__(database, table) 63 | 64 | self.conn.execute( 65 | f"CREATE TABLE IF NOT EXISTS {table} (id integer PRIMARY KEY, priority real key, message blob)" 66 | ) 67 | 68 | def put(self, message, priority=0.0): 69 | self.conn.execute( 70 | f"INSERT INTO {self.table} (priority, message) VALUES (?, ?)", 71 | (priority, self.encode(message)), 72 | ) 73 | self.conn.commit() 74 | 75 | def pop(self): 76 | row = self.conn.execute(f"SELECT id, message FROM {self.table} ORDER BY priority DESC LIMIT 1").fetchone() 77 | if row is None: 78 | return None 79 | _id, message = row 80 | 81 | # If a row vanished, try again. 82 | if not self.conn.execute(f"DELETE FROM {self.table} WHERE id = ?", (_id,)).rowcount: 83 | self.conn.rollback() 84 | return self.pop() 85 | 86 | self.conn.commit() 87 | return self.decode(message) 88 | 89 | def remove(self, func): 90 | deleted = 0 91 | for _id, message in self.conn.execute(f"SELECT id, message FROM {self.table}"): 92 | if func(self.decode(message)): 93 | # If a row vanished, try again. 94 | if not self.conn.execute(f"DELETE FROM {self.table} WHERE id = ?", (_id,)).rowcount: 95 | self.conn.rollback() 96 | return self.remove(func) 97 | deleted += 1 98 | 99 | self.conn.commit() 100 | return deleted 101 | 102 | def clear(self): 103 | self.conn.execute(f"DELETE FROM {self.table}") 104 | self.conn.commit() 105 | 106 | def __iter__(self): 107 | return ( 108 | (self.decode(message), priority) 109 | for message, priority in self.conn.execute( 110 | f"SELECT message, priority FROM {self.table} ORDER BY priority DESC" 111 | ) 112 | ) 113 | 114 | 115 | class SqliteFinishedJobs(SqliteMixin): 116 | """ 117 | SQLite finished jobs. 118 | 119 | .. versionadded:: 1.3.0 120 | Job storage was previously in-memory only. 121 | """ 122 | 123 | def __init__(self, database=None, table="finished_jobs"): 124 | super().__init__(database, table) 125 | 126 | self.conn.execute( 127 | f"CREATE TABLE IF NOT EXISTS {table} " 128 | "(id integer PRIMARY KEY, project text, spider text, job text, start_time datetime, end_time datetime)" 129 | ) 130 | 131 | def add(self, job): 132 | self.conn.execute( 133 | f"INSERT INTO {self.table} (project, spider, job, start_time, end_time) VALUES (?, ?, ?, ?, ?)", 134 | (job.project, job.spider, job.job, job.start_time, job.end_time), 135 | ) 136 | self.conn.commit() 137 | 138 | def clear(self, finished_to_keep=None): 139 | where = "" 140 | if finished_to_keep: 141 | limit = len(self) - finished_to_keep 142 | if limit <= 0: 143 | return # nothing to delete 144 | where = f"WHERE id <= (SELECT max(id) FROM (SELECT id FROM {self.table} ORDER BY end_time LIMIT {limit}))" 145 | 146 | self.conn.execute(f"DELETE FROM {self.table} {where}") 147 | self.conn.commit() 148 | 149 | def __iter__(self): 150 | return ( 151 | ( 152 | project, 153 | spider, 154 | job, 155 | datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S.%f"), 156 | datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f"), 157 | ) 158 | for project, spider, job, start_time, end_time in self.conn.execute( 159 | f"SELECT project, spider, job, start_time, end_time FROM {self.table} ORDER BY end_time DESC" 160 | ) 161 | ) 162 | -------------------------------------------------------------------------------- /scrapyd/txapp.py: -------------------------------------------------------------------------------- 1 | # this file is used to start scrapyd with twistd -y 2 | from scrapyd import get_application 3 | 4 | application = get_application() 5 | -------------------------------------------------------------------------------- /scrapyd/utils.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | from scrapy.utils.misc import load_object 4 | from twisted.python import filepath 5 | 6 | from scrapyd.exceptions import DirectoryTraversalError 7 | 8 | 9 | def initialize_component(config, setting, default, *args): 10 | path = config.get(setting, default) 11 | cls = load_object(path) 12 | return cls(config, *args) 13 | 14 | 15 | def local_items(items_dir, parsed): 16 | return items_dir and parsed.scheme.lower() in ("", "file", os.path.splitdrive(items_dir)[0].rstrip(":").lower()) 17 | 18 | 19 | def get_file_path(directory, project, spider, job, extension): 20 | # https://docs.twisted.org/en/stable/api/twisted.python.filepath.FilePath.html 21 | try: 22 | return filepath.FilePath(directory).child(project).child(spider).child(f"{job}.{extension}") 23 | except filepath.InsecurePath as e: 24 | raise DirectoryTraversalError(os.path.join(project, spider, f"{job}.{extension}")) from e 25 | 26 | 27 | def get_spider_queues(config): 28 | """Return a dict of Spider Queues keyed by project name""" 29 | spiderqueue_cls = load_object(config.get("spiderqueue", "scrapyd.spiderqueue.SqliteSpiderQueue")) 30 | return {project: spiderqueue_cls(config, project) for project in get_project_list(config)} 31 | 32 | 33 | def get_project_list(config): 34 | """Get list of projects by inspecting the eggs storage and the ones defined in 35 | the scrapy.cfg [settings] section 36 | """ 37 | 38 | # The poller and scheduler use this function (via get_spider_queues), and they aren't initialized with the 39 | # application. So, we need to re-initialize this component here. 40 | eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage") 41 | return eggstorage.list_projects() + [project for project, _ in config.items("settings", default=[])] 42 | -------------------------------------------------------------------------------- /scrapyd/webservice.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import json 5 | import os 6 | import sys 7 | import traceback 8 | import uuid 9 | import zipfile 10 | from collections import defaultdict 11 | from io import BytesIO 12 | from subprocess import PIPE, Popen 13 | from typing import ClassVar 14 | 15 | from twisted.logger import Logger 16 | from twisted.web import error, http, resource 17 | 18 | from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError, RunnerError 19 | 20 | log = Logger() 21 | 22 | 23 | def param( 24 | decoded: str, 25 | *, 26 | dest: str | None = None, 27 | required: bool = True, 28 | default=None, 29 | multiple: bool = False, 30 | type=str, # noqa: A002 like Click 31 | ): 32 | encoded = decoded.encode() 33 | if dest is None: 34 | dest = decoded 35 | 36 | def decorator(func): 37 | @functools.wraps(func) 38 | def wrapper(self, txrequest, *args, **kwargs): 39 | default_value = default() if callable(default) else default 40 | 41 | if encoded not in txrequest.args: 42 | if required: 43 | raise error.Error(code=http.OK, message=b"'%b' parameter is required" % encoded) 44 | 45 | value = default_value 46 | else: 47 | values = (value.decode() if type is str else type(value) for value in txrequest.args.pop(encoded)) 48 | try: 49 | value = list(values) if multiple else next(values) 50 | except (UnicodeDecodeError, ValueError) as e: 51 | raise error.Error(code=http.OK, message=b"%b is invalid: %b" % (encoded, str(e).encode())) from e 52 | 53 | kwargs[dest] = value 54 | 55 | return func(self, txrequest, *args, **kwargs) 56 | 57 | return wrapper 58 | 59 | return decorator 60 | 61 | 62 | class SpiderList: 63 | cache: ClassVar = defaultdict(dict) 64 | 65 | def get(self, project, version, *, runner): 66 | """Return the ``scrapy list`` output for the project and version, using a cache if possible.""" 67 | try: 68 | return self.cache[project][version] 69 | except KeyError: 70 | return self.set(project, version, runner=runner) 71 | 72 | def set(self, project, version, *, runner): 73 | """Calculate, cache and return the ``scrapy list`` output for the project and version, bypassing the cache.""" 74 | 75 | env = os.environ.copy() 76 | env["PYTHONIOENCODING"] = "UTF-8" 77 | env["SCRAPY_PROJECT"] = project 78 | # If the version is not provided, then the runner uses the default version, determined by egg storage. 79 | if version: 80 | env["SCRAPYD_EGG_VERSION"] = version 81 | 82 | args = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"] 83 | process = Popen(args, stdout=PIPE, stderr=PIPE, env=env) 84 | stdout, stderr = process.communicate() 85 | if process.returncode: 86 | raise RunnerError((stderr or stdout or b"").decode()) 87 | 88 | spiders = stdout.decode().splitlines() 89 | 90 | # Note: If the cache is empty, that doesn't mean that this is the project's only version; it simply means that 91 | # this is the first version called in this Scrapyd process. 92 | 93 | # Evict the return value of version=None calls, since we can't determine whether this version is the default 94 | # version (in which case we would overwrite it) or not (in which case we would keep it). 95 | self.cache[project].pop(None, None) 96 | self.cache[project][version] = spiders 97 | return spiders 98 | 99 | def delete(self, project, version=None): 100 | if version is None: 101 | self.cache.pop(project, None) 102 | else: 103 | # Evict the return value of version=None calls, since we can't determine whether this version is the 104 | # default version (in which case we would pop it) or not (in which case we would keep it). 105 | self.cache[project].pop(None, None) 106 | self.cache[project].pop(version, None) 107 | 108 | 109 | spider_list = SpiderList() 110 | 111 | 112 | # WebserviceResource 113 | class WsResource(resource.Resource): 114 | """ 115 | .. versionchanged:: 1.1.0 116 | Add ``node_name`` to the response in all subclasses. 117 | """ 118 | 119 | json_encoder = json.JSONEncoder() 120 | 121 | def __init__(self, root): 122 | super().__init__() 123 | self.root = root 124 | 125 | def render(self, txrequest): 126 | try: 127 | data = super().render(txrequest) 128 | except Exception as e: # noqa: BLE001 129 | log.failure("") 130 | 131 | if isinstance(e, error.Error): 132 | txrequest.setResponseCode(int(e.status)) 133 | 134 | if self.root.debug: 135 | return traceback.format_exc().encode() 136 | 137 | message = e.message.decode() if isinstance(e, error.Error) else f"{type(e).__name__}: {e}" 138 | data = {"status": "error", "message": message} 139 | else: 140 | if data is not None: 141 | data["status"] = "ok" 142 | 143 | if data is None: # render_OPTIONS 144 | content = b"" 145 | else: 146 | data["node_name"] = self.root.node_name 147 | content = self.json_encoder.encode(data).encode() + b"\n" 148 | txrequest.setHeader("Content-Type", "application/json") 149 | 150 | # https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS#preflighted_requests 151 | txrequest.setHeader("Access-Control-Allow-Origin", "*") 152 | txrequest.setHeader("Access-Control-Allow-Methods", self.methods) 153 | txrequest.setHeader("Access-Control-Allow-Headers", "X-Requested-With") 154 | txrequest.setHeader("Content-Length", str(len(content))) 155 | return content 156 | 157 | def render_OPTIONS(self, txrequest): 158 | txrequest.setHeader("Allow", self.methods) 159 | txrequest.setResponseCode(http.NO_CONTENT) 160 | 161 | @functools.cached_property 162 | def methods(self): 163 | methods = ["OPTIONS", "HEAD"] 164 | if hasattr(self, "render_GET"): 165 | methods.append("GET") 166 | if hasattr(self, "render_POST"): 167 | methods.append("POST") 168 | return ", ".join(methods) 169 | 170 | 171 | class DaemonStatus(WsResource): 172 | """ 173 | .. versionadded:: 1.2.0 174 | """ 175 | 176 | def render_GET(self, txrequest): 177 | return { 178 | "pending": sum(queue.count() for queue in self.root.poller.queues.values()), 179 | "running": len(self.root.launcher.processes), 180 | "finished": len(self.root.launcher.finished), 181 | } 182 | 183 | 184 | class Schedule(WsResource): 185 | """ 186 | .. versionchanged:: 1.2.0 187 | Add ``_version`` and ``jobid`` parameters. 188 | .. versionchanged:: 1.3.0 189 | Add ``priority`` parameter. 190 | """ 191 | 192 | @param("project") 193 | @param("spider") 194 | @param("_version", dest="version", required=False, default=None) 195 | # See https://github.com/scrapy/scrapyd/pull/215 196 | @param("jobid", required=False, default=lambda: uuid.uuid1().hex) 197 | @param("priority", required=False, default=0, type=float) 198 | @param("setting", required=False, default=list, multiple=True) 199 | def render_POST(self, txrequest, project, spider, version, jobid, priority, setting): 200 | if project not in self.root.poller.queues: 201 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) 202 | 203 | if version and self.root.eggstorage.get(project, version) == (None, None): 204 | raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) 205 | 206 | spiders = spider_list.get(project, version, runner=self.root.runner) 207 | if spider not in spiders: 208 | raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode()) 209 | 210 | args = {key.decode(): values[0].decode() for key, values in txrequest.args.items()} 211 | if version is not None: 212 | args["_version"] = version 213 | 214 | self.root.scheduler.schedule( 215 | project, 216 | spider, 217 | priority=priority, 218 | settings=dict(s.split("=", 1) for s in setting), 219 | _job=jobid, 220 | **args, 221 | ) 222 | return {"jobid": jobid} 223 | 224 | 225 | class Cancel(WsResource): 226 | @param("project") 227 | @param("job") 228 | # Instead of os.name, use sys.platform, which disambiguates Cygwin, which implements SIGINT not SIGBREAK. 229 | # https://cygwin.com/cygwin-ug-net/kill.html 230 | # https://github.com/scrapy/scrapy/blob/06f9c28/tests/test_crawler.py#L886 231 | @param("signal", required=False, default="INT" if sys.platform != "win32" else "BREAK") 232 | def render_POST(self, txrequest, project, job, signal): 233 | if project not in self.root.poller.queues: 234 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) 235 | 236 | prevstate = None 237 | 238 | if self.root.poller.queues[project].remove(lambda message: message["_job"] == job): 239 | prevstate = "pending" 240 | 241 | for process in self.root.launcher.processes.values(): 242 | if process.project == project and process.job == job: 243 | process.transport.signalProcess(signal) 244 | prevstate = "running" 245 | 246 | return {"prevstate": prevstate} 247 | 248 | 249 | class AddVersion(WsResource): 250 | @param("project") 251 | @param("version") 252 | @param("egg", type=bytes) 253 | def render_POST(self, txrequest, project, version, egg): 254 | if not zipfile.is_zipfile(BytesIO(egg)): 255 | raise error.Error( 256 | code=http.OK, message=b"egg is not a ZIP file (if using curl, use egg=@path not egg=path)" 257 | ) 258 | 259 | self.root.eggstorage.put(BytesIO(egg), project, version) 260 | self.root.update_projects() 261 | 262 | spiders = spider_list.set(project, version, runner=self.root.runner) 263 | return {"project": project, "version": version, "spiders": len(spiders)} 264 | 265 | 266 | class ListProjects(WsResource): 267 | def render_GET(self, txrequest): 268 | return {"projects": self.root.scheduler.list_projects()} 269 | 270 | 271 | class ListVersions(WsResource): 272 | @param("project") 273 | def render_GET(self, txrequest, project): 274 | return {"versions": self.root.eggstorage.list(project)} 275 | 276 | 277 | class ListSpiders(WsResource): 278 | """ 279 | .. versionchanged:: 1.2.0 280 | Add ``_version`` parameter. 281 | """ 282 | 283 | @param("project") 284 | @param("_version", dest="version", required=False, default=None) 285 | def render_GET(self, txrequest, project, version): 286 | if project not in self.root.poller.queues: 287 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) 288 | 289 | if version and self.root.eggstorage.get(project, version) == (None, None): 290 | raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) 291 | 292 | return {"spiders": spider_list.get(project, version, runner=self.root.runner)} 293 | 294 | 295 | class Status(WsResource): 296 | """ 297 | .. versionadded:: 1.5.0 298 | """ 299 | 300 | @param("job") 301 | @param("project", required=False) 302 | def render_GET(self, txrequest, job, project): 303 | queues = self.root.poller.queues 304 | if project is not None and project not in queues: 305 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) 306 | 307 | result = {"currstate": None} 308 | 309 | for finished in self.root.launcher.finished: 310 | if (project is None or finished.project == project) and finished.job == job: 311 | result["currstate"] = "finished" 312 | return result 313 | 314 | for process in self.root.launcher.processes.values(): 315 | if (project is None or process.project == project) and process.job == job: 316 | result["currstate"] = "running" 317 | return result 318 | 319 | for queue_name in queues if project is None else [project]: 320 | for message in queues[queue_name].list(): 321 | if message["_job"] == job: 322 | result["currstate"] = "pending" 323 | return result 324 | 325 | return result 326 | 327 | 328 | class ListJobs(WsResource): 329 | """ 330 | .. versionchanged:: 1.1.0 331 | Add ``start_time`` to running jobs in the response. 332 | .. versionchanged:: 1.2.0 333 | Add ``pid`` to running jobs in the response. 334 | .. versionchanged:: 1.3.0 335 | The ``project`` parameter is optional. Add ``project`` to all jobs in the response. 336 | .. versionchanged:: 1.4.0 337 | Add ``log_url`` and ``items_url`` to finished jobs in the response. 338 | .. versionchanged:: 1.5.0 339 | Add ``version``, ``settings`` and ``args`` to pending jobs in the response. 340 | """ 341 | 342 | @param("project", required=False) 343 | def render_GET(self, txrequest, project): 344 | queues = self.root.poller.queues 345 | if project is not None and project not in queues: 346 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) 347 | 348 | return { 349 | "pending": [ 350 | { 351 | "id": message["_job"], 352 | "project": queue_name, 353 | "spider": message["name"], 354 | "version": message.get("_version"), 355 | "settings": message.get("settings", {}), 356 | "args": {k: v for k, v in message.items() if k not in ("name", "_job", "_version", "settings")}, 357 | } 358 | for queue_name in (queues if project is None else [project]) 359 | for message in queues[queue_name].list() 360 | ], 361 | "running": [ 362 | { 363 | "id": process.job, 364 | "project": process.project, 365 | "spider": process.spider, 366 | "pid": process.pid, 367 | "start_time": str(process.start_time), 368 | "log_url": self.root.get_log_url(process), 369 | "items_url": self.root.get_item_url(process), 370 | } 371 | for process in self.root.launcher.processes.values() 372 | if project is None or process.project == project 373 | ], 374 | "finished": [ 375 | { 376 | "id": finished.job, 377 | "project": finished.project, 378 | "spider": finished.spider, 379 | "start_time": str(finished.start_time), 380 | "end_time": str(finished.end_time), 381 | "log_url": self.root.get_log_url(finished), 382 | "items_url": self.root.get_item_url(finished), 383 | } 384 | for finished in self.root.launcher.finished 385 | if project is None or finished.project == project 386 | ], 387 | } 388 | 389 | 390 | class DeleteProject(WsResource): 391 | @param("project") 392 | def render_POST(self, txrequest, project): 393 | self._delete_version(project) 394 | spider_list.delete(project) 395 | return {} 396 | 397 | def _delete_version(self, project, version=None): 398 | try: 399 | self.root.eggstorage.delete(project, version) 400 | except ProjectNotFoundError as e: 401 | raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) from e 402 | except EggNotFoundError as e: 403 | raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) from e 404 | else: 405 | self.root.update_projects() 406 | 407 | 408 | class DeleteVersion(DeleteProject): 409 | @param("project") 410 | @param("version") 411 | def render_POST(self, txrequest, project, version): 412 | self._delete_version(project, version) 413 | spider_list.delete(project, version) 414 | return {} 415 | -------------------------------------------------------------------------------- /scrapyd/website.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import socket 3 | from datetime import datetime, timedelta 4 | from html import escape 5 | from textwrap import dedent, indent 6 | from urllib.parse import quote, urlsplit 7 | 8 | from scrapy.utils.misc import load_object 9 | from twisted.application.service import IServiceCollection 10 | from twisted.python import filepath 11 | from twisted.web import resource, static 12 | 13 | from scrapyd.interfaces import IEggStorage, IPoller, ISpiderScheduler 14 | from scrapyd.utils import local_items 15 | 16 | 17 | # Use local DirectoryLister class. 18 | class File(static.File): 19 | def directoryListing(self): 20 | path = self.path 21 | names = self.listNames() 22 | return DirectoryLister(path, names, self.contentTypes, self.contentEncodings, self.defaultType) 23 | 24 | 25 | # Add "Last modified" column. 26 | class DirectoryLister(static.DirectoryLister): 27 | template = """ 28 | 29 | %(header)s 30 | 47 | 48 | 49 | 50 |

%(header)s

51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | %(tableContent)s 64 | 65 |
FilenameSizeLast modifiedContent typeContent encoding
66 | 67 | 68 | 69 | """ 70 | 71 | linePattern = """ 72 | %(text)s 73 | %(size)s 74 | %(modified)s 75 | %(type)s 76 | %(encoding)s 77 | 78 | """ 79 | 80 | def _getFilesAndDirectories(self, directory): 81 | files = [] 82 | dirs = [] 83 | 84 | for path in directory: 85 | if isinstance(path, bytes): 86 | path = path.decode() # noqa: PLW2901 from Twisted 87 | 88 | url = quote(path, "/") 89 | escaped_path = escape(path) 90 | child_path = filepath.FilePath(self.path).child(path) 91 | modified = datetime.fromtimestamp(child_path.getModificationTime()).strftime("%Y-%m-%d %H:%M") # NEW 92 | 93 | if child_path.isdir(): 94 | dirs.append( 95 | { 96 | "text": escaped_path + "/", 97 | "href": url + "/", 98 | "size": "", 99 | "type": "[Directory]", 100 | "encoding": "", 101 | "modified": modified, # NEW 102 | } 103 | ) 104 | else: 105 | mimetype, encoding = static.getTypeAndEncoding( 106 | path, self.contentTypes, self.contentEncodings, self.defaultType 107 | ) 108 | try: 109 | size = child_path.getsize() 110 | except OSError: 111 | continue 112 | files.append( 113 | { 114 | "text": escaped_path, 115 | "href": url, 116 | "type": f"[{mimetype}]", 117 | "encoding": (encoding and f"[{encoding}]" or ""), 118 | "size": static.formatFileSize(size), 119 | "modified": modified, # NEW 120 | } 121 | ) 122 | return dirs, files 123 | 124 | 125 | def _get_file_url(base, directory, job, extension): 126 | if os.path.exists(os.path.join(directory, job.project, job.spider, f"{job.job}.{extension}")): 127 | return f"/{base}/{job.project}/{job.spider}/{job.job}.{extension}" 128 | return None 129 | 130 | 131 | class Root(resource.Resource): 132 | def __init__(self, config, app): 133 | super().__init__() 134 | 135 | self.app = app 136 | self.logs_dir = config.get("logs_dir", "logs") 137 | self.items_dir = config.get("items_dir", "") 138 | self.debug = config.getboolean("debug", False) 139 | self.runner = config.get("runner", "scrapyd.runner") 140 | self.prefix_header = config.get("prefix_header", "x-forwarded-prefix") 141 | self.local_items = local_items(self.items_dir, urlsplit(self.items_dir)) 142 | self.node_name = config.get("node_name", socket.gethostname()) 143 | 144 | if self.logs_dir: 145 | self.putChild(b"logs", File(self.logs_dir, "text/plain")) 146 | if self.local_items: 147 | self.putChild(b"items", File(self.items_dir, "text/plain")) 148 | 149 | for service_name, service_path in config.items("services", default=[]): 150 | if service_path: 151 | service_cls = load_object(service_path) 152 | self.putChild(service_name.encode(), service_cls(self)) 153 | 154 | # Add web UI last, since its behavior can depend on others' presence. 155 | self.putChild(b"", Home(self)) 156 | self.putChild(b"jobs", Jobs(self)) 157 | 158 | def update_projects(self): 159 | self.poller.update_projects() 160 | self.scheduler.update_projects() 161 | 162 | def get_log_url(self, job): 163 | return _get_file_url("logs", self.logs_dir, job, "log") 164 | 165 | def get_item_url(self, job): 166 | if self.local_items: 167 | return _get_file_url("items", self.items_dir, job, "jl") 168 | return None 169 | 170 | @property 171 | def launcher(self): 172 | return IServiceCollection(self.app, self.app).getServiceNamed("launcher") 173 | 174 | @property 175 | def scheduler(self): 176 | return self.app.getComponent(ISpiderScheduler) 177 | 178 | @property 179 | def eggstorage(self): 180 | return self.app.getComponent(IEggStorage) 181 | 182 | @property 183 | def poller(self): 184 | return self.app.getComponent(IPoller) 185 | 186 | 187 | class PrefixHeaderMixin: 188 | def get_base_path(self, txrequest): 189 | return txrequest.getHeader(self.root.prefix_header) or "" 190 | 191 | 192 | class Home(PrefixHeaderMixin, resource.Resource): 193 | def __init__(self, root): 194 | super().__init__() 195 | self.root = root 196 | 197 | def prepare_projects(self): 198 | if projects := self.root.scheduler.list_projects(): 199 | lis = "\n".join(f"
  • {escape(project_name)}
  • " for project_name in sorted(projects)) 200 | return f"

    Scrapy projects:

    \n
      \n{indent(lis, ' ')}\n
    " 201 | return "

    No Scrapy projects yet.

    " 202 | 203 | def render_GET(self, txrequest): 204 | base_path = self.get_base_path(txrequest) 205 | 206 | content = dedent( 207 | f"""\ 208 | 209 | 210 | 211 | 212 | 213 | Scrapyd 214 | 217 | 218 | 219 |

    Scrapyd

    220 | 221 | 227 | 228 | {indent(self.prepare_projects(), " ")} 229 | 230 |

    231 | This web UI is for monitoring only. To upload projects and schedule crawls, use the API. 232 | For example, using curl: 233 |

    234 | 235 |

    236 | curl http://localhost:6800/schedule.json -d project=default -d spider=somespider 237 |

    238 | 239 |

    240 | See the Scrapyd documentation for details. 241 |

    242 | 243 | 244 | """ 245 | ) 246 | content = content.encode() 247 | 248 | txrequest.setHeader("Content-Type", "text/html; charset=utf-8") 249 | txrequest.setHeader("Content-Length", str(len(content))) 250 | return content 251 | 252 | 253 | def no_microseconds(timelike): 254 | # microsecond for datetime, microseconds for timedelta. 255 | ms = timelike.microsecond if hasattr(timelike, "microsecond") else timelike.microseconds 256 | return timelike - timedelta(microseconds=ms) 257 | 258 | 259 | class Jobs(PrefixHeaderMixin, resource.Resource): 260 | def __init__(self, root): 261 | super().__init__() 262 | self.root = root 263 | 264 | self.headers = [ 265 | "Project", 266 | "Spider", 267 | "Job", 268 | "PID", 269 | "Start", 270 | "Runtime", 271 | "Finish", 272 | "Log", 273 | ] 274 | # Hide the Items column if items_dir isn't local. 275 | if self.root.local_items: 276 | self.headers.append("Items") 277 | # Hide the Cancel column if no cancel.json webservice. 278 | if b"cancel.json" in self.root.children: 279 | self.headers.append("Cancel") 280 | 281 | def cancel_button(self, project, job): 282 | return dedent( 283 | f""" 284 |
    285 | 286 | 287 | 288 |
    289 | """ 290 | ) 291 | 292 | def html_log_url(self, job): 293 | if url := self.root.get_log_url(job): 294 | return f'Log' 295 | return None 296 | 297 | def html_item_url(self, job): 298 | if url := self.root.get_item_url(job): 299 | return f'Items' 300 | return None 301 | 302 | def prepare_headers(self): 303 | ths = "\n".join(f"{header}" for header in self.headers) 304 | return f"\n{indent(ths, ' ')}\n" 305 | 306 | def prepare_row(self, row): 307 | tds = "\n".join(f"{'' if row.get(header) is None else row[header]}" for header in self.headers) 308 | return f"\n{indent(tds, ' ')}\n" 309 | 310 | def prepare_pending(self): 311 | return "\n".join( 312 | self.prepare_row( 313 | { 314 | "Project": escape(project), 315 | "Spider": escape(message["name"]), 316 | "Job": escape(message["_job"]), 317 | "Cancel": self.cancel_button(project, message["_job"]), 318 | } 319 | ) 320 | for project, queue in self.root.poller.queues.items() 321 | for message in queue.list() 322 | ) 323 | 324 | def prepare_running(self): 325 | return "\n".join( 326 | self.prepare_row( 327 | { 328 | "Project": escape(process.project), 329 | "Spider": escape(process.spider), 330 | "Job": escape(process.job), 331 | "PID": process.pid, 332 | "Start": no_microseconds(process.start_time), 333 | "Runtime": no_microseconds(datetime.now() - process.start_time), 334 | "Log": self.html_log_url(process), 335 | "Items": self.html_item_url(process), 336 | "Cancel": self.cancel_button(process.project, process.job), 337 | } 338 | ) 339 | for process in self.root.launcher.processes.values() 340 | ) 341 | 342 | def prepare_finished(self): 343 | return "\n".join( 344 | self.prepare_row( 345 | { 346 | "Project": escape(job.project), 347 | "Spider": escape(job.spider), 348 | "Job": escape(job.job), 349 | "Start": no_microseconds(job.start_time), 350 | "Runtime": no_microseconds(job.end_time - job.start_time), 351 | "Finish": no_microseconds(job.end_time), 352 | "Log": self.html_log_url(job), 353 | "Items": self.html_item_url(job), 354 | } 355 | ) 356 | for job in self.root.launcher.finished 357 | ) 358 | 359 | def render_GET(self, txrequest): 360 | self.base_path = self.get_base_path(txrequest) 361 | 362 | content = dedent( 363 | f"""\ 364 | 365 | 366 | 367 | 368 | 369 | Scrapyd 370 | 378 | 379 | 380 |

    Jobs

    381 |

    Go up

    382 | 383 | 384 | {indent(self.prepare_headers(), " ")} 385 | 386 | 387 | 388 | 389 | 390 | {indent(self.prepare_pending(), " ")} 391 | 392 | 393 | 394 | 395 | 396 | {indent(self.prepare_running(), " ")} 397 | 398 | 399 | 400 | 401 | 402 | {indent(self.prepare_finished(), " ")} 403 | 404 |
    Pending
    Running
    Finished
    405 | 406 | 407 | """ 408 | ).encode() 409 | 410 | txrequest.setHeader("Content-Type", "text/html; charset=utf-8") 411 | txrequest.setHeader("Content-Length", str(len(content))) 412 | return content 413 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import io 3 | import os.path 4 | import pkgutil 5 | 6 | from twisted.logger import eventAsText 7 | 8 | from scrapyd.launcher import ScrapyProcessProtocol 9 | 10 | 11 | def touch(path): 12 | path.parent.mkdir(parents=True) 13 | path.touch() 14 | 15 | 16 | def get_egg_data(basename): 17 | return pkgutil.get_data("tests", f"fixtures/{basename}.egg") 18 | 19 | 20 | def has_settings(): 21 | return os.path.exists("scrapy.cfg") 22 | 23 | 24 | def root_add_version(root, project, version, basename): 25 | root.eggstorage.put(io.BytesIO(get_egg_data(basename)), project, version) 26 | 27 | 28 | def get_message(captured): 29 | return eventAsText(captured[0]).split(" ", 1)[1] 30 | 31 | 32 | def get_finished_job(project="p1", spider="s1", job="j1", start_time=None, end_time=None): 33 | if start_time is None: 34 | start_time = datetime.datetime.now() 35 | if end_time is None: 36 | end_time = datetime.datetime.now() 37 | process = ScrapyProcessProtocol(project, spider, job, env={}, args=[]) 38 | process.start_time = start_time 39 | process.end_time = end_time 40 | return process 41 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import shutil 3 | 4 | import pytest 5 | from twisted.web import http 6 | from twisted.web.http import Request 7 | from twisted.web.test.requesthelper import DummyChannel 8 | 9 | from scrapyd import Config 10 | from scrapyd.app import application 11 | from scrapyd.interfaces import IEnvironment 12 | from scrapyd.webservice import spider_list 13 | from scrapyd.website import Root 14 | from tests import root_add_version 15 | 16 | BASEDIR = os.path.abspath(os.path.dirname(__file__)) 17 | 18 | 19 | @pytest.fixture(autouse=True) 20 | def _clear_spider_list_cache(): 21 | spider_list.cache.clear() 22 | 23 | 24 | @pytest.fixture() 25 | def txrequest(): 26 | http_channel = http.HTTPChannel() 27 | http_channel.makeConnection(DummyChannel.TCP()) 28 | return Request(http_channel) 29 | 30 | 31 | # Use this fixture when testing the Scrapyd web UI or API or writing configuration files. 32 | @pytest.fixture() 33 | def chdir(monkeypatch, tmp_path): 34 | monkeypatch.chdir(tmp_path) 35 | return tmp_path 36 | 37 | 38 | @pytest.fixture( 39 | params=[ 40 | None, 41 | (("items_dir", "items"), ("jobstorage", "scrapyd.jobstorage.SqliteJobStorage")), 42 | ], 43 | ids=["default", "custom"], 44 | ) 45 | def config(request, chdir): 46 | if request.param: 47 | shutil.copytree(os.path.join(BASEDIR, "fixtures", "filesystem"), chdir, dirs_exist_ok=True) 48 | config = Config() 49 | if request.param: 50 | for key, value in request.param: 51 | config.cp.set(Config.SECTION, key, value) 52 | return config 53 | 54 | 55 | @pytest.fixture() 56 | def app(config): 57 | return application(config) 58 | 59 | 60 | @pytest.fixture() 61 | def environ(app): 62 | return app.getComponent(IEnvironment) 63 | 64 | 65 | @pytest.fixture() 66 | def root(config, app): 67 | return Root(config, app) 68 | 69 | 70 | @pytest.fixture() 71 | def root_with_egg(root): 72 | root_add_version(root, "quotesbot", "0.1", "quotesbot") 73 | root.update_projects() 74 | return root 75 | -------------------------------------------------------------------------------- /tests/fixtures/entrypoint_missing.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/entrypoint_missing.egg -------------------------------------------------------------------------------- /tests/fixtures/filesystem/localproject/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/filesystem/localproject/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/filesystem/localproject/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = "localproject" 2 | SPIDER_MODULES = ["localproject.spiders"] 3 | NEWSPIDER_MODULE = "localproject.spiders" 4 | ROBOTSTXT_OBEY = True 5 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 6 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 7 | FEED_EXPORT_ENCODING = "utf-8" 8 | -------------------------------------------------------------------------------- /tests/fixtures/filesystem/localproject/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tests/fixtures/filesystem/localproject/spiders/example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class ExampleSpider(scrapy.Spider): 5 | name = "example" 6 | 7 | def start_requests(self): 8 | pass 9 | -------------------------------------------------------------------------------- /tests/fixtures/filesystem/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | localproject = localproject.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = localproject 12 | -------------------------------------------------------------------------------- /tests/fixtures/mybot.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/mybot.egg -------------------------------------------------------------------------------- /tests/fixtures/mybot2.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/mybot2.egg -------------------------------------------------------------------------------- /tests/fixtures/quotesbot.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/quotesbot.egg -------------------------------------------------------------------------------- /tests/fixtures/settings_asyncioreactor.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_asyncioreactor.egg -------------------------------------------------------------------------------- /tests/fixtures/settings_log_stdout.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_log_stdout.egg -------------------------------------------------------------------------------- /tests/fixtures/settings_raise.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_raise.egg -------------------------------------------------------------------------------- /tests/fixtures/spiders_utf8.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/spiders_utf8.egg -------------------------------------------------------------------------------- /tests/mockapp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from twisted.application import app 5 | from twisted.internet import reactor 6 | from twisted.python import log 7 | 8 | from scrapyd import Config 9 | from scrapyd.app import application 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("http_port") 14 | parser.add_argument("--username") 15 | parser.add_argument("--password") 16 | args = parser.parse_args() 17 | 18 | config = Config() 19 | config.cp.set(Config.SECTION, "http_port", args.http_port) 20 | if args.username and args.password: 21 | config.cp.set(Config.SECTION, "username", args.username) 22 | config.cp.set(Config.SECTION, "password", args.password) 23 | 24 | log.startLogging(sys.stdout) 25 | 26 | app.startApplication(application(config=config), save=False) 27 | 28 | reactor.run() 29 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import re 3 | import socket 4 | import sys 5 | from subprocess import PIPE, Popen 6 | from urllib.parse import urljoin 7 | 8 | BASEDIR = os.path.abspath(os.path.dirname(__file__)) 9 | 10 | 11 | def get_ephemeral_port(): 12 | # Somehow getting random high port doesn't work on pypy 13 | if re.search("PyPy", sys.version): 14 | return str(9112) 15 | s = socket.socket() 16 | s.bind(("", 0)) 17 | return str(s.getsockname()[1]) 18 | 19 | 20 | class MockScrapydServer: 21 | def __init__(self, username=None, password=None): 22 | self.username = username 23 | self.password = password 24 | 25 | def __enter__(self): 26 | self.http_port = get_ephemeral_port() 27 | command = [sys.executable, os.path.join(BASEDIR, "mockapp.py"), self.http_port] 28 | if self.username and self.password: 29 | command.extend([f"--username={self.username}", f"--password={self.password}"]) 30 | 31 | self.process = Popen(command, stdout=PIPE) 32 | 33 | # The loop is expected to run 3 times. 34 | # 2001-02-03 04:05:06-0000 [-] Log opened. 35 | # 2001-02-03 04:05:06-0000 [-] Basic authentication disabled as either `username` or `password` is unset 36 | # 2001-02-03 04:05:06-0000 [-] Scrapyd web console available at http://127.0.0.1:53532/ 37 | self.head = [] 38 | for _ in range(10): 39 | line = self.process.stdout.readline() 40 | self.head.append(line) 41 | if address := re.search("available at (.+/)", line.decode()): 42 | self.url = address.group(1) 43 | break 44 | 45 | return self 46 | 47 | def __exit__(self, exc_type, exc_value, traceback): 48 | self.process.terminate() 49 | self.stdout, _ = self.process.communicate() 50 | self.stdout = b"".join(self.head) + self.stdout 51 | 52 | def urljoin(self, path): 53 | return urljoin(self.url, path) 54 | 55 | 56 | if __name__ == "__main__": 57 | with MockScrapydServer() as server: 58 | while True: 59 | pass 60 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from configparser import NoOptionError, NoSectionError 2 | 3 | import pytest 4 | 5 | from scrapyd import get_application 6 | from scrapyd.app import application 7 | from scrapyd.config import Config 8 | from scrapyd.exceptions import InvalidUsernameError 9 | 10 | 11 | def test_items_no_section(): 12 | with pytest.raises(NoSectionError): 13 | Config().items("nonexistent") 14 | 15 | 16 | def test_get_no_section(): 17 | with pytest.raises(NoOptionError): 18 | Config().get("nonexistent") 19 | 20 | 21 | def test_get_no_option(): 22 | config = Config() 23 | config.cp.set("scrapyd", "http_port", "8000") 24 | 25 | with pytest.raises(NoOptionError): 26 | config.get("nonexistent") 27 | 28 | 29 | def test_closest_scrapy_cfg(monkeypatch, tmp_path): 30 | monkeypatch.chdir(tmp_path) 31 | (tmp_path / "scrapy.cfg").write_text("[scrapyd]\nhttp_port = 1234") 32 | 33 | assert Config().getint("http_port") == 1234 34 | 35 | 36 | def test_invalid_username(): 37 | config = Config() 38 | config.cp.set("scrapyd", "username", "invalid:") 39 | 40 | with pytest.raises(InvalidUsernameError) as exc: 41 | application(config) 42 | 43 | assert ( 44 | str(exc.value) 45 | == "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file." 46 | ) 47 | 48 | 49 | def test_invalid_username_sys(): 50 | config = Config() 51 | config.cp.set("scrapyd", "username", "invalid:") 52 | 53 | with pytest.raises(SystemExit) as exc: 54 | get_application(config) 55 | 56 | assert ( 57 | str(exc.value) 58 | == "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file." 59 | ) 60 | -------------------------------------------------------------------------------- /tests/test_eggstorage.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os.path 3 | from contextlib import closing 4 | 5 | import pytest 6 | from zope.interface import implementer 7 | from zope.interface.verify import verifyObject 8 | 9 | from scrapyd.app import application 10 | from scrapyd.config import Config 11 | from scrapyd.eggstorage import FilesystemEggStorage, sorted_versions 12 | from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError 13 | from scrapyd.interfaces import IEggStorage 14 | 15 | 16 | @implementer(IEggStorage) 17 | class MockEggStorage: 18 | def __init__(self, config): 19 | self.config = config 20 | 21 | def put(self, eggfile, project, version): 22 | pass 23 | 24 | def get(self, project, version=None): 25 | pass 26 | 27 | def list(self, project): 28 | pass 29 | 30 | def list_projects(self): 31 | return ["hello_world"] 32 | 33 | def delete(self, project, version=None): 34 | pass 35 | 36 | 37 | @pytest.fixture() 38 | def eggstorage(tmpdir): 39 | return FilesystemEggStorage(Config(values={"eggs_dir": tmpdir})) 40 | 41 | 42 | @pytest.mark.parametrize( 43 | ("versions", "expected"), 44 | [ 45 | # letter 46 | (["zzz", "b", "ddd", "a", "x"], ["a", "b", "ddd", "x", "zzz"]), 47 | # number 48 | (["10", "1", "9"], ["1", "9", "10"]), 49 | # "r" number 50 | (["r10", "r1", "r9"], ["r1", "r10", "r9"]), 51 | # version 52 | (["2.11", "2.01", "2.9"], ["2.01", "2.9", "2.11"]), 53 | # number and letter 54 | (["123456789", "b3b8fd2"], ["123456789", "b3b8fd2"]), 55 | ], 56 | ) 57 | def test_sorted_versions(versions, expected): 58 | assert sorted_versions(versions) == expected 59 | 60 | 61 | def test_config(chdir): 62 | config = Config() 63 | config.cp.set("scrapyd", "eggstorage", "tests.test_eggstorage.MockEggStorage") 64 | 65 | app = application(config) 66 | eggstorage = app.getComponent(IEggStorage) 67 | 68 | assert isinstance(eggstorage, MockEggStorage) 69 | assert eggstorage.list_projects() == ["hello_world"] 70 | 71 | 72 | def test_interface(eggstorage): 73 | verifyObject(IEggStorage, eggstorage) 74 | 75 | 76 | def test_put_secure(eggstorage): 77 | with pytest.raises(DirectoryTraversalError) as exc: 78 | eggstorage.put(io.BytesIO(b"data"), "../p", "v") # version is sanitized 79 | 80 | assert str(exc.value) == "../p" 81 | 82 | 83 | def test_get_secure(eggstorage): 84 | with pytest.raises(DirectoryTraversalError) as exc: 85 | eggstorage.get("../p", "v") # version is sanitized 86 | 87 | assert str(exc.value) == "../p" 88 | 89 | 90 | def test_list_secure_join(eggstorage): 91 | with pytest.raises(DirectoryTraversalError) as exc: 92 | eggstorage.list("../p") 93 | 94 | assert str(exc.value) == "../p" 95 | 96 | 97 | def test_list_secure_glob(eggstorage): 98 | eggstorage.put(io.BytesIO(b"data"), "mybot", "01") 99 | 100 | assert eggstorage.list("*") == [] # ["01"] if * weren't escaped 101 | 102 | 103 | def test_delete_secure(eggstorage): 104 | with pytest.raises(DirectoryTraversalError) as exc: 105 | eggstorage.delete("../p", "v") # version is sanitized 106 | 107 | assert str(exc.value) == "../p" 108 | 109 | 110 | @pytest.mark.parametrize( 111 | ("version", "expected"), 112 | [ 113 | (None, (None, None)), 114 | ("nonexistent", (None, None)), 115 | ("01", (None, None)), 116 | ], 117 | ) 118 | def test_get_empty(eggstorage, version, expected): 119 | assert eggstorage.get("mybot", version) == expected 120 | 121 | 122 | @pytest.mark.parametrize( 123 | ("version", "expected"), 124 | [ 125 | (None, ("03", b"egg03")), 126 | ("nonexistent", (None, None)), 127 | ("01", ("01", b"egg01")), 128 | ], 129 | ) 130 | def test_get_many(eggstorage, version, expected): 131 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01") 132 | eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03") 133 | eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02") 134 | 135 | version, data = eggstorage.get("mybot", version) 136 | if data is not None: 137 | with closing(data): 138 | data = data.read() 139 | 140 | assert (version, data) == expected 141 | 142 | 143 | @pytest.mark.parametrize( 144 | ("versions", "expected"), 145 | [(["ddd", "abc", "bcaa"], ["abc", "bcaa", "ddd"]), (["9", "2", "200", "3", "4"], ["2", "3", "4", "9", "200"])], 146 | ) 147 | def test_list(eggstorage, versions, expected): 148 | assert eggstorage.list("mybot") == [] 149 | 150 | for version in versions: 151 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", version) 152 | 153 | assert eggstorage.list("mybot") == expected 154 | 155 | 156 | def test_list_glob(eggstorage): 157 | directory = os.path.join(eggstorage.basedir, "mybot") 158 | os.makedirs(directory) 159 | with open(os.path.join(directory, "other"), "wb") as f: 160 | f.write(b"") 161 | 162 | assert eggstorage.list("mybot") == [] # "other" without "*.egg" glob 163 | 164 | 165 | def test_list_projects(eggstorage): 166 | with open(os.path.join(eggstorage.basedir, "other"), "wb") as f: 167 | f.write(b"") 168 | 169 | assert eggstorage.list_projects() == [] # "other" without isdir() filter 170 | 171 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01") 172 | 173 | assert eggstorage.list_projects() == ["mybot"] 174 | 175 | 176 | def test_delete_project(eggstorage): 177 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01") 178 | eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03") 179 | eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02") 180 | 181 | assert eggstorage.list("mybot") == ["01", "02", "03"] 182 | 183 | eggstorage.delete("mybot") 184 | 185 | assert eggstorage.list("mybot") == [] 186 | 187 | 188 | def test_delete_vesrion(eggstorage): 189 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01") 190 | eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03") 191 | eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02") 192 | 193 | assert eggstorage.list("mybot") == ["01", "02", "03"] 194 | 195 | eggstorage.delete("mybot", "02") 196 | 197 | assert eggstorage.list("mybot") == ["01", "03"] 198 | 199 | eggstorage.delete("mybot", "03") 200 | 201 | assert eggstorage.list("mybot") == ["01"] 202 | 203 | eggstorage.delete("mybot", "01") 204 | 205 | assert eggstorage.list("mybot") == [] 206 | assert not os.path.exists(os.path.join(eggstorage.basedir, "mybot")) 207 | 208 | 209 | def test_delete_nonexistent_project(eggstorage): 210 | with pytest.raises(ProjectNotFoundError): 211 | eggstorage.delete("mybot") 212 | 213 | 214 | def test_delete_nonexistent_version(eggstorage): 215 | with pytest.raises(EggNotFoundError): 216 | eggstorage.delete("mybot", "01") 217 | 218 | eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01") 219 | 220 | with pytest.raises(EggNotFoundError): 221 | eggstorage.delete("mybot", "02") 222 | -------------------------------------------------------------------------------- /tests/test_environ.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | from zope.interface.verify import verifyObject 8 | 9 | from scrapyd.config import Config 10 | from scrapyd.environ import Environment 11 | from scrapyd.exceptions import DirectoryTraversalError 12 | from scrapyd.interfaces import IEnvironment 13 | from tests import has_settings 14 | 15 | 16 | def test_interface(environ): 17 | verifyObject(IEnvironment, environ) 18 | 19 | 20 | def test_get_settings(environ): 21 | settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) 22 | 23 | assert re.search(r"^\S+j1\.log$", settings["LOG_FILE"]) 24 | 25 | if environ.items_dir: 26 | feeds = json.loads(settings.pop("FEEDS")) 27 | path, value = feeds.popitem() 28 | 29 | assert list(settings) == ["LOG_FILE"] 30 | assert feeds == {} 31 | assert re.search(r"^file:///\S+j1\.jl$", path) 32 | assert value == {"format": "jsonlines"} 33 | 34 | 35 | @pytest.mark.parametrize( 36 | ("items_dir", "pattern"), 37 | [ 38 | ( 39 | "https://host.example/path?query=value#fragment", 40 | r"https://host\.example/path/p1/s1/j1\.jl\?query=value#fragment", 41 | ), 42 | ( 43 | "https://host.example/path/", 44 | "https://host.example/path/p1/s1/j1.jl", # no double slashes 45 | ), 46 | ( 47 | "file:/root.dir/path?ignored#ignored", 48 | r"file:///([A-Z]:/)?root\.dir/path/p1/s1/j1\.jl", 49 | ), 50 | ( 51 | "file://hostname/root.dir/path?ignored#ignored", 52 | r"file:///([A-Z]:/)?root.dir/path/p1/s1/j1.jl", 53 | ), 54 | ( 55 | "file:///root.dir/path?ignored#ignored", 56 | r"file:///([A-Z]:/)?root.dir/path/p1/s1/j1.jl", 57 | ), 58 | ], 59 | ) 60 | @patch("os.listdir", lambda _: []) 61 | @patch("os.makedirs", lambda _: _) 62 | def test_get_settings_url(items_dir, pattern): 63 | config = Config(values={"logs_dir": "", "items_dir": items_dir}) 64 | environ = Environment(config, initenv={}) 65 | 66 | settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) 67 | feeds = json.loads(settings.pop("FEEDS")) 68 | path, value = feeds.popitem() 69 | 70 | assert settings == {} 71 | assert feeds == {} 72 | assert re.search(pattern, path) 73 | assert value == {"format": "jsonlines"} 74 | 75 | 76 | @pytest.mark.parametrize("values", [{"items_dir": "../items"}, {"logs_dir": "../logs"}]) 77 | @pytest.mark.parametrize(("key", "value"), [("_project", "../p"), ("_spider", "../s"), ("_job", "../j")]) 78 | def test_get_settings_secure(values, key, value): 79 | config = Config(values=values) 80 | environ = Environment(config, initenv={}) 81 | 82 | with pytest.raises(DirectoryTraversalError) as exc: 83 | environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1", key: value}) 84 | 85 | assert str(exc.value) == ( 86 | f"{value if key == '_project' else 'p1'}{os.sep}" 87 | f"{value if key == '_spider' else 's1'}{os.sep}" 88 | f"{value if key == '_job' else 'j1'}.log" 89 | ) 90 | 91 | 92 | def test_jobs_to_keep(chdir): 93 | config = Config(values={"jobs_to_keep": "2"}) 94 | environ = Environment(config, initenv={}) 95 | directory = chdir / "logs" / "p1" / "s1" 96 | 97 | assert not directory.exists() 98 | 99 | environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) 100 | 101 | assert directory.exists() 102 | 103 | (directory / "j1.a").touch() 104 | (directory / "j2.b").touch() 105 | os.utime(directory / "j1.a", (1000000000, 1000000000)) 106 | os.utime(directory / "j2.b", (1000000000, 1000000000)) 107 | (directory / "j3.c").touch() 108 | (directory / "j4.d").touch() 109 | 110 | environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) 111 | 112 | assert not (directory / "j1.a").exists() 113 | assert not (directory / "j2.b").exists() 114 | 115 | 116 | @pytest.mark.parametrize( 117 | ("message", "run_only_if_has_settings"), 118 | [ 119 | ({"_project": "mybot"}, False), 120 | ({"_project": "mybot", "_version": "v1"}, False), 121 | ({"_project": "localproject"}, True), 122 | ], 123 | ) 124 | def test_get_environment(monkeypatch, environ, message, run_only_if_has_settings): 125 | if run_only_if_has_settings and not has_settings(): 126 | pytest.skip("[settings] section is not set") 127 | 128 | monkeypatch.setenv("CUSTOM", "value") 129 | env = environ.get_environment(message, 3) 130 | 131 | assert env["CUSTOM"] == "value" 132 | assert env["SCRAPY_PROJECT"] == message["_project"] 133 | 134 | if "_version" in message: 135 | assert env["SCRAPYD_EGG_VERSION"] == "v1" 136 | else: 137 | assert "SCRAPYD_EGG_VERSION" not in env 138 | 139 | if run_only_if_has_settings: 140 | assert env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings" 141 | else: 142 | assert "SCRAPY_SETTINGS_MODULE" not in env 143 | -------------------------------------------------------------------------------- /tests/test_interfaces.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from twisted.cred.checkers import ICredentialsChecker 3 | from twisted.cred.portal import IRealm 4 | from zope.interface.verify import verifyClass 5 | 6 | from scrapyd.basicauth import PublicHTMLRealm, StringCredentialsChecker 7 | from scrapyd.eggstorage import FilesystemEggStorage 8 | from scrapyd.environ import Environment 9 | from scrapyd.interfaces import IEggStorage, IEnvironment, IJobStorage, IPoller, ISpiderQueue, ISpiderScheduler 10 | from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage 11 | from scrapyd.poller import QueuePoller 12 | from scrapyd.scheduler import SpiderScheduler 13 | from scrapyd.spiderqueue import SqliteSpiderQueue 14 | 15 | 16 | @pytest.mark.parametrize( 17 | ("cls", "interface"), 18 | [ 19 | (PublicHTMLRealm, IRealm), 20 | (StringCredentialsChecker, ICredentialsChecker), 21 | (FilesystemEggStorage, IEggStorage), 22 | (Environment, IEnvironment), 23 | (MemoryJobStorage, IJobStorage), 24 | (SqliteJobStorage, IJobStorage), 25 | (QueuePoller, IPoller), 26 | (SpiderScheduler, ISpiderScheduler), 27 | (SqliteSpiderQueue, ISpiderQueue), 28 | ], 29 | ) 30 | def test_interface(cls, interface): 31 | verifyClass(interface, cls) 32 | -------------------------------------------------------------------------------- /tests/test_jobstorage.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from zope.interface.verify import verifyObject 4 | 5 | from scrapyd.config import Config 6 | from scrapyd.interfaces import IJobStorage 7 | from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage 8 | from tests import get_finished_job 9 | 10 | job1 = get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)) 11 | job2 = get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)) 12 | job3 = get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)) 13 | 14 | 15 | def pytest_generate_tests(metafunc): 16 | idlist = [] 17 | argvalues = [] 18 | for scenario, cls in metafunc.cls.scenarios: 19 | idlist.append(scenario) 20 | argnames = ["cls"] 21 | argvalues.append([cls]) 22 | metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") 23 | 24 | 25 | def config(tmpdir): 26 | return Config(values={"dbs_dir": tmpdir, "finished_to_keep": "2"}) 27 | 28 | 29 | class TestJobStorage: 30 | scenarios = (("sqlite", SqliteJobStorage), ("memory", MemoryJobStorage)) 31 | 32 | def test_interface(self, cls, tmpdir): 33 | verifyObject(IJobStorage, cls(config(tmpdir))) 34 | 35 | def test_add(self, cls, tmpdir): 36 | jobstorage = cls(config(tmpdir)) 37 | 38 | assert len(jobstorage) == 0 39 | 40 | jobstorage.add(job1) 41 | jobstorage.add(job2) 42 | jobstorage.add(job3) 43 | actual = jobstorage.list() 44 | 45 | assert len(jobstorage) == 2 46 | assert actual == list(jobstorage) 47 | assert actual == [job3, job2] 48 | 49 | def test_iter(self, cls, tmpdir): 50 | jobstorage = cls(config(tmpdir)) 51 | 52 | assert len(jobstorage) == 0 53 | 54 | jobstorage.add(job1) 55 | jobstorage.add(job2) 56 | jobstorage.add(job3) 57 | actual = jobstorage.list() 58 | 59 | assert len(jobstorage) == 2 60 | assert actual == list(jobstorage) 61 | assert actual == [job3, job2] 62 | -------------------------------------------------------------------------------- /tests/test_launcher.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | 4 | import pytest 5 | from twisted.internet import defer, error 6 | from twisted.logger import LogLevel, capturedLogs 7 | from twisted.python import failure 8 | 9 | from scrapyd import __version__ 10 | from scrapyd.config import Config 11 | from scrapyd.launcher import Launcher, get_crawl_args 12 | from tests import get_message, has_settings 13 | 14 | 15 | def remove_debug_messages(captured): 16 | return [message for message in captured if message["log_level"] != LogLevel.debug] 17 | 18 | 19 | @pytest.fixture() 20 | def launcher(app): 21 | return Launcher(Config(), app) 22 | 23 | 24 | @pytest.fixture() 25 | def process(launcher): 26 | launcher._spawn_process({"_project": "p1", "_spider": "s1", "_job": "j1"}, 0) # noqa: SLF001 27 | return launcher.processes[0] 28 | 29 | 30 | @pytest.mark.parametrize( 31 | ("message", "expected"), 32 | [ 33 | ({"_project": "p1", "_spider": "s1"}, ["s1"]), 34 | ({"_project": "p1", "_spider": "s1", "settings": {"ONE": "two"}}, ["s1", "-s", "ONE=two"]), 35 | ({"_project": "p1", "_spider": "s1", "arg1": "val1"}, ["s1", "-a", "arg1=val1"]), 36 | ( 37 | {"_project": "p1", "_spider": "s1", "arg1": "val1", "settings": {"ONE": "two"}}, 38 | ["s1", "-s", "ONE=two", "-a", "arg1=val1"], 39 | ), 40 | ], 41 | ) 42 | def test_get_crawl_args(message, expected): 43 | assert get_crawl_args(message) == expected 44 | 45 | 46 | def test_start_service(launcher): 47 | with capturedLogs() as captured: 48 | launcher.startService() 49 | captured = remove_debug_messages(captured) 50 | 51 | assert len(captured) == 1 52 | assert captured[0]["log_level"] == LogLevel.info 53 | assert re.search( 54 | f"\\[Launcher\\] Scrapyd {__version__} started: max_proc=\\d+, runner='scrapyd.runner'", get_message(captured) 55 | ) 56 | 57 | 58 | def test_start_service_max_proc(app): 59 | config = Config() 60 | config.cp.set(Config.SECTION, "max_proc", "8") 61 | launcher = Launcher(config, app) 62 | 63 | with capturedLogs() as captured: 64 | launcher.startService() 65 | captured = remove_debug_messages(captured) 66 | 67 | assert len(captured) == 1 68 | assert captured[0]["log_level"] == LogLevel.info 69 | assert re.search( 70 | f"\\[Launcher\\] Scrapyd {__version__} started: max_proc=8, runner='scrapyd.runner'", get_message(captured) 71 | ) 72 | 73 | 74 | @pytest.mark.parametrize( 75 | ("message", "expected"), 76 | [ 77 | ({}, {}), 78 | ({"_version": "v1"}, {"SCRAPYD_EGG_VERSION": "v1"}), 79 | ], 80 | ) 81 | def test_spawn_process(launcher, message, expected): 82 | launcher._spawn_process({"_project": "localproject", "_spider": "s1", "_job": "j1", **message}, 1) # noqa: SLF001 83 | 84 | process = launcher.processes[1] 85 | 86 | assert isinstance(process.pid, int) 87 | assert process.project == "localproject" 88 | assert process.spider == "s1" 89 | assert process.job == "j1" 90 | assert isinstance(process.start_time, datetime.datetime) 91 | assert process.end_time is None 92 | assert isinstance(process.args, list) # see tests below 93 | assert isinstance(process.deferred, defer.Deferred) 94 | 95 | # scrapyd.environ.Environ.get_environment 96 | assert process.env["SCRAPY_PROJECT"] == "localproject" 97 | for key, value in expected.items(): 98 | assert process.env[key] == value 99 | if "SCRAPYD_EGG_VERSION" not in expected: 100 | assert "SCRAPYD_EGG_VERSION" not in process.env 101 | if has_settings(): 102 | assert process.env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings" 103 | else: 104 | assert "SCRAPY_SETTINGS_MODULE" not in process.env 105 | 106 | 107 | def test_out_received(process): 108 | with capturedLogs() as captured: 109 | process.outReceived(b"out\n") 110 | 111 | assert len(captured) == 1 112 | assert captured[0]["log_level"] == LogLevel.info 113 | assert get_message(captured) == f"[Launcher,{process.pid}/stdout] out" 114 | 115 | 116 | def test_err_received(process): 117 | with capturedLogs() as captured: 118 | process.errReceived(b"err\n") 119 | 120 | assert len(captured) == 1 121 | assert captured[0]["log_level"] == LogLevel.error 122 | assert get_message(captured) == f"[Launcher,{process.pid}/stderr] err" 123 | 124 | 125 | def test_connection_made(environ, process): 126 | pid = process.pid 127 | with capturedLogs() as captured: 128 | process.connectionMade() 129 | 130 | assert len(captured) == 1 131 | assert captured[0]["log_level"] == LogLevel.info 132 | if environ.items_dir: 133 | assert re.match( 134 | f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} " 135 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " 136 | """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", 137 | get_message(captured), 138 | ) 139 | else: 140 | assert re.match( 141 | f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} " 142 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-a', '_job=j1'\\]", 143 | get_message(captured), 144 | ) 145 | 146 | 147 | def test_process_ended_done(environ, process): 148 | pid = process.pid 149 | with capturedLogs() as captured: 150 | process.processEnded(failure.Failure(error.ProcessDone(0))) 151 | captured = remove_debug_messages(captured) 152 | 153 | assert len(captured) == 1 154 | assert captured[0]["log_level"] == LogLevel.info 155 | if environ.items_dir: 156 | assert re.match( 157 | f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} " 158 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " 159 | """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", 160 | get_message(captured), 161 | ) 162 | else: 163 | assert re.match( 164 | f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} " 165 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-a', '_job=j1'\\]", 166 | get_message(captured), 167 | ) 168 | 169 | 170 | def test_process_ended_terminated(environ, process): 171 | pid = process.pid 172 | with capturedLogs() as captured: 173 | process.processEnded(failure.Failure(error.ProcessTerminated(1))) 174 | captured = remove_debug_messages(captured) 175 | 176 | assert len(captured) == 1 177 | assert captured[0]["log_level"] == LogLevel.error 178 | if environ.items_dir: 179 | assert re.match( 180 | f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} " 181 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " 182 | """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", 183 | get_message(captured), 184 | ) 185 | else: 186 | assert re.match( 187 | f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} " 188 | "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+', '-a', '_job=j1'\\]", 189 | get_message(captured), 190 | ) 191 | 192 | 193 | def test_repr(process): 194 | assert repr(process).startswith(f"ScrapyProcessProtocol(project=p1 spider=s1 job=j1 pid={process.pid} start_time=") 195 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | 5 | from scrapyd import __version__ 6 | from scrapyd.__main__ import main 7 | 8 | 9 | def test_version(capsys, monkeypatch): 10 | monkeypatch.setattr(sys, "argv", ["scrapyd", "junk", "--version", "junk"]) 11 | main() 12 | 13 | assert capsys.readouterr().out == f"Scrapyd {__version__}\n" 14 | 15 | 16 | def test_v(capsys, monkeypatch): 17 | monkeypatch.setattr(sys, "argv", ["scrapyd", "junk", "-v", "junk"]) 18 | main() 19 | 20 | assert capsys.readouterr().out == f"Scrapyd {__version__}\n" 21 | 22 | 23 | def test_help(capsys, monkeypatch): 24 | monkeypatch.setattr(sys, "argv", ["scrapyd", "--help"]) 25 | 26 | with pytest.raises(SystemExit) as exc: 27 | main() 28 | 29 | captured = capsys.readouterr() 30 | 31 | assert exc.value.code == 0 32 | assert captured.out.startswith("Usage: scrapyd [options]\n") 33 | assert "--nodaemon" in captured.out 34 | assert "python" not in captured.out 35 | assert "rundir" not in captured.out 36 | assert "ftp" not in captured.out 37 | assert "Commands:" not in captured.out 38 | assert captured.err == "" 39 | -------------------------------------------------------------------------------- /tests/test_poller.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from twisted.internet.defer import Deferred 5 | from zope.interface.verify import verifyObject 6 | 7 | from scrapyd.config import Config 8 | from scrapyd.interfaces import IPoller 9 | from scrapyd.poller import QueuePoller 10 | from scrapyd.utils import get_spider_queues 11 | 12 | 13 | @pytest.fixture() 14 | def poller(tmpdir): 15 | eggs_dir = os.path.join(tmpdir, "eggs") 16 | dbs_dir = os.path.join(tmpdir, "dbs") 17 | config = Config(values={"eggs_dir": eggs_dir, "dbs_dir": dbs_dir}) 18 | os.makedirs(os.path.join(eggs_dir, "mybot1")) 19 | os.makedirs(os.path.join(eggs_dir, "mybot2")) 20 | return QueuePoller(config) 21 | 22 | 23 | def test_interface(poller): 24 | verifyObject(IPoller, poller) 25 | 26 | 27 | # Need sorted(), because os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order. 28 | def test_list_projects_update_projects(poller): 29 | assert sorted(poller.queues) == ["mybot1", "mybot2"] 30 | 31 | os.makedirs(os.path.join(poller.config.get("eggs_dir"), "settings_raise")) 32 | 33 | assert sorted(poller.queues) == ["mybot1", "mybot2"] 34 | 35 | poller.update_projects() 36 | 37 | assert sorted(poller.queues) == ["mybot1", "mybot2", "settings_raise"] 38 | 39 | 40 | def test_poll_next(poller): 41 | queues = get_spider_queues(poller.config) 42 | 43 | scenario = {"mybot1": "spider1", "mybot2": "spider2"} 44 | for project, spider in scenario.items(): 45 | queues[project].add(spider) 46 | 47 | deferred1 = poller.next() 48 | deferred2 = poller.next() 49 | 50 | assert isinstance(deferred1, Deferred) 51 | assert not hasattr(deferred1, "result") 52 | assert isinstance(deferred2, Deferred) 53 | assert not hasattr(deferred2, "result") 54 | 55 | value = poller.poll() 56 | 57 | assert isinstance(value, Deferred) 58 | assert hasattr(value, "result") 59 | assert getattr(value, "called", False) 60 | assert value.result is None 61 | 62 | assert hasattr(deferred1, "result") 63 | assert getattr(deferred1, "called", False) 64 | assert hasattr(deferred2, "result") 65 | assert getattr(deferred2, "called", False) 66 | 67 | # os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order. 68 | project_a = deferred1.result["_project"] 69 | spider_a = scenario.pop(project_a) 70 | project_b, spider_b = scenario.popitem() 71 | 72 | assert deferred1.result["_spider"] == spider_a 73 | assert deferred2.result == {"_project": project_b, "_spider": spider_b} 74 | 75 | 76 | def test_poll_empty(poller): 77 | value = poller.poll() 78 | 79 | assert isinstance(value, Deferred) 80 | assert hasattr(value, "result") 81 | assert getattr(value, "called", False) 82 | assert value.result is None 83 | -------------------------------------------------------------------------------- /tests/test_runner.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os.path 3 | import sys 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | from zope.interface import implementer 8 | 9 | from scrapyd.exceptions import BadEggError 10 | from scrapyd.interfaces import IEggStorage 11 | from scrapyd.runner import main 12 | 13 | BASEDIR = os.path.abspath(os.path.dirname(__file__)) 14 | 15 | 16 | @implementer(IEggStorage) 17 | class MockEggStorage: 18 | def __init__(self, config): 19 | self.config = config 20 | 21 | def put(self, eggfile, project, version): 22 | pass 23 | 24 | def get(self, project, version=None): 25 | if project == "bytesio": 26 | with open(os.path.join(BASEDIR, "fixtures", "quotesbot.egg"), "rb") as f: 27 | return version, io.BytesIO(f.read()) 28 | if project == "noentrypoint": 29 | # Identical to quotesbot.egg, except EGG-INFO/entry_points.txt doesn't set `settings` under [scrapy]. 30 | with open(os.path.join(BASEDIR, "fixtures", "entrypoint_missing.egg"), "rb") as f: 31 | return version, io.BytesIO(f.read()) 32 | if project == "badegg": 33 | return version, io.BytesIO(b"badegg") 34 | return None, None 35 | 36 | def list(self, project): 37 | pass 38 | 39 | def list_projects(self): 40 | return [] 41 | 42 | def delete(self, project, version=None): 43 | pass 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "module", 48 | [ 49 | "scrapy.utils.project", 50 | "scrapy.utils.conf", 51 | "scrapyd.interfaces", 52 | "scrapyd.runner", 53 | ], 54 | ) 55 | def test_no_load_scrapy_conf(module): 56 | __import__(module) 57 | 58 | assert "scrapy.conf" not in sys.modules, f"module {module!r} must not cause the scrapy.conf module to be loaded" 59 | 60 | 61 | @pytest.mark.skipif(sys.platform == "win32", reason="The temporary file encounters a PermissionError") 62 | def test_bytesio(monkeypatch, capsys, chdir): 63 | (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage") 64 | monkeypatch.setenv("SCRAPY_PROJECT", "bytesio") 65 | 66 | with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(SystemExit) as exc: 67 | main() 68 | 69 | # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests. 70 | del os.environ["SCRAPY_SETTINGS_MODULE"] 71 | 72 | captured = capsys.readouterr() 73 | 74 | assert exc.value.code == 0 75 | assert captured.out == "toscrape-css\ntoscrape-xpath\n" 76 | assert captured.err == "" 77 | 78 | 79 | def test_badegg(monkeypatch, capsys, chdir): 80 | (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage") 81 | monkeypatch.setenv("SCRAPY_PROJECT", "badegg") 82 | 83 | with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(BadEggError) as exc: 84 | main() 85 | 86 | # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests. 87 | os.environ.pop("SCRAPY_SETTINGS_MODULE", None) 88 | 89 | captured = capsys.readouterr() 90 | 91 | assert str(exc.value) == "" 92 | assert captured.out == "" 93 | assert captured.err == "" 94 | 95 | 96 | # This confirms that entry_points are required, as documented. 97 | @pytest.mark.filterwarnings("ignore:Module quotesbot was already imported from:UserWarning") # fixture reuses module 98 | def test_noentrypoint(monkeypatch, capsys, chdir): 99 | (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage") 100 | monkeypatch.setenv("SCRAPY_PROJECT", "noentrypoint") 101 | 102 | with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(AttributeError) as exc: 103 | main() 104 | 105 | # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests. 106 | os.environ.pop("SCRAPY_SETTINGS_MODULE", None) 107 | 108 | captured = capsys.readouterr() 109 | 110 | assert str(exc.value) 111 | assert captured.out == "" 112 | assert captured.err == "" 113 | -------------------------------------------------------------------------------- /tests/test_scheduler.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from zope.interface.verify import verifyObject 5 | 6 | from scrapyd.config import Config 7 | from scrapyd.interfaces import ISpiderScheduler 8 | from scrapyd.scheduler import SpiderScheduler 9 | from scrapyd.utils import get_spider_queues 10 | 11 | 12 | @pytest.fixture() 13 | def scheduler(tmpdir): 14 | eggs_dir = os.path.join(tmpdir, "eggs") 15 | dbs_dir = os.path.join(tmpdir, "dbs") 16 | config = Config(values={"eggs_dir": eggs_dir, "dbs_dir": dbs_dir}) 17 | os.makedirs(os.path.join(eggs_dir, "mybot1")) 18 | os.makedirs(os.path.join(eggs_dir, "mybot2")) 19 | return SpiderScheduler(config) 20 | 21 | 22 | def test_interface(scheduler): 23 | verifyObject(ISpiderScheduler, scheduler) 24 | 25 | 26 | # Need sorted(), because os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order. 27 | def test_list_projects_update_projects(scheduler): 28 | assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2"] 29 | 30 | os.makedirs(os.path.join(scheduler.config.get("eggs_dir"), "settings_raise")) 31 | 32 | assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2"] 33 | 34 | scheduler.update_projects() 35 | 36 | assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2", "settings_raise"] 37 | 38 | 39 | def test_schedule(scheduler): 40 | queues = get_spider_queues(scheduler.config) 41 | mybot1_queue = queues["mybot1"] 42 | mybot2_queue = queues["mybot2"] 43 | 44 | assert not mybot1_queue.count() 45 | assert not mybot2_queue.count() 46 | 47 | scheduler.schedule("mybot1", "myspider1", 2, a="b") 48 | scheduler.schedule("mybot2", "myspider2", 1, c="d") 49 | scheduler.schedule("mybot2", "myspider3", 10, e="f") 50 | 51 | assert mybot1_queue.pop() == {"name": "myspider1", "a": "b"} 52 | assert mybot2_queue.pop() == {"name": "myspider3", "e": "f"} 53 | assert mybot2_queue.pop() == {"name": "myspider2", "c": "d"} 54 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import re 4 | 5 | import pytest 6 | import requests 7 | 8 | from tests import get_egg_data 9 | from tests.mockserver import MockScrapydServer 10 | 11 | 12 | @pytest.fixture() 13 | def mock_scrapyd(chdir): 14 | with MockScrapydServer() as server: 15 | yield server 16 | 17 | 18 | def test_urljoin(mock_scrapyd): 19 | assert mock_scrapyd.urljoin("foo") == f"{mock_scrapyd.url}foo" 20 | 21 | 22 | def test_auth(): 23 | with MockScrapydServer(username="bob", password="hunter2") as server: 24 | assert requests.get(server.url).status_code == 401 25 | 26 | res = requests.get(server.url, auth=("bob", "hunter2")) 27 | 28 | assert res.status_code == 200 29 | assert re.search("use the API", res.text) 30 | 31 | res = requests.get(server.url, auth=("bob", "invalid")) 32 | 33 | assert res.status_code == 401 34 | 35 | stdout = server.stdout.decode() 36 | 37 | # scrapyd.basicauth 38 | assert f" [-] Basic authentication enabled{os.linesep}" in stdout 39 | # scrapyd.app 40 | assert f" [-] Scrapyd web console available at http://127.0.0.1:{server.http_port}/" in stdout 41 | 42 | 43 | def test_noauth(): 44 | with MockScrapydServer() as server: 45 | pass 46 | 47 | # scrapyd.basicauth 48 | assert ( 49 | f" [-] Basic authentication disabled as either `username` or `password` is unset{os.linesep}" 50 | in server.stdout.decode() 51 | ) 52 | 53 | 54 | def test_error(): 55 | with MockScrapydServer() as server: 56 | requests.get(server.urljoin("listversions.json"), params={"project": [b"\xc3\x28"]}) 57 | 58 | stdout = server.stdout.decode() 59 | 60 | # scrapyd.webservice 61 | assert f" [-] Unhandled Error{os.linesep}" in stdout 62 | assert f"\tTraceback (most recent call last):{os.linesep}" in stdout 63 | assert "\ttwisted.web.error.Error: 200 project is invalid: " in stdout 64 | 65 | 66 | @pytest.mark.parametrize( 67 | ("method", "basename"), 68 | [ 69 | ("GET", "daemonstatus"), 70 | ("POST", "addversion"), 71 | ("POST", "schedule"), 72 | ("POST", "cancel"), 73 | ("GET", "status"), 74 | ("GET", "listprojects"), 75 | ("GET", "listversions"), 76 | ("GET", "listspiders"), 77 | ("GET", "listjobs"), 78 | ("POST", "delversion"), 79 | ("POST", "delproject"), 80 | ], 81 | ) 82 | def test_options(mock_scrapyd, method, basename): 83 | response = requests.options(mock_scrapyd.urljoin(f"{basename}.json")) 84 | 85 | assert response.status_code == 204, f"204 != {response.status_code}" 86 | assert response.content == b"" 87 | assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}" 88 | 89 | 90 | # https://github.com/scrapy/scrapyd/issues/377 91 | def test_other_reactors(mock_scrapyd): 92 | response = requests.post( 93 | mock_scrapyd.urljoin("addversion.json"), 94 | data={b"project": b"quotesbot", b"version": b"0.01"}, 95 | # Identical to quotesbot.egg, except quotesbot/settings.py sets 96 | # `TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"`. 97 | files={b"egg": io.BytesIO(get_egg_data("settings_asyncioreactor"))}, 98 | ) 99 | 100 | assert response.status_code == 200 101 | assert response.json()["status"] == "ok" 102 | -------------------------------------------------------------------------------- /tests/test_spiderqueue.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from twisted.internet.defer import inlineCallbacks, maybeDeferred 3 | from zope.interface.verify import verifyObject 4 | 5 | from scrapyd.config import Config 6 | from scrapyd.interfaces import ISpiderQueue 7 | from scrapyd.spiderqueue import SqliteSpiderQueue 8 | 9 | spider_args = { 10 | "arg1": "val1", 11 | "arg2": 2, 12 | "arg3": "\N{SNOWMAN}", 13 | } 14 | expected = spider_args.copy() 15 | expected["name"] = "spider1" 16 | 17 | 18 | @pytest.fixture() 19 | def spiderqueue(): 20 | return SqliteSpiderQueue(Config(values={"dbs_dir": ":memory:"}), "quotesbot") 21 | 22 | 23 | def test_interface(spiderqueue): 24 | verifyObject(ISpiderQueue, spiderqueue) 25 | 26 | 27 | @inlineCallbacks 28 | def test_pop(spiderqueue): 29 | yield maybeDeferred(spiderqueue.add, "spider0", 5) 30 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 31 | yield maybeDeferred(spiderqueue.add, "spider1", 0) 32 | 33 | assert (yield maybeDeferred(spiderqueue.count)) == 3 34 | 35 | assert (yield maybeDeferred(spiderqueue.pop)) == expected 36 | 37 | assert (yield maybeDeferred(spiderqueue.count)) == 2 38 | 39 | 40 | @inlineCallbacks 41 | def test_list(spiderqueue): 42 | assert (yield maybeDeferred(spiderqueue.list)) == [] 43 | 44 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 45 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 46 | 47 | assert (yield maybeDeferred(spiderqueue.list)) == [expected, expected] 48 | 49 | 50 | @inlineCallbacks 51 | def test_remove(spiderqueue): 52 | yield maybeDeferred(spiderqueue.add, "spider0", 5) 53 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 54 | yield maybeDeferred(spiderqueue.add, "spider1", 0) 55 | 56 | assert (yield maybeDeferred(spiderqueue.count)) == 3 57 | 58 | assert (yield maybeDeferred(spiderqueue.remove, lambda message: message["name"] == "spider1")) == 2 59 | 60 | assert (yield maybeDeferred(spiderqueue.count)) == 1 61 | 62 | 63 | @inlineCallbacks 64 | def test_clear(spiderqueue): 65 | assert (yield maybeDeferred(spiderqueue.count)) == 0 66 | 67 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 68 | yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args) 69 | 70 | assert (yield maybeDeferred(spiderqueue.count)) == 2 71 | 72 | yield maybeDeferred(spiderqueue.clear) 73 | 74 | assert (yield maybeDeferred(spiderqueue.count)) == 0 75 | -------------------------------------------------------------------------------- /tests/test_sqlite.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pytest 4 | 5 | from scrapyd.sqlite import JsonSqlitePriorityQueue, SqliteFinishedJobs 6 | from tests import get_finished_job 7 | 8 | 9 | @pytest.fixture() 10 | def jsonsqlitepriorityqueue(): 11 | return JsonSqlitePriorityQueue() 12 | 13 | 14 | @pytest.fixture() 15 | def sqlitefinishedjobs(): 16 | q = SqliteFinishedJobs(":memory:") 17 | q.add(get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))) 18 | q.add(get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))) 19 | q.add(get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))) 20 | return q 21 | 22 | 23 | def test_jsonsqlitepriorityqueue_empty(jsonsqlitepriorityqueue): 24 | assert jsonsqlitepriorityqueue.pop() is None 25 | 26 | 27 | def test_jsonsqlitepriorityqueue_one(jsonsqlitepriorityqueue): 28 | msg = "a message" 29 | jsonsqlitepriorityqueue.put(msg) 30 | 31 | assert "_id" not in msg 32 | assert jsonsqlitepriorityqueue.pop() == msg 33 | assert jsonsqlitepriorityqueue.pop() is None 34 | 35 | 36 | def test_jsonsqlitepriorityqueue_multiple(jsonsqlitepriorityqueue): 37 | msg1 = "first message" 38 | msg2 = "second message" 39 | jsonsqlitepriorityqueue.put(msg1) 40 | jsonsqlitepriorityqueue.put(msg2) 41 | out = [] 42 | out.append(jsonsqlitepriorityqueue.pop()) 43 | out.append(jsonsqlitepriorityqueue.pop()) 44 | 45 | assert msg1 in out 46 | assert msg2 in out 47 | assert jsonsqlitepriorityqueue.pop() is None 48 | 49 | 50 | def test_jsonsqlitepriorityqueue_priority(jsonsqlitepriorityqueue): 51 | msg1 = "message 1" 52 | msg2 = "message 2" 53 | msg3 = "message 3" 54 | msg4 = "message 4" 55 | jsonsqlitepriorityqueue.put(msg1, priority=1.0) 56 | jsonsqlitepriorityqueue.put(msg2, priority=5.0) 57 | jsonsqlitepriorityqueue.put(msg3, priority=3.0) 58 | jsonsqlitepriorityqueue.put(msg4, priority=2.0) 59 | 60 | assert jsonsqlitepriorityqueue.pop() == msg2 61 | assert jsonsqlitepriorityqueue.pop() == msg3 62 | assert jsonsqlitepriorityqueue.pop() == msg4 63 | assert jsonsqlitepriorityqueue.pop() == msg1 64 | 65 | 66 | def test_jsonsqlitepriorityqueue_iter_len_clear(jsonsqlitepriorityqueue): 67 | assert len(jsonsqlitepriorityqueue) == 0 68 | assert list(jsonsqlitepriorityqueue) == [] 69 | 70 | msg1 = "message 1" 71 | msg2 = "message 2" 72 | msg3 = "message 3" 73 | msg4 = "message 4" 74 | jsonsqlitepriorityqueue.put(msg1, priority=1.0) 75 | jsonsqlitepriorityqueue.put(msg2, priority=5.0) 76 | jsonsqlitepriorityqueue.put(msg3, priority=3.0) 77 | jsonsqlitepriorityqueue.put(msg4, priority=2.0) 78 | 79 | assert len(jsonsqlitepriorityqueue) == 4 80 | assert list(jsonsqlitepriorityqueue) == [(msg2, 5.0), (msg3, 3.0), (msg4, 2.0), (msg1, 1.0)] 81 | 82 | jsonsqlitepriorityqueue.clear() 83 | 84 | assert len(jsonsqlitepriorityqueue) == 0 85 | assert list(jsonsqlitepriorityqueue) == [] 86 | 87 | 88 | def test_jsonsqlitepriorityqueue_remove(jsonsqlitepriorityqueue): 89 | assert len(jsonsqlitepriorityqueue) == 0 90 | assert list(jsonsqlitepriorityqueue) == [] 91 | 92 | msg1 = "good message 1" 93 | msg2 = "bad message 2" 94 | msg3 = "good message 3" 95 | msg4 = "bad message 4" 96 | jsonsqlitepriorityqueue.put(msg1) 97 | jsonsqlitepriorityqueue.put(msg2) 98 | jsonsqlitepriorityqueue.put(msg3) 99 | jsonsqlitepriorityqueue.put(msg4) 100 | jsonsqlitepriorityqueue.remove(lambda x: x.startswith("bad")) 101 | 102 | assert list(jsonsqlitepriorityqueue) == [(msg1, 0.0), (msg3, 0.0)] 103 | 104 | 105 | @pytest.mark.parametrize( 106 | "value", 107 | [ 108 | "native ascii str", 109 | "\xa3", 110 | 123, 111 | 1.2, 112 | True, 113 | ["a", "list", 1], 114 | {"a": "dict"}, 115 | ], 116 | ) 117 | def test_jsonsqlitepriorityqueue_types(jsonsqlitepriorityqueue, value): 118 | jsonsqlitepriorityqueue.put(value) 119 | 120 | assert jsonsqlitepriorityqueue.pop() == value 121 | 122 | 123 | def test_sqlitefinishedjobs_add(sqlitefinishedjobs): 124 | assert len(sqlitefinishedjobs) == 3 125 | 126 | 127 | def test_sqlitefinishedjobs_clear_all(sqlitefinishedjobs): 128 | sqlitefinishedjobs.clear() 129 | 130 | assert len(sqlitefinishedjobs) == 0 131 | 132 | 133 | def test_sqlitefinishedjobs_clear_keep_0(sqlitefinishedjobs): 134 | sqlitefinishedjobs.clear(finished_to_keep=0) 135 | 136 | assert len(sqlitefinishedjobs) == 0 137 | 138 | 139 | def test_sqlitefinishedjobs_clear_keep_2(sqlitefinishedjobs): 140 | sqlitefinishedjobs.clear(finished_to_keep=2) 141 | 142 | assert len(sqlitefinishedjobs) == 2 143 | 144 | 145 | def test_sqlitefinishedjobs__iter__(sqlitefinishedjobs): 146 | actual = list(sqlitefinishedjobs) 147 | 148 | assert (actual[0][0], actual[0][1]) == ("p3", "s3") 149 | assert (actual[1][0], actual[1][1]) == ("p2", "s2") 150 | assert (actual[2][0], actual[2][1]) == ("p1", "s1") 151 | -------------------------------------------------------------------------------- /tests/test_website.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from html_checker.validator import ValidatorInterface 5 | from twisted.web import http_headers, resource 6 | from twisted.web.test._util import _render 7 | from twisted.web.test.requesthelper import DummyRequest 8 | 9 | from scrapyd.app import application 10 | from scrapyd.launcher import ScrapyProcessProtocol 11 | from scrapyd.website import Root 12 | from tests import get_finished_job, has_settings, root_add_version, touch 13 | 14 | 15 | def assert_headers(txrequest): 16 | headers = dict(txrequest.responseHeaders.getAllRawHeaders()) 17 | content_length = headers.pop(b"Content-Length") 18 | 19 | assert len(content_length) == 1 20 | assert isinstance(content_length[0], bytes) 21 | assert int(content_length[0]) 22 | assert headers == {b"Content-Type": [b"text/html; charset=utf-8"]} 23 | 24 | 25 | def assert_hrefs(urls, text, header): 26 | for href, name in urls: 27 | if header: 28 | assert f'{name}' in text 29 | else: 30 | assert f'{name}' in text 31 | 32 | 33 | # Derived from test_emptyChildUnicodeParent. 34 | # https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py 35 | def test_logs_dir(txrequest, root): 36 | os.makedirs(os.path.join("logs", "quotesbot")) 37 | 38 | file = root.children[b"logs"] 39 | request = DummyRequest([b""]) 40 | child = resource.getChildForRequest(file, request) 41 | 42 | content = child.render(request) 43 | 44 | assert list(request.responseHeaders.getAllRawHeaders()) == [(b"Content-Type", [b"text/html; charset=utf-8"])] 45 | assert b"Last modified" in content 46 | assert b'quotesbot/' in content 47 | 48 | 49 | # Derived from test_indexNames. 50 | # https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py 51 | def test_logs_file(txrequest, root): 52 | os.makedirs(os.path.join("logs", "quotesbot")) 53 | with open(os.path.join("logs", "foo.txt"), "wb") as f: 54 | f.write(b"baz") 55 | 56 | file = root.children[b"logs"] 57 | request = DummyRequest([b"foo.txt"]) 58 | child = resource.getChildForRequest(file, request) 59 | 60 | d = _render(child, request) 61 | 62 | def cbRendered(ignored): 63 | assert list(request.responseHeaders.getAllRawHeaders()) == [ 64 | (b"Accept-Ranges", [b"bytes"]), 65 | (b"Content-Length", [b"3"]), 66 | (b"Content-Type", [b"text/plain"]), 67 | ] 68 | assert b"".join(request.written) == b"baz" 69 | 70 | d.addCallback(cbRendered) 71 | return d 72 | 73 | 74 | @pytest.mark.parametrize("cancel", [True, False], ids=["cancel", "no_cancel"]) 75 | @pytest.mark.parametrize("header", [True, False], ids=["header", "no_header"]) 76 | @pytest.mark.parametrize("exists", [True, False], ids=["exists", "no_exists"]) 77 | def test_jobs(txrequest, config, cancel, header, exists, chdir): 78 | if not cancel: 79 | config.cp.remove_option("services", "cancel.json") 80 | 81 | root = Root(config, application(config)) 82 | root_add_version(root, "quotesbot", "0.1", "quotesbot") 83 | root.update_projects() 84 | 85 | urls = [ 86 | ("logs/p1/s1/j1-finished.log", "Log"), 87 | ("logs/p2/s2/j2-running.log", "Log"), 88 | ("logs/p3/s3/j3-pending.log", "Log"), 89 | ] 90 | if root.local_items: 91 | urls.extend( 92 | [ 93 | ("items/p1/s1/j1-finished.jl", "Items"), 94 | ("items/p2/s2/j2-running.jl", "Items"), 95 | ("items/p3/s3/j3-pending.jl", "Items"), 96 | ] 97 | ) 98 | if exists: 99 | touch(chdir / "logs" / "p1" / "s1" / "j1-finished.log") 100 | touch(chdir / "logs" / "p2" / "s2" / "j2-running.log") 101 | exist = urls[0:2] 102 | no_exist = urls[2:3] 103 | 104 | if root.local_items: 105 | touch(chdir / "items" / "p1" / "s1" / "j1-finished.jl") 106 | touch(chdir / "items" / "p2" / "s2" / "j2-running.jl") 107 | exist += urls[3:5] 108 | no_exist += urls[5:6] 109 | else: 110 | exist = [] 111 | no_exist = urls 112 | 113 | root.launcher.finished.add(get_finished_job("p1", "s1", "j1-finished")) 114 | root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2-running", env={}, args=[]) 115 | root.poller.queues["quotesbot"].add("quotesbot", _job="j3-pending") 116 | 117 | if header: 118 | txrequest.requestHeaders = http_headers.Headers({b"X-Forwarded-Prefix": [b"/path/to"]}) 119 | txrequest.method = "GET" 120 | content = root.children[b"jobs"].render(txrequest) 121 | text = content.decode() 122 | 123 | assert_headers(txrequest) 124 | assert_hrefs(exist, text, header) 125 | for url, _ in no_exist: 126 | assert url not in text 127 | 128 | if root.local_items: 129 | assert b"Items" in content 130 | else: 131 | assert b"Items" not in content 132 | 133 | if cancel: 134 | assert b"Cancel" in content 135 | if header: 136 | assert b' action="/path/to/cancel.json">' in content 137 | else: 138 | assert b' action="/cancel.json">' in content 139 | for job in ("j2-running", "j3-pending"): 140 | assert f' value="{job}">' in text 141 | else: 142 | assert b"Cancel" not in content 143 | assert b'/cancel.json">' not in content 144 | assert b' value="j1-finished">' not in content 145 | 146 | 147 | @pytest.mark.parametrize("with_egg", [True, False]) 148 | @pytest.mark.parametrize("header", [True, False]) 149 | def test_home(txrequest, root, with_egg, header): 150 | if with_egg: 151 | root_add_version(root, "quotesbot", "0.1", "quotesbot") 152 | root.update_projects() 153 | 154 | if header: 155 | txrequest.requestHeaders = http_headers.Headers({b"X-Forwarded-Prefix": [b"/path/to"]}) 156 | txrequest.method = "GET" 157 | content = root.children[b""].render(txrequest) 158 | text = content.decode() 159 | 160 | urls = [("jobs", "Jobs"), ("logs/", "Logs")] 161 | if root.local_items: 162 | urls.append(("items/", "Items")) 163 | 164 | assert_headers(txrequest) 165 | assert_hrefs(urls, text, header) 166 | 167 | if root.local_items: 168 | assert b'/items/">Items' in content 169 | else: 170 | assert b'/items/">Items' not in content 171 | 172 | projects = [] 173 | if with_egg: 174 | projects.append("quotesbot") 175 | if has_settings(): 176 | projects.append("localproject") 177 | 178 | if projects: 179 | assert b"

    Scrapy projects:

    " in content 180 | for project in projects: 181 | assert f"
  • {project}
  • " in text 182 | else: 183 | assert b"

    No Scrapy projects yet.

    " in content 184 | for project in projects: 185 | assert f"
  • {project}
  • " not in text 186 | 187 | 188 | @pytest.mark.parametrize("basename", ["", "jobs"]) 189 | def test_validate(tmp_path, txrequest, root, basename, caplog): 190 | txrequest.method = "GET" 191 | content = root.children[basename.encode()].render(txrequest) 192 | path = tmp_path / "page.html" 193 | path.write_bytes(content) 194 | report = ValidatorInterface().validate([str(path)]).registry[str(path)] 195 | 196 | assert report is None, repr(report) 197 | --------------------------------------------------------------------------------