├── .git-blame-ignore-revs
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── lint.yml
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── api.rst
    ├── cli.rst
    ├── conf.py
    ├── config.rst
    ├── contributing
    │   ├── api.rst
    │   └── index.rst
    ├── deploy.rst
    ├── index.rst
    ├── news.rst
    ├── overview.rst
    └── requirements.txt
├── integration_tests
    ├── __init__.py
    ├── test_webservice.py
    └── test_website.py
├── pyproject.toml
├── scrapyd
    ├── __init__.py
    ├── __main__.py
    ├── app.py
    ├── basicauth.py
    ├── config.py
    ├── default_scrapyd.conf
    ├── eggstorage.py
    ├── environ.py
    ├── exceptions.py
    ├── interfaces.py
    ├── jobstorage.py
    ├── launcher.py
    ├── poller.py
    ├── runner.py
    ├── scheduler.py
    ├── spiderqueue.py
    ├── sqlite.py
    ├── txapp.py
    ├── utils.py
    ├── webservice.py
    └── website.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── fixtures
        ├── entrypoint_missing.egg
        ├── filesystem
        │   ├── localproject
        │   │   ├── __init__.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   └── example.py
        │   └── scrapy.cfg
        ├── mybot.egg
        ├── mybot2.egg
        ├── quotesbot.egg
        ├── settings_asyncioreactor.egg
        ├── settings_log_stdout.egg
        ├── settings_raise.egg
        └── spiders_utf8.egg
    ├── mockapp.py
    ├── mockserver.py
    ├── test_config.py
    ├── test_eggstorage.py
    ├── test_environ.py
    ├── test_interfaces.py
    ├── test_jobstorage.py
    ├── test_launcher.py
    ├── test_main.py
    ├── test_poller.py
    ├── test_runner.py
    ├── test_scheduler.py
    ├── test_server.py
    ├── test_spiderqueue.py
    ├── test_sqlite.py
    ├── test_webservice.py
    └── test_website.py


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Example: git blame --ignore-revs-file .git-blame-ignore-revs file
2 | 
3 | # Migrate code style to Black
4 | 51521eed7216eb7545028e2be0de5a2c3e5049f6
5 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v4
 9 |       - uses: actions/setup-python@v5
10 |         with:
11 |           python-version: 3.9
12 |           cache: pip
13 |       - run: pip install --upgrade pre-commit
14 |       - run: pre-commit run --all-files
15 |       - run: pip install --upgrade check-manifest setuptools
16 |       - run: check-manifest
17 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | on: push
 3 | jobs:
 4 |   publish:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v4
 8 |       - uses: actions/setup-python@v5
 9 |         with:
10 |           python-version: 3.9
11 |       - run: pip install --upgrade build
12 |       - run: python -m build --sdist --wheel
13 |       - name: Publish to TestPyPI
14 |         uses: pypa/gh-action-pypi-publish@release/v1
15 |         with:
16 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
17 |           repository-url: https://test.pypi.org/legacy/
18 |           skip-existing: true
19 |       - name: Publish to PyPI
20 |         if: startsWith(github.ref, 'refs/tags')
21 |         uses: pypa/gh-action-pypi-publish@release/v1
22 |         with:
23 |           password: ${{ secrets.PYPI_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   tests:
 5 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 6 |     runs-on: ${{ matrix.os }}
 7 |     strategy:
 8 |       matrix:
 9 |         os: [macos-latest, windows-latest, ubuntu-latest]
10 |         python-version: [3.9, "3.10", "3.11", "3.12", "3.13"]
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - uses: actions/setup-python@v5
14 |         with:
15 |           python-version: ${{ matrix.python-version }}
16 |           cache: pip
17 |       - run: pip install -e .[test]
18 |       # Python 3.12 deprecates pkg_resources (also used by py-html-checker).
19 |       # https://github.com/pytest-dev/pytest-twisted/issues/183
20 |       # https://github.com/sveetch/py-html-checker/issues/26
21 |       - run: |
22 |           pytest -W error -W ignore::ResourceWarning -W ignore::DeprecationWarning:scrapyd.runner -W ignore::DeprecationWarning:pytest_twisted -W ignore::DeprecationWarning:html_checker -W ignore::DeprecationWarning:pkg_resources tests --cov scrapyd
23 |       # Occasional "ConnectionRefusedError: [Errno 111] Connection refused".
24 |       - name: Run integration tests
25 |         run: |
26 |           printf "[scrapyd]\nusername = hello12345\npassword = 67890world\n" > scrapyd.conf
27 |           mkdir logs
28 |           scrapyd > scrapyd.log 2>&1 &
29 |           sleep 1
30 |           pytest -W error -W ignore::ResourceWarning -W ignore::DeprecationWarning:pytest_twisted integration_tests
31 |           cat scrapyd.log
32 |       - env:
33 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 |         run: coveralls --service=github
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Build
 2 | /*.egg-info
 3 | /dist
 4 | *.pyc
 5 | 
 6 | # Development
 7 | venv
 8 | .vscode
 9 | .idea
10 | /myproject
11 | 
12 | # Docs
13 | /docs/_build
14 | 
15 | # Tests
16 | /.coverage*
17 | /htmlcov
18 | /_trial_temp
19 | /tests.test_*
20 | 
21 | # CLI
22 | /scrapyd.conf
23 | /twistd.pid
24 | /dbs
25 | /eggs
26 | /items
27 | /logs
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ci:
2 |   autoupdate_schedule: quarterly
3 | repos:
4 |   - repo: https://github.com/astral-sh/ruff-pre-commit
5 |     rev: v0.5.0
6 |     hooks:
7 |       - id: ruff
8 |       - id: ruff-format
9 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-20.04
 4 |   tools:
 5 |     python: "3.9"
 6 | python:
 7 |   install:
 8 |     - path: .
 9 |     - requirements: docs/requirements.txt
10 | sphinx:
11 |   configuration: docs/conf.py
12 |   fail_on_warning: true
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Scrapy developers.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, 
 8 |        this list of conditions and the following disclaimer.
 9 |     
10 |     2. Redistributions in binary form must reproduce the above copyright 
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of Scrapy nor the names of its contributors may be used
15 |        to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include scrapyd/default_scrapyd.conf
 3 | recursive-include docs *.py
 4 | recursive-include docs *.rst
 5 | recursive-include docs *.txt
 6 | recursive-include docs Makefile
 7 | recursive-include scrapyd *.py
 8 | recursive-include tests *.cfg
 9 | recursive-include tests *.egg
10 | recursive-include tests *.py
11 | recursive-include integration_tests *.py
12 | exclude .git-blame-ignore-revs
13 | exclude .pre-commit-config.yaml
14 | exclude .readthedocs.yaml
15 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |PyPI Version| |Build Status| |Coverage Status| |Python Version| |Pypi Downloads|
 2 | 
 3 | Scrapyd is a service for deploying and running `Scrapy <https://scrapy.org>`__ spiders.
 4 | 
 5 | It allows you to upload Scrapy projects and control their spiders using a JSON API.
 6 | 
 7 | (If you are viewing this on GitHub, open the `full documentation <https://scrapyd.readthedocs.io/>`__ for additional details.)
 8 | 
 9 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/scrapyd.svg
10 |    :target: https://pypi.org/project/scrapyd/
11 | .. |Build Status| image:: https://github.com/scrapy/scrapyd/workflows/Tests/badge.svg
12 | .. |Coverage Status| image:: https://coveralls.io/repos/github/scrapy/scrapyd/badge.svg?branch=master
13 |    :target: https://coveralls.io/github/scrapy/scrapyd?branch=master
14 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/scrapyd.svg
15 |    :target: https://pypi.org/project/scrapyd/
16 | .. |Pypi Downloads| image:: https://img.shields.io/pypi/dm/scrapyd.svg
17 |    :target: https://pypi.python.org/pypi/scrapyd/
18 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | PYTHON        = python
  6 | SPHINXOPTS    =
  7 | SPHINXBUILD   = sphinx-build
  8 | PAPER         =
  9 | BUILDDIR      = _build
 10 | 
 11 | # Internal variables.
 12 | PAPEROPT_a4     = -D latex_paper_size=a4
 13 | PAPEROPT_letter = -D latex_paper_size=letter
 14 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 15 | # the i18n builder cannot share the environment and doctrees with the others
 16 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 17 | 
 18 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 19 | 
 20 | help:
 21 | 	@echo "Please use \`make <target>' where <target> is one of"
 22 | 	@echo "  html       to make standalone HTML files"
 23 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 24 | 	@echo "  singlehtml to make a single large HTML file"
 25 | 	@echo "  pickle     to make pickle files"
 26 | 	@echo "  json       to make JSON files"
 27 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 28 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 29 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 30 | 	@echo "  epub       to make an epub"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  text       to make text files"
 34 | 	@echo "  man        to make manual pages"
 35 | 	@echo "  texinfo    to make Texinfo files"
 36 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 37 | 	@echo "  gettext    to make PO message catalogs"
 38 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 39 | 	@echo "  linkcheck  to check all external links for integrity"
 40 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 41 | 
 42 | clean:
 43 | 	-rm -rf $(BUILDDIR)/*
 44 | 
 45 | html:
 46 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 47 | 	@echo
 48 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 49 | 
 50 | dirhtml:
 51 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 52 | 	@echo
 53 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 54 | 
 55 | singlehtml:
 56 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 59 | 
 60 | pickle:
 61 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 62 | 	@echo
 63 | 	@echo "Build finished; now you can process the pickle files."
 64 | 
 65 | json:
 66 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 67 | 	@echo
 68 | 	@echo "Build finished; now you can process the JSON files."
 69 | 
 70 | htmlhelp:
 71 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 74 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 75 | 
 76 | qthelp:
 77 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 78 | 	@echo
 79 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 80 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 81 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Scrapyd.qhcp"
 82 | 	@echo "To view the help file:"
 83 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Scrapyd.qhc"
 84 | 
 85 | devhelp:
 86 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 87 | 	@echo
 88 | 	@echo "Build finished."
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Scrapyd"
 91 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Scrapyd"
 92 | 	@echo "# devhelp"
 93 | 
 94 | epub:
 95 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 96 | 	@echo
 97 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 98 | 
 99 | latex:
100 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
101 | 	@echo
102 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
103 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
104 | 	      "(use \`make latexpdf' here to do that automatically)."
105 | 
106 | latexpdf:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo "Running LaTeX files through pdflatex..."
109 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
110 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
111 | 
112 | text:
113 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
114 | 	@echo
115 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
116 | 
117 | man:
118 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
119 | 	@echo
120 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
121 | 
122 | texinfo:
123 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
124 | 	@echo
125 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
126 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
127 | 	      "(use \`make info' here to do that automatically)."
128 | 
129 | info:
130 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
131 | 	@echo "Running Texinfo files through makeinfo..."
132 | 	make -C $(BUILDDIR)/texinfo info
133 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
134 | 
135 | gettext:
136 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
137 | 	@echo
138 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
139 | 
140 | changes:
141 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
142 | 	@echo
143 | 	@echo "The overview file is in $(BUILDDIR)/changes."
144 | 
145 | linkcheck:
146 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
147 | 	@echo
148 | 	@echo "Link check complete; look for any errors in the above output " \
149 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
150 | 
151 | doctest:
152 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
153 | 	@echo "Testing of doctests in the sources finished, look at the " \
154 | 	      "results in $(BUILDDIR)/doctest/output.txt."
155 | 
156 | htmlview: html
157 | 	$(PYTHON) -c "import webbrowser; webbrowser.open('_build/html/index.html')"
158 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
  1 | API
  2 | ===
  3 | 
  4 | If :ref:`basic authentication<username>` is enabled, you can use ``curl``'s ``-u`` option in the examples below, for example:
  5 | 
  6 | .. code-block:: shell
  7 | 
  8 |    curl -u yourusername:yourpassword http://localhost:6800/daemonstatus.json
  9 | 
 10 | .. _daemonstatus.json:
 11 | 
 12 | daemonstatus.json
 13 | -----------------
 14 | 
 15 | .. versionadded:: 1.2.0
 16 | 
 17 | To check the load status of a service.
 18 | 
 19 | Supported request methods
 20 |   ``GET``
 21 | 
 22 | Example:
 23 | 
 24 | .. code-block:: shell-session
 25 | 
 26 |    $ curl http://localhost:6800/daemonstatus.json
 27 |    {"node_name": "mynodename", "status": "ok", "pending": 0, "running": 0, "finished": 0}
 28 | 
 29 | .. _addversion.json:
 30 | 
 31 | addversion.json
 32 | ---------------
 33 | 
 34 | Add a version to a project in :ref:`eggstorage`, creating the project if needed.
 35 | 
 36 | Supported request methods
 37 |   ``POST``
 38 | Parameters
 39 |   ``project`` (required)
 40 |     the project name
 41 |   ``version`` (required)
 42 |     the project version
 43 | 
 44 |     Scrapyd uses the packaging `Version <https://packaging.pypa.io/en/stable/version.html>`__ to interpret the version numbers you provide.
 45 |   ``egg`` (required)
 46 |     a Python egg containing the project's code
 47 | 
 48 |     The egg must set an entry point to its Scrapy settings. For example, with a ``setup.py`` file:
 49 | 
 50 |     .. code-block:: python
 51 |        :emphasize-lines: 5
 52 | 
 53 |        setup(
 54 |            name         = 'project',
 55 |            version      = '1.0',
 56 |            packages     = find_packages(),
 57 |            entry_points = {'scrapy': ['settings = projectname.settings']},
 58 |        )
 59 | 
 60 |     Do this easily with the ``scrapyd-deploy`` command from the `scrapyd-client <https://github.com/scrapy/scrapyd-client>`__ package.
 61 | 
 62 | Example:
 63 | 
 64 | .. code-block:: shell-session
 65 | 
 66 |    $ curl http://localhost:6800/addversion.json -F project=myproject -F version=r23 -F egg=@myproject.egg
 67 |    {"node_name": "mynodename", "status": "ok", "spiders": 3}
 68 | 
 69 | .. _schedule.json:
 70 | 
 71 | schedule.json
 72 | -------------
 73 | 
 74 | Schedule a job. (A job is a `Scrapy crawl <https://docs.scrapy.org/en/latest/topics/commands.html#crawl>`__.)
 75 | 
 76 | If the :ref:`logs_dir` setting is set, log files are written to ``{logs_dir}/{project}/{spider}/{jobid}.log``. Set the ``jobid`` parameter to configure the basename of the log file.
 77 | 
 78 | .. important:: Like Scrapy's ``scrapy.Spider`` class, spiders should allow an arbitrary number of keyword arguments in their ``__init__`` method, because Scrapyd sets internally-generated spider arguments when starting crawls.
 79 | 
 80 | Supported request methods
 81 |   ``POST``
 82 | Parameters
 83 |   ``project`` (required)
 84 |     the project name
 85 |   ``spider`` (required)
 86 |     the spider name
 87 |   ``_version``
 88 |     the project version (the latest project version by default)
 89 |   ``jobid``
 90 |     the job's ID (a hexadecimal UUID v1 by default)
 91 |   ``priority``
 92 |     the job's priority in the project's spider queue (0 by default, higher number, higher priority)
 93 |   ``setting``
 94 |     a Scrapy setting
 95 | 
 96 |     For example, using `DOWNLOAD_DELAY <http://doc.scrapy.org/en/latest/topics/settings.html#download-delay>`__:
 97 | 
 98 |     .. code-block:: shell
 99 | 
100 |        curl http://localhost:6800/schedule.json -d setting=DOWNLOAD_DELAY=2 -d project=myproject -d spider=somespider
101 |   Any other parameter
102 |     a spider argument
103 | 
104 |     For example, using ``arg1``:
105 | 
106 |     .. code-block:: shell
107 | 
108 |        curl http://localhost:6800/schedule.json -d arg1=val1 -d project=myproject -d spider=somespider
109 | 
110 |     .. warning::
111 | 
112 |        When such parameters are set multiple times, only the first value is sent to the spider.
113 | 
114 |        To change this behavior, please `open an issue <https://github.com/scrapy/scrapyd/issues>`__.
115 | 
116 | Example:
117 | 
118 | .. code-block:: shell-session
119 | 
120 |    $ curl http://localhost:6800/schedule.json -d project=myproject -d spider=somespider
121 |    {"node_name": "mynodename", "status": "ok", "jobid": "6487ec79947edab326d6db28a2d86511e8247444"}
122 | 
123 | .. _status.json:
124 | 
125 | status.json
126 | -----------
127 | 
128 | .. versionadded:: 1.5.0
129 | 
130 | Get the status of a job.
131 | 
132 | Supported request methods
133 |   ``GET``
134 | Parameters
135 |   ``job`` (required)
136 |     the job ID
137 |   ``project``
138 |     the project name
139 | 
140 | Example:
141 | 
142 | .. code-block:: shell-session
143 | 
144 |    $ curl http://localhost:6800/status.json?job=6487ec79947edab326d6db28a2d86511e8247444
145 |    {"node_name": "mynodename", "status": "ok", "currstate": "running"}
146 | 
147 | .. _cancel.json:
148 | 
149 | cancel.json
150 | -----------
151 | 
152 | Cancel a job.
153 | 
154 | -  If the job is pending, it is removed from the project's spider queue.
155 | -  If the job is running, the process is sent a signal to terminate.
156 | 
157 | Supported request methods
158 |   ``POST``
159 | Parameters
160 |   ``project`` (required)
161 |     the project name
162 |   ``job`` (required)
163 |     the job ID
164 |   ``signal``
165 |     the `signal <https://docs.python.org/3/library/signal.html#module-contents>`__ to send to the Scrapy process (``BREAK`` by default on Windows and ``INT`` by default, otherwise)
166 | 
167 | Example:
168 | 
169 | .. code-block:: shell-session
170 | 
171 |    $ curl http://localhost:6800/cancel.json -d project=myproject -d job=6487ec79947edab326d6db28a2d86511e8247444
172 |    {"node_name": "mynodename", "status": "ok", "prevstate": "running"}
173 | 
174 | .. _listprojects.json:
175 | 
176 | listprojects.json
177 | -----------------
178 | 
179 | Get the projects.
180 | 
181 | Supported request methods
182 |   ``GET``
183 | 
184 | Example:
185 | 
186 | .. code-block:: shell-session
187 | 
188 |    $ curl http://localhost:6800/listprojects.json
189 |    {"node_name": "mynodename", "status": "ok", "projects": ["myproject", "otherproject"]}
190 | 
191 | .. _listversions.json:
192 | 
193 | listversions.json
194 | -----------------
195 | 
196 | Get the versions of a project in :ref:`eggstorage`, in :ref:`order<overview-order>`, with the latest version last.
197 | 
198 | Supported request methods
199 |   ``GET``
200 | Parameters
201 |   ``project`` (required)
202 |     the project name
203 | 
204 | Example:
205 | 
206 | .. code-block:: shell-session
207 | 
208 |    $ curl http://localhost:6800/listversions.json?project=myproject
209 |    {"node_name": "mynodename", "status": "ok", "versions": ["r99", "r156"]}
210 | 
211 | .. _listspiders.json:
212 | 
213 | listspiders.json
214 | ----------------
215 | 
216 | Get the spiders in a version of a project.
217 | 
218 | .. note:: If the project is configured via a :ref:`scrapy.cfg<config-settings>` file rather than uploaded via the :ref:`addversion.json` webservice, don't set the ``version`` parameter.
219 | 
220 | Supported request methods
221 |   ``GET``
222 | Parameters
223 |   ``project`` (required)
224 |     the project name
225 |   ``_version``
226 |     the project version (the latest project version by default)
227 | 
228 | Example:
229 | 
230 | .. code-block:: shell-session
231 | 
232 |    $ curl http://localhost:6800/listspiders.json?project=myproject
233 |    {"node_name": "mynodename", "status": "ok", "spiders": ["spider1", "spider2", "spider3"]}
234 | 
235 | .. _listjobs.json:
236 | 
237 | listjobs.json
238 | -------------
239 | 
240 | Get the pending, running and finished jobs of a project.
241 | 
242 | -  Pending jobs are in :ref:`spider queues<spiderqueue>`.
243 | -  Running jobs have Scrapy processes.
244 | -  Finished jobs are in :ref:job storage<jobstorage>`.
245 | 
246 |    .. note::
247 | 
248 |       -  The default :ref:`jobstorage` setting stores jobs in memory, such that jobs are lost when the Scrapyd process ends.
249 |       -  ``log_url`` is ``null`` in the response if :ref:`logs_dir` is disabled or the file doesn't exist.
250 |       -  ``items_url`` is ``null`` in the response if :ref:`items_dir` is disabled or the file doesn't exist.
251 | 
252 | Supported request methods
253 |   ``GET``
254 | Parameters
255 |   ``project``
256 |     filter results by project name
257 | 
258 | Example:
259 | 
260 | .. code-block:: shell-session
261 | 
262 |    $ curl http://localhost:6800/listjobs.json?project=myproject | python -m json.tool
263 |    {
264 |        "node_name": "mynodename",
265 |        "status": "ok",
266 |        "pending": [
267 |            {
268 |                "id": "78391cc0fcaf11e1b0090800272a6d06",
269 |                "project": "myproject",
270 |                "spider": "spider1",
271 |                "version": "0.1",
272 |                "settings": {"DOWNLOAD_DELAY=2"},
273 |                "args": {"arg1": "val1"},
274 |            }
275 |        ],
276 |        "running": [
277 |            {
278 |                "id": "422e608f9f28cef127b3d5ef93fe9399",
279 |                "project": "myproject",
280 |                "spider": "spider2",
281 |                "pid": 93956,
282 |                "start_time": "2012-09-12 10:14:03.594664",
283 |                "log_url": "/logs/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.log",
284 |                "items_url": "/items/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.jl"
285 |            }
286 |        ],
287 |        "finished": [
288 |            {
289 |                "id": "2f16646cfcaf11e1b0090800272a6d06",
290 |                "project": "myproject",
291 |                "spider": "spider3",
292 |                "start_time": "2012-09-12 10:14:03.594664",
293 |                "end_time": "2012-09-12 10:24:03.594664",
294 |                "log_url": "/logs/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.log",
295 |                "items_url": "/items/myproject/spider3/2f16646cfcaf11e1b0090800272a6d06.jl"
296 |            }
297 |        ]
298 |    }
299 | 
300 | .. _delversion.json:
301 | 
302 | delversion.json
303 | ---------------
304 | 
305 | Delete a version of a project from :ref:`eggstorage`. If no versions of the project remain, delete the project, too.
306 | 
307 | Supported request methods
308 |   ``POST``
309 | Parameters
310 |   ``project`` (required)
311 |     the project name
312 |   ``version`` (required)
313 |     the project version
314 | 
315 | Example:
316 | 
317 | .. code-block:: shell-session
318 | 
319 |    $ curl http://localhost:6800/delversion.json -d project=myproject -d version=r99
320 |    {"node_name": "mynodename", "status": "ok"}
321 | 
322 | .. _delproject.json:
323 | 
324 | delproject.json
325 | ---------------
326 | 
327 | Delete a project and its versions from :ref:`eggstorage`.
328 | 
329 | Supported request methods
330 |   ``POST``
331 | Parameters
332 |   ``project`` (required)
333 |       the project name
334 | 
335 | Example:
336 | 
337 | .. code-block:: shell-session
338 | 
339 |    $ curl http://localhost:6800/delproject.json -d project=myproject
340 |    {"node_name": "mynodename", "status": "ok"}
341 | 


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
 1 | Command-line interface
 2 | ======================
 3 | 
 4 | The CLI is simply a wrapper around `twistd <https://docs.twisted.org/en/stable/core/howto/basics.html#twistd>`__.
 5 | 
 6 | The most relevant option is ``--logfile`` (``-l``). The ``--nodaemon`` option is always enabled by Scrapyd.
 7 | 
 8 | .. code-block:: none
 9 | 
10 |    Usage: scrapyd [options]
11 |    Options:
12 |      -b, --debug          Run the application in the Python Debugger (implies
13 |                           nodaemon),         sending SIGUSR2 will drop into
14 |                           debugger
15 |          --chroot=        Chroot to a supplied directory before running
16 |      -e, --encrypted      The specified tap/aos file is encrypted.
17 |          --euid           Set only effective user-id rather than real user-id.
18 |                           (This option has no effect unless the server is running
19 |                           as root, in which case it means not to shed all
20 |                           privileges after binding ports, retaining the option to
21 |                           regain privileges in cases such as spawning processes.
22 |                           Use with caution.)
23 |      -f, --file=          read the given .tap file [default: twistd.tap]
24 |      -g, --gid=           The gid to run as.  If not specified, the default gid
25 |                           associated with the specified --uid is used.
26 |          --help           Display this help and exit.
27 |          --help-reactors  Display a list of possibly available reactor names.
28 |      -l, --logfile=       log to a specified file, - for stdout
29 |          --logger=        A fully-qualified name to a log observer factory to use
30 |                           for the initial log observer.  Takes precedence over
31 |                           --logfile and --syslog (when available).
32 |      -n, --nodaemon       don't daemonize, don't use default umask of 0077
33 |      -o, --no_save        do not save state on shutdown
34 |          --originalname   Don't try to change the process name
35 |      -p, --profile=       Run in profile mode, dumping results to specified file.
36 |          --pidfile=       Name of the pidfile [default: twistd.pid]
37 |          --prefix=        use the given prefix when syslogging [default: twisted]
38 |          --profiler=      Name of the profiler to use (profile, cprofile).
39 |                           [default: cprofile]
40 |      -r, --reactor=       Which reactor to use (see --help-reactors for a list of
41 |                           possibilities)
42 |      -s, --source=        Read an application from a .tas file (AOT format).
43 |          --savestats      save the Stats object rather than the text output of the
44 |                           profiler.
45 |          --spew           Print an insanely verbose log of everything that happens.
46 |                           Useful when debugging freezes or locks in complex code.
47 |          --syslog         Log to syslog, not to file
48 |      -u, --uid=           The uid to run as.
49 |          --umask=         The (octal) file creation mask to apply.
50 |          --version        Print version information and exit.
51 | 
52 |    Scrapyd is an application for deploying and running Scrapy spiders.
53 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | import os.path
13 | import sys
14 | 
15 | sys.path.insert(0, os.path.abspath(".."))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = "Scrapyd"
21 | copyright = "2013-2023, Scrapy group"
22 | author = "Scrapy group"
23 | 
24 | # The short X.Y version
25 | version = "1.5.0"
26 | # The full version, including alpha/beta/rc tags
27 | release = version
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 |     "sphinx.ext.autodoc",
37 |     "sphinx.ext.extlinks",
38 |     "sphinx.ext.viewcode",
39 |     "sphinxcontrib.zopeext.autointerface",
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ["_templates"]
44 | 
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = ["_build"]
49 | 
50 | 
51 | # -- Options for HTML output -------------------------------------------------
52 | 
53 | # The theme to use for HTML and HTML Help pages.  See the documentation for
54 | # a list of builtin themes.
55 | #
56 | html_theme = "furo"
57 | 
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path = []
62 | 
63 | 
64 | # -- Extension configuration -------------------------------------------------
65 | 
66 | autodoc_default_options = {
67 |     "members": None,
68 |     "member-order": "bysource",
69 | }
70 | autodoc_typehints = "description"
71 | autodoc_type_aliases = {}
72 | 
73 | extlinks = {
74 |     "issue": ("https://github.com/open-contracting/pelican-frontend/issues/%s", "#%s"),
75 |     "commit": ("https://github.com/open-contracting/pelican-frontend/commit/%s", "%s"),
76 | }
77 | 


--------------------------------------------------------------------------------
/docs/config.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | Configuration
  3 | =============
  4 | 
  5 | .. _config-default:
  6 | 
  7 | Default configuration
  8 | =====================
  9 | 
 10 | Scrapyd always loads this configuration file, which can be overridden by :ref:`config-sources`:
 11 | 
 12 | .. literalinclude:: ../scrapyd/default_scrapyd.conf
 13 | 
 14 | .. _config-sources:
 15 | 
 16 | Configuration sources
 17 | =====================
 18 | 
 19 | Scrapyd reads these configuration files in this order. Values in later files take priority.
 20 | 
 21 | #. ``c:\scrapyd\scrapyd.conf``
 22 | #. ``/etc/scrapyd/scrapyd.conf``
 23 | #. ``/etc/scrapyd/conf.d/*`` in alphabetical order
 24 | #. ``scrapyd.conf`` in the current directory
 25 | #. ``~/.scrapyd.conf`` in the home directory of the user that invoked the ``scrapyd`` command
 26 | #. the closest ``scrapy.cfg`` file, starting in the current directory and traversing upward
 27 | 
 28 | .. _config-envvars:
 29 | 
 30 | Environment variables
 31 | =====================
 32 | 
 33 | .. versionadded:: 1.5.0
 34 | 
 35 | These environment variables override corresponding options:
 36 | 
 37 | * ``SCRAPYD_BIND_ADDRESS`` (:ref:`bind_address`)
 38 | * ``SCRAPYD_HTTP_PORT`` (:ref:`http_port`)
 39 | * ``SCRAPYD_USERNAME`` (:ref:`username`)
 40 | * ``SCRAPYD_PASSWORD`` (:ref:`password`)
 41 | * ``SCRAPYD_UNIX_SOCKET_PATH`` (:ref:`unix_socket_path`)
 42 | 
 43 | scrapyd section
 44 | ===============
 45 | 
 46 | Application options
 47 | -------------------
 48 | 
 49 | .. _application:
 50 | 
 51 | application
 52 | ~~~~~~~~~~~
 53 | 
 54 | The function that returns the Twisted Application to use.
 55 | 
 56 | If necessary, override this to fully control how Scrapyd works.
 57 | 
 58 | Default
 59 |   ``scrapyd.app.application``
 60 | Options
 61 |   Any Twisted `Application <https://docs.twisted.org/en/stable/core/howto/application.html>`__
 62 | 
 63 | .. _bind_address:
 64 | 
 65 | bind_address
 66 | ~~~~~~~~~~~~
 67 | 
 68 | The IP address on which the :ref:`webui` and :doc:`api` listen for connections.
 69 | 
 70 | Default
 71 |   ``127.0.0.1``
 72 | Options
 73 |   Any IP address, including:
 74 | 
 75 |   -  ``127.0.0.1`` to listen for local IPv4 connections only
 76 |   -  ``0.0.0.0`` to listen for all IPv4 connections
 77 |   -  ``::0`` to listen for all IPv4 and IPv6 connections
 78 | 
 79 |      .. note:: If ``sysctl`` sets ``net.ipv6.bindv6only`` to true (default false), then ``::0`` listens for IPv6 connections only.
 80 | 
 81 | .. _http_port:
 82 | 
 83 | http_port
 84 | ~~~~~~~~~
 85 | 
 86 | The TCP port on which the :ref:`webui` and :doc:`api` listen for connections.
 87 | 
 88 | Default
 89 |   ``6800``
 90 | Options
 91 |   Any integer
 92 | 
 93 | .. _unix_socket_path:
 94 | 
 95 | unix_socket_path
 96 | ----------------
 97 | 
 98 | .. versionadded:: 1.5.0
 99 | 
100 | The filesystem path of the Unix socket on which the :ref:`webui` and :doc:`api` listen for connections.
101 | 
102 | For example:
103 | 
104 | .. code-block:: ini
105 | 
106 |    unix_socket_path = /var/run/scrapyd/web.socket
107 | 
108 | The file's mode is set to 660 (owner and group, read and write) to control access to Scrapyd.
109 | 
110 | .. attention::
111 | 
112 |    If :ref:`bind_address` and :ref:`http_port` are set, a TCP server will start, in addition to the Unix server. To disable the TCP server, set ``bind_address`` to empty:
113 | 
114 |    .. code-block:: ini
115 | 
116 |       bind_address =
117 | 
118 | .. _username:
119 | 
120 | username
121 | ~~~~~~~~
122 | 
123 | .. versionadded:: 1.3.0
124 | 
125 | Enable basic authentication by setting this and :ref:`password` to non-empty values.
126 | 
127 | Default
128 |   ``""`` (empty)
129 | 
130 | .. _password:
131 | 
132 | password
133 | ~~~~~~~~
134 | 
135 | .. versionadded:: 1.3.0
136 | 
137 | Enable basic authentication by setting this and :ref:`username` to non-empty values.
138 | 
139 | Default
140 |   ``""`` (empty)
141 | 
142 | .. _spiderqueue:
143 | 
144 | spiderqueue
145 | ~~~~~~~~~~~
146 | 
147 | .. versionadded:: 1.4.2
148 | 
149 | The class that stores pending jobs.
150 | 
151 | Default
152 |   ``scrapyd.spiderqueue.SqliteSpiderQueue``
153 | Options
154 |   -  ``scrapyd.spiderqueue.SqliteSpiderQueue`` stores spider queues in SQLite databases named after each project, in the :ref:`dbs_dir` directory
155 |   -  Implement your own, using the :py:interface:`~scrapyd.interfaces.ISpiderQueue` interface
156 | Also used by
157 |   -  :ref:`addversion.json` webservice, to create a queue if the project is new
158 |   -  :ref:`schedule.json` webservice, to add a pending job
159 |   -  :ref:`cancel.json` webservice, to remove a pending job
160 |   -  :ref:`listjobs.json` webservice, to list the pending jobs
161 |   -  :ref:`daemonstatus.json` webservice, to count the pending jobs
162 |   -  :ref:`webui`, to list the pending jobs and, if queues are transient, to create the queues per project at startup
163 | 
164 | .. Community PostgreSQL and RabbitMQ queues: https://github.com/scrapy/scrapyd/pull/140/files#diff-c479470812a00776da54c3cefc15bb5bb244b4056996ae972f4daba7f6ec5bd5
165 | 
166 | Poller options
167 | --------------
168 | 
169 | .. _poller:
170 | 
171 | poller
172 | ~~~~~~
173 | 
174 | .. versionadded:: 1.5.0
175 | 
176 | The class that tracks capacity for new jobs, and starts jobs when ready.
177 | 
178 | Default
179 |   ``scrapyd.poller.QueuePoller``
180 | Options
181 |   -  ``scrapyd.poller.QueuePoller``. When using the default :ref:`application` and :ref:`launcher` values:
182 | 
183 |     -  The launcher adds :ref:`max_proc` capacity at startup, and one capacity each time a Scrapy process ends.
184 |     -  The :ref:`application` starts a timer so that, every :ref:`poll_interval` seconds, jobs start if there's capacity: that is, if the number of Scrapy processes that are running is less than the :ref:`max_proc` value.
185 | 
186 |   -  Implement your own, using the :py:interface:`~scrapyd.interfaces.IPoller` interface
187 | 
188 | .. _poll_interval:
189 | 
190 | poll_interval
191 | ~~~~~~~~~~~~~
192 | 
193 | The number of seconds between capacity checks.
194 | 
195 | Default
196 |   ``5.0``
197 | Options
198 |    Any floating-point number
199 | 
200 | .. _config-launcher:
201 | 
202 | Launcher options
203 | ----------------
204 | 
205 | .. _launcher:
206 | 
207 | launcher
208 | ~~~~~~~~
209 | 
210 | The class that starts Scrapy processes.
211 | 
212 | Default
213 |   ``scrapyd.launcher.Launcher``
214 | Options
215 |   Any Twisted `Service <https://docs.twisted.org/en/stable/api/twisted.application.service.Service.html>`__
216 | 
217 | .. _max_proc:
218 | 
219 | max_proc
220 | ~~~~~~~~
221 | 
222 | The maximum number of Scrapy processes to run concurrently.
223 | 
224 | Default
225 |   ``0``
226 | Options
227 |   Any non-negative integer, including:
228 | 
229 |   -  ``0`` to use :ref:`max_proc_per_cpu` multiplied by the number of CPUs
230 | 
231 | .. _max_proc_per_cpu:
232 | 
233 | max_proc_per_cpu
234 | ~~~~~~~~~~~~~~~~
235 | 
236 | See :ref:`max_proc`.
237 | 
238 | Default
239 |   ``4``
240 | 
241 | .. _logs_dir:
242 | 
243 | logs_dir
244 | ~~~~~~~~
245 | 
246 | The directory in which to write Scrapy logs.
247 | 
248 | A log file is written to ``{logs_dir}/{project}/{spider}/{job}.log``.
249 | 
250 | To disable log storage, set this option to empty:
251 | 
252 | .. code-block:: ini
253 | 
254 |    logs_dir =
255 | 
256 | To log messages to a remote service, you can, for example, reconfigure Scrapy's logger from your Scrapy project:
257 | 
258 | .. code-block:: python
259 | 
260 |    import logging
261 |    import logstash
262 | 
263 |    logger = logging.getLogger("scrapy")
264 |    logger.handlers.clear()
265 |    logger.addHandler(logstash.LogstashHandler("https://user:pass@id.us-east-1.aws.found.io", 5959, version=1))
266 | 
267 | Default
268 |   ``logs``
269 | Also used by
270 |   :ref:`webui`, to link to log files
271 | 
272 | .. attention:: Each ``*_dir`` setting must point to a different directory.
273 | 
274 | .. _items_dir:
275 | 
276 | items_dir
277 | ~~~~~~~~~
278 | 
279 | The directory in which to write Scrapy items.
280 | 
281 | An item feed is written to ``{items_dir}/{project}/{spider}/{job}.jl``.
282 | 
283 | If this option is non-empty, the `FEEDS <https://docs.scrapy.org/en/latest/topics/feed-exports.html#std-setting-FEEDS>`__ Scrapy setting is set as follows, resulting in items being written to the above path as JSON lines:
284 | 
285 | .. code-block:: json
286 | 
287 |    {"file:///path/to/items_dir/project/spider/job.jl": {"format": "jsonlines"}}
288 | 
289 | Default
290 |   ``""`` (empty), because it is recommended to instead use either:
291 | 
292 |    -  `Feed exports <https://docs.scrapy.org/en/latest/topics/feed-exports.html>`__, by setting the ``FEEDS`` Scrapy setting in your Scrapy project. See the full list of `storage backends <https://docs.scrapy.org/en/latest/topics/feed-exports.html#storages>`__.
293 |    -  `Item pipeline <https://docs.scrapy.org/en/latest/topics/item-pipeline.html>`__, to store the scraped items in a database. See the `MongoDB example <https://docs.scrapy.org/en/latest/topics/item-pipeline.html#write-items-to-mongodb>`__, which can be adapted to another database.
294 | Also used by
295 |   :ref:`webui`, to link to item feeds
296 | 
297 | .. attention:: Each ``*_dir`` setting must point to a different directory.
298 | 
299 | .. _jobs_to_keep:
300 | 
301 | jobs_to_keep
302 | ~~~~~~~~~~~~
303 | 
304 | The number of finished jobs per spider, for which to keep the most recent log files in the :ref:`logs_dir` directory and item feeds in the :ref:`items_dir` directory.
305 | 
306 | To "disable" this feature, set this to an arbitrarily large value. For example, on a 64-bit system:
307 | 
308 | .. code-block:: ini
309 | 
310 |    jobs_to_keep = 9223372036854775807
311 | 
312 | .. warning::
313 | 
314 |    Scrapyd deletes old files in these directories, regardless of origin.
315 | 
316 | Default
317 |   ``5``
318 | 
319 | .. _runner:
320 | 
321 | runner
322 | ~~~~~~
323 | 
324 | The Python script to run Scrapy's `CLI <https://docs.scrapy.org/en/latest/topics/commands.html>`__.
325 | 
326 | If necessary, override this to fully control how the Scrapy CLI is called.
327 | 
328 | Default
329 |   ``scrapyd.runner``
330 | Options
331 |   Any Python `script <https://docs.python.org/3/tutorial/modules.html#executing-modules-as-scripts>`__
332 | Also used by
333 |   :ref:`listspiders.json` webservice, to run Scrapy's `list <https://docs.scrapy.org/en/latest/topics/commands.html#list>`__ command
334 | 
335 | Web UI and API options
336 | ----------------------
337 | 
338 | .. _webroot:
339 | 
340 | webroot
341 | ~~~~~~~
342 | 
343 | .. versionadded:: 1.2.0
344 | 
345 | The class that defines the :ref:`webui` and :doc:`api`, as a Twisted Resource.
346 | 
347 | If necessary, override this to fully control how the web UI and API work.
348 | 
349 | Default
350 |   ``scrapyd.website.Root``
351 | Options
352 |   Any Twisted `Resource <https://docs.twisted.org/en/stable/web/howto/using-twistedweb.html#resource-objects>`__
353 | 
354 | .. _prefix_header:
355 | 
356 | prefix_header
357 | ~~~~~~~~~~~~~
358 | 
359 | .. versionadded:: 1.4.2
360 | 
361 | The header for the base path of the original request.
362 | 
363 | The header is relevant only if Scrapyd is running behind a reverse proxy, and if the public URL contains a base path, before the Scrapyd API path components.
364 | A base path must have a leading slash and no trailing slash, e.g. ``/base/path``.
365 | 
366 | Default
367 |   ``x-forwarded-prefix``
368 | 
369 | .. _node_name:
370 | 
371 | node_name
372 | ~~~~~~~~~
373 | 
374 | .. versionadded:: 1.1.0
375 | 
376 | The node name, which appears in :doc:`api` responses.
377 | 
378 | Default
379 |   ``socket.gethostname()``
380 | 
381 | .. _debug:
382 | 
383 | debug
384 | ~~~~~
385 | 
386 | Whether debug mode is enabled.
387 | 
388 | If enabled, a Python traceback is returned (as a plain-text response) when the :doc:`api` errors.
389 | 
390 | Default
391 |   ``off``
392 | 
393 | Egg storage options
394 | -------------------
395 | 
396 | .. _eggstorage:
397 | 
398 | eggstorage
399 | ~~~~~~~~~~
400 | 
401 | .. versionadded:: 1.3.0
402 | 
403 | The class that stores project eggs.
404 | 
405 | Default
406 |   ``scrapyd.eggstorage.FilesystemEggStorage``
407 | Options
408 |   -  ``scrapyd.eggstorage.FilesystemEggStorage`` writes eggs in the :ref:`eggs_dir` directory
409 | 
410 |      .. note:: Eggs are named after the ``version``, replacing characters other than ``A-Za-z0-9_-`` with underscores. Therefore, if you frequently use non-word, non-hyphen characters, the eggs for different versions can collide.
411 |   -  Implement your own, using the :py:interface:`~scrapyd.interfaces.IEggStorage` interface: for example, to store eggs remotely
412 | 
413 | .. _eggs_dir:
414 | 
415 | eggs_dir
416 | ~~~~~~~~
417 | 
418 | The directory in which to write project eggs.
419 | 
420 | Default
421 |   ``eggs``
422 | 
423 | .. attention:: Each ``*_dir`` setting must point to a different directory.
424 | 
425 | Job storage options
426 | -------------------
427 | 
428 | .. _jobstorage:
429 | 
430 | jobstorage
431 | ~~~~~~~~~~
432 | 
433 | .. versionadded:: 1.3.0
434 | 
435 | The class that stores finished jobs.
436 | 
437 | Default
438 |   ``scrapyd.jobstorage.MemoryJobStorage``
439 | Options
440 |   -  ``scrapyd.jobstorage.MemoryJobStorage`` stores jobs in memory, such that jobs are lost when the Scrapyd process ends
441 |   -  ``scrapyd.jobstorage.SqliteJobStorage`` stores jobs in a SQLite database named ``jobs.db``, in the :ref:`dbs_dir` directory
442 |   -  Implement your own, using the :py:interface:`~scrapyd.interfaces.IJobStorage` interface
443 | 
444 | .. _finished_to_keep:
445 | 
446 | finished_to_keep
447 | ~~~~~~~~~~~~~~~~
448 | 
449 | The number of finished jobs, for which to keep metadata in the :ref:`jobstorage` backend.
450 | 
451 | Finished jobs are accessed via the :ref:`webui` and :ref:`listjobs.json` webservice.
452 | 
453 | Default
454 |   ``100``
455 | Options
456 |   Any non-negative integer
457 | 
458 | Directory options
459 | -----------------
460 | 
461 | .. _dbs_dir:
462 | 
463 | dbs_dir
464 | ~~~~~~~
465 | 
466 | The directory in which to write SQLite databases.
467 | 
468 | Default
469 |   ``dbs``
470 | Options
471 |   Any relative or absolute path, or `:memory: <https://docs.python.org/3/library/sqlite3.html#sqlite3.connect>`__
472 | Used by
473 |   -  :ref:`spiderqueue` (``scrapyd.spiderqueue.SqliteSpiderQueue``)
474 |   -  :ref:`jobstorage` (``scrapyd.jobstorage.SqliteJobStorage``)
475 | 
476 | .. attention:: Each ``*_dir`` setting must point to a different directory.
477 | 
478 | .. _config-services:
479 | 
480 | services section
481 | ================
482 | 
483 | If you want to add a webservice (endpoint), add, for example:
484 | 
485 | .. code-block:: ini
486 | 
487 |    [services]
488 |    mywebservice.json = amodule.anothermodule.MyWebService
489 | 
490 | You can use code for webservices in `webservice.py <https://github.com/scrapy/scrapyd/blob/master/scrapyd/webservice.py>`__ as inspiration.
491 | 
492 | To remove a :ref:`default webservice<config-default>`, set it to empty:
493 | 
494 | .. code-block:: ini
495 | 
496 |    [services]
497 |    daemonstatus.json =
498 | 
499 | .. _config-settings:
500 | 
501 | settings section (scrapy.cfg)
502 | =============================
503 | 
504 | Project code is usually stored in a `Python egg <https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html>`__ and uploaded to Scrapyd via the :ref:`addversion.json` webservice.
505 | 
506 | Alternatively, you can invoke Scrapyd within a Scrapy project: that is, you can run the ``scrapyd`` command from a directory containing a ``scrapy.cfg`` file (or from a directory with any parent directory containing a ``scrapy.cfg`` file).
507 | 
508 | As described in `Scrapy's documentation <https://docs.scrapy.org/en/latest/topics/commands.html#sharing-the-root-directory-between-projects>`__, the ``scrapy.cfg`` file contains a ``[settings]`` section, which can describe many Scrapy projects. By default, it is:
509 | 
510 | .. code-block:: ini
511 | 
512 |    [settings]
513 |    default = projectname.settings
514 | 


--------------------------------------------------------------------------------
/docs/contributing/api.rst:
--------------------------------------------------------------------------------
 1 | Developer API reference
 2 | =======================
 3 | 
 4 | Interfaces
 5 | ----------
 6 | 
 7 | .. automodule:: scrapyd.interfaces
 8 |    :members:
 9 |    :undoc-members:
10 |    :special-members:
11 | 
12 | Config
13 | ------
14 | 
15 | .. automodule:: scrapyd.config
16 |    :members:
17 |    :undoc-members:
18 | 
19 | Exceptions
20 | ----------
21 | 
22 | .. automodule:: scrapyd.exceptions
23 |    :members:
24 |    :undoc-members:
25 | 


--------------------------------------------------------------------------------
/docs/contributing/index.rst:
--------------------------------------------------------------------------------
  1 | Contributing
  2 | ============
  3 | 
  4 | .. important:: Read through the `Scrapy Contribution Docs <http://scrapy.readthedocs.org/en/latest/contributing.html>`__ for tips relating to writing patches, reporting bugs, and coding style.
  5 | 
  6 | .. toctree::
  7 |    :maxdepth: 2
  8 |    :caption: Contents
  9 | 
 10 |    api
 11 | 
 12 | Issues and bugs
 13 | ---------------
 14 | 
 15 | Report on `GitHub <https://github.com/scrapy/scrapyd/issues>`__.
 16 | 
 17 | Tests
 18 | -----
 19 | 
 20 | Include tests in your pull requests.
 21 | 
 22 | To run unit tests:
 23 | 
 24 | .. code-block:: shell
 25 | 
 26 |    pytest tests
 27 | 
 28 | To run integration tests:
 29 | 
 30 | .. code-block:: shell
 31 | 
 32 |    printf "[scrapyd]\nusername = hello12345\npassword = 67890world\n" > scrapyd.conf
 33 |    mkdir logs
 34 |    scrapyd &
 35 |    pytest integration_tests
 36 | 
 37 | Installation
 38 | ------------
 39 | 
 40 | To install an editable version for development, clone the repository, change to its directory, and run:
 41 | 
 42 | .. code-block:: shell
 43 | 
 44 |    pip install -e .[test,docs]
 45 | 
 46 | Developer documentation
 47 | -----------------------
 48 | 
 49 | Configuration
 50 | ~~~~~~~~~~~~~
 51 | 
 52 | Pass the ``config`` object to a class' ``__init__`` method, but don't store it on the instance (:issue:`526`).
 53 | 
 54 | Processes
 55 | ~~~~~~~~~
 56 | 
 57 | Scrapyd starts Scrapy processes. It runs ``scrapy crawl`` in the :ref:`launcher`, and ``scrapy list`` in the :ref:`schedule.json` (to check the spider exists), :ref:`addversion.json` (to return the number of spiders) and :ref:`listspiders.json` (to return the names of spiders) webservices.
 58 | 
 59 | Environment variables
 60 | ~~~~~~~~~~~~~~~~~~~~~
 61 | 
 62 | Scrapyd uses environment variables to communicate between the Scrapyd process and the Scrapy processes that it starts.
 63 | 
 64 | SCRAPY_PROJECT
 65 |   The project to use. See ``scrapyd/runner.py``.
 66 | SCRAPYD_EGG_VERSION
 67 |   The version of the project, to be retrieved as an egg from :ref:`eggstorage` and activated.
 68 | SCRAPY_SETTINGS_MODULE
 69 |   The Python path to the `settings <https://docs.scrapy.org/en/latest/topics/settings.html#designating-the-settings>`__ module of the project.
 70 | 
 71 |   This is usually the module from the `entry points <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__ of the egg, but can be the module from the ``[settings]`` section of a :ref:`scrapy.cfg<config-settings>` file. See ``scrapyd/environ.py``.
 72 | 
 73 | Jobs
 74 | ~~~~
 75 | 
 76 | A **pending job** is a ``dict`` object (referred to as a "message"), accessible via an :py:interface:`~scrapyd.interfaces.ISpiderQueue`'s :meth:`~scrapyd.interfaces.ISpiderQueue.pop` or :meth:`~scrapyd.interfaces.ISpiderQueue.list` methods.
 77 | 
 78 | .. note:: The short-lived message returned by :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.poll` method is also referred to as a "message".
 79 | 
 80 | -  The :ref:`schedule.json` webservice calls :py:interface:`~scrapyd.interfaces.ISpiderScheduler`'s :meth:`~scrapyd.interfaces.ISpiderScheduler.schedule` method. The ``SpiderScheduler`` implementation of :meth:`~scrapyd.interfaces.ISpiderScheduler.schedule` adds the message to the project's :py:interface:`~scrapyd.interfaces.ISpiderQueue`.
 81 | -  The default :ref:`application` sets a `TimerService <https://docs.twisted.org/en/stable/api/twisted.application.internet.TimerService.html>`__ to call :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.poll` method, at :ref:`poll_interval`.
 82 | -  :py:interface:`~scrapyd.interfaces.IPoller` has a :attr:`~scrapyd.interfaces.IPoller.queues` attribute, that implements a ``__getitem__`` method to get a project's :py:interface:`~scrapyd.interfaces.ISpiderQueue` by project name.
 83 | -  The ``QueuePoller`` implementation of :meth:`~scrapyd.interfaces.IPoller.poll` calls a project's :py:interface:`~scrapyd.interfaces.ISpiderQueue`'s :meth:`~scrapyd.interfaces.ISpiderQueue.pop` method, adds a ``_project`` key to the message and renames the ``name`` key to ``_spider``, and fires a callback.
 84 | -  The ``Launcher`` service had added the callback to the `Deferred <https://docs.twisted.org/en/stable/core/howto/defer.html>`__, which had been returned by :py:interface:`~scrapyd.interfaces.IPoller`'s :meth:`~scrapyd.interfaces.IPoller.next` method.
 85 | -  The ``Launcher`` service adapts the message to instantiate a ``ScrapyProcessProtocol`` (`ProcessProtocol <https://docs.twisted.org/en/stable/api/twisted.internet.protocol.ProcessProtocol.html>`__) object, adds a callback, and `spawns a process <https://docs.twisted.org/en/stable/core/howto/process.html>`__.
 86 | 
 87 | A **running job** is a ``ScrapyProcessProtocol`` object, accessible via ``Launcher.processes`` (a ``dict``), in which each key is a slot's number (an ``int``).
 88 | 
 89 | -  ``Launcher`` has a ``finished`` attribute, which is an :py:interface:`~scrapyd.interfaces.IJobStorage`.
 90 | -  When the process ends, the callback fires. The ``Launcher`` service calls :py:interface:`~scrapyd.interfaces.IJobStorage`'s :meth:`~scrapyd.interfaces.IJobStorage.add` method, passing the ``ScrapyProcessProtocol`` as input.
 91 | 
 92 | A **finished job** is an object with the attributes ``project``, ``spider``, ``job``, ``start_time`` and ``end_time``, accessible via an :py:interface:`~scrapyd.interfaces.IJobStorage`'s :meth:`~scrapyd.interfaces.IJobStorage.list` or :meth:`~scrapyd.interfaces.IJobStorage.__iter__` methods.
 93 | 
 94 | .. list-table::
 95 |    :header-rows: 1
 96 |    :stub-columns: 1
 97 | 
 98 |    * - Concept
 99 |      - ISpiderQueue
100 |      - IPoller
101 |      - ScrapyProcessProtocol
102 |      - IJobStorage
103 |    * - Project
104 |      - *not specified*
105 |      - _project
106 |      - project
107 |      - project
108 |    * - Spider
109 |      - name
110 |      - _spider
111 |      - spider
112 |      - spider
113 |    * - Job ID
114 |      - _job
115 |      - _job
116 |      - job
117 |      - job
118 |    * - Egg version
119 |      - _version
120 |      - _version
121 |      - ✗
122 |      - ✗
123 |    * - Scrapy settings
124 |      - settings
125 |      - settings
126 |      - args (``-s k=v``)
127 |      - ✗
128 |    * - Spider arguments
129 |      - *remaining keys*
130 |      - *remaining keys*
131 |      - args (``-a k=v``)
132 |      - ✗
133 |    * - Environment variables
134 |      - ✗
135 |      - ✗
136 |      - env
137 |      - ✗
138 |    * - Process ID
139 |      - ✗
140 |      - ✗
141 |      - pid
142 |      - ✗
143 |    * - Start time
144 |      - ✗
145 |      - ✗
146 |      - start_time
147 |      - start_time
148 |    * - End time
149 |      - ✗
150 |      - ✗
151 |      - end_time
152 |      - end_time
153 | 


--------------------------------------------------------------------------------
/docs/deploy.rst:
--------------------------------------------------------------------------------
 1 | Deployment
 2 | ==========
 3 | 
 4 | .. _docker:
 5 | 
 6 | Creating a Docker image
 7 | -----------------------
 8 | 
 9 | If you prefer to create a Docker image for the Scrapyd service and your Scrapy projects, you can copy this ``Dockerfile`` template into your Scrapy project, and adapt it.
10 | 
11 | .. code-block:: dockerfile
12 | 
13 |    # Build an egg of your project.
14 | 
15 |    FROM python as build-stage
16 | 
17 |    RUN pip install --no-cache-dir scrapyd-client
18 | 
19 |    WORKDIR /workdir
20 | 
21 |    COPY . .
22 | 
23 |    RUN scrapyd-deploy --build-egg=myproject.egg
24 | 
25 |    # Build the image.
26 | 
27 |    FROM python:alpine
28 | 
29 |    # Install Scrapy dependencies - and any others for your project.
30 | 
31 |    RUN apk --no-cache add --virtual build-dependencies \
32 |       gcc \
33 |       musl-dev \
34 |       libffi-dev \
35 |       libressl-dev \
36 |       libxml2-dev \
37 |       libxslt-dev \
38 |     && pip install --no-cache-dir \
39 |       scrapyd \
40 |     && apk del build-dependencies \
41 |     && apk add \
42 |       libressl \
43 |       libxml2 \
44 |       libxslt
45 | 
46 |    # Mount two volumes for configuration and runtime.
47 | 
48 |    VOLUME /etc/scrapyd/ /var/lib/scrapyd/
49 | 
50 |    COPY ./scrapyd.conf /etc/scrapyd/
51 | 
52 |    RUN mkdir -p /src/eggs/myproject
53 | 
54 |    COPY --from=build-stage /workdir/myproject.egg /src/eggs/myproject/1.egg
55 | 
56 |    EXPOSE 6800
57 | 
58 |    ENTRYPOINT ["scrapyd", "--pidfile="]
59 | 
60 | Where your ``scrapy.cfg`` file, used by ``scrapyd-deploy``, might be:
61 | 
62 | .. code-block:: ini
63 | 
64 |    [settings]
65 |    default = myproject.settings
66 | 
67 |    [deploy]
68 |    url = http://localhost:6800
69 |    project = myproject
70 | 
71 | And your ``scrapyd.conf`` file might be:
72 | 
73 | .. code-block:: ini
74 | 
75 |    [scrapyd]
76 |    bind_address      = 0.0.0.0
77 |    logs_dir          = /var/lib/scrapyd/logs
78 |    items_dir         = /var/lib/scrapyd/items
79 |    dbs_dir           = /var/lib/scrapyd/dbs
80 |    eggs_dir          = /src/eggs
81 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | Scrapyd |release|
 3 | =================
 4 | 
 5 | .. include:: ../README.rst
 6 | 
 7 | Quickstart
 8 | ==========
 9 | 
10 | Install Scrapyd
11 | ---------------
12 | 
13 | .. code-block:: shell
14 | 
15 |    pip install scrapyd
16 | 
17 | Start Scrapyd
18 | -------------
19 | 
20 | .. code-block:: shell
21 | 
22 |    scrapyd
23 | 
24 | See :doc:`overview` and :doc:`config` for more details.
25 | 
26 | Upload a project
27 | ----------------
28 | 
29 | This involves building a `Python egg <https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html>`__ and uploading it to Scrapyd via the `addversion.json <https://scrapyd.readthedocs.org/en/latest/api.html#addversion-json>`_ webservice.
30 | 
31 | Do this easily with the ``scrapyd-deploy`` command from the `scrapyd-client <https://github.com/scrapy/scrapyd-client>`__ package. Once configured:
32 | 
33 | .. code-block:: shell
34 | 
35 |    scrapyd-deploy
36 | 
37 | Schedule a crawl
38 | ----------------
39 | 
40 | .. code-block:: shell-session
41 | 
42 |    $ curl http://localhost:6800/schedule.json -d project=myproject -d spider=spider2
43 |    {"status": "ok", "jobid": "26d1b1a6d6f111e0be5c001e648c57f8"}
44 | 
45 | See :doc:`api` for more details.
46 | 
47 | .. toctree::
48 |    :maxdepth: 2
49 |    :caption: Contents
50 | 
51 |    overview
52 |    config
53 |    api
54 |    cli
55 |    deploy
56 |    contributing/index
57 |    news
58 | 


--------------------------------------------------------------------------------
/docs/overview.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Overview
 3 | ========
 4 | 
 5 | Projects and versions
 6 | =====================
 7 | 
 8 | Scrapyd can manage multiple Scrapy projects. Each project can have multiple versions. The latest version is used by default for starting spiders.
 9 | 
10 | .. _overview-order:
11 | 
12 | Version order
13 | -------------
14 | 
15 | The latest version is the alphabetically greatest, unless all version names are `version specifiers <https://packaging.python.org/en/latest/specifications/version-specifiers/>`__ like ``1.0`` or ``1.0rc1``, in which case they are sorted as such.
16 | 
17 | How Scrapyd works
18 | =================
19 | 
20 | Scrapyd is a server (typically run as a daemon) that listens for :doc:`api` and :ref:`webui` requests.
21 | 
22 | The API is especially used to upload projects and schedule crawls. To start a crawl, Scrapyd spawns a process that essentially runs:
23 | 
24 | .. code-block:: shell
25 | 
26 |    scrapy crawl myspider
27 | 
28 | Scrapyd runs multiple processes in parallel, and manages the number of concurrent processes. See :ref:`config-launcher` for details.
29 | 
30 | If you are familiar with the `Twisted Application Framework <https://docs.twisted.org/en/stable/core/howto/application.html>`__, you can essentially reconfigure every part of Scrapyd. See :doc:`config` for details.
31 | 
32 | .. _webui:
33 | 
34 | Web interface
35 | =============
36 | 
37 | Scrapyd has a minimal web interface for monitoring running processes and accessing log files and item fees. By default, it is available at at http://localhost:6800/ Other options to manage Scrapyd include:
38 | 
39 | -  `ScrapydWeb <https://github.com/my8100/scrapydweb>`__
40 | -  `spider-admin-pro <https://github.com/mouday/spider-admin-pro>`__
41 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | furo
2 | sphinxcontrib-zopeext
3 | 


--------------------------------------------------------------------------------
/integration_tests/__init__.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urljoin
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def req(method, path, auth=None, status=200, **kwargs):
 7 |     url = urljoin("http://127.0.0.1:6800", path)
 8 | 
 9 |     for badauth in (None, ("baduser", "badpass")):
10 |         response = getattr(requests, method)(url, auth=badauth, **kwargs)
11 | 
12 |         assert response.status_code == 401, f"401 != {response.status_code}"
13 |         assert response.text == "Unauthorized"
14 | 
15 |     response = getattr(requests, method)(url, auth=("hello12345", "67890world"), **kwargs)
16 | 
17 |     assert response.status_code == status, f"{status} != {response.status_code}"
18 | 
19 |     return response
20 | 


--------------------------------------------------------------------------------
/integration_tests/test_webservice.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | from pathlib import Path
  3 | 
  4 | import pytest
  5 | import requests
  6 | 
  7 | from integration_tests import req
  8 | 
  9 | BASEDIR = os.path.realpath(".").replace("\\", "\\\\")
 10 | with (Path(__file__).absolute().parent.parent / "tests" / "fixtures" / "quotesbot.egg").open("rb") as f:
 11 |     EGG = f.read()
 12 | 
 13 | 
 14 | def assert_response(method, path, expected, **kwargs):
 15 |     response = req(method, path, **kwargs)
 16 |     data = response.json()
 17 |     data.pop("node_name")
 18 | 
 19 |     assert data == expected
 20 |     assert response.content.endswith(b"\n")
 21 | 
 22 | 
 23 | @pytest.mark.parametrize(
 24 |     ("method", "basename"),
 25 |     [
 26 |         ("GET", "daemonstatus"),
 27 |         ("POST", "addversion"),
 28 |         ("POST", "schedule"),
 29 |         ("POST", "cancel"),
 30 |         ("GET", "status"),
 31 |         ("GET", "listprojects"),
 32 |         ("GET", "listversions"),
 33 |         ("GET", "listspiders"),
 34 |         ("GET", "listjobs"),
 35 |         ("POST", "delversion"),
 36 |         ("POST", "delproject"),
 37 |     ],
 38 | )
 39 | def test_options(method, basename):
 40 |     response = requests.options(
 41 |         f"http://127.0.0.1:6800/{basename}.json",
 42 |         auth=("hello12345", "67890world"),
 43 |     )
 44 | 
 45 |     assert response.status_code == 204, f"204 != {response.status_code}"
 46 |     assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}"
 47 |     assert response.content == b""
 48 | 
 49 | 
 50 | # ListSpiders, Schedule, Cancel, Status and ListJobs return "project '%b' not found" on directory traversal attempts.
 51 | # The egg storage (in get_project_list, called by get_spider_queues, called by QueuePoller, used by these webservices)
 52 | # would need to find a project like "../project" (which is impossible with the default eggstorage) to not error.
 53 | @pytest.mark.parametrize(
 54 |     ("method", "basename", "params"),
 55 |     [
 56 |         ("post", "addversion", {"version": "v", "egg": EGG}),
 57 |         ("get", "listversions", {}),
 58 |         ("post", "delversion", {"version": "v"}),
 59 |         ("post", "delproject", {}),
 60 |     ],
 61 | )
 62 | def test_project_directory_traversal(method, basename, params):
 63 |     response = getattr(requests, method)(
 64 |         f"http://127.0.0.1:6800/{basename}.json",
 65 |         auth=("hello12345", "67890world"),
 66 |         **{"params" if method == "get" else "data": {"project": "../p", **params}},
 67 |     )
 68 | 
 69 |     data = response.json()
 70 |     data.pop("node_name")
 71 | 
 72 |     assert response.status_code == 200, f"200 != {response.status_code}"
 73 |     assert data == {"status": "error", "message": "DirectoryTraversalError: ../p"}
 74 | 
 75 | 
 76 | def test_daemonstatus():
 77 |     assert_response("get", "/daemonstatus.json", {"status": "ok", "running": 0, "pending": 0, "finished": 0})
 78 | 
 79 | 
 80 | def test_schedule_nonexistent_project():
 81 |     assert_response(
 82 |         "post",
 83 |         "/schedule.json",
 84 |         {"status": "error", "message": "project 'nonexistent' not found"},
 85 |         data={"project": "nonexistent", "spider": "nospider"},
 86 |     )
 87 | 
 88 | 
 89 | def test_status_nonexistent_job():
 90 |     assert_response(
 91 |         "get",
 92 |         "/status.json",
 93 |         {"status": "ok", "currstate": None},
 94 |         params={"job": "sample"},
 95 |     )
 96 | 
 97 | 
 98 | def test_status_nonexistent_project():
 99 |     assert_response(
100 |         "get",
101 |         "/status.json",
102 |         {"status": "error", "message": "project 'nonexistent' not found"},
103 |         params={"job": "sample", "project": "nonexistent"},
104 |     )
105 | 
106 | 
107 | def test_cancel_nonexistent_project():
108 |     assert_response(
109 |         "post",
110 |         "/cancel.json",
111 |         {"status": "error", "message": "project 'nonexistent' not found"},
112 |         data={"project": "nonexistent", "job": "nojob"},
113 |     )
114 | 
115 | 
116 | def test_listprojects():
117 |     assert_response(
118 |         "get",
119 |         "/listprojects.json",
120 |         {"status": "ok", "projects": []},
121 |     )
122 | 
123 | 
124 | def test_listversions():
125 |     assert_response(
126 |         "get",
127 |         "/listversions.json",
128 |         {"status": "ok", "versions": []},
129 |         params={"project": "sample"},
130 |     )
131 | 
132 | 
133 | def test_listspiders_nonexistent_project():
134 |     assert_response(
135 |         "get",
136 |         "/listspiders.json",
137 |         {"status": "error", "message": "project 'nonexistent' not found"},
138 |         params={"project": "nonexistent"},
139 |     )
140 | 
141 | 
142 | def test_listjobs():
143 |     assert_response(
144 |         "get",
145 |         "/listjobs.json",
146 |         {"status": "ok", "pending": [], "running": [], "finished": []},
147 |     )
148 | 
149 | 
150 | def test_listjobs_nonexistent_project():
151 |     assert_response(
152 |         "get",
153 |         "/listjobs.json",
154 |         {"status": "error", "message": "project 'nonexistent' not found"},
155 |         params={"project": "nonexistent"},
156 |     )
157 | 
158 | 
159 | def test_delversion_nonexistent_project():
160 |     assert_response(
161 |         "post",
162 |         "/delversion.json",
163 |         {"status": "error", "message": "version 'nonexistent' not found"},
164 |         data={"project": "sample", "version": "nonexistent"},
165 |     )
166 | 
167 | 
168 | def test_delproject_nonexistent_project():
169 |     assert_response(
170 |         "post",
171 |         "/delproject.json",
172 |         {"status": "error", "message": "project 'nonexistent' not found"},
173 |         data={"project": "nonexistent"},
174 |     )
175 | 


--------------------------------------------------------------------------------
/integration_tests/test_website.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from integration_tests import req
 4 | 
 5 | 
 6 | def test_root():
 7 |     response = req("get", "/")
 8 | 
 9 |     assert '"/jobs"' in response.text
10 |     assert '"/logs/"' in response.text
11 | 
12 | 
13 | @pytest.mark.parametrize(("path", "content"), [("jobs", "Cancel"), ("logs", "Last modified")])
14 | def test_paths(path, content):
15 |     response = req("get", f"/{path}")
16 | 
17 |     assert content in response.text
18 | 
19 | 
20 | def test_base_path():
21 |     response = req("get", "/", headers={"X-Forwarded-Prefix": "/path/to"})
22 | 
23 |     assert '"/path/to/jobs"' in response.text
24 |     assert '"/path/to/logs/"' in response.text
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=61.2"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "scrapyd"
  7 | version = "1.5.0"
  8 | authors = [{name = "Scrapy developers", email = "info@scrapy.org"}]
  9 | description = "A service for running Scrapy spiders, with an HTTP API"
 10 | readme = "README.rst"
 11 | license = {text = "BSD"}
 12 | urls = {Homepage = "https://github.com/scrapy/scrapyd"}
 13 | classifiers = [
 14 |     "License :: OSI Approved :: BSD License",
 15 |     "Operating System :: OS Independent",
 16 |     "Programming Language :: Python :: 3.9",
 17 |     "Programming Language :: Python :: 3.10",
 18 |     "Programming Language :: Python :: 3.11",
 19 |     "Programming Language :: Python :: 3.12",
 20 |     "Programming Language :: Python :: 3.13",
 21 |     "Programming Language :: Python :: Implementation :: CPython",
 22 |     "Development Status :: 5 - Production/Stable",
 23 |     "Environment :: Console",
 24 |     "Environment :: No Input/Output (Daemon)",
 25 |     "Topic :: Internet :: WWW/HTTP",
 26 | ]
 27 | dependencies = [
 28 |   "packaging",
 29 |   "pywin32;platform_system=='Windows'",
 30 |   "scrapy>=2.0.0",
 31 |   "setuptools",
 32 |   "twisted>=17.9",
 33 |   "w3lib",
 34 |   "zope.interface",
 35 | ]
 36 | 
 37 | [project.optional-dependencies]
 38 | test = [
 39 |   "coveralls",
 40 |   "py-html-checker",
 41 |   "pytest",
 42 |   "pytest-cov",
 43 |   "pytest-twisted",
 44 |   "requests",
 45 |   "twisted>=19.7",  # twisted.logger.capturedLogs
 46 | ]
 47 | docs = [
 48 |     "furo",
 49 |     "sphinx",
 50 |     "sphinx-autobuild",
 51 |     "sphinxcontrib-zopeext",
 52 | ]
 53 | 
 54 | [project.scripts]
 55 | scrapyd = "scrapyd.__main__:main"
 56 | 
 57 | [tool.setuptools]
 58 | packages = ["scrapyd"]
 59 | zip-safe = false  # The scrapyd.__main__ module requires the txapp.py file to be decompressed. #49
 60 | 
 61 | [tool.ruff]
 62 | line-length = 119
 63 | target-version = "py38"
 64 | 
 65 | [tool.ruff.lint]
 66 | select = ["ALL"]
 67 | ignore = [
 68 |   "ANN", "COM", "EM",
 69 |   # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
 70 |   "W191", "E501", "D206", "Q000", "Q001", "Q002", "Q003", "ISC001",
 71 |   "D203", "D212",  # ignore incompatible rules
 72 |   "D200",  # documentation preferences
 73 |   "C901", "PLR0912",  # complexity preferences
 74 | 
 75 |   # Project-specific
 76 |   "D",
 77 |   "PTH",  # Scrapyd hasn't adopted pathlib
 78 |   "ARG002",  # Unused method argument (txrequest argument isn't always used)
 79 |   "N802",  # Function name should be lowercase (Twisted uses method names like render_GET)
 80 |   "N803",  # Argument name should be lowercase (Twisted uses argument names like avatarId)
 81 |   "N815",  # Variable in class scope should not be mixedCase (Twisted uses class attributes like requestAvatarId)
 82 |   "PLR0913",  # Too many arguments to function call
 83 |   "S603",  # `subprocess` call: check for execution of untrusted input (informative)
 84 | 
 85 |   # sqlite3 doesn't have functions like psycopg2.sql.Identifier and psycopg2.sql.SQL.format.
 86 |   "S608",  # Possible SQL injection vector through string-based query construction
 87 | 
 88 |   # Scrapyd uses naive datetimes.
 89 |   "DTZ001",  # `datetime.datetime()` called without a `tzinfo` argument"
 90 |   "DTZ005",  # `datetime.datetime.now()` called without a `tz` argument
 91 |   "DTZ006",  # `datetime.datetime.fromtimestamp()` called without a `tz` argument
 92 |   "DTZ007",  # Naive datetime constructed using `datetime.datetime.strptime()` without %z
 93 | ]
 94 | 
 95 | [tool.ruff.lint.flake8-builtins]
 96 | builtins-ignorelist = ["copyright"]
 97 | 
 98 | [tool.ruff.lint.per-file-ignores]
 99 | "docs/conf.py" = ["INP001"]  # no __init__.py file
100 | "scrapyd/__main__.py" = ["T201"]  #  `print` found
101 | "scrapyd/interfaces.py" = ["N805"]  # First argument of a method should be named `self`
102 | "{tests,integration_tests}/*" = [
103 |   "D",  # docstring
104 |   "S101",  # assert
105 |   "S106",  # password
106 |   "S113",  # requests timeout
107 |   "PLR2004",  # magic value
108 |   "ARG001", "ARG002", "ARG005",  # mocks
109 |   "PT009", "PT027",  # Scrapyd mixes unittest with pytest
110 | ]
111 | 


--------------------------------------------------------------------------------
/scrapyd/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from scrapyd.config import Config
 4 | from scrapyd.exceptions import ConfigError
 5 | from scrapyd.utils import initialize_component
 6 | 
 7 | __version__ = "1.5.0"
 8 | version_info = tuple(__version__.split(".")[:3])
 9 | 
10 | 
11 | def get_application(config=None):
12 |     if config is None:
13 |         config = Config()
14 |     try:
15 |         return initialize_component(config, "application", "scrapyd.app.application")
16 |     except ConfigError as e:
17 |         sys.exit(str(e))
18 | 


--------------------------------------------------------------------------------
/scrapyd/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from os.path import dirname, join
 3 | 
 4 | from twisted.scripts import twistd
 5 | 
 6 | import scrapyd
 7 | 
 8 | 
 9 | class ServerOptions(twistd.ServerOptions):
10 |     synopsis = "Usage: scrapyd [options]"
11 |     longdesc = "Scrapyd is an application for deploying and running Scrapy spiders."
12 | 
13 |     def __init__(self):
14 |         super().__init__()
15 |         # main() always sets -n (--nodaemon) and -y (--python=). -y can be set only once. -n is okay to leave as a
16 |         # no-op. Scrapyd's *_dir settings don't respect --rundir.
17 |         self.longOpt = [opt for opt in self.longOpt if opt not in ("python=", "rundir=")]
18 | 
19 |     @property
20 |     def subCommands(self):
21 |         return []  # remove alternatives to running txapp.py
22 | 
23 |     def getUsage(self, width=None):
24 |         return super().getUsage(width=width)[:-11]  # remove "\nCommands:\n"
25 | 
26 | 
27 | def main():
28 |     if len(sys.argv) > 1 and "-v" in sys.argv[1:] or "--version" in sys.argv[1:]:
29 |         print(f"Scrapyd {scrapyd.__version__}")
30 |     else:
31 |         sys.argv[1:1] = ["-n", "-y", join(dirname(scrapyd.__file__), "txapp.py")]
32 |         twistd.app.run(twistd.runApp, ServerOptions)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/scrapyd/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from twisted.application.internet import TCPServer, TimerService, UNIXServer
 4 | from twisted.application.service import Application
 5 | from twisted.logger import Logger
 6 | from twisted.web import server
 7 | 
 8 | from scrapyd.basicauth import wrap_resource
 9 | from scrapyd.environ import Environment
10 | from scrapyd.interfaces import IEggStorage, IEnvironment, IJobStorage, IPoller, ISpiderScheduler
11 | from scrapyd.scheduler import SpiderScheduler
12 | from scrapyd.utils import initialize_component
13 | 
14 | log = Logger()
15 | 
16 | 
17 | def application(config):
18 |     app = Application("Scrapyd")
19 |     bind_address = os.getenv("SCRAPYD_BIND_ADDRESS") or config.get("bind_address", "127.0.0.1")
20 |     http_port = int(os.getenv("SCRAPYD_HTTP_PORT") or config.getint("http_port", "6800"))
21 |     unix_socket_path = os.getenv("SCRAPYD_UNIX_SOCKET_PATH") or config.get("unix_socket_path", "")
22 |     poll_interval = config.getfloat("poll_interval", 5)
23 | 
24 |     environment = Environment(config)
25 |     scheduler = SpiderScheduler(config)
26 |     poller = initialize_component(config, "poller", "scrapyd.poller.QueuePoller")
27 |     jobstorage = initialize_component(config, "jobstorage", "scrapyd.jobstorage.MemoryJobStorage")
28 |     eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage")
29 | 
30 |     app.setComponent(IEnvironment, environment)
31 |     app.setComponent(ISpiderScheduler, scheduler)
32 |     app.setComponent(IPoller, poller)
33 |     app.setComponent(IJobStorage, jobstorage)
34 |     app.setComponent(IEggStorage, eggstorage)
35 | 
36 |     # launcher uses jobstorage in initializer, and uses poller and environment.
37 |     launcher = initialize_component(config, "launcher", "scrapyd.launcher.Launcher", app)
38 | 
39 |     timer = TimerService(poll_interval, poller.poll)
40 | 
41 |     # webroot uses launcher, poller, scheduler and environment.
42 |     webroot = initialize_component(config, "webroot", "scrapyd.website.Root", app)
43 |     resource = server.Site(wrap_resource(webroot, config))
44 |     if bind_address and http_port:
45 |         webservice = TCPServer(http_port, resource, interface=bind_address)
46 |         log.info(
47 |             "Scrapyd web console available at http://{bind_address}:{http_port}/",
48 |             bind_address=bind_address,
49 |             http_port=http_port,
50 |         )
51 |     if unix_socket_path:
52 |         unix_socket_path = os.path.abspath(unix_socket_path)
53 |         webservice = UNIXServer(unix_socket_path, resource, mode=0o660)
54 |         log.info(
55 |             "Scrapyd web console available at http+unix://{unix_socket_path}",
56 |             unix_socket_path=unix_socket_path,
57 |         )
58 | 
59 |     launcher.setServiceParent(app)
60 |     timer.setServiceParent(app)
61 |     webservice.setServiceParent(app)
62 | 
63 |     return app
64 | 


--------------------------------------------------------------------------------
/scrapyd/basicauth.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from twisted.cred import credentials, error
 4 | from twisted.cred.checkers import ICredentialsChecker
 5 | from twisted.cred.portal import IRealm, Portal
 6 | from twisted.internet import defer
 7 | from twisted.logger import Logger
 8 | from twisted.web.guard import BasicCredentialFactory, HTTPAuthSessionWrapper
 9 | from twisted.web.resource import IResource
10 | from zope.interface import implementer
11 | 
12 | from scrapyd.exceptions import InvalidUsernameError
13 | 
14 | log = Logger()
15 | 
16 | 
17 | # https://docs.twisted.org/en/stable/web/howto/web-in-60/http-auth.html
18 | @implementer(IRealm)
19 | class PublicHTMLRealm:
20 |     def __init__(self, resource):
21 |         self.resource = resource
22 | 
23 |     def requestAvatar(self, avatarId, mind, *interfaces):
24 |         if IResource in interfaces:
25 |             return (IResource, self.resource, lambda: None)
26 |         raise NotImplementedError
27 | 
28 | 
29 | @implementer(ICredentialsChecker)
30 | class StringCredentialsChecker:
31 |     credentialInterfaces = (credentials.IUsernamePassword,)
32 | 
33 |     def __init__(self, username, password):
34 |         self.username = username.encode()
35 |         self.password = password.encode()
36 | 
37 |     def requestAvatarId(self, credentials):
38 |         if credentials.username == self.username and credentials.password == self.password:
39 |             return defer.succeed(credentials.username)
40 |         return defer.fail(error.UnauthorizedLogin())
41 | 
42 | 
43 | def wrap_resource(resource, config):
44 |     username = os.getenv("SCRAPYD_USERNAME") or config.get("username", "")
45 |     password = os.getenv("SCRAPYD_PASSWORD") or config.get("password", "")
46 |     # https://www.rfc-editor.org/rfc/rfc2617#section-2
47 |     if ":" in username:
48 |         raise InvalidUsernameError
49 | 
50 |     if username and password:
51 |         log.info("Basic authentication enabled")
52 |         return HTTPAuthSessionWrapper(
53 |             Portal(PublicHTMLRealm(resource), [StringCredentialsChecker(username, password)]),
54 |             [BasicCredentialFactory(b"Scrapyd")],
55 |         )
56 | 
57 |     log.info("Basic authentication disabled as either `username` or `password` is unset")
58 |     return resource
59 | 


--------------------------------------------------------------------------------
/scrapyd/config.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os.path
 3 | from configparser import ConfigParser, NoOptionError, NoSectionError
 4 | from pkgutil import get_data
 5 | 
 6 | from scrapy.utils.conf import closest_scrapy_cfg
 7 | 
 8 | 
 9 | class Config:
10 |     """A ConfigParser wrapper to support defaults when calling instance
11 |     methods, and also tied to a single section"""
12 | 
13 |     SECTION = "scrapyd"
14 | 
15 |     def __init__(self, values=None, extra_sources=()):
16 |         if values is None:
17 |             self.cp = ConfigParser()
18 |             self.cp.read_string(get_data(__package__, "default_scrapyd.conf").decode())
19 |             self.cp.read(
20 |                 [
21 |                     "/etc/scrapyd/scrapyd.conf",
22 |                     "c:\\scrapyd\\scrapyd.conf",
23 |                     *sorted(glob.glob("/etc/scrapyd/conf.d/*")),
24 |                     "scrapyd.conf",
25 |                     os.path.expanduser("~/.scrapyd.conf"),
26 |                     closest_scrapy_cfg(),
27 |                     *extra_sources,
28 |                 ]
29 |             )
30 |         else:
31 |             self.cp = ConfigParser(values)
32 |             self.cp.add_section(self.SECTION)
33 | 
34 |     def get(self, option, default=None):
35 |         return self._get(self.cp.get, option, default)
36 | 
37 |     def getint(self, option, default=None):
38 |         return self._get(self.cp.getint, option, default)
39 | 
40 |     def getfloat(self, option, default=None):
41 |         return self._get(self.cp.getfloat, option, default)
42 | 
43 |     def getboolean(self, option, default=None):
44 |         return self._get(self.cp.getboolean, option, default)
45 | 
46 |     def _get(self, method, option, default):
47 |         try:
48 |             return method(self.SECTION, option)
49 |         except (NoSectionError, NoOptionError):
50 |             if default is not None:
51 |                 return default
52 |             raise
53 | 
54 |     def items(self, section, default=None):
55 |         try:
56 |             return self.cp.items(section)
57 |         except NoSectionError:
58 |             if default is not None:
59 |                 return default
60 |             raise
61 | 


--------------------------------------------------------------------------------
/scrapyd/default_scrapyd.conf:
--------------------------------------------------------------------------------
 1 | [scrapyd]
 2 | # Application options
 3 | application       = scrapyd.app.application
 4 | bind_address      = 127.0.0.1
 5 | http_port         = 6800
 6 | unix_socket_path  =
 7 | username          =
 8 | password          =
 9 | spiderqueue       = scrapyd.spiderqueue.SqliteSpiderQueue
10 | 
11 | # Poller options
12 | poller            = scrapyd.poller.QueuePoller
13 | poll_interval     = 5.0
14 | 
15 | # Launcher options
16 | launcher          = scrapyd.launcher.Launcher
17 | max_proc          = 0
18 | max_proc_per_cpu  = 4
19 | logs_dir          = logs
20 | items_dir         =
21 | jobs_to_keep      = 5
22 | runner            = scrapyd.runner
23 | 
24 | # Web UI and API options
25 | webroot           = scrapyd.website.Root
26 | prefix_header     = x-forwarded-prefix
27 | debug             = off
28 | 
29 | # Egg storage options
30 | eggstorage        = scrapyd.eggstorage.FilesystemEggStorage
31 | eggs_dir          = eggs
32 | 
33 | # Job storage options
34 | jobstorage        = scrapyd.jobstorage.MemoryJobStorage
35 | finished_to_keep  = 100
36 | 
37 | # Directory options
38 | dbs_dir           = dbs
39 | 
40 | [services]
41 | schedule.json     = scrapyd.webservice.Schedule
42 | cancel.json       = scrapyd.webservice.Cancel
43 | status.json       = scrapyd.webservice.Status
44 | addversion.json   = scrapyd.webservice.AddVersion
45 | listprojects.json = scrapyd.webservice.ListProjects
46 | listversions.json = scrapyd.webservice.ListVersions
47 | listspiders.json  = scrapyd.webservice.ListSpiders
48 | delproject.json   = scrapyd.webservice.DeleteProject
49 | delversion.json   = scrapyd.webservice.DeleteVersion
50 | listjobs.json     = scrapyd.webservice.ListJobs
51 | daemonstatus.json = scrapyd.webservice.DaemonStatus
52 | 


--------------------------------------------------------------------------------
/scrapyd/eggstorage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import shutil
 4 | from glob import escape, glob
 5 | 
 6 | from packaging.version import InvalidVersion, Version
 7 | from twisted.python import filepath
 8 | from zope.interface import implementer
 9 | 
10 | from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError
11 | from scrapyd.interfaces import IEggStorage
12 | 
13 | 
14 | def sorted_versions(versions):
15 |     try:
16 |         return sorted(versions, key=Version)
17 |     except InvalidVersion:
18 |         return sorted(versions)
19 | 
20 | 
21 | @implementer(IEggStorage)
22 | class FilesystemEggStorage:
23 |     def __init__(self, config):
24 |         self.basedir = config.get("eggs_dir", "eggs")
25 | 
26 |     def put(self, eggfile, project, version):
27 |         path = self._egg_path(project, version)
28 | 
29 |         directory = os.path.dirname(path)
30 |         if not os.path.exists(directory):
31 |             os.makedirs(directory)
32 | 
33 |         with open(path, "wb") as f:
34 |             shutil.copyfileobj(eggfile, f)
35 | 
36 |     def get(self, project, version=None):
37 |         if version is None:
38 |             try:
39 |                 version = self.list(project)[-1]
40 |             except IndexError:
41 |                 return None, None
42 |         try:
43 |             return version, open(self._egg_path(project, version), "rb")  # noqa: SIM115
44 |         except FileNotFoundError:
45 |             return None, None
46 | 
47 |     def list(self, project):
48 |         return sorted_versions(
49 |             [os.path.splitext(os.path.basename(path))[0] for path in glob(self._get_path(escape(project), "*.egg"))]
50 |         )
51 | 
52 |     def list_projects(self):
53 |         if os.path.exists(self.basedir):
54 |             return [name for name in os.listdir(self.basedir) if os.path.isdir(os.path.join(self.basedir, name))]
55 |         return []
56 | 
57 |     def delete(self, project, version=None):
58 |         if version is None:
59 |             try:
60 |                 shutil.rmtree(self._get_path(project))
61 |             except FileNotFoundError as e:
62 |                 raise ProjectNotFoundError from e
63 |         else:
64 |             try:
65 |                 os.remove(self._egg_path(project, version))
66 |                 if not self.list(project):  # remove project if no versions left
67 |                     self.delete(project)
68 |             except FileNotFoundError as e:
69 |                 raise EggNotFoundError from e
70 | 
71 |     def _egg_path(self, project, version):
72 |         sanitized_version = re.sub(r"[^A-Za-z0-9_-]", "_", version)
73 |         return self._get_path(project, f"{sanitized_version}.egg")
74 | 
75 |     def _get_path(self, project, *trusted):
76 |         try:
77 |             file = filepath.FilePath(self.basedir).child(project)
78 |         except filepath.InsecurePath as e:
79 |             raise DirectoryTraversalError(project) from e
80 | 
81 |         return os.path.join(file.path, *trusted)
82 | 


--------------------------------------------------------------------------------
/scrapyd/environ.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from contextlib import suppress
 4 | from posixpath import join as urljoin
 5 | from urllib.parse import urlsplit
 6 | 
 7 | from w3lib.url import path_to_file_uri
 8 | from zope.interface import implementer
 9 | 
10 | from scrapyd.interfaces import IEnvironment
11 | from scrapyd.utils import get_file_path, local_items
12 | 
13 | 
14 | @implementer(IEnvironment)
15 | class Environment:
16 |     def __init__(self, config, initenv=os.environ):
17 |         self.dbs_dir = config.get("dbs_dir", "dbs")
18 |         self.logs_dir = config.get("logs_dir", "logs")
19 |         self.items_dir = config.get("items_dir", "")
20 |         self.jobs_to_keep = config.getint("jobs_to_keep", 5)
21 |         self.settings = dict(config.items("settings", default=[]))
22 |         self.initenv = initenv
23 | 
24 |     def get_settings(self, message):
25 |         settings = {}
26 |         if self.logs_dir:
27 |             settings["LOG_FILE"] = self._prepare_file(message, self.logs_dir, "log")
28 |         if self.items_dir:
29 |             settings["FEEDS"] = json.dumps({self._get_feeds(message, "jl"): {"format": "jsonlines"}})
30 |         return settings
31 | 
32 |     def get_environment(self, message, slot):
33 |         project = message["_project"]
34 | 
35 |         env = self.initenv.copy()
36 |         env["SCRAPY_PROJECT"] = project
37 |         # If the version is not provided, then the runner uses the default version, determined by egg storage.
38 |         if "_version" in message:
39 |             env["SCRAPYD_EGG_VERSION"] = message["_version"]
40 |         # Scrapy discovers the same scrapy.cfg files as Scrapyd. So, this is only needed if users are adding [settings]
41 |         # sections to Scrapyd configuration files (which Scrapy doesn't discover). This might lead to strange behavior
42 |         # if an egg project and a [settings] project have the same name (unlikely). Preserved, since committed in 2010.
43 |         if project in self.settings:
44 |             env["SCRAPY_SETTINGS_MODULE"] = self.settings[project]
45 | 
46 |         return env
47 | 
48 |     def _get_feeds(self, message, extension):
49 |         parsed = urlsplit(self.items_dir)
50 | 
51 |         if local_items(self.items_dir, parsed):
52 |             # File URLs do not have query or fragment components. https://www.rfc-editor.org/rfc/rfc8089#section-2
53 |             return path_to_file_uri(self._prepare_file(message, parsed.path, extension))
54 | 
55 |         path = urljoin(parsed.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}")
56 |         return parsed._replace(path=path).geturl()
57 | 
58 |     def _prepare_file(self, message, directory, extension):
59 |         file_path = get_file_path(directory, message["_project"], message["_spider"], message["_job"], extension)
60 | 
61 |         parent = file_path.dirname()  # returns a str
62 |         if not os.path.exists(parent):
63 |             os.makedirs(parent)
64 | 
65 |         to_delete = sorted(
66 |             (os.path.join(parent, name) for name in os.listdir(parent)),
67 |             key=os.path.getmtime,
68 |         )[: -self.jobs_to_keep]
69 |         for path in to_delete:
70 |             with suppress(OSError):
71 |                 os.remove(path)
72 | 
73 |         return file_path.path
74 | 


--------------------------------------------------------------------------------
/scrapyd/exceptions.py:
--------------------------------------------------------------------------------
 1 | class ScrapydError(Exception):
 2 |     """Base class for exceptions from within this package"""
 3 | 
 4 | 
 5 | class ConfigError(ScrapydError):
 6 |     """Raised if a configuration error prevents Scrapyd from starting"""
 7 | 
 8 | 
 9 | class InvalidUsernameError(ConfigError):
10 |     """Raised if the username contains a colon"""
11 | 
12 |     def __init__(self):
13 |         super().__init__(
14 |             "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file."
15 |         )
16 | 
17 | 
18 | class BadEggError(ScrapydError):
19 |     """Raised if the egg is invalid"""
20 | 
21 | 
22 | class DirectoryTraversalError(ScrapydError):
23 |     """Raised if the resolved path is outside the expected directory"""
24 | 
25 | 
26 | class ProjectNotFoundError(ScrapydError):
27 |     """Raised if a project isn't found in an IEggStorage implementation"""
28 | 
29 | 
30 | class EggNotFoundError(ScrapydError):
31 |     """Raised if an egg isn't found in an IEggStorage implementation"""
32 | 
33 | 
34 | class RunnerError(ScrapydError):
35 |     """Raised if the runner returns an error code"""
36 | 


--------------------------------------------------------------------------------
/scrapyd/interfaces.py:
--------------------------------------------------------------------------------
  1 | from zope.interface import Attribute, Interface
  2 | 
  3 | 
  4 | class IEggStorage(Interface):
  5 |     """
  6 |     A component to store project eggs.
  7 |     """
  8 | 
  9 |     def put(eggfile, project, version):
 10 |         """
 11 |         Store the egg (a file object), which represents a ``version`` of the ``project``.
 12 |         """
 13 | 
 14 |     def get(project, version=None):
 15 |         """
 16 |         Return ``(version, file)`` for the egg matching the ``project`` and ``version``.
 17 | 
 18 |         If ``version`` is ``None``, the latest version and corresponding file are returned.
 19 | 
 20 |         If no egg is found, ``(None, None)`` is returned.
 21 | 
 22 |         .. tip:: Remember to close the ``file`` when done.
 23 |         """
 24 | 
 25 |     def list(project):
 26 |         """
 27 |         Return all versions of the ``project`` in order, with the latest version last.
 28 |         """
 29 | 
 30 |     def list_projects():
 31 |         """
 32 |         Return all projects in storage.
 33 | 
 34 |         .. versionadded:: 1.3.0
 35 |            Move this logic into the interface and its implementations, to allow customization.
 36 |         """
 37 | 
 38 |     def delete(project, version=None):
 39 |         """
 40 |         Delete the egg matching the ``project`` and ``version``. Delete the ``project``, if no versions remains.
 41 |         """
 42 | 
 43 | 
 44 | class IPoller(Interface):
 45 |     """
 46 |     A component that tracks capacity for new jobs, and starts jobs when ready.
 47 |     """
 48 | 
 49 |     queues = Attribute(
 50 |         """
 51 |         An object (like a ``dict``) with a ``__getitem__`` method that accepts a project's name and returns its
 52 |         :py:interface:`spider queue<scrapyd.interfaces.ISpiderQueue>` of pending jobs.
 53 |         """
 54 |     )
 55 | 
 56 |     def poll():
 57 |         """
 58 |         Called periodically to start jobs if there's capacity.
 59 |         """
 60 | 
 61 |     def next():
 62 |         """
 63 |         Return the next pending job.
 64 | 
 65 |         It should return a Deferred that will be fired when there's capacity, or already fired if there's capacity.
 66 | 
 67 |         The pending job is a ``dict`` containing at least the ``_project`` name, ``_spider`` name and ``_job`` ID.
 68 |         The job ID is unique, at least within the project.
 69 | 
 70 |         The pending job is later passed to :meth:`scrapyd.interfaces.IEnvironment.get_environment`.
 71 | 
 72 |         .. seealso:: :meth:`scrapyd.interfaces.ISpiderQueue.pop`
 73 |         """
 74 | 
 75 |     def update_projects():
 76 |         """
 77 |         Called when projects may have changed, to refresh the available projects, including at initialization.
 78 |         """
 79 | 
 80 | 
 81 | class ISpiderQueue(Interface):
 82 |     """
 83 |     A component to store pending jobs.
 84 | 
 85 |     The ``dict`` keys used by the chosen ``ISpiderQueue`` implementation must match the chosen:
 86 | 
 87 |     -  :ref:`launcher` service (which calls :meth:`scrapyd.interfaces.IPoller.next`)
 88 |     -  :py:interface:`~scrapyd.interfaces.IEnvironment` implementation (see :meth:`scrapyd.interfaces.IPoller.next`)
 89 |     -  :ref:`webservices<config-services>` that schedule, cancel or list pending jobs
 90 |     """
 91 | 
 92 |     def add(name, priority, **spider_args):
 93 |         """
 94 |         Add a pending job, given the spider ``name``, crawl ``priority`` and keyword arguments, which might include the
 95 |         ``_job`` ID, egg ``_version`` and Scrapy ``settings`` depending on the implementation, with keyword arguments
 96 |         that are not recognized by the implementation being treated as spider arguments.
 97 | 
 98 |         .. versionchanged:: 1.3.0
 99 |            Add the ``priority`` parameter.
100 |         """
101 | 
102 |     def pop():
103 |         """
104 |         Pop the next pending job. The pending job is a ``dict`` containing the spider ``name``. Depending on the
105 |         implementation, other keys might include the ``_job`` ID, egg ``_version`` and Scrapy ``settings``, with
106 |         keyword arguments that are not recognized by the receiver being treated as spider arguments.
107 |         """
108 | 
109 |     def list():
110 |         """
111 |         Return the pending jobs.
112 | 
113 |         .. seealso:: :meth:`scrapyd.interfaces.ISpiderQueue.pop`
114 |         """
115 | 
116 |     def count():
117 |         """
118 |         Return the number of pending jobs.
119 |         """
120 | 
121 |     def remove(func):
122 |         """
123 |         Remove pending jobs for which ``func(job)`` is true, and return the number of removed pending jobss.
124 |         """
125 | 
126 |     def clear():
127 |         """
128 |         Remove all pending jobs.
129 |         """
130 | 
131 | 
132 | class ISpiderScheduler(Interface):
133 |     """
134 |     A component to schedule jobs.
135 |     """
136 | 
137 |     def schedule(project, spider_name, priority, **spider_args):
138 |         """
139 |         Schedule a crawl.
140 | 
141 |         .. versionchanged:: 1.3.0
142 |            Add the ``priority`` parameter.
143 |         """
144 | 
145 |     def list_projects():
146 |         """
147 |         Return all projects that can be scheduled.
148 |         """
149 | 
150 |     def update_projects():
151 |         """
152 |         Called when projects may have changed, to refresh the available projects, including at initialization.
153 |         """
154 | 
155 | 
156 | class IEnvironment(Interface):
157 |     """
158 |     A component to generate the environment of jobs.
159 | 
160 |     The chosen ``IEnvironment`` implementation must match the chosen :ref:`launcher` service.
161 |     """
162 | 
163 |     def get_settings(message):
164 |         """
165 |         Return the Scrapy settings to use for running the process.
166 | 
167 |         Depending on the chosen :ref:`launcher`, this would be one of more ``LOG_FILE`` or ``FEEDS``.
168 | 
169 |         .. versionadded:: 1.4.2
170 |            Support for overriding Scrapy settings via ``SCRAPY_`` environment variables was removed in Scrapy 2.8.
171 | 
172 |         :param message: the pending job received from the :meth:`scrapyd.interfaces.IPoller.next` method
173 |         """
174 | 
175 |     def get_environment(message, slot):
176 |         """
177 |         Return the environment variables to use for running the process.
178 | 
179 |         Depending on the chosen :ref:`launcher`, this would be one of more of ``SCRAPY_PROJECT``,
180 |         ``SCRAPYD_EGG_VERSION`` or ``SCRAPY_SETTINGS_MODULE``.
181 | 
182 |         :param message: the pending job received from the :meth:`scrapyd.interfaces.IPoller.next` method
183 |         :param slot: the :ref:`launcher` slot for tracking the process
184 |         """
185 | 
186 | 
187 | class IJobStorage(Interface):
188 |     """
189 |     A component to store finished jobs.
190 | 
191 |     .. versionadded:: 1.3.0
192 |     """
193 | 
194 |     def add(job):
195 |         """
196 |         Add a finished job in the storage.
197 |         """
198 | 
199 |     def list():
200 |         """
201 |         Return the finished jobs.
202 | 
203 |         .. seealso:: :meth:`scrapyd.interfaces.IJobStorage.__iter__`
204 |         """
205 | 
206 |     def __len__():
207 |         """
208 |         Return the number of finished jobs.
209 |         """
210 | 
211 |     def __iter__():
212 |         """
213 |         Iterate over the finished jobs in reverse order by ``end_time``.
214 | 
215 |         A job has the attributes ``project``, ``spider``, ``job``, ``start_time`` and ``end_time`` and may have the
216 |         attributes ``args`` (``scrapy crawl`` CLI arguments) and ``env`` (environment variables).
217 |         """
218 | 


--------------------------------------------------------------------------------
/scrapyd/jobstorage.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. versionadded:: 1.3.0
 3 |    Job storage was previously in-memory only and managed by the launcher.
 4 | """
 5 | 
 6 | from zope.interface import implementer
 7 | 
 8 | from scrapyd import sqlite
 9 | from scrapyd.interfaces import IJobStorage
10 | from scrapyd.launcher import ScrapyProcessProtocol
11 | 
12 | 
13 | @implementer(IJobStorage)
14 | class MemoryJobStorage:
15 |     def __init__(self, config):
16 |         self.jobs = []
17 |         self.finished_to_keep = config.getint("finished_to_keep", 100)
18 | 
19 |     def add(self, job):
20 |         self.jobs.append(job)
21 |         del self.jobs[: -self.finished_to_keep]  # keep last x finished jobs
22 | 
23 |     def list(self):
24 |         return list(self)
25 | 
26 |     def __len__(self):
27 |         return len(self.jobs)
28 | 
29 |     def __iter__(self):
30 |         yield from reversed(self.jobs)
31 | 
32 | 
33 | @implementer(IJobStorage)
34 | class SqliteJobStorage:
35 |     def __init__(self, config):
36 |         self.jobs = sqlite.initialize(sqlite.SqliteFinishedJobs, config, "jobs", "finished_jobs")
37 |         self.finished_to_keep = config.getint("finished_to_keep", 100)
38 | 
39 |     def add(self, job):
40 |         self.jobs.add(job)
41 |         self.jobs.clear(self.finished_to_keep)
42 | 
43 |     def list(self):
44 |         return list(self)
45 | 
46 |     def __len__(self):
47 |         return len(self.jobs)
48 | 
49 |     def __iter__(self):
50 |         for project, spider, jobid, start_time, end_time in self.jobs:
51 |             job = ScrapyProcessProtocol(project, spider, jobid, env={}, args=[])
52 |             job.start_time = start_time
53 |             job.end_time = end_time
54 |             yield job
55 | 


--------------------------------------------------------------------------------
/scrapyd/launcher.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import multiprocessing
  3 | import sys
  4 | from itertools import chain
  5 | 
  6 | from twisted.application.service import Service
  7 | from twisted.internet import defer, error, protocol, reactor
  8 | from twisted.logger import Logger
  9 | 
 10 | from scrapyd import __version__
 11 | from scrapyd.interfaces import IEnvironment, IJobStorage, IPoller
 12 | 
 13 | log = Logger()
 14 | 
 15 | 
 16 | def get_crawl_args(message):
 17 |     """Return the command-line arguments to use for the scrapy crawl process
 18 |     that will be started for this message
 19 |     """
 20 |     copied = message.copy()
 21 |     del copied["_project"]
 22 | 
 23 |     return [
 24 |         copied.pop("_spider"),
 25 |         *chain.from_iterable(["-s", f"{key}={value}"] for key, value in copied.pop("settings", {}).items()),
 26 |         *chain.from_iterable(["-a", f"{key}={value}"] for key, value in copied.items()),  # spider arguments
 27 |     ]
 28 | 
 29 | 
 30 | class Launcher(Service):
 31 |     name = "launcher"
 32 | 
 33 |     def __init__(self, config, app):
 34 |         self.processes = {}
 35 |         self.finished = app.getComponent(IJobStorage)
 36 |         self.max_proc = self._get_max_proc(config)
 37 |         self.runner = config.get("runner", "scrapyd.runner")
 38 |         self.app = app
 39 | 
 40 |     def startService(self):
 41 |         log.info(
 42 |             "Scrapyd {version} started: max_proc={max_proc!r}, runner={runner!r}",
 43 |             version=__version__,
 44 |             max_proc=self.max_proc,
 45 |             runner=self.runner,
 46 |             log_system="Launcher",
 47 |         )
 48 |         for slot in range(self.max_proc):
 49 |             self._get_message(slot)
 50 | 
 51 |     def _get_message(self, slot):
 52 |         poller = self.app.getComponent(IPoller)
 53 |         poller.next().addCallback(self._spawn_process, slot)
 54 |         log.debug("Process slot {slot} ready", slot=slot)
 55 | 
 56 |     def _spawn_process(self, message, slot):
 57 |         project = message["_project"]
 58 |         environment = self.app.getComponent(IEnvironment)
 59 |         message.setdefault("settings", {})
 60 |         message["settings"].update(environment.get_settings(message))
 61 | 
 62 |         env = environment.get_environment(message, slot)
 63 |         args = [sys.executable, "-m", self.runner, "crawl", *get_crawl_args(message)]
 64 | 
 65 |         process = ScrapyProcessProtocol(project, message["_spider"], message["_job"], env, args)
 66 |         process.deferred.addBoth(self._process_finished, slot)
 67 | 
 68 |         reactor.spawnProcess(process, sys.executable, args=args, env=env)
 69 |         self.processes[slot] = process
 70 |         log.debug("Process slot {slot} occupied", slot=slot)
 71 | 
 72 |     def _process_finished(self, _, slot):
 73 |         process = self.processes.pop(slot)
 74 |         process.end_time = datetime.datetime.now()
 75 |         self.finished.add(process)
 76 |         log.debug("Process slot {slot} vacated", slot=slot)
 77 | 
 78 |         self._get_message(slot)
 79 | 
 80 |     def _get_max_proc(self, config):
 81 |         max_proc = config.getint("max_proc", 0)
 82 |         if max_proc:
 83 |             return max_proc
 84 | 
 85 |         try:
 86 |             cpus = multiprocessing.cpu_count()
 87 |         except NotImplementedError:  # Windows 17520a3
 88 |             cpus = 1
 89 |         return cpus * config.getint("max_proc_per_cpu", 4)
 90 | 
 91 | 
 92 | # https://docs.twisted.org/en/stable/api/twisted.internet.protocol.ProcessProtocol.html
 93 | class ScrapyProcessProtocol(protocol.ProcessProtocol):
 94 |     def __init__(self, project, spider, job, env, args):
 95 |         self.project = project
 96 |         self.spider = spider
 97 |         self.job = job
 98 |         self.pid = None
 99 |         self.start_time = datetime.datetime.now()
100 |         self.end_time = None
101 |         self.args = args
102 |         self.env = env
103 |         self.deferred = defer.Deferred()
104 | 
105 |     # For equality assertions in tests.
106 |     def __eq__(self, other):
107 |         return (
108 |             self.project == other.project
109 |             and self.spider == other.spider
110 |             and self.job == other.job
111 |             and self.pid == other.pid
112 |             and self.start_time == other.start_time
113 |             and self.end_time == other.end_time
114 |             and self.args == other.args
115 |             and self.env == other.env
116 |         )
117 | 
118 |     # For error messages in tests.
119 |     def __repr__(self):
120 |         return (
121 |             f"ScrapyProcessProtocol(project={self.project} spider={self.spider} job={self.job} pid={self.pid} "
122 |             f"start_time={self.start_time} end_time={self.end_time} args={self.args} env={self.env})"
123 |         )
124 | 
125 |     def outReceived(self, data):
126 |         log.info(data.rstrip(), log_system=f"Launcher,{self.pid}/stdout")
127 | 
128 |     def errReceived(self, data):
129 |         log.error(data.rstrip(), log_system=f"Launcher,{self.pid}/stderr")
130 | 
131 |     def connectionMade(self):
132 |         self.pid = self.transport.pid
133 |         self.log("info", "Process started:")
134 | 
135 |     # https://docs.twisted.org/en/stable/core/howto/process.html#things-that-can-happen-to-your-processprotocol
136 |     def processEnded(self, status):
137 |         if isinstance(status.value, error.ProcessDone):
138 |             self.log("info", "Process finished:")
139 |         else:
140 |             self.log("error", f"Process died: exitstatus={status.value.exitCode!r}")
141 |         self.deferred.callback(self)
142 | 
143 |     def log(self, level, action):
144 |         getattr(log, level)(
145 |             "{action} project={project!r} spider={spider!r} job={job!r} pid={pid!r} args={args!r}",
146 |             action=action,
147 |             project=self.project,
148 |             spider=self.spider,
149 |             job=self.job,
150 |             pid=self.pid,
151 |             args=self.args,
152 |         )
153 | 


--------------------------------------------------------------------------------
/scrapyd/poller.py:
--------------------------------------------------------------------------------
 1 | from twisted.internet.defer import DeferredQueue, inlineCallbacks, maybeDeferred
 2 | from zope.interface import implementer
 3 | 
 4 | from scrapyd.interfaces import IPoller
 5 | from scrapyd.utils import get_spider_queues
 6 | 
 7 | 
 8 | @implementer(IPoller)
 9 | class QueuePoller:
10 |     def __init__(self, config):
11 |         self.config = config
12 |         self.update_projects()
13 |         self.dq = DeferredQueue()
14 | 
15 |     @inlineCallbacks
16 |     def poll(self):
17 |         for project, queue in self.queues.items():
18 |             while (yield maybeDeferred(queue.count)):
19 |                 # If the "waiting" backlog is empty (that is, if the maximum number of Scrapy processes are running):
20 |                 if not self.dq.waiting:
21 |                     return
22 |                 message = (yield maybeDeferred(queue.pop)).copy()
23 |                 # The message can be None if, for example, two Scrapyd instances share a spider queue database.
24 |                 if message is not None:
25 |                     message["_project"] = project
26 |                     message["_spider"] = message.pop("name")
27 |                     # Pop a dummy item from the "waiting" backlog. and fire the message's callbacks.
28 |                     self.dq.put(message)
29 | 
30 |     def next(self):
31 |         """
32 |         Add a dummy item to the "waiting" backlog (based on Twisted's implementation of DeferredQueue).
33 |         """
34 |         return self.dq.get()
35 | 
36 |     def update_projects(self):
37 |         self.queues = get_spider_queues(self.config)
38 | 


--------------------------------------------------------------------------------
/scrapyd/runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import tempfile
 4 | from contextlib import contextmanager
 5 | 
 6 | import pkg_resources
 7 | 
 8 | from scrapyd import Config
 9 | from scrapyd.exceptions import BadEggError
10 | from scrapyd.utils import initialize_component
11 | 
12 | 
13 | def activate_egg(eggpath):
14 |     """Activate a Scrapy egg file. This is meant to be used from egg runners
15 |     to activate a Scrapy egg file. Don't use it from other code as it may
16 |     leave unwanted side effects.
17 |     """
18 |     distributions = pkg_resources.find_distributions(eggpath)
19 |     if isinstance(distributions, tuple):
20 |         raise BadEggError
21 | 
22 |     try:
23 |         distribution = next(distributions)
24 |     except StopIteration:
25 |         raise BadEggError from None
26 | 
27 |     distribution.activate()
28 | 
29 |     # setdefault() was added in https://github.com/scrapy/scrapyd/commit/0641a57. It's not clear why, since the egg
30 |     # should control its settings module. That said, it is unlikely to already be set.
31 |     os.environ.setdefault("SCRAPY_SETTINGS_MODULE", distribution.get_entry_info("scrapy", "settings").module_name)
32 | 
33 | 
34 | @contextmanager
35 | def project_environment(project):
36 |     config = Config()
37 |     eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage")
38 | 
39 |     eggversion = os.environ.get("SCRAPYD_EGG_VERSION", None)
40 |     sanitized_version, egg = eggstorage.get(project, eggversion)
41 | 
42 |     tmp = None
43 |     # egg can be None if the project is not in egg storage: for example, if Scrapyd is invoked within a Scrapy project.
44 |     if egg:
45 |         try:
46 |             if hasattr(egg, "name"):  # for example, FileIO
47 |                 activate_egg(egg.name)
48 |             else:  # for example, BytesIO
49 |                 prefix = f"{project}-{sanitized_version}-"
50 |                 tmp = tempfile.NamedTemporaryFile(suffix=".egg", prefix=prefix, delete=False)
51 |                 shutil.copyfileobj(egg, tmp)
52 |                 tmp.close()
53 |                 activate_egg(tmp.name)
54 |         finally:
55 |             egg.close()
56 | 
57 |     try:
58 |         yield
59 |     finally:
60 |         if tmp:
61 |             os.remove(tmp.name)
62 | 
63 | 
64 | def main():
65 |     project = os.environ["SCRAPY_PROJECT"]
66 |     with project_environment(project):
67 |         from scrapy.cmdline import execute
68 | 
69 |         # This calls scrapy.utils.project.get_project_settings(). It uses SCRAPY_SETTINGS_MODULE if set. Otherwise, it
70 |         # calls scrapy.utils.conf.init_env(), which reads Scrapy's configuration sources, looks for a project matching
71 |         # SCRAPY_PROJECT in the [settings] section, and uses its value for SCRAPY_SETTINGS_MODULE.
72 |         # https://docs.scrapy.org/en/latest/topics/commands.html#configuration-settings
73 |         execute()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/scrapyd/scheduler.py:
--------------------------------------------------------------------------------
 1 | from zope.interface import implementer
 2 | 
 3 | from scrapyd.interfaces import ISpiderScheduler
 4 | from scrapyd.utils import get_spider_queues
 5 | 
 6 | 
 7 | @implementer(ISpiderScheduler)
 8 | class SpiderScheduler:
 9 |     def __init__(self, config):
10 |         self.config = config
11 |         self.update_projects()
12 | 
13 |     def schedule(self, project, spider_name, priority=0.0, **spider_args):
14 |         self.queues[project].add(spider_name, priority=priority, **spider_args)
15 | 
16 |     def list_projects(self):
17 |         return list(self.queues)
18 | 
19 |     def update_projects(self):
20 |         self.queues = get_spider_queues(self.config)
21 | 


--------------------------------------------------------------------------------
/scrapyd/spiderqueue.py:
--------------------------------------------------------------------------------
 1 | from zope.interface import implementer
 2 | 
 3 | from scrapyd import sqlite
 4 | from scrapyd.interfaces import ISpiderQueue
 5 | 
 6 | 
 7 | @implementer(ISpiderQueue)
 8 | class SqliteSpiderQueue:
 9 |     def __init__(self, config, project, table="spider_queue"):
10 |         self.q = sqlite.initialize(sqlite.JsonSqlitePriorityQueue, config, project, table)
11 | 
12 |     def add(self, name, priority=0.0, **spider_args):
13 |         message = spider_args.copy()
14 |         message["name"] = name
15 |         self.q.put(message, priority=priority)
16 | 
17 |     def pop(self):
18 |         return self.q.pop()
19 | 
20 |     def count(self):
21 |         return len(self.q)
22 | 
23 |     def list(self):
24 |         return [message for message, _ in self.q]
25 | 
26 |     def remove(self, func):
27 |         return self.q.remove(func)
28 | 
29 |     def clear(self):
30 |         self.q.clear()
31 | 


--------------------------------------------------------------------------------
/scrapyd/sqlite.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import json
  3 | import os
  4 | import sqlite3
  5 | 
  6 | 
  7 | # The database argument is "jobs" (in SqliteJobStorage), or a project (in SqliteSpiderQueue) from get_spider_queues(),
  8 | # which gets projects from get_project_list(), which gets projects from egg storage. We check for directory traversal
  9 | # in egg storage, instead.
 10 | def initialize(cls, config, database, table):
 11 |     dbs_dir = config.get("dbs_dir", "dbs")
 12 |     if dbs_dir == ":memory:":
 13 |         connection_string = dbs_dir
 14 |     else:
 15 |         if not os.path.exists(dbs_dir):
 16 |             os.makedirs(dbs_dir)
 17 |         connection_string = os.path.join(dbs_dir, f"{database}.db")
 18 | 
 19 |     return cls(connection_string, table)
 20 | 
 21 | 
 22 | # https://docs.python.org/3/library/sqlite3.html#sqlite3-adapter-converter-recipes
 23 | def adapt_datetime(val):
 24 |     return val.strftime("%Y-%m-%d %H:%M:%S.%f")
 25 | 
 26 | 
 27 | def convert_datetime(val):
 28 |     return datetime.datetime.strptime(val.decode(), "%Y-%m-%d %H:%M:%S.%f")
 29 | 
 30 | 
 31 | sqlite3.register_adapter(datetime.datetime, adapt_datetime)
 32 | sqlite3.register_converter("datetime", convert_datetime)
 33 | 
 34 | 
 35 | class SqliteMixin:
 36 |     def __init__(self, database, table):
 37 |         self.database = database or ":memory:"
 38 |         self.table = table
 39 |         # Regarding check_same_thread, see http://twistedmatrix.com/trac/ticket/4040
 40 |         self.conn = sqlite3.connect(self.database, check_same_thread=False)
 41 | 
 42 |     def __len__(self):
 43 |         return self.conn.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()[0]
 44 | 
 45 |     # SQLite JSON is enabled by default since 3.38.0 (2022-02-22), and JSONB is available since 3.45.0 (2024-01-15).
 46 |     # https://sqlite.org/json1.html
 47 |     def encode(self, obj):
 48 |         return sqlite3.Binary(json.dumps(obj).encode("ascii"))
 49 | 
 50 |     def decode(self, obj):
 51 |         return json.loads(bytes(obj).decode("ascii"))
 52 | 
 53 | 
 54 | class JsonSqlitePriorityQueue(SqliteMixin):
 55 |     """
 56 |     SQLite priority queue. It relies on SQLite concurrency support for providing atomic inter-process operations.
 57 | 
 58 |     .. versionadded:: 1.0.0
 59 |     """
 60 | 
 61 |     def __init__(self, database=None, table="queue"):
 62 |         super().__init__(database, table)
 63 | 
 64 |         self.conn.execute(
 65 |             f"CREATE TABLE IF NOT EXISTS {table} (id integer PRIMARY KEY, priority real key, message blob)"
 66 |         )
 67 | 
 68 |     def put(self, message, priority=0.0):
 69 |         self.conn.execute(
 70 |             f"INSERT INTO {self.table} (priority, message) VALUES (?, ?)",
 71 |             (priority, self.encode(message)),
 72 |         )
 73 |         self.conn.commit()
 74 | 
 75 |     def pop(self):
 76 |         row = self.conn.execute(f"SELECT id, message FROM {self.table} ORDER BY priority DESC LIMIT 1").fetchone()
 77 |         if row is None:
 78 |             return None
 79 |         _id, message = row
 80 | 
 81 |         # If a row vanished, try again.
 82 |         if not self.conn.execute(f"DELETE FROM {self.table} WHERE id = ?", (_id,)).rowcount:
 83 |             self.conn.rollback()
 84 |             return self.pop()
 85 | 
 86 |         self.conn.commit()
 87 |         return self.decode(message)
 88 | 
 89 |     def remove(self, func):
 90 |         deleted = 0
 91 |         for _id, message in self.conn.execute(f"SELECT id, message FROM {self.table}"):
 92 |             if func(self.decode(message)):
 93 |                 # If a row vanished, try again.
 94 |                 if not self.conn.execute(f"DELETE FROM {self.table} WHERE id = ?", (_id,)).rowcount:
 95 |                     self.conn.rollback()
 96 |                     return self.remove(func)
 97 |                 deleted += 1
 98 | 
 99 |         self.conn.commit()
100 |         return deleted
101 | 
102 |     def clear(self):
103 |         self.conn.execute(f"DELETE FROM {self.table}")
104 |         self.conn.commit()
105 | 
106 |     def __iter__(self):
107 |         return (
108 |             (self.decode(message), priority)
109 |             for message, priority in self.conn.execute(
110 |                 f"SELECT message, priority FROM {self.table} ORDER BY priority DESC"
111 |             )
112 |         )
113 | 
114 | 
115 | class SqliteFinishedJobs(SqliteMixin):
116 |     """
117 |     SQLite finished jobs.
118 | 
119 |     .. versionadded:: 1.3.0
120 |        Job storage was previously in-memory only.
121 |     """
122 | 
123 |     def __init__(self, database=None, table="finished_jobs"):
124 |         super().__init__(database, table)
125 | 
126 |         self.conn.execute(
127 |             f"CREATE TABLE IF NOT EXISTS {table} "
128 |             "(id integer PRIMARY KEY, project text, spider text, job text, start_time datetime, end_time datetime)"
129 |         )
130 | 
131 |     def add(self, job):
132 |         self.conn.execute(
133 |             f"INSERT INTO {self.table} (project, spider, job, start_time, end_time) VALUES (?, ?, ?, ?, ?)",
134 |             (job.project, job.spider, job.job, job.start_time, job.end_time),
135 |         )
136 |         self.conn.commit()
137 | 
138 |     def clear(self, finished_to_keep=None):
139 |         where = ""
140 |         if finished_to_keep:
141 |             limit = len(self) - finished_to_keep
142 |             if limit <= 0:
143 |                 return  # nothing to delete
144 |             where = f"WHERE id <= (SELECT max(id) FROM (SELECT id FROM {self.table} ORDER BY end_time LIMIT {limit}))"
145 | 
146 |         self.conn.execute(f"DELETE FROM {self.table} {where}")
147 |         self.conn.commit()
148 | 
149 |     def __iter__(self):
150 |         return (
151 |             (
152 |                 project,
153 |                 spider,
154 |                 job,
155 |                 datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S.%f"),
156 |                 datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f"),
157 |             )
158 |             for project, spider, job, start_time, end_time in self.conn.execute(
159 |                 f"SELECT project, spider, job, start_time, end_time FROM {self.table} ORDER BY end_time DESC"
160 |             )
161 |         )
162 | 


--------------------------------------------------------------------------------
/scrapyd/txapp.py:
--------------------------------------------------------------------------------
1 | # this file is used to start scrapyd with twistd -y
2 | from scrapyd import get_application
3 | 
4 | application = get_application()
5 | 


--------------------------------------------------------------------------------
/scrapyd/utils.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | from scrapy.utils.misc import load_object
 4 | from twisted.python import filepath
 5 | 
 6 | from scrapyd.exceptions import DirectoryTraversalError
 7 | 
 8 | 
 9 | def initialize_component(config, setting, default, *args):
10 |     path = config.get(setting, default)
11 |     cls = load_object(path)
12 |     return cls(config, *args)
13 | 
14 | 
15 | def local_items(items_dir, parsed):
16 |     return items_dir and parsed.scheme.lower() in ("", "file", os.path.splitdrive(items_dir)[0].rstrip(":").lower())
17 | 
18 | 
19 | def get_file_path(directory, project, spider, job, extension):
20 |     # https://docs.twisted.org/en/stable/api/twisted.python.filepath.FilePath.html
21 |     try:
22 |         return filepath.FilePath(directory).child(project).child(spider).child(f"{job}.{extension}")
23 |     except filepath.InsecurePath as e:
24 |         raise DirectoryTraversalError(os.path.join(project, spider, f"{job}.{extension}")) from e
25 | 
26 | 
27 | def get_spider_queues(config):
28 |     """Return a dict of Spider Queues keyed by project name"""
29 |     spiderqueue_cls = load_object(config.get("spiderqueue", "scrapyd.spiderqueue.SqliteSpiderQueue"))
30 |     return {project: spiderqueue_cls(config, project) for project in get_project_list(config)}
31 | 
32 | 
33 | def get_project_list(config):
34 |     """Get list of projects by inspecting the eggs storage and the ones defined in
35 |     the scrapy.cfg [settings] section
36 |     """
37 | 
38 |     # The poller and scheduler use this function (via get_spider_queues), and they aren't initialized with the
39 |     # application. So, we need to re-initialize this component here.
40 |     eggstorage = initialize_component(config, "eggstorage", "scrapyd.eggstorage.FilesystemEggStorage")
41 |     return eggstorage.list_projects() + [project for project, _ in config.items("settings", default=[])]
42 | 


--------------------------------------------------------------------------------
/scrapyd/webservice.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import functools
  4 | import json
  5 | import os
  6 | import sys
  7 | import traceback
  8 | import uuid
  9 | import zipfile
 10 | from collections import defaultdict
 11 | from io import BytesIO
 12 | from subprocess import PIPE, Popen
 13 | from typing import ClassVar
 14 | 
 15 | from twisted.logger import Logger
 16 | from twisted.web import error, http, resource
 17 | 
 18 | from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError, RunnerError
 19 | 
 20 | log = Logger()
 21 | 
 22 | 
 23 | def param(
 24 |     decoded: str,
 25 |     *,
 26 |     dest: str | None = None,
 27 |     required: bool = True,
 28 |     default=None,
 29 |     multiple: bool = False,
 30 |     type=str,  # noqa: A002 like Click
 31 | ):
 32 |     encoded = decoded.encode()
 33 |     if dest is None:
 34 |         dest = decoded
 35 | 
 36 |     def decorator(func):
 37 |         @functools.wraps(func)
 38 |         def wrapper(self, txrequest, *args, **kwargs):
 39 |             default_value = default() if callable(default) else default
 40 | 
 41 |             if encoded not in txrequest.args:
 42 |                 if required:
 43 |                     raise error.Error(code=http.OK, message=b"'%b' parameter is required" % encoded)
 44 | 
 45 |                 value = default_value
 46 |             else:
 47 |                 values = (value.decode() if type is str else type(value) for value in txrequest.args.pop(encoded))
 48 |                 try:
 49 |                     value = list(values) if multiple else next(values)
 50 |                 except (UnicodeDecodeError, ValueError) as e:
 51 |                     raise error.Error(code=http.OK, message=b"%b is invalid: %b" % (encoded, str(e).encode())) from e
 52 | 
 53 |             kwargs[dest] = value
 54 | 
 55 |             return func(self, txrequest, *args, **kwargs)
 56 | 
 57 |         return wrapper
 58 | 
 59 |     return decorator
 60 | 
 61 | 
 62 | class SpiderList:
 63 |     cache: ClassVar = defaultdict(dict)
 64 | 
 65 |     def get(self, project, version, *, runner):
 66 |         """Return the ``scrapy list`` output for the project and version, using a cache if possible."""
 67 |         try:
 68 |             return self.cache[project][version]
 69 |         except KeyError:
 70 |             return self.set(project, version, runner=runner)
 71 | 
 72 |     def set(self, project, version, *, runner):
 73 |         """Calculate, cache and return the ``scrapy list`` output for the project and version, bypassing the cache."""
 74 | 
 75 |         env = os.environ.copy()
 76 |         env["PYTHONIOENCODING"] = "UTF-8"
 77 |         env["SCRAPY_PROJECT"] = project
 78 |         # If the version is not provided, then the runner uses the default version, determined by egg storage.
 79 |         if version:
 80 |             env["SCRAPYD_EGG_VERSION"] = version
 81 | 
 82 |         args = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"]
 83 |         process = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
 84 |         stdout, stderr = process.communicate()
 85 |         if process.returncode:
 86 |             raise RunnerError((stderr or stdout or b"").decode())
 87 | 
 88 |         spiders = stdout.decode().splitlines()
 89 | 
 90 |         # Note: If the cache is empty, that doesn't mean that this is the project's only version; it simply means that
 91 |         # this is the first version called in this Scrapyd process.
 92 | 
 93 |         # Evict the return value of version=None calls, since we can't determine whether this version is the default
 94 |         # version (in which case we would overwrite it) or not (in which case we would keep it).
 95 |         self.cache[project].pop(None, None)
 96 |         self.cache[project][version] = spiders
 97 |         return spiders
 98 | 
 99 |     def delete(self, project, version=None):
100 |         if version is None:
101 |             self.cache.pop(project, None)
102 |         else:
103 |             # Evict the return value of version=None calls, since we can't determine whether this version is the
104 |             # default version (in which case we would pop it) or not (in which case we would keep it).
105 |             self.cache[project].pop(None, None)
106 |             self.cache[project].pop(version, None)
107 | 
108 | 
109 | spider_list = SpiderList()
110 | 
111 | 
112 | # WebserviceResource
113 | class WsResource(resource.Resource):
114 |     """
115 |     .. versionchanged:: 1.1.0
116 |        Add ``node_name`` to the response in all subclasses.
117 |     """
118 | 
119 |     json_encoder = json.JSONEncoder()
120 | 
121 |     def __init__(self, root):
122 |         super().__init__()
123 |         self.root = root
124 | 
125 |     def render(self, txrequest):
126 |         try:
127 |             data = super().render(txrequest)
128 |         except Exception as e:  # noqa: BLE001
129 |             log.failure("")
130 | 
131 |             if isinstance(e, error.Error):
132 |                 txrequest.setResponseCode(int(e.status))
133 | 
134 |             if self.root.debug:
135 |                 return traceback.format_exc().encode()
136 | 
137 |             message = e.message.decode() if isinstance(e, error.Error) else f"{type(e).__name__}: {e}"
138 |             data = {"status": "error", "message": message}
139 |         else:
140 |             if data is not None:
141 |                 data["status"] = "ok"
142 | 
143 |         if data is None:  # render_OPTIONS
144 |             content = b""
145 |         else:
146 |             data["node_name"] = self.root.node_name
147 |             content = self.json_encoder.encode(data).encode() + b"\n"
148 |             txrequest.setHeader("Content-Type", "application/json")
149 | 
150 |         # https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS#preflighted_requests
151 |         txrequest.setHeader("Access-Control-Allow-Origin", "*")
152 |         txrequest.setHeader("Access-Control-Allow-Methods", self.methods)
153 |         txrequest.setHeader("Access-Control-Allow-Headers", "X-Requested-With")
154 |         txrequest.setHeader("Content-Length", str(len(content)))
155 |         return content
156 | 
157 |     def render_OPTIONS(self, txrequest):
158 |         txrequest.setHeader("Allow", self.methods)
159 |         txrequest.setResponseCode(http.NO_CONTENT)
160 | 
161 |     @functools.cached_property
162 |     def methods(self):
163 |         methods = ["OPTIONS", "HEAD"]
164 |         if hasattr(self, "render_GET"):
165 |             methods.append("GET")
166 |         if hasattr(self, "render_POST"):
167 |             methods.append("POST")
168 |         return ", ".join(methods)
169 | 
170 | 
171 | class DaemonStatus(WsResource):
172 |     """
173 |     .. versionadded:: 1.2.0
174 |     """
175 | 
176 |     def render_GET(self, txrequest):
177 |         return {
178 |             "pending": sum(queue.count() for queue in self.root.poller.queues.values()),
179 |             "running": len(self.root.launcher.processes),
180 |             "finished": len(self.root.launcher.finished),
181 |         }
182 | 
183 | 
184 | class Schedule(WsResource):
185 |     """
186 |     .. versionchanged:: 1.2.0
187 |        Add ``_version`` and ``jobid`` parameters.
188 |     .. versionchanged:: 1.3.0
189 |        Add ``priority`` parameter.
190 |     """
191 | 
192 |     @param("project")
193 |     @param("spider")
194 |     @param("_version", dest="version", required=False, default=None)
195 |     # See https://github.com/scrapy/scrapyd/pull/215
196 |     @param("jobid", required=False, default=lambda: uuid.uuid1().hex)
197 |     @param("priority", required=False, default=0, type=float)
198 |     @param("setting", required=False, default=list, multiple=True)
199 |     def render_POST(self, txrequest, project, spider, version, jobid, priority, setting):
200 |         if project not in self.root.poller.queues:
201 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
202 | 
203 |         if version and self.root.eggstorage.get(project, version) == (None, None):
204 |             raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
205 | 
206 |         spiders = spider_list.get(project, version, runner=self.root.runner)
207 |         if spider not in spiders:
208 |             raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode())
209 | 
210 |         args = {key.decode(): values[0].decode() for key, values in txrequest.args.items()}
211 |         if version is not None:
212 |             args["_version"] = version
213 | 
214 |         self.root.scheduler.schedule(
215 |             project,
216 |             spider,
217 |             priority=priority,
218 |             settings=dict(s.split("=", 1) for s in setting),
219 |             _job=jobid,
220 |             **args,
221 |         )
222 |         return {"jobid": jobid}
223 | 
224 | 
225 | class Cancel(WsResource):
226 |     @param("project")
227 |     @param("job")
228 |     # Instead of os.name, use sys.platform, which disambiguates Cygwin, which implements SIGINT not SIGBREAK.
229 |     # https://cygwin.com/cygwin-ug-net/kill.html
230 |     # https://github.com/scrapy/scrapy/blob/06f9c28/tests/test_crawler.py#L886
231 |     @param("signal", required=False, default="INT" if sys.platform != "win32" else "BREAK")
232 |     def render_POST(self, txrequest, project, job, signal):
233 |         if project not in self.root.poller.queues:
234 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
235 | 
236 |         prevstate = None
237 | 
238 |         if self.root.poller.queues[project].remove(lambda message: message["_job"] == job):
239 |             prevstate = "pending"
240 | 
241 |         for process in self.root.launcher.processes.values():
242 |             if process.project == project and process.job == job:
243 |                 process.transport.signalProcess(signal)
244 |                 prevstate = "running"
245 | 
246 |         return {"prevstate": prevstate}
247 | 
248 | 
249 | class AddVersion(WsResource):
250 |     @param("project")
251 |     @param("version")
252 |     @param("egg", type=bytes)
253 |     def render_POST(self, txrequest, project, version, egg):
254 |         if not zipfile.is_zipfile(BytesIO(egg)):
255 |             raise error.Error(
256 |                 code=http.OK, message=b"egg is not a ZIP file (if using curl, use egg=@path not egg=path)"
257 |             )
258 | 
259 |         self.root.eggstorage.put(BytesIO(egg), project, version)
260 |         self.root.update_projects()
261 | 
262 |         spiders = spider_list.set(project, version, runner=self.root.runner)
263 |         return {"project": project, "version": version, "spiders": len(spiders)}
264 | 
265 | 
266 | class ListProjects(WsResource):
267 |     def render_GET(self, txrequest):
268 |         return {"projects": self.root.scheduler.list_projects()}
269 | 
270 | 
271 | class ListVersions(WsResource):
272 |     @param("project")
273 |     def render_GET(self, txrequest, project):
274 |         return {"versions": self.root.eggstorage.list(project)}
275 | 
276 | 
277 | class ListSpiders(WsResource):
278 |     """
279 |     .. versionchanged:: 1.2.0
280 |        Add ``_version`` parameter.
281 |     """
282 | 
283 |     @param("project")
284 |     @param("_version", dest="version", required=False, default=None)
285 |     def render_GET(self, txrequest, project, version):
286 |         if project not in self.root.poller.queues:
287 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
288 | 
289 |         if version and self.root.eggstorage.get(project, version) == (None, None):
290 |             raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
291 | 
292 |         return {"spiders": spider_list.get(project, version, runner=self.root.runner)}
293 | 
294 | 
295 | class Status(WsResource):
296 |     """
297 |     .. versionadded:: 1.5.0
298 |     """
299 | 
300 |     @param("job")
301 |     @param("project", required=False)
302 |     def render_GET(self, txrequest, job, project):
303 |         queues = self.root.poller.queues
304 |         if project is not None and project not in queues:
305 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
306 | 
307 |         result = {"currstate": None}
308 | 
309 |         for finished in self.root.launcher.finished:
310 |             if (project is None or finished.project == project) and finished.job == job:
311 |                 result["currstate"] = "finished"
312 |                 return result
313 | 
314 |         for process in self.root.launcher.processes.values():
315 |             if (project is None or process.project == project) and process.job == job:
316 |                 result["currstate"] = "running"
317 |                 return result
318 | 
319 |         for queue_name in queues if project is None else [project]:
320 |             for message in queues[queue_name].list():
321 |                 if message["_job"] == job:
322 |                     result["currstate"] = "pending"
323 |                     return result
324 | 
325 |         return result
326 | 
327 | 
328 | class ListJobs(WsResource):
329 |     """
330 |     .. versionchanged:: 1.1.0
331 |        Add ``start_time`` to running jobs in the response.
332 |     .. versionchanged:: 1.2.0
333 |        Add ``pid`` to running jobs in the response.
334 |     .. versionchanged:: 1.3.0
335 |        The ``project`` parameter is optional. Add ``project`` to all jobs in the response.
336 |     .. versionchanged:: 1.4.0
337 |        Add ``log_url`` and ``items_url`` to finished jobs in the response.
338 |     .. versionchanged:: 1.5.0
339 |        Add ``version``, ``settings`` and ``args`` to pending jobs in the response.
340 |     """
341 | 
342 |     @param("project", required=False)
343 |     def render_GET(self, txrequest, project):
344 |         queues = self.root.poller.queues
345 |         if project is not None and project not in queues:
346 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
347 | 
348 |         return {
349 |             "pending": [
350 |                 {
351 |                     "id": message["_job"],
352 |                     "project": queue_name,
353 |                     "spider": message["name"],
354 |                     "version": message.get("_version"),
355 |                     "settings": message.get("settings", {}),
356 |                     "args": {k: v for k, v in message.items() if k not in ("name", "_job", "_version", "settings")},
357 |                 }
358 |                 for queue_name in (queues if project is None else [project])
359 |                 for message in queues[queue_name].list()
360 |             ],
361 |             "running": [
362 |                 {
363 |                     "id": process.job,
364 |                     "project": process.project,
365 |                     "spider": process.spider,
366 |                     "pid": process.pid,
367 |                     "start_time": str(process.start_time),
368 |                     "log_url": self.root.get_log_url(process),
369 |                     "items_url": self.root.get_item_url(process),
370 |                 }
371 |                 for process in self.root.launcher.processes.values()
372 |                 if project is None or process.project == project
373 |             ],
374 |             "finished": [
375 |                 {
376 |                     "id": finished.job,
377 |                     "project": finished.project,
378 |                     "spider": finished.spider,
379 |                     "start_time": str(finished.start_time),
380 |                     "end_time": str(finished.end_time),
381 |                     "log_url": self.root.get_log_url(finished),
382 |                     "items_url": self.root.get_item_url(finished),
383 |                 }
384 |                 for finished in self.root.launcher.finished
385 |                 if project is None or finished.project == project
386 |             ],
387 |         }
388 | 
389 | 
390 | class DeleteProject(WsResource):
391 |     @param("project")
392 |     def render_POST(self, txrequest, project):
393 |         self._delete_version(project)
394 |         spider_list.delete(project)
395 |         return {}
396 | 
397 |     def _delete_version(self, project, version=None):
398 |         try:
399 |             self.root.eggstorage.delete(project, version)
400 |         except ProjectNotFoundError as e:
401 |             raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) from e
402 |         except EggNotFoundError as e:
403 |             raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) from e
404 |         else:
405 |             self.root.update_projects()
406 | 
407 | 
408 | class DeleteVersion(DeleteProject):
409 |     @param("project")
410 |     @param("version")
411 |     def render_POST(self, txrequest, project, version):
412 |         self._delete_version(project, version)
413 |         spider_list.delete(project, version)
414 |         return {}
415 | 


--------------------------------------------------------------------------------
/scrapyd/website.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import socket
  3 | from datetime import datetime, timedelta
  4 | from html import escape
  5 | from textwrap import dedent, indent
  6 | from urllib.parse import quote, urlsplit
  7 | 
  8 | from scrapy.utils.misc import load_object
  9 | from twisted.application.service import IServiceCollection
 10 | from twisted.python import filepath
 11 | from twisted.web import resource, static
 12 | 
 13 | from scrapyd.interfaces import IEggStorage, IPoller, ISpiderScheduler
 14 | from scrapyd.utils import local_items
 15 | 
 16 | 
 17 | # Use local DirectoryLister class.
 18 | class File(static.File):
 19 |     def directoryListing(self):
 20 |         path = self.path
 21 |         names = self.listNames()
 22 |         return DirectoryLister(path, names, self.contentTypes, self.contentEncodings, self.defaultType)
 23 | 
 24 | 
 25 | # Add "Last modified" column.
 26 | class DirectoryLister(static.DirectoryLister):
 27 |     template = """<html>
 28 | <head>
 29 | <title>%(header)s</title>
 30 | <style>
 31 | .even-dir { background-color: #efe0ef }
 32 | .even { background-color: #eee }
 33 | .odd-dir {background-color: #f0d0ef }
 34 | .odd { background-color: #dedede }
 35 | .icon { text-align: center }
 36 | .listing {
 37 |     margin-left: auto;
 38 |     margin-right: auto;
 39 |     width: 50%%;
 40 |     padding: 0.1em;
 41 |     }
 42 | 
 43 | body { border: 0; padding: 0; margin: 0; background-color: #efefef; }
 44 | h1 {padding: 0.1em; background-color: #777; color: white; border-bottom: thin white dashed;}
 45 | 
 46 | </style>
 47 | </head>
 48 | 
 49 | <body>
 50 | <h1>%(header)s</h1>
 51 | 
 52 | <table>
 53 |     <thead>
 54 |         <tr>
 55 |             <th>Filename</th>
 56 |             <th>Size</th>
 57 |             <th>Last modified</th>
 58 |             <th>Content type</th>
 59 |             <th>Content encoding</th>
 60 |         </tr>
 61 |     </thead>
 62 |     <tbody>
 63 | %(tableContent)s
 64 |     </tbody>
 65 | </table>
 66 | 
 67 | </body>
 68 | </html>
 69 | """
 70 | 
 71 |     linePattern = """<tr class="%(class)s">
 72 |     <td><a href="%(href)s">%(text)s</a></td>
 73 |     <td>%(size)s</td>
 74 |     <td>%(modified)s</td>
 75 |     <td>%(type)s</td>
 76 |     <td>%(encoding)s</td>
 77 | </tr>
 78 | """
 79 | 
 80 |     def _getFilesAndDirectories(self, directory):
 81 |         files = []
 82 |         dirs = []
 83 | 
 84 |         for path in directory:
 85 |             if isinstance(path, bytes):
 86 |                 path = path.decode()  # noqa: PLW2901 from Twisted
 87 | 
 88 |             url = quote(path, "/")
 89 |             escaped_path = escape(path)
 90 |             child_path = filepath.FilePath(self.path).child(path)
 91 |             modified = datetime.fromtimestamp(child_path.getModificationTime()).strftime("%Y-%m-%d %H:%M")  # NEW
 92 | 
 93 |             if child_path.isdir():
 94 |                 dirs.append(
 95 |                     {
 96 |                         "text": escaped_path + "/",
 97 |                         "href": url + "/",
 98 |                         "size": "",
 99 |                         "type": "[Directory]",
100 |                         "encoding": "",
101 |                         "modified": modified,  # NEW
102 |                     }
103 |                 )
104 |             else:
105 |                 mimetype, encoding = static.getTypeAndEncoding(
106 |                     path, self.contentTypes, self.contentEncodings, self.defaultType
107 |                 )
108 |                 try:
109 |                     size = child_path.getsize()
110 |                 except OSError:
111 |                     continue
112 |                 files.append(
113 |                     {
114 |                         "text": escaped_path,
115 |                         "href": url,
116 |                         "type": f"[{mimetype}]",
117 |                         "encoding": (encoding and f"[{encoding}]" or ""),
118 |                         "size": static.formatFileSize(size),
119 |                         "modified": modified,  # NEW
120 |                     }
121 |                 )
122 |         return dirs, files
123 | 
124 | 
125 | def _get_file_url(base, directory, job, extension):
126 |     if os.path.exists(os.path.join(directory, job.project, job.spider, f"{job.job}.{extension}")):
127 |         return f"/{base}/{job.project}/{job.spider}/{job.job}.{extension}"
128 |     return None
129 | 
130 | 
131 | class Root(resource.Resource):
132 |     def __init__(self, config, app):
133 |         super().__init__()
134 | 
135 |         self.app = app
136 |         self.logs_dir = config.get("logs_dir", "logs")
137 |         self.items_dir = config.get("items_dir", "")
138 |         self.debug = config.getboolean("debug", False)
139 |         self.runner = config.get("runner", "scrapyd.runner")
140 |         self.prefix_header = config.get("prefix_header", "x-forwarded-prefix")
141 |         self.local_items = local_items(self.items_dir, urlsplit(self.items_dir))
142 |         self.node_name = config.get("node_name", socket.gethostname())
143 | 
144 |         if self.logs_dir:
145 |             self.putChild(b"logs", File(self.logs_dir, "text/plain"))
146 |         if self.local_items:
147 |             self.putChild(b"items", File(self.items_dir, "text/plain"))
148 | 
149 |         for service_name, service_path in config.items("services", default=[]):
150 |             if service_path:
151 |                 service_cls = load_object(service_path)
152 |                 self.putChild(service_name.encode(), service_cls(self))
153 | 
154 |         # Add web UI last, since its behavior can depend on others' presence.
155 |         self.putChild(b"", Home(self))
156 |         self.putChild(b"jobs", Jobs(self))
157 | 
158 |     def update_projects(self):
159 |         self.poller.update_projects()
160 |         self.scheduler.update_projects()
161 | 
162 |     def get_log_url(self, job):
163 |         return _get_file_url("logs", self.logs_dir, job, "log")
164 | 
165 |     def get_item_url(self, job):
166 |         if self.local_items:
167 |             return _get_file_url("items", self.items_dir, job, "jl")
168 |         return None
169 | 
170 |     @property
171 |     def launcher(self):
172 |         return IServiceCollection(self.app, self.app).getServiceNamed("launcher")
173 | 
174 |     @property
175 |     def scheduler(self):
176 |         return self.app.getComponent(ISpiderScheduler)
177 | 
178 |     @property
179 |     def eggstorage(self):
180 |         return self.app.getComponent(IEggStorage)
181 | 
182 |     @property
183 |     def poller(self):
184 |         return self.app.getComponent(IPoller)
185 | 
186 | 
187 | class PrefixHeaderMixin:
188 |     def get_base_path(self, txrequest):
189 |         return txrequest.getHeader(self.root.prefix_header) or ""
190 | 
191 | 
192 | class Home(PrefixHeaderMixin, resource.Resource):
193 |     def __init__(self, root):
194 |         super().__init__()
195 |         self.root = root
196 | 
197 |     def prepare_projects(self):
198 |         if projects := self.root.scheduler.list_projects():
199 |             lis = "\n".join(f"<li>{escape(project_name)}</li>" for project_name in sorted(projects))
200 |             return f"<p>Scrapy projects:</p>\n<ul>\n{indent(lis, '    ')}\n</ul>"
201 |         return "<p>No Scrapy projects yet.</p>"
202 | 
203 |     def render_GET(self, txrequest):
204 |         base_path = self.get_base_path(txrequest)
205 | 
206 |         content = dedent(
207 |             f"""\
208 |             <!DOCTYPE html>
209 |             <html lang="en">
210 |             <head>
211 |                 <meta charset="utf-8">
212 |                 <meta name="viewport" content="width=device-width, initial-scale=1">
213 |                 <title>Scrapyd</title>
214 |                 <style>
215 |                     body {{ font-family: system-ui, sans-serif; }}
216 |                 </style>
217 |             </head>
218 |             <body>
219 |                 <h1>Scrapyd</h1>
220 | 
221 |                 <ul>
222 |                     <li><a href="{base_path}/jobs">Jobs</a></li>
223 |                     {f'<li><a href="{base_path}/items/">Items</a></li>' if self.root.local_items else ''}
224 |                     <li><a href="{base_path}/logs/">Logs</a></li>
225 |                     <li><a href="https://scrapyd.readthedocs.io/en/latest/">Documentation</a></li>
226 |                 </ul>
227 | 
228 | {indent(self.prepare_projects(), "                ")}
229 | 
230 |                 <p>
231 |                     This web UI is for monitoring only. To upload projects and schedule crawls, use the API.
232 |                     For example, using <a href="https://curl.se/">curl</a>:
233 |                 </p>
234 | 
235 |                 <p>
236 |                     <code>curl http://localhost:6800/schedule.json -d project=default -d spider=somespider</code>
237 |                 </p>
238 | 
239 |                 <p>
240 |                     See the <a href="https://scrapyd.readthedocs.io/en/latest/">Scrapyd documentation</a> for details.
241 |                 </p>
242 |             </body>
243 |             </html>
244 |             """
245 |         )
246 |         content = content.encode()
247 | 
248 |         txrequest.setHeader("Content-Type", "text/html; charset=utf-8")
249 |         txrequest.setHeader("Content-Length", str(len(content)))
250 |         return content
251 | 
252 | 
253 | def no_microseconds(timelike):
254 |     # microsecond for datetime, microseconds for timedelta.
255 |     ms = timelike.microsecond if hasattr(timelike, "microsecond") else timelike.microseconds
256 |     return timelike - timedelta(microseconds=ms)
257 | 
258 | 
259 | class Jobs(PrefixHeaderMixin, resource.Resource):
260 |     def __init__(self, root):
261 |         super().__init__()
262 |         self.root = root
263 | 
264 |         self.headers = [
265 |             "Project",
266 |             "Spider",
267 |             "Job",
268 |             "PID",
269 |             "Start",
270 |             "Runtime",
271 |             "Finish",
272 |             "Log",
273 |         ]
274 |         # Hide the Items column if items_dir isn't local.
275 |         if self.root.local_items:
276 |             self.headers.append("Items")
277 |         # Hide the Cancel column if no cancel.json webservice.
278 |         if b"cancel.json" in self.root.children:
279 |             self.headers.append("Cancel")
280 | 
281 |     def cancel_button(self, project, job):
282 |         return dedent(
283 |             f"""
284 |             <form method="post" onsubmit="return confirm('Are you sure?');" action="{self.base_path}/cancel.json">
285 |             <input type="hidden" name="project" value="{escape(project)}">
286 |             <input type="hidden" name="job" value="{escape(job)}">
287 |             <input type="submit" style="float: left;" value="Cancel">
288 |             </form>
289 |             """
290 |         )
291 | 
292 |     def html_log_url(self, job):
293 |         if url := self.root.get_log_url(job):
294 |             return f'<a href="{self.base_path}{url}">Log</a>'
295 |         return None
296 | 
297 |     def html_item_url(self, job):
298 |         if url := self.root.get_item_url(job):
299 |             return f'<a href="{self.base_path}{url}">Items</a>'
300 |         return None
301 | 
302 |     def prepare_headers(self):
303 |         ths = "\n".join(f"<th>{header}</th>" for header in self.headers)
304 |         return f"<tr>\n{indent(ths, '    ')}\n</tr>"
305 | 
306 |     def prepare_row(self, row):
307 |         tds = "\n".join(f"<td>{'' if row.get(header) is None else row[header]}</td>" for header in self.headers)
308 |         return f"<tr>\n{indent(tds, '    ')}\n</tr>"
309 | 
310 |     def prepare_pending(self):
311 |         return "\n".join(
312 |             self.prepare_row(
313 |                 {
314 |                     "Project": escape(project),
315 |                     "Spider": escape(message["name"]),
316 |                     "Job": escape(message["_job"]),
317 |                     "Cancel": self.cancel_button(project, message["_job"]),
318 |                 }
319 |             )
320 |             for project, queue in self.root.poller.queues.items()
321 |             for message in queue.list()
322 |         )
323 | 
324 |     def prepare_running(self):
325 |         return "\n".join(
326 |             self.prepare_row(
327 |                 {
328 |                     "Project": escape(process.project),
329 |                     "Spider": escape(process.spider),
330 |                     "Job": escape(process.job),
331 |                     "PID": process.pid,
332 |                     "Start": no_microseconds(process.start_time),
333 |                     "Runtime": no_microseconds(datetime.now() - process.start_time),
334 |                     "Log": self.html_log_url(process),
335 |                     "Items": self.html_item_url(process),
336 |                     "Cancel": self.cancel_button(process.project, process.job),
337 |                 }
338 |             )
339 |             for process in self.root.launcher.processes.values()
340 |         )
341 | 
342 |     def prepare_finished(self):
343 |         return "\n".join(
344 |             self.prepare_row(
345 |                 {
346 |                     "Project": escape(job.project),
347 |                     "Spider": escape(job.spider),
348 |                     "Job": escape(job.job),
349 |                     "Start": no_microseconds(job.start_time),
350 |                     "Runtime": no_microseconds(job.end_time - job.start_time),
351 |                     "Finish": no_microseconds(job.end_time),
352 |                     "Log": self.html_log_url(job),
353 |                     "Items": self.html_item_url(job),
354 |                 }
355 |             )
356 |             for job in self.root.launcher.finished
357 |         )
358 | 
359 |     def render_GET(self, txrequest):
360 |         self.base_path = self.get_base_path(txrequest)
361 | 
362 |         content = dedent(
363 |             f"""\
364 |             <!DOCTYPE html>
365 |             <html lang="en">
366 |             <head>
367 |                 <meta charset="utf-8">
368 |                 <meta name="viewport" content="width=device-width, initial-scale=1">
369 |                 <title>Scrapyd</title>
370 |                 <style>
371 |                     body {{ font-family: system-ui, sans-serif; }}
372 |                     table {{ border-collapse: collapse; }}
373 |                     th, td {{ border-style: solid; border-width: 1px; }}
374 |                     tbody > tr:first-child {{ background-color: #eee; }}
375 |                     th, td {{ padding: .5rem; }}
376 |                     td:nth-child(2), td:nth-child(3) {{ word-break: break-all; }}
377 |                 </style>
378 |             </head>
379 |             <body>
380 |                 <h1>Jobs</h1>
381 |                 <p><a href="./">Go up</a></p>
382 |                 <table id="jobs">
383 |                     <thead>
384 | {indent(self.prepare_headers(), "                        ")}
385 |                     </thead>
386 |                     <tbody>
387 |                         <tr>
388 |                             <th colspan="{len(self.headers)}">Pending</th>
389 |                         </tr>
390 | {indent(self.prepare_pending(), "                        ")}
391 |                     </tbody>
392 |                     <tbody>
393 |                         <tr>
394 |                             <th colspan="{len(self.headers)}">Running</th>
395 |                         </tr>
396 | {indent(self.prepare_running(), "                        ")}
397 |                     </tbody>
398 |                     <tbody>
399 |                         <tr>
400 |                             <th colspan="{len(self.headers)}">Finished</th>
401 |                         </tr>
402 | {indent(self.prepare_finished(), "                        ")}
403 |                     </tbody>
404 |                 </table>
405 |             </body>
406 |             </html>
407 |             """
408 |         ).encode()
409 | 
410 |         txrequest.setHeader("Content-Type", "text/html; charset=utf-8")
411 |         txrequest.setHeader("Content-Length", str(len(content)))
412 |         return content
413 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import io
 3 | import os.path
 4 | import pkgutil
 5 | 
 6 | from twisted.logger import eventAsText
 7 | 
 8 | from scrapyd.launcher import ScrapyProcessProtocol
 9 | 
10 | 
11 | def touch(path):
12 |     path.parent.mkdir(parents=True)
13 |     path.touch()
14 | 
15 | 
16 | def get_egg_data(basename):
17 |     return pkgutil.get_data("tests", f"fixtures/{basename}.egg")
18 | 
19 | 
20 | def has_settings():
21 |     return os.path.exists("scrapy.cfg")
22 | 
23 | 
24 | def root_add_version(root, project, version, basename):
25 |     root.eggstorage.put(io.BytesIO(get_egg_data(basename)), project, version)
26 | 
27 | 
28 | def get_message(captured):
29 |     return eventAsText(captured[0]).split(" ", 1)[1]
30 | 
31 | 
32 | def get_finished_job(project="p1", spider="s1", job="j1", start_time=None, end_time=None):
33 |     if start_time is None:
34 |         start_time = datetime.datetime.now()
35 |     if end_time is None:
36 |         end_time = datetime.datetime.now()
37 |     process = ScrapyProcessProtocol(project, spider, job, env={}, args=[])
38 |     process.start_time = start_time
39 |     process.end_time = end_time
40 |     return process
41 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import shutil
 3 | 
 4 | import pytest
 5 | from twisted.web import http
 6 | from twisted.web.http import Request
 7 | from twisted.web.test.requesthelper import DummyChannel
 8 | 
 9 | from scrapyd import Config
10 | from scrapyd.app import application
11 | from scrapyd.interfaces import IEnvironment
12 | from scrapyd.webservice import spider_list
13 | from scrapyd.website import Root
14 | from tests import root_add_version
15 | 
16 | BASEDIR = os.path.abspath(os.path.dirname(__file__))
17 | 
18 | 
19 | @pytest.fixture(autouse=True)
20 | def _clear_spider_list_cache():
21 |     spider_list.cache.clear()
22 | 
23 | 
24 | @pytest.fixture()
25 | def txrequest():
26 |     http_channel = http.HTTPChannel()
27 |     http_channel.makeConnection(DummyChannel.TCP())
28 |     return Request(http_channel)
29 | 
30 | 
31 | # Use this fixture when testing the Scrapyd web UI or API or writing configuration files.
32 | @pytest.fixture()
33 | def chdir(monkeypatch, tmp_path):
34 |     monkeypatch.chdir(tmp_path)
35 |     return tmp_path
36 | 
37 | 
38 | @pytest.fixture(
39 |     params=[
40 |         None,
41 |         (("items_dir", "items"), ("jobstorage", "scrapyd.jobstorage.SqliteJobStorage")),
42 |     ],
43 |     ids=["default", "custom"],
44 | )
45 | def config(request, chdir):
46 |     if request.param:
47 |         shutil.copytree(os.path.join(BASEDIR, "fixtures", "filesystem"), chdir, dirs_exist_ok=True)
48 |     config = Config()
49 |     if request.param:
50 |         for key, value in request.param:
51 |             config.cp.set(Config.SECTION, key, value)
52 |     return config
53 | 
54 | 
55 | @pytest.fixture()
56 | def app(config):
57 |     return application(config)
58 | 
59 | 
60 | @pytest.fixture()
61 | def environ(app):
62 |     return app.getComponent(IEnvironment)
63 | 
64 | 
65 | @pytest.fixture()
66 | def root(config, app):
67 |     return Root(config, app)
68 | 
69 | 
70 | @pytest.fixture()
71 | def root_with_egg(root):
72 |     root_add_version(root, "quotesbot", "0.1", "quotesbot")
73 |     root.update_projects()
74 |     return root
75 | 


--------------------------------------------------------------------------------
/tests/fixtures/entrypoint_missing.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/entrypoint_missing.egg


--------------------------------------------------------------------------------
/tests/fixtures/filesystem/localproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/filesystem/localproject/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/filesystem/localproject/settings.py:
--------------------------------------------------------------------------------
1 | BOT_NAME = "localproject"
2 | SPIDER_MODULES = ["localproject.spiders"]
3 | NEWSPIDER_MODULE = "localproject.spiders"
4 | ROBOTSTXT_OBEY = True
5 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
6 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
7 | FEED_EXPORT_ENCODING = "utf-8"
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/filesystem/localproject/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tests/fixtures/filesystem/localproject/spiders/example.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | 
3 | 
4 | class ExampleSpider(scrapy.Spider):
5 |     name = "example"
6 | 
7 |     def start_requests(self):
8 |         pass
9 | 


--------------------------------------------------------------------------------
/tests/fixtures/filesystem/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | localproject = localproject.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = localproject
12 | 


--------------------------------------------------------------------------------
/tests/fixtures/mybot.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/mybot.egg


--------------------------------------------------------------------------------
/tests/fixtures/mybot2.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/mybot2.egg


--------------------------------------------------------------------------------
/tests/fixtures/quotesbot.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/quotesbot.egg


--------------------------------------------------------------------------------
/tests/fixtures/settings_asyncioreactor.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_asyncioreactor.egg


--------------------------------------------------------------------------------
/tests/fixtures/settings_log_stdout.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_log_stdout.egg


--------------------------------------------------------------------------------
/tests/fixtures/settings_raise.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/settings_raise.egg


--------------------------------------------------------------------------------
/tests/fixtures/spiders_utf8.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/scrapyd/8f354fa794a61a4ab6c4000b9452cb55cd3a25f5/tests/fixtures/spiders_utf8.egg


--------------------------------------------------------------------------------
/tests/mockapp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from twisted.application import app
 5 | from twisted.internet import reactor
 6 | from twisted.python import log
 7 | 
 8 | from scrapyd import Config
 9 | from scrapyd.app import application
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("http_port")
14 |     parser.add_argument("--username")
15 |     parser.add_argument("--password")
16 |     args = parser.parse_args()
17 | 
18 |     config = Config()
19 |     config.cp.set(Config.SECTION, "http_port", args.http_port)
20 |     if args.username and args.password:
21 |         config.cp.set(Config.SECTION, "username", args.username)
22 |         config.cp.set(Config.SECTION, "password", args.password)
23 | 
24 |     log.startLogging(sys.stdout)
25 | 
26 |     app.startApplication(application(config=config), save=False)
27 | 
28 |     reactor.run()
29 | 


--------------------------------------------------------------------------------
/tests/mockserver.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import re
 3 | import socket
 4 | import sys
 5 | from subprocess import PIPE, Popen
 6 | from urllib.parse import urljoin
 7 | 
 8 | BASEDIR = os.path.abspath(os.path.dirname(__file__))
 9 | 
10 | 
11 | def get_ephemeral_port():
12 |     # Somehow getting random high port doesn't work on pypy
13 |     if re.search("PyPy", sys.version):
14 |         return str(9112)
15 |     s = socket.socket()
16 |     s.bind(("", 0))
17 |     return str(s.getsockname()[1])
18 | 
19 | 
20 | class MockScrapydServer:
21 |     def __init__(self, username=None, password=None):
22 |         self.username = username
23 |         self.password = password
24 | 
25 |     def __enter__(self):
26 |         self.http_port = get_ephemeral_port()
27 |         command = [sys.executable, os.path.join(BASEDIR, "mockapp.py"), self.http_port]
28 |         if self.username and self.password:
29 |             command.extend([f"--username={self.username}", f"--password={self.password}"])
30 | 
31 |         self.process = Popen(command, stdout=PIPE)
32 | 
33 |         # The loop is expected to run 3 times.
34 |         # 2001-02-03 04:05:06-0000 [-] Log opened.
35 |         # 2001-02-03 04:05:06-0000 [-] Basic authentication disabled as either `username` or `password` is unset
36 |         # 2001-02-03 04:05:06-0000 [-] Scrapyd web console available at http://127.0.0.1:53532/
37 |         self.head = []
38 |         for _ in range(10):
39 |             line = self.process.stdout.readline()
40 |             self.head.append(line)
41 |             if address := re.search("available at (.+/)", line.decode()):
42 |                 self.url = address.group(1)
43 |                 break
44 | 
45 |         return self
46 | 
47 |     def __exit__(self, exc_type, exc_value, traceback):
48 |         self.process.terminate()
49 |         self.stdout, _ = self.process.communicate()
50 |         self.stdout = b"".join(self.head) + self.stdout
51 | 
52 |     def urljoin(self, path):
53 |         return urljoin(self.url, path)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     with MockScrapydServer() as server:
58 |         while True:
59 |             pass
60 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from configparser import NoOptionError, NoSectionError
 2 | 
 3 | import pytest
 4 | 
 5 | from scrapyd import get_application
 6 | from scrapyd.app import application
 7 | from scrapyd.config import Config
 8 | from scrapyd.exceptions import InvalidUsernameError
 9 | 
10 | 
11 | def test_items_no_section():
12 |     with pytest.raises(NoSectionError):
13 |         Config().items("nonexistent")
14 | 
15 | 
16 | def test_get_no_section():
17 |     with pytest.raises(NoOptionError):
18 |         Config().get("nonexistent")
19 | 
20 | 
21 | def test_get_no_option():
22 |     config = Config()
23 |     config.cp.set("scrapyd", "http_port", "8000")
24 | 
25 |     with pytest.raises(NoOptionError):
26 |         config.get("nonexistent")
27 | 
28 | 
29 | def test_closest_scrapy_cfg(monkeypatch, tmp_path):
30 |     monkeypatch.chdir(tmp_path)
31 |     (tmp_path / "scrapy.cfg").write_text("[scrapyd]\nhttp_port = 1234")
32 | 
33 |     assert Config().getint("http_port") == 1234
34 | 
35 | 
36 | def test_invalid_username():
37 |     config = Config()
38 |     config.cp.set("scrapyd", "username", "invalid:")
39 | 
40 |     with pytest.raises(InvalidUsernameError) as exc:
41 |         application(config)
42 | 
43 |     assert (
44 |         str(exc.value)
45 |         == "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file."
46 |     )
47 | 
48 | 
49 | def test_invalid_username_sys():
50 |     config = Config()
51 |     config.cp.set("scrapyd", "username", "invalid:")
52 | 
53 |     with pytest.raises(SystemExit) as exc:
54 |         get_application(config)
55 | 
56 |     assert (
57 |         str(exc.value)
58 |         == "The `username` option contains illegal character ':'. Check and update the Scrapyd configuration file."
59 |     )
60 | 


--------------------------------------------------------------------------------
/tests/test_eggstorage.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os.path
  3 | from contextlib import closing
  4 | 
  5 | import pytest
  6 | from zope.interface import implementer
  7 | from zope.interface.verify import verifyObject
  8 | 
  9 | from scrapyd.app import application
 10 | from scrapyd.config import Config
 11 | from scrapyd.eggstorage import FilesystemEggStorage, sorted_versions
 12 | from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError
 13 | from scrapyd.interfaces import IEggStorage
 14 | 
 15 | 
 16 | @implementer(IEggStorage)
 17 | class MockEggStorage:
 18 |     def __init__(self, config):
 19 |         self.config = config
 20 | 
 21 |     def put(self, eggfile, project, version):
 22 |         pass
 23 | 
 24 |     def get(self, project, version=None):
 25 |         pass
 26 | 
 27 |     def list(self, project):
 28 |         pass
 29 | 
 30 |     def list_projects(self):
 31 |         return ["hello_world"]
 32 | 
 33 |     def delete(self, project, version=None):
 34 |         pass
 35 | 
 36 | 
 37 | @pytest.fixture()
 38 | def eggstorage(tmpdir):
 39 |     return FilesystemEggStorage(Config(values={"eggs_dir": tmpdir}))
 40 | 
 41 | 
 42 | @pytest.mark.parametrize(
 43 |     ("versions", "expected"),
 44 |     [
 45 |         # letter
 46 |         (["zzz", "b", "ddd", "a", "x"], ["a", "b", "ddd", "x", "zzz"]),
 47 |         # number
 48 |         (["10", "1", "9"], ["1", "9", "10"]),
 49 |         # "r" number
 50 |         (["r10", "r1", "r9"], ["r1", "r10", "r9"]),
 51 |         # version
 52 |         (["2.11", "2.01", "2.9"], ["2.01", "2.9", "2.11"]),
 53 |         # number and letter
 54 |         (["123456789", "b3b8fd2"], ["123456789", "b3b8fd2"]),
 55 |     ],
 56 | )
 57 | def test_sorted_versions(versions, expected):
 58 |     assert sorted_versions(versions) == expected
 59 | 
 60 | 
 61 | def test_config(chdir):
 62 |     config = Config()
 63 |     config.cp.set("scrapyd", "eggstorage", "tests.test_eggstorage.MockEggStorage")
 64 | 
 65 |     app = application(config)
 66 |     eggstorage = app.getComponent(IEggStorage)
 67 | 
 68 |     assert isinstance(eggstorage, MockEggStorage)
 69 |     assert eggstorage.list_projects() == ["hello_world"]
 70 | 
 71 | 
 72 | def test_interface(eggstorage):
 73 |     verifyObject(IEggStorage, eggstorage)
 74 | 
 75 | 
 76 | def test_put_secure(eggstorage):
 77 |     with pytest.raises(DirectoryTraversalError) as exc:
 78 |         eggstorage.put(io.BytesIO(b"data"), "../p", "v")  # version is sanitized
 79 | 
 80 |     assert str(exc.value) == "../p"
 81 | 
 82 | 
 83 | def test_get_secure(eggstorage):
 84 |     with pytest.raises(DirectoryTraversalError) as exc:
 85 |         eggstorage.get("../p", "v")  # version is sanitized
 86 | 
 87 |     assert str(exc.value) == "../p"
 88 | 
 89 | 
 90 | def test_list_secure_join(eggstorage):
 91 |     with pytest.raises(DirectoryTraversalError) as exc:
 92 |         eggstorage.list("../p")
 93 | 
 94 |     assert str(exc.value) == "../p"
 95 | 
 96 | 
 97 | def test_list_secure_glob(eggstorage):
 98 |     eggstorage.put(io.BytesIO(b"data"), "mybot", "01")
 99 | 
100 |     assert eggstorage.list("*") == []  # ["01"] if * weren't escaped
101 | 
102 | 
103 | def test_delete_secure(eggstorage):
104 |     with pytest.raises(DirectoryTraversalError) as exc:
105 |         eggstorage.delete("../p", "v")  # version is sanitized
106 | 
107 |     assert str(exc.value) == "../p"
108 | 
109 | 
110 | @pytest.mark.parametrize(
111 |     ("version", "expected"),
112 |     [
113 |         (None, (None, None)),
114 |         ("nonexistent", (None, None)),
115 |         ("01", (None, None)),
116 |     ],
117 | )
118 | def test_get_empty(eggstorage, version, expected):
119 |     assert eggstorage.get("mybot", version) == expected
120 | 
121 | 
122 | @pytest.mark.parametrize(
123 |     ("version", "expected"),
124 |     [
125 |         (None, ("03", b"egg03")),
126 |         ("nonexistent", (None, None)),
127 |         ("01", ("01", b"egg01")),
128 |     ],
129 | )
130 | def test_get_many(eggstorage, version, expected):
131 |     eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01")
132 |     eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03")
133 |     eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02")
134 | 
135 |     version, data = eggstorage.get("mybot", version)
136 |     if data is not None:
137 |         with closing(data):
138 |             data = data.read()
139 | 
140 |     assert (version, data) == expected
141 | 
142 | 
143 | @pytest.mark.parametrize(
144 |     ("versions", "expected"),
145 |     [(["ddd", "abc", "bcaa"], ["abc", "bcaa", "ddd"]), (["9", "2", "200", "3", "4"], ["2", "3", "4", "9", "200"])],
146 | )
147 | def test_list(eggstorage, versions, expected):
148 |     assert eggstorage.list("mybot") == []
149 | 
150 |     for version in versions:
151 |         eggstorage.put(io.BytesIO(b"egg01"), "mybot", version)
152 | 
153 |     assert eggstorage.list("mybot") == expected
154 | 
155 | 
156 | def test_list_glob(eggstorage):
157 |     directory = os.path.join(eggstorage.basedir, "mybot")
158 |     os.makedirs(directory)
159 |     with open(os.path.join(directory, "other"), "wb") as f:
160 |         f.write(b"")
161 | 
162 |     assert eggstorage.list("mybot") == []  # "other" without "*.egg" glob
163 | 
164 | 
165 | def test_list_projects(eggstorage):
166 |     with open(os.path.join(eggstorage.basedir, "other"), "wb") as f:
167 |         f.write(b"")
168 | 
169 |     assert eggstorage.list_projects() == []  # "other" without isdir() filter
170 | 
171 |     eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01")
172 | 
173 |     assert eggstorage.list_projects() == ["mybot"]
174 | 
175 | 
176 | def test_delete_project(eggstorage):
177 |     eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01")
178 |     eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03")
179 |     eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02")
180 | 
181 |     assert eggstorage.list("mybot") == ["01", "02", "03"]
182 | 
183 |     eggstorage.delete("mybot")
184 | 
185 |     assert eggstorage.list("mybot") == []
186 | 
187 | 
188 | def test_delete_vesrion(eggstorage):
189 |     eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01")
190 |     eggstorage.put(io.BytesIO(b"egg03"), "mybot", "03")
191 |     eggstorage.put(io.BytesIO(b"egg02"), "mybot", "02")
192 | 
193 |     assert eggstorage.list("mybot") == ["01", "02", "03"]
194 | 
195 |     eggstorage.delete("mybot", "02")
196 | 
197 |     assert eggstorage.list("mybot") == ["01", "03"]
198 | 
199 |     eggstorage.delete("mybot", "03")
200 | 
201 |     assert eggstorage.list("mybot") == ["01"]
202 | 
203 |     eggstorage.delete("mybot", "01")
204 | 
205 |     assert eggstorage.list("mybot") == []
206 |     assert not os.path.exists(os.path.join(eggstorage.basedir, "mybot"))
207 | 
208 | 
209 | def test_delete_nonexistent_project(eggstorage):
210 |     with pytest.raises(ProjectNotFoundError):
211 |         eggstorage.delete("mybot")
212 | 
213 | 
214 | def test_delete_nonexistent_version(eggstorage):
215 |     with pytest.raises(EggNotFoundError):
216 |         eggstorage.delete("mybot", "01")
217 | 
218 |     eggstorage.put(io.BytesIO(b"egg01"), "mybot", "01")
219 | 
220 |     with pytest.raises(EggNotFoundError):
221 |         eggstorage.delete("mybot", "02")
222 | 


--------------------------------------------------------------------------------
/tests/test_environ.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from unittest.mock import patch
  5 | 
  6 | import pytest
  7 | from zope.interface.verify import verifyObject
  8 | 
  9 | from scrapyd.config import Config
 10 | from scrapyd.environ import Environment
 11 | from scrapyd.exceptions import DirectoryTraversalError
 12 | from scrapyd.interfaces import IEnvironment
 13 | from tests import has_settings
 14 | 
 15 | 
 16 | def test_interface(environ):
 17 |     verifyObject(IEnvironment, environ)
 18 | 
 19 | 
 20 | def test_get_settings(environ):
 21 |     settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})
 22 | 
 23 |     assert re.search(r"^\S+j1\.log$", settings["LOG_FILE"])
 24 | 
 25 |     if environ.items_dir:
 26 |         feeds = json.loads(settings.pop("FEEDS"))
 27 |         path, value = feeds.popitem()
 28 | 
 29 |         assert list(settings) == ["LOG_FILE"]
 30 |         assert feeds == {}
 31 |         assert re.search(r"^file:///\S+j1\.jl$", path)
 32 |         assert value == {"format": "jsonlines"}
 33 | 
 34 | 
 35 | @pytest.mark.parametrize(
 36 |     ("items_dir", "pattern"),
 37 |     [
 38 |         (
 39 |             "https://host.example/path?query=value#fragment",
 40 |             r"https://host\.example/path/p1/s1/j1\.jl\?query=value#fragment",
 41 |         ),
 42 |         (
 43 |             "https://host.example/path/",
 44 |             "https://host.example/path/p1/s1/j1.jl",  # no double slashes
 45 |         ),
 46 |         (
 47 |             "file:/root.dir/path?ignored#ignored",
 48 |             r"file:///([A-Z]:/)?root\.dir/path/p1/s1/j1\.jl",
 49 |         ),
 50 |         (
 51 |             "file://hostname/root.dir/path?ignored#ignored",
 52 |             r"file:///([A-Z]:/)?root.dir/path/p1/s1/j1.jl",
 53 |         ),
 54 |         (
 55 |             "file:///root.dir/path?ignored#ignored",
 56 |             r"file:///([A-Z]:/)?root.dir/path/p1/s1/j1.jl",
 57 |         ),
 58 |     ],
 59 | )
 60 | @patch("os.listdir", lambda _: [])
 61 | @patch("os.makedirs", lambda _: _)
 62 | def test_get_settings_url(items_dir, pattern):
 63 |     config = Config(values={"logs_dir": "", "items_dir": items_dir})
 64 |     environ = Environment(config, initenv={})
 65 | 
 66 |     settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})
 67 |     feeds = json.loads(settings.pop("FEEDS"))
 68 |     path, value = feeds.popitem()
 69 | 
 70 |     assert settings == {}
 71 |     assert feeds == {}
 72 |     assert re.search(pattern, path)
 73 |     assert value == {"format": "jsonlines"}
 74 | 
 75 | 
 76 | @pytest.mark.parametrize("values", [{"items_dir": "../items"}, {"logs_dir": "../logs"}])
 77 | @pytest.mark.parametrize(("key", "value"), [("_project", "../p"), ("_spider", "../s"), ("_job", "../j")])
 78 | def test_get_settings_secure(values, key, value):
 79 |     config = Config(values=values)
 80 |     environ = Environment(config, initenv={})
 81 | 
 82 |     with pytest.raises(DirectoryTraversalError) as exc:
 83 |         environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1", key: value})
 84 | 
 85 |     assert str(exc.value) == (
 86 |         f"{value if key == '_project' else 'p1'}{os.sep}"
 87 |         f"{value if key == '_spider' else 's1'}{os.sep}"
 88 |         f"{value if key == '_job' else 'j1'}.log"
 89 |     )
 90 | 
 91 | 
 92 | def test_jobs_to_keep(chdir):
 93 |     config = Config(values={"jobs_to_keep": "2"})
 94 |     environ = Environment(config, initenv={})
 95 |     directory = chdir / "logs" / "p1" / "s1"
 96 | 
 97 |     assert not directory.exists()
 98 | 
 99 |     environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})
100 | 
101 |     assert directory.exists()
102 | 
103 |     (directory / "j1.a").touch()
104 |     (directory / "j2.b").touch()
105 |     os.utime(directory / "j1.a", (1000000000, 1000000000))
106 |     os.utime(directory / "j2.b", (1000000000, 1000000000))
107 |     (directory / "j3.c").touch()
108 |     (directory / "j4.d").touch()
109 | 
110 |     environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})
111 | 
112 |     assert not (directory / "j1.a").exists()
113 |     assert not (directory / "j2.b").exists()
114 | 
115 | 
116 | @pytest.mark.parametrize(
117 |     ("message", "run_only_if_has_settings"),
118 |     [
119 |         ({"_project": "mybot"}, False),
120 |         ({"_project": "mybot", "_version": "v1"}, False),
121 |         ({"_project": "localproject"}, True),
122 |     ],
123 | )
124 | def test_get_environment(monkeypatch, environ, message, run_only_if_has_settings):
125 |     if run_only_if_has_settings and not has_settings():
126 |         pytest.skip("[settings] section is not set")
127 | 
128 |     monkeypatch.setenv("CUSTOM", "value")
129 |     env = environ.get_environment(message, 3)
130 | 
131 |     assert env["CUSTOM"] == "value"
132 |     assert env["SCRAPY_PROJECT"] == message["_project"]
133 | 
134 |     if "_version" in message:
135 |         assert env["SCRAPYD_EGG_VERSION"] == "v1"
136 |     else:
137 |         assert "SCRAPYD_EGG_VERSION" not in env
138 | 
139 |     if run_only_if_has_settings:
140 |         assert env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings"
141 |     else:
142 |         assert "SCRAPY_SETTINGS_MODULE" not in env
143 | 


--------------------------------------------------------------------------------
/tests/test_interfaces.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from twisted.cred.checkers import ICredentialsChecker
 3 | from twisted.cred.portal import IRealm
 4 | from zope.interface.verify import verifyClass
 5 | 
 6 | from scrapyd.basicauth import PublicHTMLRealm, StringCredentialsChecker
 7 | from scrapyd.eggstorage import FilesystemEggStorage
 8 | from scrapyd.environ import Environment
 9 | from scrapyd.interfaces import IEggStorage, IEnvironment, IJobStorage, IPoller, ISpiderQueue, ISpiderScheduler
10 | from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage
11 | from scrapyd.poller import QueuePoller
12 | from scrapyd.scheduler import SpiderScheduler
13 | from scrapyd.spiderqueue import SqliteSpiderQueue
14 | 
15 | 
16 | @pytest.mark.parametrize(
17 |     ("cls", "interface"),
18 |     [
19 |         (PublicHTMLRealm, IRealm),
20 |         (StringCredentialsChecker, ICredentialsChecker),
21 |         (FilesystemEggStorage, IEggStorage),
22 |         (Environment, IEnvironment),
23 |         (MemoryJobStorage, IJobStorage),
24 |         (SqliteJobStorage, IJobStorage),
25 |         (QueuePoller, IPoller),
26 |         (SpiderScheduler, ISpiderScheduler),
27 |         (SqliteSpiderQueue, ISpiderQueue),
28 |     ],
29 | )
30 | def test_interface(cls, interface):
31 |     verifyClass(interface, cls)
32 | 


--------------------------------------------------------------------------------
/tests/test_jobstorage.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from zope.interface.verify import verifyObject
 4 | 
 5 | from scrapyd.config import Config
 6 | from scrapyd.interfaces import IJobStorage
 7 | from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage
 8 | from tests import get_finished_job
 9 | 
10 | job1 = get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))
11 | job2 = get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))
12 | job3 = get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))
13 | 
14 | 
15 | def pytest_generate_tests(metafunc):
16 |     idlist = []
17 |     argvalues = []
18 |     for scenario, cls in metafunc.cls.scenarios:
19 |         idlist.append(scenario)
20 |         argnames = ["cls"]
21 |         argvalues.append([cls])
22 |     metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
23 | 
24 | 
25 | def config(tmpdir):
26 |     return Config(values={"dbs_dir": tmpdir, "finished_to_keep": "2"})
27 | 
28 | 
29 | class TestJobStorage:
30 |     scenarios = (("sqlite", SqliteJobStorage), ("memory", MemoryJobStorage))
31 | 
32 |     def test_interface(self, cls, tmpdir):
33 |         verifyObject(IJobStorage, cls(config(tmpdir)))
34 | 
35 |     def test_add(self, cls, tmpdir):
36 |         jobstorage = cls(config(tmpdir))
37 | 
38 |         assert len(jobstorage) == 0
39 | 
40 |         jobstorage.add(job1)
41 |         jobstorage.add(job2)
42 |         jobstorage.add(job3)
43 |         actual = jobstorage.list()
44 | 
45 |         assert len(jobstorage) == 2
46 |         assert actual == list(jobstorage)
47 |         assert actual == [job3, job2]
48 | 
49 |     def test_iter(self, cls, tmpdir):
50 |         jobstorage = cls(config(tmpdir))
51 | 
52 |         assert len(jobstorage) == 0
53 | 
54 |         jobstorage.add(job1)
55 |         jobstorage.add(job2)
56 |         jobstorage.add(job3)
57 |         actual = jobstorage.list()
58 | 
59 |         assert len(jobstorage) == 2
60 |         assert actual == list(jobstorage)
61 |         assert actual == [job3, job2]
62 | 


--------------------------------------------------------------------------------
/tests/test_launcher.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | 
  4 | import pytest
  5 | from twisted.internet import defer, error
  6 | from twisted.logger import LogLevel, capturedLogs
  7 | from twisted.python import failure
  8 | 
  9 | from scrapyd import __version__
 10 | from scrapyd.config import Config
 11 | from scrapyd.launcher import Launcher, get_crawl_args
 12 | from tests import get_message, has_settings
 13 | 
 14 | 
 15 | def remove_debug_messages(captured):
 16 |     return [message for message in captured if message["log_level"] != LogLevel.debug]
 17 | 
 18 | 
 19 | @pytest.fixture()
 20 | def launcher(app):
 21 |     return Launcher(Config(), app)
 22 | 
 23 | 
 24 | @pytest.fixture()
 25 | def process(launcher):
 26 |     launcher._spawn_process({"_project": "p1", "_spider": "s1", "_job": "j1"}, 0)  # noqa: SLF001
 27 |     return launcher.processes[0]
 28 | 
 29 | 
 30 | @pytest.mark.parametrize(
 31 |     ("message", "expected"),
 32 |     [
 33 |         ({"_project": "p1", "_spider": "s1"}, ["s1"]),
 34 |         ({"_project": "p1", "_spider": "s1", "settings": {"ONE": "two"}}, ["s1", "-s", "ONE=two"]),
 35 |         ({"_project": "p1", "_spider": "s1", "arg1": "val1"}, ["s1", "-a", "arg1=val1"]),
 36 |         (
 37 |             {"_project": "p1", "_spider": "s1", "arg1": "val1", "settings": {"ONE": "two"}},
 38 |             ["s1", "-s", "ONE=two", "-a", "arg1=val1"],
 39 |         ),
 40 |     ],
 41 | )
 42 | def test_get_crawl_args(message, expected):
 43 |     assert get_crawl_args(message) == expected
 44 | 
 45 | 
 46 | def test_start_service(launcher):
 47 |     with capturedLogs() as captured:
 48 |         launcher.startService()
 49 |     captured = remove_debug_messages(captured)
 50 | 
 51 |     assert len(captured) == 1
 52 |     assert captured[0]["log_level"] == LogLevel.info
 53 |     assert re.search(
 54 |         f"\\[Launcher\\] Scrapyd {__version__} started: max_proc=\\d+, runner='scrapyd.runner'", get_message(captured)
 55 |     )
 56 | 
 57 | 
 58 | def test_start_service_max_proc(app):
 59 |     config = Config()
 60 |     config.cp.set(Config.SECTION, "max_proc", "8")
 61 |     launcher = Launcher(config, app)
 62 | 
 63 |     with capturedLogs() as captured:
 64 |         launcher.startService()
 65 |     captured = remove_debug_messages(captured)
 66 | 
 67 |     assert len(captured) == 1
 68 |     assert captured[0]["log_level"] == LogLevel.info
 69 |     assert re.search(
 70 |         f"\\[Launcher\\] Scrapyd {__version__} started: max_proc=8, runner='scrapyd.runner'", get_message(captured)
 71 |     )
 72 | 
 73 | 
 74 | @pytest.mark.parametrize(
 75 |     ("message", "expected"),
 76 |     [
 77 |         ({}, {}),
 78 |         ({"_version": "v1"}, {"SCRAPYD_EGG_VERSION": "v1"}),
 79 |     ],
 80 | )
 81 | def test_spawn_process(launcher, message, expected):
 82 |     launcher._spawn_process({"_project": "localproject", "_spider": "s1", "_job": "j1", **message}, 1)  # noqa: SLF001
 83 | 
 84 |     process = launcher.processes[1]
 85 | 
 86 |     assert isinstance(process.pid, int)
 87 |     assert process.project == "localproject"
 88 |     assert process.spider == "s1"
 89 |     assert process.job == "j1"
 90 |     assert isinstance(process.start_time, datetime.datetime)
 91 |     assert process.end_time is None
 92 |     assert isinstance(process.args, list)  # see tests below
 93 |     assert isinstance(process.deferred, defer.Deferred)
 94 | 
 95 |     # scrapyd.environ.Environ.get_environment
 96 |     assert process.env["SCRAPY_PROJECT"] == "localproject"
 97 |     for key, value in expected.items():
 98 |         assert process.env[key] == value
 99 |     if "SCRAPYD_EGG_VERSION" not in expected:
100 |         assert "SCRAPYD_EGG_VERSION" not in process.env
101 |     if has_settings():
102 |         assert process.env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings"
103 |     else:
104 |         assert "SCRAPY_SETTINGS_MODULE" not in process.env
105 | 
106 | 
107 | def test_out_received(process):
108 |     with capturedLogs() as captured:
109 |         process.outReceived(b"out\n")
110 | 
111 |     assert len(captured) == 1
112 |     assert captured[0]["log_level"] == LogLevel.info
113 |     assert get_message(captured) == f"[Launcher,{process.pid}/stdout] out"
114 | 
115 | 
116 | def test_err_received(process):
117 |     with capturedLogs() as captured:
118 |         process.errReceived(b"err\n")
119 | 
120 |     assert len(captured) == 1
121 |     assert captured[0]["log_level"] == LogLevel.error
122 |     assert get_message(captured) == f"[Launcher,{process.pid}/stderr] err"
123 | 
124 | 
125 | def test_connection_made(environ, process):
126 |     pid = process.pid
127 |     with capturedLogs() as captured:
128 |         process.connectionMade()
129 | 
130 |     assert len(captured) == 1
131 |     assert captured[0]["log_level"] == LogLevel.info
132 |     if environ.items_dir:
133 |         assert re.match(
134 |             f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} "
135 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
136 |             """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
137 |             get_message(captured),
138 |         )
139 |     else:
140 |         assert re.match(
141 |             f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} "
142 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-a', '_job=j1'\\]",
143 |             get_message(captured),
144 |         )
145 | 
146 | 
147 | def test_process_ended_done(environ, process):
148 |     pid = process.pid
149 |     with capturedLogs() as captured:
150 |         process.processEnded(failure.Failure(error.ProcessDone(0)))
151 |     captured = remove_debug_messages(captured)
152 | 
153 |     assert len(captured) == 1
154 |     assert captured[0]["log_level"] == LogLevel.info
155 |     if environ.items_dir:
156 |         assert re.match(
157 |             f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} "
158 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
159 |             """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
160 |             get_message(captured),
161 |         )
162 |     else:
163 |         assert re.match(
164 |             f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} "
165 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-a', '_job=j1'\\]",
166 |             get_message(captured),
167 |         )
168 | 
169 | 
170 | def test_process_ended_terminated(environ, process):
171 |     pid = process.pid
172 |     with capturedLogs() as captured:
173 |         process.processEnded(failure.Failure(error.ProcessTerminated(1)))
174 |     captured = remove_debug_messages(captured)
175 | 
176 |     assert len(captured) == 1
177 |     assert captured[0]["log_level"] == LogLevel.error
178 |     if environ.items_dir:
179 |         assert re.match(
180 |             f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} "
181 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
182 |             """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
183 |             get_message(captured),
184 |         )
185 |     else:
186 |         assert re.match(
187 |             f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} "
188 |             "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+', '-a', '_job=j1'\\]",
189 |             get_message(captured),
190 |         )
191 | 
192 | 
193 | def test_repr(process):
194 |     assert repr(process).startswith(f"ScrapyProcessProtocol(project=p1 spider=s1 job=j1 pid={process.pid} start_time=")
195 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import pytest
 4 | 
 5 | from scrapyd import __version__
 6 | from scrapyd.__main__ import main
 7 | 
 8 | 
 9 | def test_version(capsys, monkeypatch):
10 |     monkeypatch.setattr(sys, "argv", ["scrapyd", "junk", "--version", "junk"])
11 |     main()
12 | 
13 |     assert capsys.readouterr().out == f"Scrapyd {__version__}\n"
14 | 
15 | 
16 | def test_v(capsys, monkeypatch):
17 |     monkeypatch.setattr(sys, "argv", ["scrapyd", "junk", "-v", "junk"])
18 |     main()
19 | 
20 |     assert capsys.readouterr().out == f"Scrapyd {__version__}\n"
21 | 
22 | 
23 | def test_help(capsys, monkeypatch):
24 |     monkeypatch.setattr(sys, "argv", ["scrapyd", "--help"])
25 | 
26 |     with pytest.raises(SystemExit) as exc:
27 |         main()
28 | 
29 |     captured = capsys.readouterr()
30 | 
31 |     assert exc.value.code == 0
32 |     assert captured.out.startswith("Usage: scrapyd [options]\n")
33 |     assert "--nodaemon" in captured.out
34 |     assert "python" not in captured.out
35 |     assert "rundir" not in captured.out
36 |     assert "ftp" not in captured.out
37 |     assert "Commands:" not in captured.out
38 |     assert captured.err == ""
39 | 


--------------------------------------------------------------------------------
/tests/test_poller.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from twisted.internet.defer import Deferred
 5 | from zope.interface.verify import verifyObject
 6 | 
 7 | from scrapyd.config import Config
 8 | from scrapyd.interfaces import IPoller
 9 | from scrapyd.poller import QueuePoller
10 | from scrapyd.utils import get_spider_queues
11 | 
12 | 
13 | @pytest.fixture()
14 | def poller(tmpdir):
15 |     eggs_dir = os.path.join(tmpdir, "eggs")
16 |     dbs_dir = os.path.join(tmpdir, "dbs")
17 |     config = Config(values={"eggs_dir": eggs_dir, "dbs_dir": dbs_dir})
18 |     os.makedirs(os.path.join(eggs_dir, "mybot1"))
19 |     os.makedirs(os.path.join(eggs_dir, "mybot2"))
20 |     return QueuePoller(config)
21 | 
22 | 
23 | def test_interface(poller):
24 |     verifyObject(IPoller, poller)
25 | 
26 | 
27 | # Need sorted(), because os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order.
28 | def test_list_projects_update_projects(poller):
29 |     assert sorted(poller.queues) == ["mybot1", "mybot2"]
30 | 
31 |     os.makedirs(os.path.join(poller.config.get("eggs_dir"), "settings_raise"))
32 | 
33 |     assert sorted(poller.queues) == ["mybot1", "mybot2"]
34 | 
35 |     poller.update_projects()
36 | 
37 |     assert sorted(poller.queues) == ["mybot1", "mybot2", "settings_raise"]
38 | 
39 | 
40 | def test_poll_next(poller):
41 |     queues = get_spider_queues(poller.config)
42 | 
43 |     scenario = {"mybot1": "spider1", "mybot2": "spider2"}
44 |     for project, spider in scenario.items():
45 |         queues[project].add(spider)
46 | 
47 |     deferred1 = poller.next()
48 |     deferred2 = poller.next()
49 | 
50 |     assert isinstance(deferred1, Deferred)
51 |     assert not hasattr(deferred1, "result")
52 |     assert isinstance(deferred2, Deferred)
53 |     assert not hasattr(deferred2, "result")
54 | 
55 |     value = poller.poll()
56 | 
57 |     assert isinstance(value, Deferred)
58 |     assert hasattr(value, "result")
59 |     assert getattr(value, "called", False)
60 |     assert value.result is None
61 | 
62 |     assert hasattr(deferred1, "result")
63 |     assert getattr(deferred1, "called", False)
64 |     assert hasattr(deferred2, "result")
65 |     assert getattr(deferred2, "called", False)
66 | 
67 |     # os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order.
68 |     project_a = deferred1.result["_project"]
69 |     spider_a = scenario.pop(project_a)
70 |     project_b, spider_b = scenario.popitem()
71 | 
72 |     assert deferred1.result["_spider"] == spider_a
73 |     assert deferred2.result == {"_project": project_b, "_spider": spider_b}
74 | 
75 | 
76 | def test_poll_empty(poller):
77 |     value = poller.poll()
78 | 
79 |     assert isinstance(value, Deferred)
80 |     assert hasattr(value, "result")
81 |     assert getattr(value, "called", False)
82 |     assert value.result is None
83 | 


--------------------------------------------------------------------------------
/tests/test_runner.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os.path
  3 | import sys
  4 | from unittest.mock import patch
  5 | 
  6 | import pytest
  7 | from zope.interface import implementer
  8 | 
  9 | from scrapyd.exceptions import BadEggError
 10 | from scrapyd.interfaces import IEggStorage
 11 | from scrapyd.runner import main
 12 | 
 13 | BASEDIR = os.path.abspath(os.path.dirname(__file__))
 14 | 
 15 | 
 16 | @implementer(IEggStorage)
 17 | class MockEggStorage:
 18 |     def __init__(self, config):
 19 |         self.config = config
 20 | 
 21 |     def put(self, eggfile, project, version):
 22 |         pass
 23 | 
 24 |     def get(self, project, version=None):
 25 |         if project == "bytesio":
 26 |             with open(os.path.join(BASEDIR, "fixtures", "quotesbot.egg"), "rb") as f:
 27 |                 return version, io.BytesIO(f.read())
 28 |         if project == "noentrypoint":
 29 |             # Identical to quotesbot.egg, except EGG-INFO/entry_points.txt doesn't set `settings` under [scrapy].
 30 |             with open(os.path.join(BASEDIR, "fixtures", "entrypoint_missing.egg"), "rb") as f:
 31 |                 return version, io.BytesIO(f.read())
 32 |         if project == "badegg":
 33 |             return version, io.BytesIO(b"badegg")
 34 |         return None, None
 35 | 
 36 |     def list(self, project):
 37 |         pass
 38 | 
 39 |     def list_projects(self):
 40 |         return []
 41 | 
 42 |     def delete(self, project, version=None):
 43 |         pass
 44 | 
 45 | 
 46 | @pytest.mark.parametrize(
 47 |     "module",
 48 |     [
 49 |         "scrapy.utils.project",
 50 |         "scrapy.utils.conf",
 51 |         "scrapyd.interfaces",
 52 |         "scrapyd.runner",
 53 |     ],
 54 | )
 55 | def test_no_load_scrapy_conf(module):
 56 |     __import__(module)
 57 | 
 58 |     assert "scrapy.conf" not in sys.modules, f"module {module!r} must not cause the scrapy.conf module to be loaded"
 59 | 
 60 | 
 61 | @pytest.mark.skipif(sys.platform == "win32", reason="The temporary file encounters a PermissionError")
 62 | def test_bytesio(monkeypatch, capsys, chdir):
 63 |     (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage")
 64 |     monkeypatch.setenv("SCRAPY_PROJECT", "bytesio")
 65 | 
 66 |     with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(SystemExit) as exc:
 67 |         main()
 68 | 
 69 |     # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests.
 70 |     del os.environ["SCRAPY_SETTINGS_MODULE"]
 71 | 
 72 |     captured = capsys.readouterr()
 73 | 
 74 |     assert exc.value.code == 0
 75 |     assert captured.out == "toscrape-css\ntoscrape-xpath\n"
 76 |     assert captured.err == ""
 77 | 
 78 | 
 79 | def test_badegg(monkeypatch, capsys, chdir):
 80 |     (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage")
 81 |     monkeypatch.setenv("SCRAPY_PROJECT", "badegg")
 82 | 
 83 |     with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(BadEggError) as exc:
 84 |         main()
 85 | 
 86 |     # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests.
 87 |     os.environ.pop("SCRAPY_SETTINGS_MODULE", None)
 88 | 
 89 |     captured = capsys.readouterr()
 90 | 
 91 |     assert str(exc.value) == ""
 92 |     assert captured.out == ""
 93 |     assert captured.err == ""
 94 | 
 95 | 
 96 | # This confirms that entry_points are required, as documented.
 97 | @pytest.mark.filterwarnings("ignore:Module quotesbot was already imported from:UserWarning")  # fixture reuses module
 98 | def test_noentrypoint(monkeypatch, capsys, chdir):
 99 |     (chdir / "scrapyd.conf").write_text("[scrapyd]\neggstorage = tests.test_runner.MockEggStorage")
100 |     monkeypatch.setenv("SCRAPY_PROJECT", "noentrypoint")
101 | 
102 |     with patch.object(sys, "argv", ["scrapy", "list"]), pytest.raises(AttributeError) as exc:
103 |         main()
104 | 
105 |     # main() sets SCRAPY_SETTINGS_MODULE, which interferes with other tests.
106 |     os.environ.pop("SCRAPY_SETTINGS_MODULE", None)
107 | 
108 |     captured = capsys.readouterr()
109 | 
110 |     assert str(exc.value)
111 |     assert captured.out == ""
112 |     assert captured.err == ""
113 | 


--------------------------------------------------------------------------------
/tests/test_scheduler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from zope.interface.verify import verifyObject
 5 | 
 6 | from scrapyd.config import Config
 7 | from scrapyd.interfaces import ISpiderScheduler
 8 | from scrapyd.scheduler import SpiderScheduler
 9 | from scrapyd.utils import get_spider_queues
10 | 
11 | 
12 | @pytest.fixture()
13 | def scheduler(tmpdir):
14 |     eggs_dir = os.path.join(tmpdir, "eggs")
15 |     dbs_dir = os.path.join(tmpdir, "dbs")
16 |     config = Config(values={"eggs_dir": eggs_dir, "dbs_dir": dbs_dir})
17 |     os.makedirs(os.path.join(eggs_dir, "mybot1"))
18 |     os.makedirs(os.path.join(eggs_dir, "mybot2"))
19 |     return SpiderScheduler(config)
20 | 
21 | 
22 | def test_interface(scheduler):
23 |     verifyObject(ISpiderScheduler, scheduler)
24 | 
25 | 
26 | # Need sorted(), because os.listdir() in FilesystemEggStorage.list_projects() uses an arbitrary order.
27 | def test_list_projects_update_projects(scheduler):
28 |     assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2"]
29 | 
30 |     os.makedirs(os.path.join(scheduler.config.get("eggs_dir"), "settings_raise"))
31 | 
32 |     assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2"]
33 | 
34 |     scheduler.update_projects()
35 | 
36 |     assert sorted(scheduler.list_projects()) == ["mybot1", "mybot2", "settings_raise"]
37 | 
38 | 
39 | def test_schedule(scheduler):
40 |     queues = get_spider_queues(scheduler.config)
41 |     mybot1_queue = queues["mybot1"]
42 |     mybot2_queue = queues["mybot2"]
43 | 
44 |     assert not mybot1_queue.count()
45 |     assert not mybot2_queue.count()
46 | 
47 |     scheduler.schedule("mybot1", "myspider1", 2, a="b")
48 |     scheduler.schedule("mybot2", "myspider2", 1, c="d")
49 |     scheduler.schedule("mybot2", "myspider3", 10, e="f")
50 | 
51 |     assert mybot1_queue.pop() == {"name": "myspider1", "a": "b"}
52 |     assert mybot2_queue.pop() == {"name": "myspider3", "e": "f"}
53 |     assert mybot2_queue.pop() == {"name": "myspider2", "c": "d"}
54 | 


--------------------------------------------------------------------------------
/tests/test_server.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import re
  4 | 
  5 | import pytest
  6 | import requests
  7 | 
  8 | from tests import get_egg_data
  9 | from tests.mockserver import MockScrapydServer
 10 | 
 11 | 
 12 | @pytest.fixture()
 13 | def mock_scrapyd(chdir):
 14 |     with MockScrapydServer() as server:
 15 |         yield server
 16 | 
 17 | 
 18 | def test_urljoin(mock_scrapyd):
 19 |     assert mock_scrapyd.urljoin("foo") == f"{mock_scrapyd.url}foo"
 20 | 
 21 | 
 22 | def test_auth():
 23 |     with MockScrapydServer(username="bob", password="hunter2") as server:
 24 |         assert requests.get(server.url).status_code == 401
 25 | 
 26 |         res = requests.get(server.url, auth=("bob", "hunter2"))
 27 | 
 28 |         assert res.status_code == 200
 29 |         assert re.search("use the API", res.text)
 30 | 
 31 |         res = requests.get(server.url, auth=("bob", "invalid"))
 32 | 
 33 |         assert res.status_code == 401
 34 | 
 35 |     stdout = server.stdout.decode()
 36 | 
 37 |     # scrapyd.basicauth
 38 |     assert f" [-] Basic authentication enabled{os.linesep}" in stdout
 39 |     # scrapyd.app
 40 |     assert f" [-] Scrapyd web console available at http://127.0.0.1:{server.http_port}/" in stdout
 41 | 
 42 | 
 43 | def test_noauth():
 44 |     with MockScrapydServer() as server:
 45 |         pass
 46 | 
 47 |     # scrapyd.basicauth
 48 |     assert (
 49 |         f" [-] Basic authentication disabled as either `username` or `password` is unset{os.linesep}"
 50 |         in server.stdout.decode()
 51 |     )
 52 | 
 53 | 
 54 | def test_error():
 55 |     with MockScrapydServer() as server:
 56 |         requests.get(server.urljoin("listversions.json"), params={"project": [b"\xc3\x28"]})
 57 | 
 58 |     stdout = server.stdout.decode()
 59 | 
 60 |     # scrapyd.webservice
 61 |     assert f" [-] Unhandled Error{os.linesep}" in stdout
 62 |     assert f"\tTraceback (most recent call last):{os.linesep}" in stdout
 63 |     assert "\ttwisted.web.error.Error: 200 project is invalid: " in stdout
 64 | 
 65 | 
 66 | @pytest.mark.parametrize(
 67 |     ("method", "basename"),
 68 |     [
 69 |         ("GET", "daemonstatus"),
 70 |         ("POST", "addversion"),
 71 |         ("POST", "schedule"),
 72 |         ("POST", "cancel"),
 73 |         ("GET", "status"),
 74 |         ("GET", "listprojects"),
 75 |         ("GET", "listversions"),
 76 |         ("GET", "listspiders"),
 77 |         ("GET", "listjobs"),
 78 |         ("POST", "delversion"),
 79 |         ("POST", "delproject"),
 80 |     ],
 81 | )
 82 | def test_options(mock_scrapyd, method, basename):
 83 |     response = requests.options(mock_scrapyd.urljoin(f"{basename}.json"))
 84 | 
 85 |     assert response.status_code == 204, f"204 != {response.status_code}"
 86 |     assert response.content == b""
 87 |     assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}"
 88 | 
 89 | 
 90 | # https://github.com/scrapy/scrapyd/issues/377
 91 | def test_other_reactors(mock_scrapyd):
 92 |     response = requests.post(
 93 |         mock_scrapyd.urljoin("addversion.json"),
 94 |         data={b"project": b"quotesbot", b"version": b"0.01"},
 95 |         # Identical to quotesbot.egg, except quotesbot/settings.py sets
 96 |         # `TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"`.
 97 |         files={b"egg": io.BytesIO(get_egg_data("settings_asyncioreactor"))},
 98 |     )
 99 | 
100 |     assert response.status_code == 200
101 |     assert response.json()["status"] == "ok"
102 | 


--------------------------------------------------------------------------------
/tests/test_spiderqueue.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from twisted.internet.defer import inlineCallbacks, maybeDeferred
 3 | from zope.interface.verify import verifyObject
 4 | 
 5 | from scrapyd.config import Config
 6 | from scrapyd.interfaces import ISpiderQueue
 7 | from scrapyd.spiderqueue import SqliteSpiderQueue
 8 | 
 9 | spider_args = {
10 |     "arg1": "val1",
11 |     "arg2": 2,
12 |     "arg3": "\N{SNOWMAN}",
13 | }
14 | expected = spider_args.copy()
15 | expected["name"] = "spider1"
16 | 
17 | 
18 | @pytest.fixture()
19 | def spiderqueue():
20 |     return SqliteSpiderQueue(Config(values={"dbs_dir": ":memory:"}), "quotesbot")
21 | 
22 | 
23 | def test_interface(spiderqueue):
24 |     verifyObject(ISpiderQueue, spiderqueue)
25 | 
26 | 
27 | @inlineCallbacks
28 | def test_pop(spiderqueue):
29 |     yield maybeDeferred(spiderqueue.add, "spider0", 5)
30 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
31 |     yield maybeDeferred(spiderqueue.add, "spider1", 0)
32 | 
33 |     assert (yield maybeDeferred(spiderqueue.count)) == 3
34 | 
35 |     assert (yield maybeDeferred(spiderqueue.pop)) == expected
36 | 
37 |     assert (yield maybeDeferred(spiderqueue.count)) == 2
38 | 
39 | 
40 | @inlineCallbacks
41 | def test_list(spiderqueue):
42 |     assert (yield maybeDeferred(spiderqueue.list)) == []
43 | 
44 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
45 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
46 | 
47 |     assert (yield maybeDeferred(spiderqueue.list)) == [expected, expected]
48 | 
49 | 
50 | @inlineCallbacks
51 | def test_remove(spiderqueue):
52 |     yield maybeDeferred(spiderqueue.add, "spider0", 5)
53 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
54 |     yield maybeDeferred(spiderqueue.add, "spider1", 0)
55 | 
56 |     assert (yield maybeDeferred(spiderqueue.count)) == 3
57 | 
58 |     assert (yield maybeDeferred(spiderqueue.remove, lambda message: message["name"] == "spider1")) == 2
59 | 
60 |     assert (yield maybeDeferred(spiderqueue.count)) == 1
61 | 
62 | 
63 | @inlineCallbacks
64 | def test_clear(spiderqueue):
65 |     assert (yield maybeDeferred(spiderqueue.count)) == 0
66 | 
67 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
68 |     yield maybeDeferred(spiderqueue.add, "spider1", 10, **spider_args)
69 | 
70 |     assert (yield maybeDeferred(spiderqueue.count)) == 2
71 | 
72 |     yield maybeDeferred(spiderqueue.clear)
73 | 
74 |     assert (yield maybeDeferred(spiderqueue.count)) == 0
75 | 


--------------------------------------------------------------------------------
/tests/test_sqlite.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import pytest
  4 | 
  5 | from scrapyd.sqlite import JsonSqlitePriorityQueue, SqliteFinishedJobs
  6 | from tests import get_finished_job
  7 | 
  8 | 
  9 | @pytest.fixture()
 10 | def jsonsqlitepriorityqueue():
 11 |     return JsonSqlitePriorityQueue()
 12 | 
 13 | 
 14 | @pytest.fixture()
 15 | def sqlitefinishedjobs():
 16 |     q = SqliteFinishedJobs(":memory:")
 17 |     q.add(get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)))
 18 |     q.add(get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)))
 19 |     q.add(get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)))
 20 |     return q
 21 | 
 22 | 
 23 | def test_jsonsqlitepriorityqueue_empty(jsonsqlitepriorityqueue):
 24 |     assert jsonsqlitepriorityqueue.pop() is None
 25 | 
 26 | 
 27 | def test_jsonsqlitepriorityqueue_one(jsonsqlitepriorityqueue):
 28 |     msg = "a message"
 29 |     jsonsqlitepriorityqueue.put(msg)
 30 | 
 31 |     assert "_id" not in msg
 32 |     assert jsonsqlitepriorityqueue.pop() == msg
 33 |     assert jsonsqlitepriorityqueue.pop() is None
 34 | 
 35 | 
 36 | def test_jsonsqlitepriorityqueue_multiple(jsonsqlitepriorityqueue):
 37 |     msg1 = "first message"
 38 |     msg2 = "second message"
 39 |     jsonsqlitepriorityqueue.put(msg1)
 40 |     jsonsqlitepriorityqueue.put(msg2)
 41 |     out = []
 42 |     out.append(jsonsqlitepriorityqueue.pop())
 43 |     out.append(jsonsqlitepriorityqueue.pop())
 44 | 
 45 |     assert msg1 in out
 46 |     assert msg2 in out
 47 |     assert jsonsqlitepriorityqueue.pop() is None
 48 | 
 49 | 
 50 | def test_jsonsqlitepriorityqueue_priority(jsonsqlitepriorityqueue):
 51 |     msg1 = "message 1"
 52 |     msg2 = "message 2"
 53 |     msg3 = "message 3"
 54 |     msg4 = "message 4"
 55 |     jsonsqlitepriorityqueue.put(msg1, priority=1.0)
 56 |     jsonsqlitepriorityqueue.put(msg2, priority=5.0)
 57 |     jsonsqlitepriorityqueue.put(msg3, priority=3.0)
 58 |     jsonsqlitepriorityqueue.put(msg4, priority=2.0)
 59 | 
 60 |     assert jsonsqlitepriorityqueue.pop() == msg2
 61 |     assert jsonsqlitepriorityqueue.pop() == msg3
 62 |     assert jsonsqlitepriorityqueue.pop() == msg4
 63 |     assert jsonsqlitepriorityqueue.pop() == msg1
 64 | 
 65 | 
 66 | def test_jsonsqlitepriorityqueue_iter_len_clear(jsonsqlitepriorityqueue):
 67 |     assert len(jsonsqlitepriorityqueue) == 0
 68 |     assert list(jsonsqlitepriorityqueue) == []
 69 | 
 70 |     msg1 = "message 1"
 71 |     msg2 = "message 2"
 72 |     msg3 = "message 3"
 73 |     msg4 = "message 4"
 74 |     jsonsqlitepriorityqueue.put(msg1, priority=1.0)
 75 |     jsonsqlitepriorityqueue.put(msg2, priority=5.0)
 76 |     jsonsqlitepriorityqueue.put(msg3, priority=3.0)
 77 |     jsonsqlitepriorityqueue.put(msg4, priority=2.0)
 78 | 
 79 |     assert len(jsonsqlitepriorityqueue) == 4
 80 |     assert list(jsonsqlitepriorityqueue) == [(msg2, 5.0), (msg3, 3.0), (msg4, 2.0), (msg1, 1.0)]
 81 | 
 82 |     jsonsqlitepriorityqueue.clear()
 83 | 
 84 |     assert len(jsonsqlitepriorityqueue) == 0
 85 |     assert list(jsonsqlitepriorityqueue) == []
 86 | 
 87 | 
 88 | def test_jsonsqlitepriorityqueue_remove(jsonsqlitepriorityqueue):
 89 |     assert len(jsonsqlitepriorityqueue) == 0
 90 |     assert list(jsonsqlitepriorityqueue) == []
 91 | 
 92 |     msg1 = "good message 1"
 93 |     msg2 = "bad message 2"
 94 |     msg3 = "good message 3"
 95 |     msg4 = "bad message 4"
 96 |     jsonsqlitepriorityqueue.put(msg1)
 97 |     jsonsqlitepriorityqueue.put(msg2)
 98 |     jsonsqlitepriorityqueue.put(msg3)
 99 |     jsonsqlitepriorityqueue.put(msg4)
100 |     jsonsqlitepriorityqueue.remove(lambda x: x.startswith("bad"))
101 | 
102 |     assert list(jsonsqlitepriorityqueue) == [(msg1, 0.0), (msg3, 0.0)]
103 | 
104 | 
105 | @pytest.mark.parametrize(
106 |     "value",
107 |     [
108 |         "native ascii str",
109 |         "\xa3",
110 |         123,
111 |         1.2,
112 |         True,
113 |         ["a", "list", 1],
114 |         {"a": "dict"},
115 |     ],
116 | )
117 | def test_jsonsqlitepriorityqueue_types(jsonsqlitepriorityqueue, value):
118 |     jsonsqlitepriorityqueue.put(value)
119 | 
120 |     assert jsonsqlitepriorityqueue.pop() == value
121 | 
122 | 
123 | def test_sqlitefinishedjobs_add(sqlitefinishedjobs):
124 |     assert len(sqlitefinishedjobs) == 3
125 | 
126 | 
127 | def test_sqlitefinishedjobs_clear_all(sqlitefinishedjobs):
128 |     sqlitefinishedjobs.clear()
129 | 
130 |     assert len(sqlitefinishedjobs) == 0
131 | 
132 | 
133 | def test_sqlitefinishedjobs_clear_keep_0(sqlitefinishedjobs):
134 |     sqlitefinishedjobs.clear(finished_to_keep=0)
135 | 
136 |     assert len(sqlitefinishedjobs) == 0
137 | 
138 | 
139 | def test_sqlitefinishedjobs_clear_keep_2(sqlitefinishedjobs):
140 |     sqlitefinishedjobs.clear(finished_to_keep=2)
141 | 
142 |     assert len(sqlitefinishedjobs) == 2
143 | 
144 | 
145 | def test_sqlitefinishedjobs__iter__(sqlitefinishedjobs):
146 |     actual = list(sqlitefinishedjobs)
147 | 
148 |     assert (actual[0][0], actual[0][1]) == ("p3", "s3")
149 |     assert (actual[1][0], actual[1][1]) == ("p2", "s2")
150 |     assert (actual[2][0], actual[2][1]) == ("p1", "s1")
151 | 


--------------------------------------------------------------------------------
/tests/test_website.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | from html_checker.validator import ValidatorInterface
  5 | from twisted.web import http_headers, resource
  6 | from twisted.web.test._util import _render
  7 | from twisted.web.test.requesthelper import DummyRequest
  8 | 
  9 | from scrapyd.app import application
 10 | from scrapyd.launcher import ScrapyProcessProtocol
 11 | from scrapyd.website import Root
 12 | from tests import get_finished_job, has_settings, root_add_version, touch
 13 | 
 14 | 
 15 | def assert_headers(txrequest):
 16 |     headers = dict(txrequest.responseHeaders.getAllRawHeaders())
 17 |     content_length = headers.pop(b"Content-Length")
 18 | 
 19 |     assert len(content_length) == 1
 20 |     assert isinstance(content_length[0], bytes)
 21 |     assert int(content_length[0])
 22 |     assert headers == {b"Content-Type": [b"text/html; charset=utf-8"]}
 23 | 
 24 | 
 25 | def assert_hrefs(urls, text, header):
 26 |     for href, name in urls:
 27 |         if header:
 28 |             assert f'<a href="/path/to/{href}">{name}</a>' in text
 29 |         else:
 30 |             assert f'<a href="/{href}">{name}</a>' in text
 31 | 
 32 | 
 33 | # Derived from test_emptyChildUnicodeParent.
 34 | # https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
 35 | def test_logs_dir(txrequest, root):
 36 |     os.makedirs(os.path.join("logs", "quotesbot"))
 37 | 
 38 |     file = root.children[b"logs"]
 39 |     request = DummyRequest([b""])
 40 |     child = resource.getChildForRequest(file, request)
 41 | 
 42 |     content = child.render(request)
 43 | 
 44 |     assert list(request.responseHeaders.getAllRawHeaders()) == [(b"Content-Type", [b"text/html; charset=utf-8"])]
 45 |     assert b"<th>Last modified</th>" in content
 46 |     assert b'<td><a href="quotesbot/">quotesbot/</a></td>' in content
 47 | 
 48 | 
 49 | # Derived from test_indexNames.
 50 | # https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
 51 | def test_logs_file(txrequest, root):
 52 |     os.makedirs(os.path.join("logs", "quotesbot"))
 53 |     with open(os.path.join("logs", "foo.txt"), "wb") as f:
 54 |         f.write(b"baz")
 55 | 
 56 |     file = root.children[b"logs"]
 57 |     request = DummyRequest([b"foo.txt"])
 58 |     child = resource.getChildForRequest(file, request)
 59 | 
 60 |     d = _render(child, request)
 61 | 
 62 |     def cbRendered(ignored):
 63 |         assert list(request.responseHeaders.getAllRawHeaders()) == [
 64 |             (b"Accept-Ranges", [b"bytes"]),
 65 |             (b"Content-Length", [b"3"]),
 66 |             (b"Content-Type", [b"text/plain"]),
 67 |         ]
 68 |         assert b"".join(request.written) == b"baz"
 69 | 
 70 |     d.addCallback(cbRendered)
 71 |     return d
 72 | 
 73 | 
 74 | @pytest.mark.parametrize("cancel", [True, False], ids=["cancel", "no_cancel"])
 75 | @pytest.mark.parametrize("header", [True, False], ids=["header", "no_header"])
 76 | @pytest.mark.parametrize("exists", [True, False], ids=["exists", "no_exists"])
 77 | def test_jobs(txrequest, config, cancel, header, exists, chdir):
 78 |     if not cancel:
 79 |         config.cp.remove_option("services", "cancel.json")
 80 | 
 81 |     root = Root(config, application(config))
 82 |     root_add_version(root, "quotesbot", "0.1", "quotesbot")
 83 |     root.update_projects()
 84 | 
 85 |     urls = [
 86 |         ("logs/p1/s1/j1-finished.log", "Log"),
 87 |         ("logs/p2/s2/j2-running.log", "Log"),
 88 |         ("logs/p3/s3/j3-pending.log", "Log"),
 89 |     ]
 90 |     if root.local_items:
 91 |         urls.extend(
 92 |             [
 93 |                 ("items/p1/s1/j1-finished.jl", "Items"),
 94 |                 ("items/p2/s2/j2-running.jl", "Items"),
 95 |                 ("items/p3/s3/j3-pending.jl", "Items"),
 96 |             ]
 97 |         )
 98 |     if exists:
 99 |         touch(chdir / "logs" / "p1" / "s1" / "j1-finished.log")
100 |         touch(chdir / "logs" / "p2" / "s2" / "j2-running.log")
101 |         exist = urls[0:2]
102 |         no_exist = urls[2:3]
103 | 
104 |         if root.local_items:
105 |             touch(chdir / "items" / "p1" / "s1" / "j1-finished.jl")
106 |             touch(chdir / "items" / "p2" / "s2" / "j2-running.jl")
107 |             exist += urls[3:5]
108 |             no_exist += urls[5:6]
109 |     else:
110 |         exist = []
111 |         no_exist = urls
112 | 
113 |     root.launcher.finished.add(get_finished_job("p1", "s1", "j1-finished"))
114 |     root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2-running", env={}, args=[])
115 |     root.poller.queues["quotesbot"].add("quotesbot", _job="j3-pending")
116 | 
117 |     if header:
118 |         txrequest.requestHeaders = http_headers.Headers({b"X-Forwarded-Prefix": [b"/path/to"]})
119 |     txrequest.method = "GET"
120 |     content = root.children[b"jobs"].render(txrequest)
121 |     text = content.decode()
122 | 
123 |     assert_headers(txrequest)
124 |     assert_hrefs(exist, text, header)
125 |     for url, _ in no_exist:
126 |         assert url not in text
127 | 
128 |     if root.local_items:
129 |         assert b"<th>Items</th>" in content
130 |     else:
131 |         assert b"<th>Items</th>" not in content
132 | 
133 |     if cancel:
134 |         assert b"<th>Cancel</th>" in content
135 |         if header:
136 |             assert b' action="/path/to/cancel.json">' in content
137 |         else:
138 |             assert b' action="/cancel.json">' in content
139 |         for job in ("j2-running", "j3-pending"):
140 |             assert f' value="{job}">' in text
141 |     else:
142 |         assert b"<th>Cancel</th>" not in content
143 |         assert b'/cancel.json">' not in content
144 |     assert b' value="j1-finished">' not in content
145 | 
146 | 
147 | @pytest.mark.parametrize("with_egg", [True, False])
148 | @pytest.mark.parametrize("header", [True, False])
149 | def test_home(txrequest, root, with_egg, header):
150 |     if with_egg:
151 |         root_add_version(root, "quotesbot", "0.1", "quotesbot")
152 |         root.update_projects()
153 | 
154 |     if header:
155 |         txrequest.requestHeaders = http_headers.Headers({b"X-Forwarded-Prefix": [b"/path/to"]})
156 |     txrequest.method = "GET"
157 |     content = root.children[b""].render(txrequest)
158 |     text = content.decode()
159 | 
160 |     urls = [("jobs", "Jobs"), ("logs/", "Logs")]
161 |     if root.local_items:
162 |         urls.append(("items/", "Items"))
163 | 
164 |     assert_headers(txrequest)
165 |     assert_hrefs(urls, text, header)
166 | 
167 |     if root.local_items:
168 |         assert b'/items/">Items</a>' in content
169 |     else:
170 |         assert b'/items/">Items</a>' not in content
171 | 
172 |     projects = []
173 |     if with_egg:
174 |         projects.append("quotesbot")
175 |     if has_settings():
176 |         projects.append("localproject")
177 | 
178 |     if projects:
179 |         assert b"<p>Scrapy projects:</p>" in content
180 |         for project in projects:
181 |             assert f"<li>{project}</li>" in text
182 |     else:
183 |         assert b"<p>No Scrapy projects yet.</p>" in content
184 |         for project in projects:
185 |             assert f"<li>{project}</li>" not in text
186 | 
187 | 
188 | @pytest.mark.parametrize("basename", ["", "jobs"])
189 | def test_validate(tmp_path, txrequest, root, basename, caplog):
190 |     txrequest.method = "GET"
191 |     content = root.children[basename.encode()].render(txrequest)
192 |     path = tmp_path / "page.html"
193 |     path.write_bytes(content)
194 |     report = ValidatorInterface().validate([str(path)]).registry[str(path)]
195 | 
196 |     assert report is None, repr(report)
197 | 


--------------------------------------------------------------------------------