├── .bumpversion.cfg
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.cfg
├── setup.py
├── sh_scrapy
    ├── __init__.py
    ├── commands
    │   ├── __init__.py
    │   └── shub_image_info.py
    ├── compat.py
    ├── crawl.py
    ├── diskquota.py
    ├── env.py
    ├── exceptions.py
    ├── extension.py
    ├── hsref.py
    ├── log.py
    ├── middlewares.py
    ├── settings.py
    ├── stats.py
    ├── utils.py
    └── writer.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_command.py
    ├── test_compat.py
    ├── test_crawl.py
    ├── test_diskquota.py
    ├── test_env.py
    ├── test_extension.py
    ├── test_hsref.py
    ├── test_log.py
    ├── test_middlewares.py
    ├── test_settings.py
    ├── test_stats.py
    ├── test_utils.py
    ├── test_writer.py
    └── utils.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.17.5
3 | commit = True
4 | tag = True
5 | tag_name = v{new_version}
6 | 
7 | [bumpversion:file:sh_scrapy/__init__.py]
8 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v4
12 | 
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v4
15 |       with:
16 |         python-version: 3.9
17 | 
18 |     - name: Publish to PyPI
19 |       run: |
20 |         pip install --upgrade pip
21 |         pip install --upgrade setuptools wheel twine
22 |         python setup.py sdist bdist_wheel
23 |         export TWINE_USERNAME=__token__
24 |         export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }}
25 |         twine upload dist/*
26 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   tests-ubuntu:
 7 |     name: "Test: py${{ matrix.python-version }}, Ubuntu"
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         include:
13 |         - python-version: "3.6"
14 |           os: ubuntu-20.04
15 |           env:
16 |             TOXENV: py36-scrapy16
17 |         - python-version: "3.6"
18 |           os: ubuntu-20.04
19 |           env:
20 |             TOXENV: py
21 |         - python-version: "3.7"
22 |           os: ubuntu-22.04
23 |           env:
24 |             TOXENV: py
25 |         - python-version: "3.8"
26 |           os: ubuntu-latest
27 |           env:
28 |             TOXENV: py
29 |         - python-version: "3.9"
30 |           os: ubuntu-latest
31 |           env:
32 |             TOXENV: py
33 |         - python-version: "3.10"
34 |           os: ubuntu-latest
35 |           env:
36 |             TOXENV: py
37 |         - python-version: "3.11"
38 |           os: ubuntu-latest
39 |           env:
40 |             TOXENV: py
41 |         - python-version: "3.12"
42 |           os: ubuntu-latest
43 |           env:
44 |             TOXENV: py
45 |         - python-version: "3.13"
46 |           os: ubuntu-latest
47 |           env:
48 |             TOXENV: py
49 | 
50 |     steps:
51 |     - uses: actions/checkout@v4
52 | 
53 |     - name: Set up Python ${{ matrix.python-version }}
54 |       uses: actions/setup-python@v5
55 |       with:
56 |         python-version: ${{ matrix.python-version }}
57 | 
58 |     - name: Update pip & install tox
59 |       run: |
60 |         pip install -U pip
61 |         pip install tox
62 | 
63 |     - name: Run tests
64 |       env: ${{ matrix.env }}
65 |       run: tox
66 | 
67 |     - name: Upload coverage report
68 |       uses: codecov/codecov-action@v5
69 |       with:
70 |         token: ${{ secrets.CODECOV_TOKEN }}
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | dist
3 | *.egg-info
4 | *.pyc
5 | .eggs/
6 | .tox
7 | /.coverage
8 | coverage.xml
9 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Changes
 3 | =======
 4 | 
 5 | 0.17.4 (2024-07-08)
 6 | ===================
 7 | 
 8 | -   Fixed an exception when running scripts with importlib_ installed,
 9 |     introduced in 0.17.3.
10 | 
11 | 
12 | 0.17.3 (2024-06-17)
13 | ===================
14 | 
15 | -   Replaced a use of the deprecated pkg_resources_ module with importlib_.
16 | 
17 |     .. _pkg_resources: https://setuptools.pypa.io/en/latest/pkg_resources.html
18 |     .. _importlib: https://docs.python.org/3/library/importlib.html
19 | 
20 | 
21 | 0.17.2 (2024-02-20)
22 | ===================
23 | 
24 | -   Added official support for Python 3.11 and 3.12.
25 | 
26 | -   Added support for centralized request fingerprints on Scrapy 2.7 and
27 |     higher.
28 | 
29 | -   Started this change log. Check `GitHub releases`_ for older releases until
30 |     0.12.0, and the `commit history`_ for the complete history.
31 | 
32 |     .. _commit history: https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/commits/master/
33 |     .. _GitHub releases: https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/releases
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Scrapinghub
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice,
 8 |        this list of conditions and the following disclaimer.
 9 | 
10 |     2. Redistributions in binary form must reproduce the above copyright
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of extruct nor the names of its contributors may be used
15 |        to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # scrapinghub-entrypoint-scrapy
 3 | 
 4 | [![version](https://img.shields.io/pypi/v/scrapinghub-entrypoint-scrapy.svg)](https://pypi.python.org/pypi/scrapinghub-entrypoint-scrapy)
 5 | [![pyversions](https://img.shields.io/pypi/pyversions/scrapinghub-entrypoint-scrapy.svg)](https://pypi.python.org/pypi/scrapinghub-entrypoint-scrapy)
 6 | [![actions](https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/workflows/Tests/badge.svg)](https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/actions)
 7 | [![codecov](https://codecov.io/gh/scrapinghub/scrapinghub-entrypoint-scrapy/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/scrapinghub-entrypoint-scrapy)
 8 | 
 9 | Scrapy entrypoint for Scrapinghub job runner.
10 | 
11 | The package implements a base wrapper layer to extract job data from
12 | environment, parse/prepare it properly and execute job using Scrapy
13 | or custom executor.
14 | 
15 | 
16 | ## Features
17 | 
18 | - parsing job data from environment
19 | - processing job args and settings
20 | - running a job with Scrapy
21 | - collecting stats
22 | - advanced logging & error handling
23 | - full hubstorage support
24 | - custom scripts support
25 | 
26 | 
27 | ## Install
28 | 
29 |     pip install scrapinghub-entrypoint-scrapy
30 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | [sdist_dsc]
4 | Package: scrapinghub-entrypoint-scrapy
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | from sh_scrapy import __version__
 4 | 
 5 | 
 6 | setup(
 7 |     name='scrapinghub-entrypoint-scrapy',
 8 |     version=__version__,
 9 |     license='BSD',
10 |     description='Scrapy entrypoint for Scrapinghub job runner',
11 |     long_description=open('README.md').read(),
12 |     packages=find_packages(),
13 |     install_requires=[
14 |         'Scrapy>=1.6',
15 |         'scrapinghub>=2.1.0',
16 |     ],
17 |     entry_points={
18 |         'console_scripts': [
19 |             'start-crawl = sh_scrapy.crawl:main',
20 |             'list-spiders = sh_scrapy.crawl:list_spiders',
21 |             'shub-image-info = sh_scrapy.crawl:shub_image_info',
22 |         ],
23 |     },
24 |     python_requires='>=3.6',
25 |     classifiers=[
26 |         'Framework :: Scrapy',
27 |         'Development Status :: 5 - Production/Stable',
28 |         'Intended Audience :: Developers',
29 |         'License :: OSI Approved :: BSD License',
30 |         'Operating System :: OS Independent',
31 |         'Programming Language :: Python',
32 |         'Programming Language :: Python :: 3',
33 |         'Programming Language :: Python :: 3.6',
34 |         'Programming Language :: Python :: 3.7',
35 |         'Programming Language :: Python :: 3.8',
36 |         'Programming Language :: Python :: 3.9',
37 |         'Programming Language :: Python :: 3.10',
38 |         'Programming Language :: Python :: 3.11',
39 |         'Programming Language :: Python :: 3.12',
40 |         'Programming Language :: Python :: 3.13',
41 |         'Topic :: Utilities',
42 |     ],
43 | )
44 | 


--------------------------------------------------------------------------------
/sh_scrapy/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.17.5"
2 | 


--------------------------------------------------------------------------------
/sh_scrapy/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/sh_scrapy/commands/shub_image_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import print_function
 3 | import json
 4 | import subprocess
 5 | 
 6 | from scrapy.commands import ScrapyCommand
 7 | 
 8 | 
 9 | class Command(ScrapyCommand):
10 |     requires_project = True
11 |     default_settings = {'LOG_ENABLED': False}
12 | 
13 |     IMAGE_INFO_CMD = ' && '.join([
14 |         "printf 'Linux packages:\n'", "dpkg -l",
15 |         "printf '\nPython packages:\n'", "pip freeze",
16 |     ])
17 | 
18 |     def short_desc(self):
19 |         return "Print JSON-encoded project metadata."
20 | 
21 |     def add_options(self, parser):
22 |         super(Command, self).add_options(parser)
23 |         # backward compatibility for optparse/argparse
24 |         try:
25 |             add_argument = parser.add_argument
26 |         except AttributeError:
27 |             add_argument = parser.add_option
28 |         add_argument(
29 |             "--debug",
30 |             action="store_true",
31 |             help="add debugging information such as list of "
32 |                  "installed Debian packages and Python packages.",
33 |         )
34 | 
35 |     def run(self, args, opts):
36 |         result = {
37 |             'project_type': 'scrapy',
38 |             'spiders': sorted(self.crawler_process.spider_loader.list()),
39 |         }
40 |         try:
41 |             from scrapy_spider_metadata import get_spider_metadata
42 |         except ImportError:
43 |             pass
44 |         else:
45 |             result['metadata'] = {}
46 |             for spider_name in result['spiders']:
47 |                 spider_cls = self.crawler_process.spider_loader.load(spider_name)
48 |                 metadata_dict = get_spider_metadata(spider_cls, normalize=True)
49 |                 try:
50 |                     # make sure it's serializable
51 |                     json.dumps(metadata_dict)
52 |                 except (TypeError, ValueError):
53 |                     continue
54 |                 result['metadata'][spider_name] = metadata_dict
55 |         if opts.debug:
56 |             output = subprocess.check_output(
57 |                 ['bash', '-c', self.IMAGE_INFO_CMD],
58 |                 stderr=subprocess.STDOUT,
59 |                 universal_newlines=True,
60 |             )
61 |             result['debug'] = output
62 |         print(json.dumps(result))
63 | 


--------------------------------------------------------------------------------
/sh_scrapy/compat.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from scrapy.exceptions import ScrapyDeprecationWarning
 4 | from scrapy.utils.decorators import deprecated
 5 | from scrapy.utils.python import (
 6 |     to_bytes as scrapy_to_bytes,
 7 |     to_unicode as scrapy_to_unicode,
 8 | )
 9 | 
10 | 
11 | IS_PYTHON2 = False
12 | STRING_TYPE = str
13 | TEXT_TYPE = str
14 | BINARY_TYPE = bytes
15 | 
16 | 
17 | warnings.warn(
18 |     "The sh_scrapy.compat module is deprecated, use the functions in scrapy.utils.python instead",
19 |     category=ScrapyDeprecationWarning,
20 |     stacklevel=2,
21 | )
22 | 
23 | 
24 | def is_string(var):
25 |     warnings.warn(
26 |         "is_string(var) is deprecated, please use isinstance(var, str) instead",
27 |         category=ScrapyDeprecationWarning,
28 |         stacklevel=2,
29 |     )
30 |     return isinstance(var, str)
31 | 
32 | 
33 | @deprecated("scrapy.utils.python.to_bytes")
34 | def to_bytes(text, encoding=None, errors='strict'):
35 |     return scrapy_to_bytes(text, encoding, errors)
36 | 
37 | 
38 | @deprecated("scrapy.utils.python.to_unicode")
39 | def to_native_str(text, encoding=None, errors='strict'):
40 |     return scrapy_to_unicode(text, encoding, errors)
41 | 
42 | 
43 | @deprecated("scrapy.utils.python.to_unicode")
44 | def to_unicode(text, encoding=None, errors='strict'):
45 |     return scrapy_to_unicode(text, encoding, errors)
46 | 


--------------------------------------------------------------------------------
/sh_scrapy/crawl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # --------------------- DO NOT ADD IMPORTS HERE -------------------------
  3 | # Add them below so that any import errors are caught and sent to sentry
  4 | # -----------------------------------------------------------------------
  5 | from __future__ import print_function
  6 | import os
  7 | import sys
  8 | import socket
  9 | import logging
 10 | import datetime
 11 | import warnings
 12 | from contextlib import contextmanager
 13 | # XXX: Do not use atexit to close Hubstorage client!
 14 | # why: functions registed with atexit are called when run_script() finishes,
 15 | # and at that point main() function doesn't completed leading to lost log
 16 | # messages.
 17 | 
 18 | from sh_scrapy.exceptions import SHScrapyDeprecationWarning
 19 | 
 20 | # Keep a reference to standard output/error as they are redirected
 21 | # at log initialization
 22 | _sys_stderr = sys.stderr  # stderr and stoud are redirected to HS later
 23 | _sys_stdout = sys.stdout
 24 | # Sentry DSN ins passed by environment variable
 25 | _hworker_sentry_dsn = os.environ.pop('HWORKER_SENTRY_DSN', None)
 26 | _sentry_dsn = os.environ.pop('SENTRY_DSN', _hworker_sentry_dsn)
 27 | 
 28 | # Set default socket timeout for code that doesn't
 29 | socket.setdefaulttimeout(60.0)
 30 | 
 31 | 
 32 | SCRAPY_SETTINGS_ENTRYPOINT_NOT_FOUND = """
 33 | Scrapy distribution with `scrapy.settings` entrypoint is not found.
 34 | The entrypoint should be specified in your project setup.py, please make sure
 35 | you specified it in the following format:
 36 | setup(
 37 |     ...,
 38 |     entry_points = {'scrapy': ['settings = your_project.settings']},
 39 |     ...
 40 | )
 41 | Check the link for more details:
 42 | https://setuptools.readthedocs.io/en/latest/pkg_resources.html#entry-points
 43 | """
 44 | 
 45 | 
 46 | @contextmanager
 47 | def ignore_warnings(**kwargs):
 48 |     """Context manager that creates a temporary filter to ignore warnings.
 49 | 
 50 |     This context manager behaves similarly to warnings.catch_warnings though
 51 |     filtered warnings aren't recorded and you can ignore them by some criteria
 52 |     matching warnings.simplefilter arguments.
 53 | 
 54 |     As warnings.catch_warnings, this context manager is not thread-safe.
 55 |     """
 56 |     warnings.warn(
 57 |         "The sh_scrapy.crawl.ignore_warnings function is deprecated.",
 58 |         category=SHScrapyDeprecationWarning,
 59 |         stacklevel=2,
 60 |     )
 61 |     _filters = warnings.filters[:]
 62 |     warnings.filterwarnings('ignore', **kwargs)
 63 |     yield
 64 |     warnings.filters = _filters
 65 | 
 66 | 
 67 | def _fatalerror():
 68 |     # Log error to hworker slotN.out
 69 |     # Inspired by logging.Handler.handleError()
 70 |     #
 71 |     # Capture exc_info early on, so that an error in the handler doesn't
 72 |     # overwrite it.
 73 |     import traceback
 74 |     ei = sys.exc_info()
 75 | 
 76 |     if _sentry_dsn:
 77 |         try:
 78 |             from raven import Client
 79 |         except ImportError:
 80 |             # Do not fail here, previous error is more important
 81 |             print('HWORKER_SENTRY_DSN is set but python-raven '
 82 |                   'is not installed', file=_sys_stderr)
 83 |         else:
 84 |             try:
 85 |                 Client(_sentry_dsn).captureException()
 86 |             except Exception as err:
 87 |                 print(datetime.datetime.utcnow().isoformat(),
 88 |                       "Error when sending fatal error to sentry:", err,
 89 |                       file=_sys_stderr)
 90 | 
 91 |     # Log error to hworker slotN.out
 92 |     # Inspired by logging.Handler.handleError()
 93 |     try:
 94 |         print(datetime.datetime.utcnow().isoformat(), end=' ',
 95 |               file=_sys_stderr)
 96 |         traceback.print_exception(ei[0], ei[1], ei[2], None, _sys_stderr)
 97 |     except IOError:
 98 |         pass
 99 |     finally:
100 |         del ei
101 | 
102 | 
103 | def _get_apisettings():
104 |     from sh_scrapy.env import decode_uri
105 |     return decode_uri(envvar='SHUB_SETTINGS') or {}
106 | 
107 | 
108 | def _run(args, settings):
109 |     if args[0] == 'scrapy':
110 |         _run_scrapy(args, settings)
111 |     else:
112 |         _run_pkgscript(args)
113 | 
114 | 
115 | def _run_scrapy(argv, settings):
116 |     from scrapy.cmdline import execute
117 |     sys.argv = argv
118 |     execute(settings=settings)
119 | 
120 | 
121 | def _run_pkgscript(argv):
122 |     if argv[0].startswith('py:'):
123 |         argv[0] = argv[0][3:]
124 |     scriptname = argv[0]
125 |     sys.argv = argv
126 | 
127 |     try:
128 |         import importlib.metadata
129 |         has_importlib = True
130 |     except ImportError:
131 |         import pkg_resources
132 |         has_importlib = False
133 | 
134 |     def get_distribution():
135 |         if has_importlib:
136 |             eps = importlib.metadata.entry_points(group='scrapy')
137 |         else:
138 |             eps = pkg_resources.WorkingSet().iter_entry_points('scrapy')
139 | 
140 |         for ep in eps:
141 |             if ep.name == 'settings':
142 |                 return ep.dist
143 | 
144 |     d = get_distribution()
145 |     if not d:
146 |         raise ValueError(SCRAPY_SETTINGS_ENTRYPOINT_NOT_FOUND)
147 |     ns = {"__name__": "__main__"}
148 |     if has_importlib:
149 |         _run_script(d, scriptname, ns)
150 |     else:
151 |         d.run_script(scriptname, ns)
152 | 
153 | 
154 | def _run_script(dist, script_name, namespace):
155 |     # An importlib-based replacement for pkg_resources.NullProvider.run_script().
156 |     # It's possible that this doesn't support all cases that pkg_resources does,
157 |     # so it may need to be improved when those are discovered.
158 |     # Using a private attribute (dist._path) seems to be necessary to get the
159 |     # full file path, but it's only needed for diagnostic messages so it should
160 |     # be easy to fix this by moving to relative paths if this API is removed.
161 |     script = "scripts/" + script_name
162 |     source = dist.read_text(script)
163 |     if not source:
164 |         raise ValueError(
165 |             f"Script {script!r} not found in metadata at {dist._path!r}"
166 |         )
167 |     script_filename = dist._path.joinpath(script)
168 |     code = compile(source, str(script_filename), "exec")
169 |     exec(code, namespace, namespace)
170 | 
171 | 
172 | def _run_usercode(spider, args, apisettings_func,
173 |                   log_handler=None, commands_module=None):
174 |     try:
175 |         from scrapy.exceptions import ScrapyDeprecationWarning
176 |         from sh_scrapy.settings import populate_settings
177 | 
178 |         with warnings.catch_warnings():
179 |             warnings.filterwarnings("ignore", category=ScrapyDeprecationWarning)
180 |             settings = populate_settings(apisettings_func(), spider)
181 |         if commands_module:
182 |             settings.set('COMMANDS_MODULE', commands_module, priority=40)
183 |         if log_handler is not None:
184 |             log_handler.setLevel(settings['LOG_LEVEL'])
185 |     except Exception:
186 |         logging.exception('Settings initialization failed')
187 |         raise
188 | 
189 |     try:
190 |         _run(args, settings)
191 |     except Exception:
192 |         logging.exception('Job runtime exception')
193 |         raise
194 | 
195 | 
196 | def _launch():
197 |     try:
198 |         from scrapy.exceptions import ScrapyDeprecationWarning
199 |         warnings.filterwarnings(
200 |             'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy')
201 |         from sh_scrapy.env import get_args_and_env, decode_uri
202 |         job = decode_uri(envvar='SHUB_JOB_DATA')
203 |         assert job, 'SHUB_JOB_DATA must be set'
204 |         args, env = get_args_and_env(job)
205 |         os.environ.update(env)
206 | 
207 |         from sh_scrapy.log import initialize_logging
208 |         from sh_scrapy.settings import populate_settings  # NOQA
209 |         from sh_scrapy.env import setup_environment
210 |         loghdlr = initialize_logging()
211 |         setup_environment()
212 |     except:
213 |         _fatalerror()
214 |         raise
215 | 
216 |     _run_usercode(job['spider'], args, _get_apisettings, loghdlr)
217 | 
218 | 
219 | def list_spiders():
220 |     """ An entrypoint for list-spiders."""
221 |     warnings.warn(
222 |         "The sh_scrapy.crawl.list_spiders function is deprecated.",
223 |         category=SHScrapyDeprecationWarning,
224 |         stacklevel=2,
225 |     )
226 |     try:
227 |         from scrapy.exceptions import ScrapyDeprecationWarning
228 |         warnings.filterwarnings(
229 |             'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy')
230 |         from sh_scrapy.env import setup_environment
231 |         setup_environment()
232 |     except:
233 |         _fatalerror()
234 |         raise
235 | 
236 |     _run_usercode(None, ['scrapy', 'list'], _get_apisettings)
237 | 
238 | 
239 | def shub_image_info():
240 |     """shub-image-info command
241 | 
242 |     http://shub.readthedocs.io/en/latest/custom-images-contract.html#contract-statements
243 | 
244 |     """
245 |     try:
246 |         from scrapy.exceptions import ScrapyDeprecationWarning
247 |         warnings.filterwarnings(
248 |             'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy')
249 |         from sh_scrapy.env import setup_environment
250 |         setup_environment()
251 |     except:
252 |         _fatalerror()
253 |         raise
254 | 
255 |     _run_usercode(None, ['scrapy', 'shub_image_info'] + sys.argv[1:],
256 |                   _get_apisettings, commands_module='sh_scrapy.commands')
257 | 
258 | 
259 | def main():
260 |     try:
261 |         from sh_scrapy.writer import pipe_writer
262 |         pipe_writer.open()
263 |     except Exception:
264 |         _fatalerror()
265 |         return 1
266 |     try:
267 |         _launch()
268 |     except SystemExit as e:
269 |         return e.code
270 |     except:
271 |         # exception was already handled and logged inside _launch()
272 |         return 1
273 |     finally:
274 |         sys.stderr = _sys_stderr
275 |         sys.stdout = _sys_stdout
276 |     return 0
277 | 
278 | 
279 | if __name__ == '__main__':
280 |     sys.exit(main())
281 | 


--------------------------------------------------------------------------------
/sh_scrapy/diskquota.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DiskQuota downloader and spider middlewares.
 3 | The goal is to catch disk quota errors and stop spider gently.
 4 | """
 5 | 
 6 | from scrapy.exceptions import NotConfigured
 7 | 
 8 | 
 9 | class DiskQuota(object):
10 | 
11 |     def __init__(self, crawler):
12 |         if not crawler.settings.getbool('DISK_QUOTA_STOP_ON_ERROR'):
13 |             raise NotConfigured
14 |         self.crawler = crawler
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         return cls(crawler)
19 | 
20 |     def _is_disk_quota_error(self, error):
21 |         return isinstance(error, (OSError, IOError)) and error.errno == 122
22 | 
23 | 
24 | class DiskQuotaDownloaderMiddleware(DiskQuota):
25 | 
26 |     def process_exception(self, request, exception, spider):
27 |         if self._is_disk_quota_error(exception):
28 |             self.crawler.engine.close_spider(spider, 'diskusage_exceeded')
29 | 
30 | 
31 | class DiskQuotaSpiderMiddleware(DiskQuota):
32 | 
33 |     def process_spider_exception(self, response, exception, spider):
34 |         if self._is_disk_quota_error(exception):
35 |             self.crawler.engine.close_spider(spider, 'diskusage_exceeded')
36 | 


--------------------------------------------------------------------------------
/sh_scrapy/env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import codecs
  4 | from base64 import b64decode
  5 | 
  6 | from scrapy.utils.python import to_bytes, to_unicode
  7 | 
  8 | 
  9 | def _make_scrapy_args(arg, args_dict):
 10 |     if not args_dict:
 11 |         return []
 12 |     args = []
 13 |     for k, v in sorted(dict(args_dict).items()):
 14 |         args += [arg, "{}={}".format(
 15 |             to_unicode(k), to_unicode(v) if isinstance(v, str) else v)]
 16 |     return args
 17 | 
 18 | 
 19 | def _scrapy_crawl_args_and_env(msg):
 20 |     args = ['scrapy', 'crawl', str(msg['spider'])] + \
 21 |         _make_scrapy_args('-a', msg.get('spider_args')) + \
 22 |         _make_scrapy_args('-s', msg.get('settings'))
 23 |     env = {
 24 |         'SCRAPY_JOB': msg['key'],
 25 |         'SCRAPY_SPIDER': msg['spider'],
 26 |         'SCRAPY_PROJECT_ID': msg['key'].split('/')[0],
 27 |         # the following should be considered deprecated
 28 |         'SHUB_SPIDER_TYPE': msg.get('spider_type', '')
 29 |     }
 30 |     return args, env
 31 | 
 32 | 
 33 | def _job_args_and_env(msg):
 34 |     env = msg.get('job_env')
 35 |     if not isinstance(env, dict):
 36 |         env = {}
 37 |     cmd = msg.get('job_cmd')
 38 |     if not isinstance(cmd, list):
 39 |         cmd = [str(cmd)]
 40 |     return cmd, {to_unicode(k): to_unicode(v) if isinstance(v, str) else v
 41 |                  for k, v in sorted(dict(env).items())}
 42 | 
 43 | 
 44 | def _jobname(msg):
 45 |     if 'job_name' in msg:
 46 |         return msg['job_name']
 47 |     elif 'spider' in msg:
 48 |         return msg['spider']
 49 |     else:
 50 |         return msg['job_cmd'][0]
 51 | 
 52 | 
 53 | def _jobauth(msg):
 54 |     auth_data = to_bytes('{0[key]}:{0[auth]}'.format(msg))
 55 |     return to_unicode(codecs.encode(auth_data, 'hex_codec'))
 56 | 
 57 | 
 58 | def get_args_and_env(msg):
 59 |     envf = _job_args_and_env if 'job_cmd' in msg else _scrapy_crawl_args_and_env
 60 |     args, env = envf(msg)
 61 |     if 'api_url' in msg:
 62 |         env['SHUB_APIURL'] = msg.get('api_url')
 63 | 
 64 |     env.update({
 65 |         'SHUB_JOBKEY': msg['key'],
 66 |         'SHUB_JOBAUTH': _jobauth(msg),
 67 |         'SHUB_JOBNAME': _jobname(msg),
 68 |         'SHUB_JOB_TAGS': ','.join(msg.get('tags') or ()),  # DEPRECATED?
 69 |     })
 70 |     return args, env
 71 | 
 72 | 
 73 | def decode_uri(uri=None, envvar=None):
 74 |     """Return content for a data: or file: URI
 75 | 
 76 |     >>> decode_uri('data:application/json;charset=utf8;base64,ImhlbGxvIHdvcmxkIg==')
 77 |     u'hello world'
 78 |     >>> decode_uri('data:;base64,ImhlbGxvIHdvcmxkIg==')
 79 |     u'hello world'
 80 |     >>> decode_uri('{"spider": "hello"}')
 81 |     {u'spider': u'hello'}
 82 | 
 83 |     """
 84 |     if envvar is not None:
 85 |         uri = os.getenv(envvar, '')
 86 |     elif uri is None:
 87 |         raise ValueError('An uri or envvar is required')
 88 | 
 89 |     mime_type = 'application/json'
 90 | 
 91 |     # data:[<MIME-type>][;charset=<encoding>][;base64],<data>
 92 |     if uri.startswith('data:'):
 93 |         prefix, _, data = uri.rpartition(',')
 94 |         mods = {}
 95 |         for idx, value in enumerate(prefix[5:].split(';')):
 96 |             if idx == 0:
 97 |                 mime_type = value or mime_type
 98 |             elif '=' in value:
 99 |                 k, _, v = value.partition('=')
100 |                 mods[k] = v
101 |             else:
102 |                 mods[value] = None
103 | 
104 |         if 'base64' in mods:
105 |             data = b64decode(data)
106 |         if mime_type == 'application/json':
107 |             data = data.decode(mods.get('charset', 'utf-8'))
108 |             return json.loads(data)
109 |         else:
110 |             return data
111 | 
112 |     if uri.startswith('{'):
113 |         return json.loads(uri)
114 | 
115 |     if uri.startswith('/'):
116 |         uri = 'file://' + uri
117 |     if uri.startswith('file://'):
118 |         reader = codecs.getreader("utf-8")
119 |         with open(uri[7:], 'rb') as data_file:
120 |             return json.load(reader(data_file))
121 | 
122 | 
123 | def setup_environment():
124 |     # scrapy.cfg is required by scrapy.utils.project.data_path
125 |     # FIXME: drop this requirement
126 |     if not os.path.exists('scrapy.cfg'):
127 |         open('scrapy.cfg', 'w').close()
128 | 


--------------------------------------------------------------------------------
/sh_scrapy/exceptions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | class SHScrapyDeprecationWarning(Warning):
5 |     """Warning category for deprecated features, since the default
6 |     DeprecationWarning is silenced on Python 2.7+
7 |     """
8 | 


--------------------------------------------------------------------------------
/sh_scrapy/extension.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from contextlib import suppress
  3 | from warnings import warn
  4 | from weakref import WeakKeyDictionary
  5 | 
  6 | import scrapy
  7 | from scrapy import signals
  8 | from scrapy import version_info as SCRAPY_VERSION_INFO
  9 | from scrapy.exporters import PythonItemExporter
 10 | from scrapy.http import Request
 11 | from scrapy.utils.deprecate import create_deprecated_class
 12 | 
 13 | from sh_scrapy import hsref
 14 | from sh_scrapy.exceptions import SHScrapyDeprecationWarning
 15 | from sh_scrapy.middlewares import HS_PARENT_ID_KEY, request_id_sequence
 16 | from sh_scrapy.writer import pipe_writer
 17 | 
 18 | 
 19 | try:
 20 |     from itemadapter import ItemAdapter
 21 | except ImportError:
 22 |     _base_item_cls = [dict, scrapy.item.Item]
 23 |     with suppress(AttributeError):
 24 |         _base_item_cls.append(scrapy.item.BaseItem)
 25 |     _base_item_cls = tuple(_base_item_cls)
 26 | 
 27 |     def is_item(item):
 28 |         return isinstance(item, _base_item_cls)
 29 | else:
 30 |     def is_item(item):
 31 |         return ItemAdapter.is_item(item)
 32 | 
 33 | 
 34 | class HubstorageExtension(object):
 35 |     """Extension to write scraped items to HubStorage"""
 36 | 
 37 |     def __init__(self, crawler):
 38 |         self.hsref = hsref.hsref
 39 |         self.pipe_writer = pipe_writer
 40 |         self.crawler = crawler
 41 |         self.logger = logging.getLogger(__name__)
 42 |         self._write_item = self.pipe_writer.write_item
 43 |         kwargs = {}
 44 |         if SCRAPY_VERSION_INFO < (2, 11):
 45 |             kwargs["binary"] = False
 46 |         self.exporter = PythonItemExporter(**kwargs)
 47 | 
 48 |     @classmethod
 49 |     def from_crawler(cls, crawler):
 50 |         o = cls(crawler)
 51 |         crawler.signals.connect(o.item_scraped, signals.item_scraped)
 52 |         crawler.signals.connect(o.spider_closed, signals.spider_closed)
 53 |         return o
 54 | 
 55 |     def item_scraped(self, item, spider):
 56 |         if not is_item(item):
 57 |             self.logger.error("Wrong item type: %s" % item)
 58 |             return
 59 |         type_ = type(item).__name__
 60 |         item = self.exporter.export_item(item)
 61 |         item.setdefault("_type", type_)
 62 |         self._write_item(item)
 63 | 
 64 |     def spider_closed(self, spider, reason):
 65 |         self.pipe_writer.set_outcome(reason)
 66 | 
 67 | 
 68 | _HUBSTORAGE_MIDDLEWARE_WARNING = """\
 69 | {cls} inherits from deprecated class {old}
 70 | 
 71 | sh_scrapy.extension.HubstorageMiddleware functionality is now split between two new middlewares:
 72 | 
 73 | - sh_scrapy.middlewares.HubstorageDownloaderMiddleware
 74 | - sh_scrapy.middlewares.HubstorageSpiderMiddleware
 75 | 
 76 | Please migrate to new middlewares.
 77 | """
 78 | 
 79 | 
 80 | class HubstorageMiddleware:
 81 | 
 82 |     @classmethod
 83 |     def from_crawler(cls, crawler):
 84 |         try:
 85 |             result = cls(crawler)
 86 |         except TypeError:
 87 |             warn(
 88 |                 (
 89 |                     "Subclasses of HubstorageMiddleware must now accept a "
 90 |                     "crawler parameter in their __init__ method. This will "
 91 |                     "become an error in the future."
 92 |                 ),
 93 |                 DeprecationWarning,
 94 |             )
 95 |             result = cls()
 96 |             result._crawler = crawler
 97 |             result._load_fingerprinter()
 98 |         return result
 99 | 
100 |     def _load_fingerprinter(self):
101 |         if hasattr(self._crawler, "request_fingerprinter"):
102 |             self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
103 |         else:
104 |             from scrapy.utils.request import request_fingerprint
105 |             self._fingerprint = request_fingerprint
106 | 
107 |     def __init__(self, crawler=None):
108 |         self._seen = WeakKeyDictionary()
109 |         self.hsref = hsref.hsref
110 |         self.pipe_writer = pipe_writer
111 |         self.request_id_sequence = request_id_sequence
112 |         self._crawler = crawler
113 |         if crawler:
114 |             self._load_fingerprinter()
115 | 
116 |     def process_spider_input(self, response, spider):
117 |         self.pipe_writer.write_request(
118 |             url=response.url,
119 |             status=response.status,
120 |             method=response.request.method,
121 |             rs=len(response.body),
122 |             duration=response.meta.get('download_latency', 0) * 1000,
123 |             parent=response.meta.get(HS_PARENT_ID_KEY),
124 |             fp=self._fingerprint(response.request),
125 |         )
126 |         self._seen[response] = next(self.request_id_sequence)
127 | 
128 |     def process_spider_output(self, response, result, spider):
129 |         parent = self._seen.pop(response)
130 |         for x in result:
131 |             if isinstance(x, Request):
132 |                 x.meta[HS_PARENT_ID_KEY] = parent
133 |             yield x
134 | 
135 | 
136 | HubstorageMiddleware = create_deprecated_class(
137 |     "HubstorageMiddleware",
138 |     HubstorageMiddleware,
139 |     warn_category=SHScrapyDeprecationWarning,
140 |     subclass_warn_message=_HUBSTORAGE_MIDDLEWARE_WARNING
141 | )
142 | 


--------------------------------------------------------------------------------
/sh_scrapy/hsref.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module to hold a reference to singleton Hubstorage client and Job instance
 3 | """
 4 | import os
 5 | from codecs import decode
 6 | 
 7 | from scrapy.utils.python import to_unicode
 8 | 
 9 | 
10 | class _HubstorageRef(object):
11 | 
12 |     def __init__(self):
13 |         self.enabled = 'SHUB_JOBKEY' in os.environ
14 |         self._client = None
15 |         self._project = None
16 |         self._job = None
17 |         if self.enabled:
18 |             self.jobkey = os.environ['SHUB_JOBKEY']
19 |             job_id = [int(id) for id in self.jobkey.split('/')]
20 |             self._projectid, self._spiderid, self._jobcounter = job_id
21 |         else:
22 |             self._projectid = None
23 |             self._spiderid = None
24 |             self._jobcounter = None
25 | 
26 |     @property
27 |     def auth(self):
28 |         return to_unicode(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec'))
29 | 
30 |     @property
31 |     def endpoint(self):
32 |         return os.environ.get('SHUB_STORAGE')
33 | 
34 |     @property
35 |     def projectid(self):
36 |         return self._projectid
37 | 
38 |     @property
39 |     def spiderid(self):
40 |         return self._spiderid
41 | 
42 |     @property
43 |     def jobid(self):
44 |         return self._jobcounter
45 | 
46 |     @property
47 |     def client(self):
48 |         from scrapinghub import HubstorageClient
49 |         if self._client is None:
50 |             user_agent = os.environ.get('SHUB_HS_USER_AGENT')
51 |             self._client = HubstorageClient(endpoint=self.endpoint,
52 |                                             auth=self.auth,
53 |                                             user_agent=user_agent)
54 |         return self._client
55 | 
56 |     @property
57 |     def project(self):
58 |         if self._project is None:
59 |             self._project = self.client.get_project(str(self.projectid))
60 |         return self._project
61 | 
62 |     @property
63 |     def job(self):
64 |         if self._job is None:
65 |             self._job = self.project.get_job((self.spiderid, self.jobid))
66 |         return self._job
67 | 
68 |     def close(self):
69 |         if self._client is not None:
70 |             self._client.close()
71 | 
72 | hsref = _HubstorageRef()
73 | 


--------------------------------------------------------------------------------
/sh_scrapy/log.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | import warnings
  4 | 
  5 | from scrapy import __version__
  6 | from scrapy.utils.python import to_unicode
  7 | from twisted.python import log as txlog
  8 | 
  9 | from sh_scrapy.writer import pipe_writer
 10 | 
 11 | 
 12 | # keep a global reference to stderr as it is redirected on log initialization
 13 | _stdout = sys.stdout
 14 | _stderr = sys.stderr
 15 | 
 16 | 
 17 | def _logfn(level, message):
 18 |     """Wraps HS job logging function."""
 19 |     try:
 20 |         pipe_writer.write_log(level=level, message=message)
 21 |     except UnicodeDecodeError:
 22 |         # workaround for messages that contain binary data
 23 |         message = repr(message)[1:-1]
 24 |         pipe_writer.write_log(level=level, message=message)
 25 | 
 26 | 
 27 | def initialize_logging():
 28 |     """Initialize logging to send messages to Hubstorage job logs
 29 | 
 30 |     it initializes:
 31 |     - Python logging
 32 |     - Twisted logging
 33 |     - Scrapy logging
 34 |     - Redirects standard output and stderr to job log at INFO level
 35 | 
 36 |     This duplicates some code with Scrapy log.start(), but it's required in
 37 |     order to avoid scrapy from starting the log twice.
 38 |     """
 39 |     # General python logging
 40 |     root = logging.getLogger()
 41 |     root.setLevel(logging.NOTSET)
 42 |     hdlr = HubstorageLogHandler()
 43 |     hdlr.setLevel(logging.INFO)
 44 |     hdlr.setFormatter(logging.Formatter('[%(name)s] %(message)s'))
 45 |     root.addHandler(hdlr)
 46 | 
 47 |     # Silence commonly used noisy libraries
 48 |     try:
 49 |         import boto  # boto overrides its logger at import time
 50 |     except ImportError:
 51 |         pass
 52 | 
 53 |     nh = logging.NullHandler()
 54 |     for ln in ('boto', 'requests', 'hubstorage'):
 55 |         lg = logging.getLogger(ln)
 56 |         lg.propagate = 0
 57 |         lg.addHandler(nh)
 58 | 
 59 |     # Redirect standard output and error to HS log
 60 |     sys.stdout = StdoutLogger(0, 'utf-8')
 61 |     sys.stderr = StdoutLogger(1, 'utf-8')
 62 | 
 63 |     # Twisted specifics (includes Scrapy)
 64 |     obs = HubstorageLogObserver(hdlr)
 65 |     _oldshowwarning = warnings.showwarning
 66 |     txlog.startLoggingWithObserver(obs.emit, setStdout=False)
 67 |     warnings.showwarning = _oldshowwarning
 68 |     return hdlr
 69 | 
 70 | 
 71 | class HubstorageLogHandler(logging.Handler):
 72 |     """Python logging handler that writes to HubStorage"""
 73 | 
 74 |     def emit(self, record):
 75 |         try:
 76 |             message = self.format(record)
 77 |             if message:
 78 |                 _logfn(message=message, level=record.levelno)
 79 |         except (KeyboardInterrupt, SystemExit):
 80 |             raise
 81 |         except:
 82 |             self.handleError(record)
 83 | 
 84 |     def handleError(self, record):
 85 |         cur = sys.stderr
 86 |         try:
 87 |             sys.stderr = _stderr
 88 |             super(HubstorageLogHandler, self).handleError(record)
 89 |         finally:
 90 |             sys.stderr = cur
 91 | 
 92 | 
 93 | class HubstorageLogObserver(object):
 94 |     """Twisted log observer with Scrapy specifics that writes to HubStorage"""
 95 | 
 96 |     def __init__(self, loghdlr):
 97 |         self._hs_loghdlr = loghdlr
 98 | 
 99 |     def emit(self, ev):
100 |         logitem = self._get_log_item(ev)
101 |         if logitem:
102 |             _logfn(**logitem)
103 | 
104 |     def _get_log_item(self, ev):
105 |         """Get HubStorage log item for the given Twisted event, or None if no
106 |         document should be inserted
107 |         """
108 |         if ev['system'] == 'scrapy':
109 |             level = ev['logLevel']
110 |         else:
111 |             if ev['isError']:
112 |                 level = logging.ERROR
113 |             else:
114 |                 level = logging.INFO
115 | 
116 |         # It's important to access level trough handler instance,
117 |         # min log level can change at any moment.
118 |         if level < self._hs_loghdlr.level:
119 |             return
120 | 
121 |         msg = ev.get('message')
122 |         if msg:
123 |             msg = to_unicode(msg[0])
124 | 
125 |         failure = ev.get('failure', None)
126 |         if failure:
127 |             msg = failure.getTraceback()
128 | 
129 |         why = ev.get('why', None)
130 |         if why:
131 |             msg = "%s\n%s" % (why, msg)
132 | 
133 |         fmt = ev.get('format')
134 |         if fmt:
135 |             try:
136 |                 msg = fmt % ev
137 |             except:
138 |                 msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
139 |                 level = logging.ERROR
140 |         # to replicate typical scrapy log appeareance
141 |         msg = msg.replace('\n', '\n\t')
142 |         return {'message': msg, 'level': level}
143 | 
144 | 
145 | class StdoutLogger(txlog.StdioOnnaStick):
146 |     """This works like Twisted's StdioOnnaStick but prepends standard
147 |     output/error messages with [stdout] and [stderr]
148 |     """
149 | 
150 |     def __init__(self, isError=0, encoding=None, loglevel=logging.INFO):
151 |         txlog.StdioOnnaStick.__init__(self, isError, encoding)
152 |         self.prefix = "[stderr] " if isError else "[stdout] "
153 |         self.loglevel = loglevel
154 | 
155 |     def _logprefixed(self, msg):
156 |         _logfn(message=self.prefix + msg, level=self.loglevel)
157 | 
158 |     def write(self, data):
159 |         data = to_unicode(data, self.encoding)
160 | 
161 |         d = (self.buf + data).split('\n')
162 |         self.buf = d[-1]
163 |         messages = d[0:-1]
164 |         for message in messages:
165 |             self._logprefixed(message)
166 | 
167 |     def writelines(self, lines):
168 |         for line in lines:
169 |             line = to_unicode(line, self.encoding)
170 |             self._logprefixed(line)
171 | 


--------------------------------------------------------------------------------
/sh_scrapy/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import itertools
  3 | from warnings import warn
  4 | from weakref import WeakKeyDictionary
  5 | 
  6 | from scrapy import Request
  7 | 
  8 | from sh_scrapy.writer import pipe_writer
  9 | 
 10 | HS_REQUEST_ID_KEY = '_hsid'
 11 | HS_PARENT_ID_KEY = '_hsparent'
 12 | request_id_sequence = itertools.count()
 13 | seen_requests = WeakKeyDictionary()
 14 | 
 15 | 
 16 | class HubstorageSpiderMiddleware(object):
 17 |     """Hubstorage spider middleware.
 18 |     
 19 |     What it does:
 20 |     
 21 |     - Sets parent request ids to the requests coming out of the spider.
 22 |     
 23 |     """
 24 | 
 25 |     def __init__(self):
 26 |         self._seen_requests = seen_requests
 27 | 
 28 |     def process_spider_output(self, response, result, spider):
 29 |         parent = self._seen_requests.pop(response.request, None)
 30 |         for x in result:
 31 |             if isinstance(x, Request):
 32 |                 self._process_request(x, parent)
 33 |             yield x
 34 | 
 35 |     async def process_spider_output_async(self, response, result, spider):
 36 |         parent = self._seen_requests.pop(response.request, None)
 37 |         async for x in result:
 38 |             if isinstance(x, Request):
 39 |                 self._process_request(x, parent)
 40 |             yield x
 41 | 
 42 |     def _process_request(self, request, parent):
 43 |         request.meta[HS_PARENT_ID_KEY] = parent
 44 |         # Remove request id if it was for some reason set in the request coming from Spider.
 45 |         request.meta.pop(HS_REQUEST_ID_KEY, None)
 46 | 
 47 | 
 48 | class HubstorageDownloaderMiddleware:
 49 |     """Hubstorage dowloader middleware.
 50 |     
 51 |     What it does:
 52 |     
 53 |     - Generates request ids for all downloaded requests.
 54 |     - Sets parent request ids for requests generated in downloader middlewares.
 55 |     - Stores all downloaded requests into Hubstorage.
 56 |     
 57 |     """
 58 | 
 59 |     @classmethod
 60 |     def from_crawler(cls, crawler):
 61 |         try:
 62 |             result = cls(crawler)
 63 |         except TypeError:
 64 |             warn(
 65 |                 (
 66 |                     "Subclasses of HubstorageDownloaderMiddleware must now "
 67 |                     "accept a crawler parameter in their __init__ method. "
 68 |                     "This will become an error in the future."
 69 |                 ),
 70 |                 DeprecationWarning,
 71 |             )
 72 |             result = cls()
 73 |             result._crawler = crawler
 74 |             result._load_fingerprinter()
 75 |         return result
 76 | 
 77 |     def __init__(self, crawler):
 78 |         self._crawler = crawler
 79 |         self._seen_requests = seen_requests
 80 |         self.pipe_writer = pipe_writer
 81 |         self.request_id_sequence = request_id_sequence
 82 |         self._load_fingerprinter()
 83 | 
 84 |     def _load_fingerprinter(self):
 85 |         if hasattr(self._crawler, "request_fingerprinter"):
 86 |             self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
 87 |         else:
 88 |             from scrapy.utils.request import request_fingerprint
 89 |             self._fingerprint = request_fingerprint
 90 | 
 91 |     def process_request(self, request, spider):
 92 |         # Check if request id is set, which usually happens for retries or redirects because
 93 |         # those requests are usually copied from the original one.
 94 |         request_id = request.meta.pop(HS_REQUEST_ID_KEY, None)
 95 |         if request_id is not None:
 96 |             # Set original request id or None as a parent request id.
 97 |             request.meta[HS_PARENT_ID_KEY] = request_id
 98 | 
 99 |     def process_response(self, request, response, spider):
100 |         # This class of response check is intended to fix the bug described here
101 |         # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112
102 |         if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"):
103 |             return response
104 | 
105 |         self.pipe_writer.write_request(
106 |             url=response.url,
107 |             status=response.status,
108 |             method=request.method,
109 |             rs=len(response.body),
110 |             duration=request.meta.get('download_latency', 0) * 1000,
111 |             parent=request.meta.setdefault(HS_PARENT_ID_KEY),
112 |             fp=self._fingerprint(request),
113 |         )
114 |         # Generate and set request id.
115 |         request_id = next(self.request_id_sequence)
116 |         self._seen_requests[request] = request_id
117 |         request.meta[HS_REQUEST_ID_KEY] = request_id
118 |         return response
119 | 


--------------------------------------------------------------------------------
/sh_scrapy/settings.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import tempfile
  5 | 
  6 | from scrapy.settings import Settings
  7 | from scrapy.utils.misc import load_object
  8 | from scrapy.utils.project import get_project_settings
  9 | from scrapy.utils.python import to_unicode
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | REPLACE_ADDONS_PATHS = {
 14 |     "hworker.bot.ext.page.PageStorageMiddleware":
 15 |         "scrapy_pagestorage.PageStorageMiddleware",
 16 |     "hworker.bot.ext.persistence.DotScrapyPersistence":
 17 |         "scrapy_dotpersistence.DotScrapyPersistence",
 18 |     "scrapylib.deltafetch.DeltaFetch":
 19 |         "scrapy_deltafetch.DeltaFetch",
 20 |     "scrapylib.magicfields.MagicFieldsMiddleware":
 21 |         "scrapy_magicfields.MagicFieldsMiddleware",
 22 |     "scrapylib.querycleaner.QueryCleanerMiddleware":
 23 |         "scrapy_querycleaner.QueryCleanerMiddleware",
 24 |     "scrapylib.splitvariants.SplitVariantsMiddleware":
 25 |         "scrapy_splitvariants.SplitVariantsMiddleware",
 26 |     "scrapy.contrib.throttle.AutoThrottle":
 27 |         "scrapy.extensions.throttle.AutoThrottle",
 28 | }
 29 | SLYBOT_SPIDER_MANAGER = 'slybot.spidermanager.ZipfileSlybotSpiderManager'
 30 | SLYBOT_DUPE_FILTER = 'slybot.dupefilter.DupeFilterPipeline'
 31 | SETTINGS_ORDERED_DICTS = [
 32 |     "DOWNLOADER_MIDDLEWARES", "DOWNLOADER_MIDDLEWARES_BASE",
 33 |     "EXTENSIONS", "EXTENSIONS_BASE",
 34 |     "ITEM_PIPELINES", "ITEM_PIPELINES_BASE",
 35 |     "SPIDER_CONTRACTS", "SPIDER_CONTRACTS_BASE",
 36 |     "SPIDER_MIDDLEWARES", "SPIDER_MIDDLEWARES_BASE"
 37 | ]
 38 | 
 39 | try:
 40 |     from scrapy.utils.deprecate import update_classpath
 41 | except ImportError:
 42 |     update_classpath = lambda x: x
 43 | 
 44 | 
 45 | class EntrypointSettings(Settings):
 46 |     """
 47 |     We need to convert settings to string since the S3 download handler
 48 |     doesn't work if the AWS keys are passed as unicode. Other code may
 49 |     also depend on settings being str.
 50 |     """
 51 | 
 52 |     def __init__(self):
 53 |         super(EntrypointSettings, self).__init__()
 54 |         self.attributes = {}
 55 | 
 56 |     def set(self, name, value, priority='project'):
 57 |         super(EntrypointSettings, self).set(
 58 |             to_unicode(name),
 59 |             value if isinstance(value, str) else value,
 60 |             priority=priority)
 61 | 
 62 |     def copy_to_dict(self):
 63 |         if hasattr(super(EntrypointSettings, self), 'copy_to_dict'):
 64 |             return super(EntrypointSettings, self).copy_to_dict()
 65 |         # Backward compatibility with older Scrapy versions w/o copy_to_dict
 66 |         settings = self.copy()
 67 |         return {key: settings[key] for key in settings.attributes}
 68 | 
 69 | 
 70 | def _maybe_load_autoscraping_project(settings, priority=0):
 71 |     if os.environ.get('SHUB_SPIDER_TYPE') in ('auto', 'portia'):
 72 |         slybot_settings = {'ITEM_PIPELINES': {},
 73 |                            'SLYDUPEFILTER_ENABLED': True,
 74 |                            'SLYCLOSE_SPIDER_ENABLED': True,
 75 |                            'SPIDER_MANAGER_CLASS': SLYBOT_SPIDER_MANAGER}
 76 |         settings.setdict(slybot_settings, priority=priority)
 77 |         settings['ITEM_PIPELINES'][SLYBOT_DUPE_FILTER] = 0
 78 |         settings.set("PROJECT_ZIPFILE", 'project-slybot.zip')
 79 | 
 80 | 
 81 | def _get_component_base(settings, compkey):
 82 |     if settings.get(compkey + '_BASE') is not None:
 83 |         return compkey + '_BASE'
 84 |     return compkey
 85 | 
 86 | 
 87 | def _get_action_on_missing_addons(settings):
 88 |     on_missing_addons = settings.get('ON_MISSING_ADDONS', 'warn')
 89 |     if on_missing_addons not in ['fail', 'error', 'warn']:
 90 |         logger.warning(
 91 |             "Wrong value for ON_MISSING_ADDONS: should be one of "
 92 |             "[fail,error,warn]. Set default 'warn' value.")
 93 |         on_missing_addons = 'warn'
 94 |     return on_missing_addons
 95 | 
 96 | 
 97 | def _update_old_classpaths(settings):
 98 |     """Update user's project settings with proper class paths.
 99 | 
100 |     Note that the method updates only settings with dicts as values:
101 |     it's needed for proper dicts merge to avoid duplicates in paths.
102 |     For all other cases Scrapy will handle it by itself.
103 |     """
104 |     for setting_key in settings.attributes.keys():
105 |         setting_value = settings[setting_key]
106 |         # A workaround to make it work for:
107 |         # - Scrapy==1.0.5 with dicts as values
108 |         # - Scrapy>=1.1.0 with BaseSettings as values
109 |         if hasattr(setting_value, 'copy_to_dict'):
110 |             setting_value = setting_value.copy_to_dict()
111 |         elif not isinstance(setting_value, dict):
112 |             continue
113 |         for path in setting_value.keys():
114 |             if not isinstance(path, str):
115 |                 continue
116 |             updated_path = update_classpath(path)
117 |             if updated_path != path:
118 |                 order = settings[setting_key].pop(path)
119 |                 settings[setting_key][updated_path] = order
120 | 
121 | 
122 | def _update_component_order(components, path, order):
123 |     """Update component order only if it's not set yet"""
124 |     updated_path = update_classpath(path)
125 |     if updated_path not in components:
126 |         components[updated_path] = order
127 | 
128 | 
129 | def _load_addons(addons, settings, merged_settings, priority=0):
130 |     on_missing_addons = _get_action_on_missing_addons(merged_settings)
131 |     for addon in addons:
132 |         addon_path = addon['path']
133 |         if addon_path in REPLACE_ADDONS_PATHS:
134 |             addon_path = REPLACE_ADDONS_PATHS[addon_path]
135 |         try:
136 |             load_object(addon_path)
137 |         except (ImportError, NameError, ValueError) as exc:
138 |             message = "Addon import error {}:\n {}".format(addon_path, exc)
139 |             if on_missing_addons == 'warn':
140 |                 logger.warning(message)
141 |                 continue
142 |             elif on_missing_addons == 'error':
143 |                 logger.error(message)
144 |                 continue
145 |             raise
146 |         skey = _get_component_base(settings, addon['type'])
147 |         components = settings[skey]
148 |         _update_component_order(components, addon_path, addon['order'])
149 |         merged_settings.set(skey, components)
150 |         merged_settings.setdict(addon['default_settings'], priority)
151 | 
152 | 
153 | def _merge_with_keeping_order(settings, updates):
154 |     for setting_key, value in updates.items():
155 |         if not isinstance(value, dict):
156 |             settings.set(setting_key, value, priority='cmdline')
157 |             continue
158 |         if setting_key in SETTINGS_ORDERED_DICTS:
159 |             components = settings[setting_key]
160 |             for path, order in value.items():
161 |                 _update_component_order(components, path, order)
162 |         else:
163 |             settings.set(setting_key, value)
164 | 
165 | 
166 | def _populate_settings_base(apisettings, defaults_func, spider=None):
167 |     """Populate and merge project settings with other ones.
168 | 
169 |     Important note: Scrapy doesn't really copy values on set/setdict methods,
170 |     changing a dict in merged settings means mutating it in original settings.
171 |     """
172 |     assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded"
173 |     settings = get_project_settings().copy()
174 |     _update_old_classpaths(settings)
175 |     merged_settings = EntrypointSettings()
176 | 
177 |     enabled_addons = apisettings.setdefault('enabled_addons', [])
178 |     project_settings = apisettings.setdefault('project_settings', {})
179 |     organization_settings = apisettings.setdefault('organization_settings', {})
180 |     spider_settings = apisettings.setdefault('spider_settings', {})
181 |     job_settings = apisettings.setdefault('job_settings', {})
182 | 
183 |     defaults_func(settings)
184 |     merged_settings.setdict(project_settings, priority=10)
185 |     merged_settings.setdict(organization_settings, priority=20)
186 |     if spider:
187 |         merged_settings.setdict(spider_settings, priority=30)
188 |         _maybe_load_autoscraping_project(merged_settings, priority=0)
189 |         merged_settings.set('JOBDIR', tempfile.mkdtemp(prefix='jobdata-'),
190 |                             priority=40)
191 |     merged_settings.setdict(job_settings, priority=40)
192 |     # Load addons only after we gather all settings
193 |     _load_addons(enabled_addons, settings, merged_settings, priority=0)
194 |     _merge_with_keeping_order(settings, merged_settings.copy_to_dict())
195 |     _enforce_required_settings(settings)
196 |     return settings
197 | 
198 | 
199 | def _load_default_settings(settings):
200 |     downloader_middlewares = {
201 |         'sh_scrapy.diskquota.DiskQuotaDownloaderMiddleware': -10000,  # closest to the engine
202 |         'sh_scrapy.middlewares.HubstorageDownloaderMiddleware': 10000,  # closest to the downloader
203 |     }
204 |     spider_middlewares = {
205 |         'sh_scrapy.diskquota.DiskQuotaSpiderMiddleware': -10001,  # closest to the engine
206 |         'sh_scrapy.middlewares.HubstorageSpiderMiddleware': -10000,  # right after disk quota middleware
207 |     }
208 |     extensions = {
209 |         'scrapy.extensions.debug.StackTraceDump': 0,
210 |         'sh_scrapy.extension.HubstorageExtension': 100,
211 |     }
212 | 
213 |     try:
214 |         import slybot
215 |     except ImportError:
216 |         pass
217 |     else:
218 |         extensions['slybot.closespider.SlybotCloseSpider'] = 0
219 | 
220 |     settings.get('DOWNLOADER_MIDDLEWARES_BASE').update(downloader_middlewares)
221 |     settings.get('EXTENSIONS_BASE').update(extensions)
222 |     settings.get('SPIDER_MIDDLEWARES_BASE').update(spider_middlewares)
223 |     memory_limit = int(os.environ.get('SHUB_JOB_MEMORY_LIMIT', 950))
224 |     settings.setdict({
225 |         'STATS_CLASS': 'sh_scrapy.stats.HubStorageStatsCollector',
226 |         'MEMUSAGE_ENABLED': True,
227 |         'MEMUSAGE_LIMIT_MB': memory_limit,
228 |         'DISK_QUOTA_STOP_ON_ERROR': True,
229 |         'WEBSERVICE_ENABLED': False,
230 |         'LOG_LEVEL': 'INFO',
231 |         'LOG_ENABLED': False,
232 |         'TELNETCONSOLE_HOST': '0.0.0.0',  # to access telnet console from host
233 |     }, priority='cmdline')
234 | 
235 | 
236 | def _enforce_required_settings(settings):
237 |     settings.setdict({
238 |         # breaks logging and useless in scrapy cloud
239 |         'LOG_STDOUT': False,
240 |     }, priority='cmdline')
241 | 
242 | 
243 | def populate_settings(apisettings, spider=None):
244 |     return _populate_settings_base(apisettings, _load_default_settings, spider)
245 | 


--------------------------------------------------------------------------------
/sh_scrapy/stats.py:
--------------------------------------------------------------------------------
 1 | from twisted.internet import task
 2 | from scrapy.statscollectors import StatsCollector
 3 | 
 4 | from sh_scrapy import hsref
 5 | from sh_scrapy.writer import pipe_writer
 6 | 
 7 | 
 8 | class HubStorageStatsCollector(StatsCollector):
 9 | 
10 |     INTERVAL = 30
11 | 
12 |     def __init__(self, crawler):
13 |         super(HubStorageStatsCollector, self).__init__(crawler)
14 |         self.hsref = hsref.hsref
15 |         self.pipe_writer = pipe_writer
16 | 
17 |     def _upload_stats(self):
18 |         self.pipe_writer.write_stats(self._stats)
19 | 
20 |     def open_spider(self, spider):
21 |         self._setup_looping_call(now=True)
22 | 
23 |     def _setup_looping_call(self, _ignored=None, **kwargs):
24 |         self._samplestask = task.LoopingCall(self._upload_stats)
25 |         d = self._samplestask.start(self.INTERVAL, **kwargs)
26 |         d.addErrback(self._setup_looping_call, now=False)
27 | 
28 |     def close_spider(self, spider, reason):
29 |         super(HubStorageStatsCollector, self).close_spider(spider, reason)
30 |         if self._samplestask.running:
31 |             self._samplestask.stop()
32 |         self._upload_stats()
33 | 


--------------------------------------------------------------------------------
/sh_scrapy/utils.py:
--------------------------------------------------------------------------------
1 | from sh_scrapy.settings import populate_settings
2 | from sh_scrapy.crawl import _get_apisettings
3 | 
4 | 
5 | def get_project_settings():
6 |     return populate_settings(_get_apisettings())
7 | 


--------------------------------------------------------------------------------
/sh_scrapy/writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import os
 4 | import threading
 5 | 
 6 | from scrapinghub.hubstorage.serialization import jsondefault
 7 | from scrapinghub.hubstorage.utils import millitime
 8 | 
 9 | 
10 | def _not_configured(*args, **kwargs):
11 |     raise RuntimeError("Pipe writer is misconfigured, named pipe path is not set")
12 | 
13 | 
14 | class _PipeWriter(object):
15 |     """Writer for the Scrapinghub named pipe.
16 | 
17 |     It's not safe to instantiate and use multiple writers, only one writer
18 |     should be instantiated and used, otherwise data may be corrupted.
19 | 
20 |     The object is thread safe.
21 | 
22 |     :ivar path: Named pipe path
23 | 
24 |     """
25 | 
26 |     def __init__(self, path):
27 |         self.path = path or ''
28 |         self._lock = threading.Lock()
29 |         self._pipe = None
30 |         if not self.path:
31 |             self._write = _not_configured
32 |             self.open = _not_configured
33 |             self.close = _not_configured
34 | 
35 |     def open(self):
36 |         with self._lock:
37 |             self._pipe = open(self.path, 'wb')
38 | 
39 |     def _write(self, command, payload):
40 |         # binary command
41 |         command = command.encode('utf-8')
42 |         # binary payload
43 |         encoded_payload = json.dumps(
44 |             payload,
45 |             separators=(',', ':'),
46 |             default=jsondefault
47 |         ).encode('utf-8')
48 |         # write needs to be locked because write can be called from multiple threads
49 |         with self._lock:
50 |             self._pipe.write(command)
51 |             self._pipe.write(b' ')
52 |             self._pipe.write(encoded_payload)
53 |             self._pipe.write(b'\n')
54 |             self._pipe.flush()
55 | 
56 |     def write_log(self, level, message):
57 |         log = {
58 |             'time': millitime(),
59 |             'level': level,
60 |             'message': message
61 |         }
62 |         self._write('LOG', log)
63 | 
64 |     def write_request(self, url, status, method, rs, duration, parent, fp):
65 |         request = {
66 |             'url': url,
67 |             'status': int(status),
68 |             'method': method,
69 |             'rs': int(rs),
70 |             'duration': int(duration),
71 |             'parent': parent,
72 |             'time': millitime(),
73 |             'fp': fp,
74 |         }
75 |         self._write('REQ', request)
76 | 
77 |     def write_item(self, item):
78 |         self._write('ITM', item)
79 | 
80 |     def write_stats(self, stats):
81 |         self._write('STA', {'time': millitime(), 'stats': stats})
82 | 
83 |     def set_outcome(self, outcome):
84 |         self._write('FIN', {'outcome': outcome})
85 | 
86 |     def close(self):
87 |         with self._lock:
88 |             self._pipe.close()
89 | 
90 | 
91 | pipe_writer = _PipeWriter(os.environ.get('SHUB_FIFO_PATH', ''))
92 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/scrapinghub-entrypoint-scrapy/aeea6fc61827fc5ff8f871a3f988588d338f8185/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import codecs
 3 | import os
 4 | import shutil
 5 | import tempfile
 6 | 
 7 | import pytest
 8 | from scrapy.utils.python import to_unicode, to_bytes
 9 | 
10 | TEMP_DIR = tempfile.mkdtemp()
11 | SHUB_FIFO_PATH = os.path.join(TEMP_DIR, 'scrapinghub')
12 | os.environ['SHUB_FIFO_PATH'] = SHUB_FIFO_PATH
13 | 
14 | from sh_scrapy.writer import pipe_writer  # should go after setting SHUB_FIFO_PATH
15 | 
16 | 
17 | TEST_AUTH = to_unicode(codecs.encode(to_bytes('1/2/3:authstr'), 'hex_codec'))
18 | 
19 | 
20 | @pytest.fixture(scope='session', autouse=True)
21 | def clean_shub_fifo_path():
22 |     global TEMP_DIR
23 |     pipe_writer.open()
24 |     try:
25 |         yield
26 |     finally:
27 |         shutil.rmtree(TEMP_DIR)
28 | 
29 | 
30 | @pytest.fixture(autouse=True)
31 | def set_jobkeyenvironment(monkeypatch):
32 |     monkeypatch.setenv('SHUB_JOBKEY', '1/2/3')
33 |     monkeypatch.setenv('SCRAPY_JOB', '1/2/3')
34 |     monkeypatch.setenv('SHUB_JOBAUTH', TEST_AUTH)
35 |     monkeypatch.setenv('SHUB_STORAGE', 'storage-url')
36 | 
37 | 
38 | # install the reactor explicitly, as Scrapy including scrapy.utils.test.get_crawler() assumes it's installed
39 | from twisted.internet import reactor
40 | 


--------------------------------------------------------------------------------
/tests/test_command.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from argparse import ArgumentParser
 3 | from optparse import OptionParser
 4 | 
 5 | import pytest
 6 | import scrapy
 7 | from packaging import version
 8 | 
 9 | from sh_scrapy.commands.shub_image_info import Command
10 | 
11 | 
12 | @pytest.fixture
13 | def command():
14 |     command = Command()
15 |     command.settings = scrapy.settings.Settings()
16 |     return command
17 | 
18 | 
19 | @pytest.mark.skipif(
20 |     version.parse(scrapy.__version__) >= version.parse("2.6"),
21 |     reason="Scrapy>=2.6 uses argparse"
22 | )
23 | def test_optparse(command):
24 |     parser = OptionParser()
25 |     command.add_options(parser)
26 |     options = parser.parse_args(["--debug"])
27 |     assert options[0].debug
28 | 
29 | 
30 | @pytest.mark.skipif(
31 |     version.parse(scrapy.__version__) < version.parse("2.6"),
32 |     reason="Scrapy<2.6 uses optparse"
33 | )
34 | def test_argparse(command):
35 |     parser = ArgumentParser()
36 |     command.add_options(parser)
37 |     options = parser.parse_args(["--debug"])
38 |     assert options.debug
39 | 


--------------------------------------------------------------------------------
/tests/test_compat.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | import pytest
  4 | from scrapy.exceptions import ScrapyDeprecationWarning
  5 | 
  6 | from sh_scrapy.compat import is_string, to_bytes, to_unicode, to_native_str
  7 | 
  8 | 
  9 | # test deprecation messages
 10 | 
 11 | def test_deprecated_is_string():
 12 |     with warnings.catch_warnings(record=True) as caught:
 13 |         assert is_string("foo")
 14 |         assert not is_string(b"foo")
 15 |         assert not is_string(1)
 16 |         assert (
 17 |             "is_string(var) is deprecated, please use isinstance(var, str) instead"
 18 |             == str(caught[0].message)
 19 |         )
 20 |         assert caught[0].category is ScrapyDeprecationWarning
 21 | 
 22 | 
 23 | def test_deprecated_to_unicode():
 24 |     with warnings.catch_warnings(record=True) as caught:
 25 |         assert to_unicode("foo") == "foo"
 26 |         assert to_unicode(b"foo") == "foo"
 27 |         assert (
 28 |             "Call to deprecated function to_unicode. Use scrapy.utils.python.to_unicode instead."
 29 |             == str(caught[0].message)
 30 |         )
 31 |         assert caught[0].category is ScrapyDeprecationWarning
 32 | 
 33 | 
 34 | def test_deprecated_to_native_str():
 35 |     with warnings.catch_warnings(record=True) as caught:
 36 |         assert to_native_str("foo") == "foo"
 37 |         assert to_native_str(b"foo") == "foo"
 38 |         assert (
 39 |             "Call to deprecated function to_native_str. Use scrapy.utils.python.to_unicode instead."
 40 |             == str(caught[0].message)
 41 |         )
 42 |         assert caught[0].category is ScrapyDeprecationWarning
 43 | 
 44 | 
 45 | def test_deprecated_to_bytes():
 46 |     with warnings.catch_warnings(record=True) as caught:
 47 |         assert to_bytes("foo") == b"foo"
 48 |         assert to_bytes(b"foo") == b"foo"
 49 |         assert (
 50 |             "Call to deprecated function to_bytes. Use scrapy.utils.python.to_bytes instead."
 51 |             == str(caught[0].message)
 52 |         )
 53 |         assert caught[0].category is ScrapyDeprecationWarning
 54 | 
 55 | 
 56 | # Testing to_unicode conversion
 57 | 
 58 | def test_to_str_an_utf8_encoded_string_to_str():
 59 |     assert to_unicode(b'lel\xc3\xb1e') == u'lel\xf1e'
 60 | 
 61 | 
 62 | def test_to_str_a_latin_1_encoded_string_to_str():
 63 |     assert to_unicode(b'lel\xf1e', 'latin-1') == u'lel\xf1e'
 64 | 
 65 | 
 66 | def test_to_str_a_unicode_to_str_should_return_the_same_object():
 67 |     assert to_unicode(u'\xf1e\xf1e\xf1e') == u'\xf1e\xf1e\xf1e'
 68 | 
 69 | 
 70 | def test_to_str_a_strange_object_should_raise_TypeError():
 71 |     with pytest.raises(TypeError) as excinfo:
 72 |         to_unicode(123)
 73 | 
 74 | 
 75 | def test_to_str_errors_argument():
 76 |     assert to_unicode(b'a\xedb', 'utf-8', errors='replace') == u'a\ufffdb'
 77 | 
 78 | 
 79 | # Testing to_bytes conversion
 80 | 
 81 | def test_to_bytes_a_unicode_object_to_an_utf_8_encoded_string():
 82 |     assert to_bytes(u'\xa3 49') == b'\xc2\xa3 49'
 83 | 
 84 | 
 85 | def test_to_bytes_a_unicode_object_to_a_latin_1_encoded_string():
 86 |     assert to_bytes(u'\xa3 49', 'latin-1') == b'\xa3 49'
 87 | 
 88 | 
 89 | def test_to_bytes_a_regular_bytes_to_bytes_should_return_the_same_object():
 90 |     assert to_bytes(b'lel\xf1e') == b'lel\xf1e'
 91 | 
 92 | 
 93 | def test_to_bytes_a_strange_object_should_raise_TypeError():
 94 |     with pytest.raises(TypeError):
 95 |         to_bytes(pytest)
 96 | 
 97 | 
 98 | def test_to_bytes_errors_argument():
 99 |     assert to_bytes(u'a\ufffdb', 'latin-1', errors='replace') == b'a?b'
100 | 


--------------------------------------------------------------------------------
/tests/test_crawl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import mock
  5 | import pytest
  6 | import unittest
  7 | from scrapy.settings import Settings
  8 | 
  9 | import sh_scrapy.crawl
 10 | from sh_scrapy.crawl import _fatalerror
 11 | from sh_scrapy.crawl import _get_apisettings
 12 | from sh_scrapy.crawl import _run
 13 | from sh_scrapy.crawl import _run_scrapy
 14 | from sh_scrapy.crawl import _run_pkgscript
 15 | from sh_scrapy.crawl import _run_usercode
 16 | from sh_scrapy.crawl import _launch
 17 | from sh_scrapy.crawl import list_spiders
 18 | from sh_scrapy.crawl import main
 19 | from sh_scrapy.log import HubstorageLogHandler
 20 | from tests.utils import create_project, call_command
 21 | 
 22 | 
 23 | try:
 24 |     from scrapy_spider_metadata import get_spider_metadata
 25 |     SPIDER_METADATA_AVAILABLE = True
 26 | except:
 27 |     SPIDER_METADATA_AVAILABLE = False
 28 | 
 29 | 
 30 | @mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn',
 31 |                               'SENTRY_DSN': 'sentry-dsn'})
 32 | def test_init_module():
 33 |     assert sh_scrapy.crawl._sys_stderr == sys.stderr
 34 |     assert sh_scrapy.crawl._sys_stdout == sys.stdout
 35 |     assert sh_scrapy.crawl.socket.getdefaulttimeout() == 60.0
 36 | 
 37 | 
 38 | @mock.patch('traceback.print_exception')
 39 | def test_fatal_error(trace_print):
 40 |     exception = ValueError('some exception')
 41 |     traceback = None
 42 |     try:
 43 |         raise exception
 44 |     except:
 45 |         # get traceback before we cleaned it with fatalerror
 46 |         traceback = sys.exc_info()[2]
 47 |         _fatalerror()
 48 |     assert trace_print.called
 49 |     trace_args = trace_print.call_args_list[0]
 50 |     assert trace_args[0][0] == ValueError
 51 |     assert trace_args[0][1] == exception
 52 |     assert trace_args[0][2] == traceback
 53 |     assert trace_args[0][3] is None
 54 |     assert trace_args[0][4] == sys.stderr
 55 | 
 56 | 
 57 | @mock.patch('traceback.print_exception')
 58 | def test_fatal_error_ignore_IOError(trace_print):
 59 |     trace_print.side_effect = IOError('some error')
 60 |     try:
 61 |         raise ValueError('some exception')
 62 |     except:
 63 |         _fatalerror()
 64 |     assert trace_print.called
 65 | 
 66 | 
 67 | @mock.patch('sh_scrapy.crawl._sentry_dsn')
 68 | def test_fatal_error_sentry_import_error(sentry_dsn):
 69 |     try:
 70 |         raise ValueError('some exception')
 71 |     except:
 72 |         _fatalerror()
 73 | 
 74 | 
 75 | @mock.patch('sh_scrapy.crawl._sentry_dsn')
 76 | def test_fatal_error_sentry_with_mock(sentry_dsn):
 77 |     raven_stub = type('raven', (object, ), {})
 78 |     raven_stub.Client = mock.Mock()
 79 |     try:
 80 |         sys.modules['raven'] = raven_stub
 81 |         raise ValueError('some exception')
 82 |     except:
 83 |         _fatalerror()
 84 |     finally:
 85 |         del sys.modules['raven']
 86 |     assert raven_stub.Client.called
 87 |     assert raven_stub.Client.call_args[0] == (sentry_dsn,)
 88 |     sentry_client = raven_stub.Client.return_value
 89 |     assert sentry_client.captureException.called
 90 | 
 91 | 
 92 | @mock.patch('sh_scrapy.crawl._sentry_dsn')
 93 | def test_fatal_error_sentry_with_mock_ignore_errors(sentry_dsn):
 94 |     raven_stub = type('raven', (object, ), {})
 95 |     raven_stub.Client = mock.Mock()
 96 |     sentry_client = raven_stub.Client.return_value
 97 |     sentry_client.captureException.side_effect = IOError('error')
 98 |     try:
 99 |         sys.modules['raven'] = raven_stub
100 |         raise ValueError('some exception')
101 |     except:
102 |         _fatalerror()
103 |     finally:
104 |         del sys.modules['raven']
105 | 
106 | 
107 | def test_get_apisettings_empty():
108 |     assert _get_apisettings() == {}
109 | 
110 | 
111 | @mock.patch.dict(os.environ, {
112 |     'SHUB_SETTINGS': 'data:;base64,ImhlbGxvIHdvcmxkIg=='})
113 | def test_get_apisettings_from_env():
114 |     assert _get_apisettings() == 'hello world'
115 | 
116 | 
117 | @mock.patch('sh_scrapy.crawl._run_pkgscript')
118 | def test_run_pkg_script(run_pkg_mock):
119 |     _run(['py:script.py'], {'SETTING': 'VALUE'})
120 |     assert run_pkg_mock.called
121 |     assert run_pkg_mock.call_args[0] == (['py:script.py'],)
122 | 
123 | 
124 | @unittest.skipIf(sys.version_info > (3,7), "Requires Python 3.7 or lower")
125 | @mock.patch('pkg_resources.WorkingSet')
126 | def test_run_pkg_script_distribution_not_found(working_set_class):
127 |     fake_set = mock.Mock()
128 |     fake_set.iter_entry_points.return_value = iter(())
129 |     working_set_class.return_value = fake_set
130 |     with pytest.raises(ValueError):
131 |         _run(['py:script.py'], {'SETTING': 'VALUE'})
132 | 
133 | @unittest.skipIf(sys.version_info < (3,8), "Requires Python 3.8 or higher")
134 | @mock.patch('importlib.metadata.entry_points')
135 | def test_run_pkg_script_distribution_not_found_python_3_8_plus(working_set_class):
136 |     fake_set = mock.Mock()
137 |     fake_set.iter_entry_points.return_value = iter(())
138 |     working_set_class.return_value = [fake_set]
139 |     with pytest.raises(ValueError):
140 |         _run(['py:script.py'], {'SETTING': 'VALUE'})
141 | 
142 | @mock.patch('sh_scrapy.crawl._run_scrapy')
143 | def test_run_scrapy_spider(run_scrapy_mock):
144 |     _run(['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'})
145 |     assert run_scrapy_mock.called
146 |     assert run_scrapy_mock.call_args[0] == (
147 |         ['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'})
148 | 
149 | 
150 | @mock.patch('scrapy.cmdline.execute')
151 | def test_run_scrapy(execute_mock):
152 |     _run_scrapy(['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'})
153 |     assert execute_mock.called
154 |     assert execute_mock.call_args == (
155 |         {'settings': {'SETTING': 'VALUE'}},)
156 |     assert sys.argv == ['scrapy', 'crawl', 'spider']
157 | 
158 | 
159 | def get_working_set(working_set_class):
160 |     """Helper to confugure a fake working set with ep"""
161 |     working_set = working_set_class.return_value
162 |     ep = mock.Mock()
163 |     ep.name = 'settings'
164 |     working_set.iter_entry_points.return_value = [ep]
165 |     return working_set
166 | 
167 | 
168 | @unittest.skipIf(sys.version_info > (3,7), "Requires Python 3.7 or lower")
169 | @mock.patch('pkg_resources.WorkingSet')
170 | def test_run_pkgscript_base_usage(working_set_class):
171 |     working_set = get_working_set(working_set_class)
172 |     _run_pkgscript(['py:script.py', 'arg1', 'arg2'])
173 |     assert working_set.iter_entry_points.called
174 |     assert working_set.iter_entry_points.call_args[0] == ('scrapy',)
175 |     ep = working_set.iter_entry_points.return_value[0]
176 |     assert ep.dist.run_script.called
177 |     assert ep.dist.run_script.call_args[0] == (
178 |         'script.py', {'__name__': '__main__'})
179 |     assert sys.argv == ['script.py', 'arg1', 'arg2']
180 | 
181 | def get_entry_points_mock():
182 |     """Helper to configure a fake entry point"""
183 |     ep = mock.Mock()
184 |     ep.name = 'settings'
185 |     ep.dist.run_script = mock.Mock()  # only for the pkg_resources code path
186 |     return [ep]
187 | 
188 | @unittest.skipIf(sys.version_info < (3,8), "Requires Python 3.8 or higher")
189 | @mock.patch('sh_scrapy.crawl._run_script')
190 | @mock.patch('importlib.metadata.entry_points')
191 | def test_run_pkgscript_base_usage_python_3_8_plus(entry_points_mock, mocked_run):
192 |     entry_points_mock.return_value = get_entry_points_mock()
193 |     _run_pkgscript(['py:script.py', 'arg1', 'arg2'])
194 |     assert entry_points_mock.called
195 |     assert entry_points_mock.call_args[1] == {'group': 'scrapy'}
196 |     assert mocked_run.called
197 |     assert mocked_run.call_args[0][1:] == ('script.py', {'__name__': '__main__'})
198 |     assert sys.argv == ['script.py', 'arg1', 'arg2']
199 | 
200 | 
201 | @mock.patch.dict(os.environ, {
202 |     'SHUB_SETTINGS': '{"project_settings": {"SETTING....'})
203 | @mock.patch('sh_scrapy.crawl._run')
204 | def test_run_usercode_bad_settings(mocked_run):
205 |     with pytest.raises(ValueError):
206 |         _run_usercode('py:script.py', ['py:script.py'], _get_apisettings)
207 |     assert not mocked_run.called
208 | 
209 | 
210 | @mock.patch.dict(os.environ, {
211 |     'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}'})
212 | @mock.patch('sh_scrapy.crawl._run')
213 | def test_run_usercode_run_exception(mocked_run):
214 |     mocked_run.side_effect = AttributeError('argA is missing')
215 |     with pytest.raises(AttributeError):
216 |         _run_usercode('py:script.py', ['py:script.py'], _get_apisettings)
217 |     assert mocked_run.called
218 | 
219 | 
220 | @mock.patch.dict(os.environ, {
221 |     'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}'})
222 | @mock.patch('sh_scrapy.crawl._run')
223 | def test_run_usercode(mocked_run):
224 |     _run_usercode('py:script.py', ['py:script.py', 'arg1'], _get_apisettings)
225 |     assert mocked_run.called
226 |     assert mocked_run.call_args[0][0] == ['py:script.py', 'arg1']
227 |     settings = mocked_run.call_args[0][1]
228 |     assert isinstance(settings, Settings)
229 |     assert settings['SETTING_TEST'] == 'VAL'
230 | 
231 | 
232 | @mock.patch.dict(os.environ, {
233 |     'SHUB_SETTINGS': '{"project_settings": {"LOG_LEVEL": 10}}'})
234 | @mock.patch('sh_scrapy.crawl._run')
235 | def test_run_usercode_with_loghandler(mocked_run):
236 |     loghandler = mock.Mock()
237 |     _run_usercode('py:script.py', ['py:script.py', 'arg1'],
238 |                   _get_apisettings, loghandler)
239 |     assert mocked_run.called
240 |     assert loghandler.setLevel.called
241 |     call_args = loghandler.setLevel.call_args[0]
242 |     assert len(call_args) == 1
243 |     assert call_args[0] == 10
244 | 
245 | 
246 | SPIDER_MSG = {
247 |     'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto',
248 |     'auth': 'auths', 'spider_args': {'arg1': 'val1', 'arg2': 'val2'},
249 |     'settings': {'SETTING1': 'VAL1', 'SETTING2': 'VAL2'}
250 | }
251 | 
252 | 
253 | @mock.patch('sh_scrapy.crawl._fatalerror')
254 | def test_launch_handle_fatalerror(mocked_fatalerr):
255 |     with pytest.raises(AssertionError):
256 |         _launch()
257 |     assert mocked_fatalerr.called
258 | 
259 | 
260 | @mock.patch.dict(os.environ, {'SHUB_JOB_DATA': json.dumps(SPIDER_MSG)})
261 | @mock.patch('sh_scrapy.env.setup_environment')
262 | @mock.patch('sh_scrapy.crawl._run_usercode')
263 | def test_launch(mocked_run, mocked_setup):
264 |     _launch()
265 |     expected_env = {
266 |         'SCRAPY_SPIDER': 'test', 'SHUB_JOBNAME': 'test',
267 |         'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1',
268 |         'SHUB_JOBKEY': '1/2/3', 'SHUB_JOB_TAGS': '',
269 |         'SHUB_JOBAUTH': '312f322f333a6175746873',
270 |         'SHUB_SPIDER_TYPE': 'auto'}
271 |     for k, v in expected_env.items():
272 |         assert os.environ.get(k) == v
273 |     assert mocked_run.called
274 |     run_args = mocked_run.call_args[0]
275 |     assert run_args[0] == 'test'
276 |     expected_args = [
277 |         'scrapy', 'crawl', 'test', '-a', 'arg1=val1', '-a',
278 |         'arg2=val2', '-s', 'SETTING1=VAL1', '-s', 'SETTING2=VAL2']
279 |     assert run_args[1] == expected_args
280 |     assert run_args[2] == _get_apisettings
281 |     assert isinstance(run_args[3], HubstorageLogHandler)
282 |     assert mocked_setup.called
283 | 
284 | 
285 | @mock.patch('sh_scrapy.env.setup_environment')
286 | @mock.patch('sh_scrapy.crawl._run_usercode')
287 | def test_list_spiders(mocked_run, mocked_setup):
288 |     list_spiders()
289 |     assert mocked_run.called
290 |     run_args = mocked_run.call_args[0]
291 |     assert run_args[0] is None
292 |     expected_args = ['scrapy', 'list']
293 |     assert run_args[1] == expected_args
294 |     assert run_args[2] == _get_apisettings
295 |     assert mocked_setup.called
296 | 
297 | 
298 | @mock.patch('sh_scrapy.crawl._fatalerror')
299 | @mock.patch('sh_scrapy.env.setup_environment')
300 | def test_list_spiders_handle_fatalerror(mocked_setup, mocked_fatalerr):
301 |     mocked_setup.side_effect = AttributeError('some error')
302 |     with pytest.raises(AttributeError):
303 |         list_spiders()
304 |     assert mocked_fatalerr.called
305 | 
306 | 
307 | @mock.patch('sh_scrapy.writer.pipe_writer')
308 | @mock.patch('sh_scrapy.crawl._launch')
309 | def test_main(mocked_launch, pipe_writer):
310 |     main()
311 |     assert pipe_writer.open.called
312 |     assert mocked_launch.called
313 |     assert mocked_launch.call_args == ()
314 |     assert sys.stdout == sh_scrapy.crawl._sys_stdout
315 |     assert sys.stderr == sh_scrapy.crawl._sys_stderr
316 |     # Pipe writer file object is closed implicitly on program exit.
317 |     # This ensures that pipe is writable even if main program is fininshed -
318 |     # e.g. for threads that are not closed yet.
319 |     assert not pipe_writer.close.called
320 | 
321 | 
322 | def test_image_info(tmp_path):
323 |     project_dir = create_project(tmp_path)
324 |     out, err = call_command(project_dir, "shub-image-info")
325 |     # can't be asserted as it contains a SHScrapyDeprecationWarning
326 |     # assert err == ""
327 |     data = json.loads(out)
328 |     expected = {
329 |         "project_type": "scrapy",
330 |         "spiders": ["myspider"],
331 |         "metadata": {"myspider": {}},
332 |     }
333 |     if not SPIDER_METADATA_AVAILABLE:
334 |         del expected["metadata"]
335 |     assert data == expected
336 | 
337 | 
338 | def test_image_info_metadata(tmp_path):
339 |     project_dir = create_project(tmp_path, spider_text="""
340 | from scrapy import Spider
341 | 
342 | class MySpider(Spider):
343 |     name = "myspider"
344 |     metadata = {"foo": 42}
345 | """)
346 |     out, _ = call_command(project_dir, "shub-image-info")
347 |     data = json.loads(out)
348 |     expected = {
349 |         "project_type": "scrapy",
350 |         "spiders": ["myspider"],
351 |         "metadata": {"myspider": {"foo": 42}},
352 |     }
353 |     if not SPIDER_METADATA_AVAILABLE:
354 |         del expected["metadata"]
355 |     assert data == expected
356 | 
357 | 
358 | def test_image_info_metadata_skip_broken(tmp_path):
359 |     project_dir = create_project(tmp_path, spider_text="""
360 | from scrapy import Spider
361 | 
362 | class MySpider(Spider):
363 |     name = "myspider"
364 |     metadata = {"foo": Spider}
365 | """)
366 |     out, _ = call_command(project_dir, "shub-image-info")
367 |     data = json.loads(out)
368 |     expected = {
369 |         "project_type": "scrapy",
370 |         "spiders": ["myspider"],
371 |         "metadata": {},
372 |     }
373 |     if not SPIDER_METADATA_AVAILABLE:
374 |         del expected["metadata"]
375 |     assert data == expected
376 | 
377 | 
378 | @pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed")
379 | def test_image_info_args(tmp_path):
380 |     project_dir = create_project(tmp_path, spider_text="""
381 | from enum import Enum
382 | from scrapy import Spider
383 | from scrapy_spider_metadata import Args
384 | from pydantic import BaseModel, Field
385 | 
386 | class ToolEnum(Enum):
387 |     spanner = "spanner"
388 |     wrench = "wrench"
389 | 
390 | class Parameters(BaseModel):
391 |     tool: ToolEnum = ToolEnum.spanner
392 | 
393 | class MySpider(Args[Parameters], Spider):
394 |     name = "myspider"
395 | """)
396 |     out, _ = call_command(project_dir, "shub-image-info")
397 |     data = json.loads(out)
398 |     expected = {
399 |         "project_type": "scrapy",
400 |         "spiders": ["myspider"],
401 |         "metadata": {
402 |             "myspider": {
403 |                 "param_schema": {
404 |                     "properties": {
405 |                         "tool": {
406 |                             "default": "spanner",
407 |                             "enum": ["spanner", "wrench"],
408 |                             "title": "Tool",
409 |                             "type": "string",
410 |                         },
411 |                     },
412 |                     "title": "Parameters",
413 |                     "type": "object",
414 |                 },
415 |             },
416 |         },
417 |     }
418 |     if not SPIDER_METADATA_AVAILABLE:
419 |         del expected["metadata"]
420 |     assert data == expected
421 | 
422 | 
423 | @pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed")
424 | def test_image_info_args_metadata(tmp_path):
425 |     project_dir = create_project(tmp_path, spider_text="""
426 | from enum import Enum
427 | from scrapy import Spider
428 | from scrapy_spider_metadata import Args
429 | from pydantic import BaseModel, Field
430 | 
431 | class ToolEnum(Enum):
432 |     spanner = "spanner"
433 |     wrench = "wrench"
434 | 
435 | class Parameters(BaseModel):
436 |     tool: ToolEnum = ToolEnum.spanner
437 | 
438 | class MySpider(Args[Parameters], Spider):
439 |     name = "myspider"
440 |     metadata = {"foo": 42}
441 | """)
442 |     out, _ = call_command(project_dir, "shub-image-info")
443 |     data = json.loads(out)
444 |     expected = {
445 |         "project_type": "scrapy",
446 |         "spiders": ["myspider"],
447 |         "metadata": {
448 |             "myspider": {
449 |                 "foo": 42,
450 |                 "param_schema": {
451 |                     "properties": {
452 |                         "tool": {
453 |                             "default": "spanner",
454 |                             "enum": ["spanner", "wrench"],
455 |                             "title": "Tool",
456 |                             "type": "string",
457 |                         },
458 |                     },
459 |                     "title": "Parameters",
460 |                     "type": "object",
461 |                 },
462 |             },
463 |         },
464 |     }
465 |     if not SPIDER_METADATA_AVAILABLE:
466 |         del expected["metadata"]
467 |     assert data == expected
468 | 


--------------------------------------------------------------------------------
/tests/test_diskquota.py:
--------------------------------------------------------------------------------
 1 | import mock
 2 | import pytest
 3 | from scrapy.utils.test import get_crawler
 4 | from scrapy.exceptions import NotConfigured
 5 | 
 6 | from sh_scrapy.diskquota import DiskQuota
 7 | from sh_scrapy.diskquota import DiskQuotaDownloaderMiddleware
 8 | from sh_scrapy.diskquota import DiskQuotaSpiderMiddleware
 9 | 
10 | 
11 | def test_disk_quota_disabled():
12 |     crawler = get_crawler()
13 |     with pytest.raises(NotConfigured):
14 |         DiskQuota(crawler)
15 | 
16 | 
17 | @pytest.fixture
18 | def crawler():
19 |     return get_crawler(settings_dict={'DISK_QUOTA_STOP_ON_ERROR': True})
20 | 
21 | 
22 | def test_disk_quota_init(crawler):
23 |     dquota = DiskQuota(crawler)
24 |     assert dquota.crawler == crawler
25 | 
26 | 
27 | def test_disk_quota_from_crawler(crawler):
28 |     assert isinstance(DiskQuota.from_crawler(crawler), DiskQuota)
29 | 
30 | 
31 | def test_disk_quota_check_error(crawler):
32 |     dquota = DiskQuota(crawler)
33 |     assert not dquota._is_disk_quota_error(ValueError())
34 |     assert not dquota._is_disk_quota_error(IOError())
35 |     valid_error = IOError()
36 |     valid_error.errno = 122
37 |     assert dquota._is_disk_quota_error(valid_error)
38 |     other_valid_error = OSError()
39 |     other_valid_error.errno = 122
40 |     assert dquota._is_disk_quota_error(other_valid_error)
41 | 
42 | 
43 | def test_downloaded_mware_process_not_stopped(crawler):
44 |     crawler.engine = mock.Mock()
45 |     mware = DiskQuotaDownloaderMiddleware(crawler)
46 |     mware.process_exception('request', ValueError(), 'spider')
47 |     assert not crawler.engine.close_spider.called
48 | 
49 | 
50 | def test_downloaded_mware_process_stopped(crawler):
51 |     crawler.engine = mock.Mock()
52 |     mware = DiskQuotaDownloaderMiddleware(crawler)
53 |     error = IOError()
54 |     error.errno = 122
55 |     mware.process_exception('request', error, 'spider')
56 |     assert crawler.engine.close_spider.called
57 |     assert crawler.engine.close_spider.call_args[0] == (
58 |         'spider', 'diskusage_exceeded')
59 | 
60 | 
61 | def test_spider_mware_process_not_stopped(crawler):
62 |     crawler.engine = mock.Mock()
63 |     mware = DiskQuotaSpiderMiddleware(crawler)
64 |     mware.process_spider_exception('response', ValueError(), 'spider')
65 |     assert not crawler.engine.close_spider.called
66 | 
67 | 
68 | def test_spider_mware_process_stopped(crawler):
69 |     crawler.engine = mock.Mock()
70 |     mware = DiskQuotaSpiderMiddleware(crawler)
71 |     error = IOError()
72 |     error.errno = 122
73 |     mware.process_spider_exception('response', error, 'spider')
74 |     assert crawler.engine.close_spider.called
75 |     assert crawler.engine.close_spider.call_args[0] == (
76 |         'spider', 'diskusage_exceeded')
77 | 


--------------------------------------------------------------------------------
/tests/test_env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import mock
  4 | import codecs
  5 | import pytest
  6 | import tempfile
  7 | 
  8 | from scrapy.utils.python import to_bytes, to_unicode
  9 | 
 10 | from sh_scrapy.env import _jobauth
 11 | from sh_scrapy.env import _jobname
 12 | from sh_scrapy.env import decode_uri
 13 | from sh_scrapy.env import get_args_and_env
 14 | from sh_scrapy.env import _job_args_and_env
 15 | from sh_scrapy.env import _make_scrapy_args
 16 | from sh_scrapy.env import setup_environment
 17 | from sh_scrapy.env import _scrapy_crawl_args_and_env
 18 | 
 19 | 
 20 | def test_make_scrapy_args():
 21 |     assert _make_scrapy_args('-a', {}) == []
 22 |     assert _make_scrapy_args('-a', {'test': 'val'}) == ['-a', 'test=val']
 23 |     result1 = _make_scrapy_args('-s', [('k1', 'v1'), ('k2', 'v2')])
 24 |     assert result1 == ['-s', 'k1=v1', '-s', 'k2=v2']
 25 |     result2 = _make_scrapy_args('-s', [('arg1', 'val1'), ('arg2', 'val2')])
 26 |     assert result2 == ['-s', 'arg1=val1', '-s', 'arg2=val2']
 27 |     result3 = _make_scrapy_args('-s', [('arg1', 1), ('arg2', 2)])
 28 |     assert result3 == ['-s', 'arg1=1', '-s', 'arg2=2']
 29 | 
 30 | 
 31 | def test_scrapy_crawl_args_and_env():
 32 |     # test with minimal message
 33 |     result = _scrapy_crawl_args_and_env({'key': '1/2/3', 'spider': 'test'})
 34 |     assert len(result) == 2
 35 |     assert result[0] == ['scrapy', 'crawl', 'test']
 36 |     assert result[1] == {'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1',
 37 |                          'SCRAPY_SPIDER': 'test', 'SHUB_SPIDER_TYPE': ''}
 38 |     # test with full message
 39 |     result1 = _scrapy_crawl_args_and_env(
 40 |         {'key': '1/2/3', 'spider': 'test',
 41 |          'spider_args': [('arg1', 'val1'), ('arg2', 'val2')],
 42 |          'settings': [('SETTING1', 'VAL1'), ('SETTING2', 'VAL2')],
 43 |          'spider_type': 'auto',
 44 |          'extra_args': ['some', 'extra', 'args']})
 45 |     assert result1[0] == ['scrapy', 'crawl', 'test', '-a', 'arg1=val1',
 46 |                           '-a', 'arg2=val2', '-s', 'SETTING1=VAL1',
 47 |                           '-s', 'SETTING2=VAL2']
 48 |     assert result1[1] == {'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1',
 49 |                           'SCRAPY_SPIDER': 'test', 'SHUB_SPIDER_TYPE': 'auto'}
 50 | 
 51 | 
 52 | def test_job_args_and_env():
 53 |     result = _job_args_and_env({'job_cmd': ['custom.py', 'arg1']})
 54 |     assert result == (['custom.py', 'arg1'], {})
 55 |     result1 = _job_args_and_env({'job_cmd': ['custom.py', 'arg1'],
 56 |                                  'job_env': {'some': 'env'}})
 57 |     assert result1 == (['custom.py', 'arg1'], {'some': 'env'})
 58 |     result2 = _job_args_and_env({'job_cmd': ('wrong', 'cmd', 'style')})
 59 |     assert result2 == (["('wrong', 'cmd', 'style')"], {})
 60 | 
 61 | 
 62 | def test_jobname():
 63 |     msg = {'job_name': 'jobn', 'spider': 'test',
 64 |            'job_cmd': ['custom.py', 'arg1', 'arg2']}
 65 |     assert _jobname(msg) == 'jobn'
 66 |     msg.pop('job_name')
 67 |     assert _jobname(msg) == 'test'
 68 |     msg.pop('spider')
 69 |     assert _jobname(msg) == 'custom.py'
 70 | 
 71 | 
 72 | def test_jobauth():
 73 |     msg = {'key': '1/2/3', 'auth': 'authstring'}
 74 |     expected = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec')
 75 |     assert _jobauth(msg) == to_unicode(expected)
 76 | 
 77 | 
 78 | def test_get_args_and_env_run_spider():
 79 |     msg = {'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto',
 80 |            'auth': 'auths', 'spider_args': {'arg1': 'val1', 'arg2': 'val2'},
 81 |            'settings': {'SETTING1': 'VAL1', 'SETTING2': 'VAL2'}}
 82 |     result = get_args_and_env(msg)
 83 |     expected_auth = codecs.encode(to_bytes('1/2/3:auths'), 'hex_codec')
 84 |     assert len(result) == 2
 85 |     assert result[0] == ['scrapy', 'crawl', 'test', '-a', 'arg1=val1',
 86 |                          '-a', 'arg2=val2', '-s', 'SETTING1=VAL1', '-s',
 87 |                          'SETTING2=VAL2']
 88 |     assert result[1] == {'SCRAPY_JOB': '1/2/3',
 89 |                          'SCRAPY_PROJECT_ID': '1',
 90 |                          'SCRAPY_SPIDER': 'test',
 91 |                          'SHUB_JOBAUTH': to_unicode(expected_auth),
 92 |                          'SHUB_JOBKEY': '1/2/3',
 93 |                          'SHUB_JOBNAME': 'test',
 94 |                          'SHUB_JOB_TAGS': '',
 95 |                          'SHUB_SPIDER_TYPE': 'auto'}
 96 |     add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'}
 97 |     msg.update(add_fields)
 98 |     result1 = get_args_and_env(msg)
 99 |     assert len(result1) == 2
100 |     assert result1[1]['SHUB_APIURL'] == 'some-api-url'
101 |     assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
102 | 
103 | 
104 | def test_get_args_and_env_run_script():
105 |     msg = {'key': '1/2/3', 'job_cmd': ['custom.py', 'arg1'],
106 |            'auth': 'authstring'}
107 |     result = get_args_and_env(msg)
108 |     expected_auth = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec')
109 |     assert len(result) == 2
110 |     assert result[0] == ['custom.py', 'arg1']
111 |     assert result[1] == {
112 |         'SHUB_JOBAUTH': to_unicode(expected_auth),
113 |         'SHUB_JOBKEY': '1/2/3',
114 |         'SHUB_JOBNAME': 'custom.py',
115 |         'SHUB_JOB_TAGS': ''}
116 |     add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'}
117 |     msg.update(add_fields)
118 |     result1 = get_args_and_env(msg)
119 |     assert len(result1) == 2
120 |     assert result1[1]['SHUB_APIURL'] == 'some-api-url'
121 |     assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB'
122 | 
123 | 
124 | def test_decode_uri_basic_usage():
125 |     assert decode_uri('{"spider": "hello"}') == {'spider': 'hello'}
126 |     str1 = 'data:application/json;charset=utf8;base64,ImhlbGxvIHdvcmxkIg=='
127 |     assert decode_uri(str1) == u'hello world'
128 |     assert decode_uri('data:;base64,ImhlbGxvIHdvcmxkIg==') == 'hello world'
129 |     str2 = 'data:custom-mime;charset=utf8;base64,ImhlbGxvIHdvcmxkIg=='
130 |     assert decode_uri(str2) == b'"hello world"'
131 | 
132 | 
133 | @mock.patch.dict(os.environ, {'TEST_VAR': '{"spider": "hello"}'})
134 | def test_decode_uri_from_env():
135 |     assert decode_uri(None, 'TEST_VAR') == {'spider': 'hello'}
136 | 
137 | 
138 | def test_decode_uri_var_or_env_is_needed():
139 |     with pytest.raises(ValueError):
140 |         decode_uri()
141 | 
142 | 
143 | def test_decode_uri_from_file():
144 |     with tempfile.NamedTemporaryFile() as temp:
145 |         temp.write('{"hello":"world"}'.encode('utf-8'))
146 |         temp.flush()
147 |         assert decode_uri(temp.name) == {'hello': 'world'}
148 |         assert decode_uri('file://' + temp.name) == {'hello': 'world'}
149 | 
150 | 
151 | def test_setup_environment():
152 |     builtin_mod = '__builtin__' if sys.version_info < (3,) else 'builtins'
153 |     with mock.patch(builtin_mod + '.open') as mock_open:
154 |         setup_environment()
155 |         assert mock_open.called
156 | 


--------------------------------------------------------------------------------
/tests/test_extension.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from weakref import WeakKeyDictionary
  3 | 
  4 | import mock
  5 | import pytest
  6 | import scrapy
  7 | from packaging import version
  8 | from pytest import warns
  9 | from scrapy import Spider
 10 | from scrapy.exporters import PythonItemExporter
 11 | from scrapy.http import Request, Response
 12 | from scrapy.item import Item
 13 | from scrapy.utils.test import get_crawler
 14 | 
 15 | from sh_scrapy.extension import HubstorageExtension, HubstorageMiddleware
 16 | from sh_scrapy.middlewares import HS_PARENT_ID_KEY
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def hs_ext(monkeypatch):
 21 |     monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock())
 22 |     monkeypatch.setattr('sh_scrapy.extension.hsref', mock.Mock())
 23 |     crawler = get_crawler(Spider)
 24 |     return HubstorageExtension.from_crawler(crawler)
 25 | 
 26 | 
 27 | def test_hs_ext_init(hs_ext):
 28 |     assert hs_ext.crawler
 29 |     assert hs_ext._write_item == hs_ext.pipe_writer.write_item
 30 |     assert isinstance(hs_ext.exporter, PythonItemExporter)
 31 |     assert hs_ext.exporter.export_item({"a": "b"}) == {"a": "b"}
 32 | 
 33 | 
 34 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7")
 35 | def test_hs_ext_dataclass_item_scraped(hs_ext):
 36 |     from dataclasses import dataclass
 37 | 
 38 |     @dataclass
 39 |     class DataclassItem:
 40 |         pass
 41 | 
 42 |     hs_ext._write_item = mock.Mock()
 43 |     item = DataclassItem()
 44 |     spider = Spider('test')
 45 |     hs_ext.item_scraped(item, spider)
 46 |     assert hs_ext._write_item.call_count == 1
 47 |     assert hs_ext._write_item.call_args[0] == ({'_type': 'DataclassItem'},)
 48 | 
 49 | 
 50 | def test_hs_ext_attrs_item_scraped(hs_ext):
 51 |     try:
 52 |         import attr
 53 |         import iteamadapter
 54 |     except ImportError:
 55 |         pytest.skip('attrs not installed')
 56 |         return
 57 | 
 58 |     @attr.s
 59 |     class AttrsItem(object):
 60 |         pass
 61 | 
 62 |     hs_ext._write_item = mock.Mock()
 63 |     item = AttrsItem()
 64 |     spider = Spider('test')
 65 |     hs_ext.item_scraped(item, spider)
 66 |     assert hs_ext._write_item.call_count == 1
 67 |     assert hs_ext._write_item.call_args[0] == ({'_type': 'AttrsItem'},)
 68 | 
 69 | 
 70 | def test_hs_ext_item_scraped(hs_ext):
 71 |     hs_ext._write_item = mock.Mock()
 72 |     item = Item()
 73 |     spider = Spider('test')
 74 |     hs_ext.item_scraped(item, spider)
 75 |     assert hs_ext._write_item.call_count == 1
 76 |     assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'},)
 77 | 
 78 | 
 79 | def test_hs_ext_item_scraped_skip_wrong_type(hs_ext):
 80 |     hs_ext._write_item = mock.Mock()
 81 |     spider = Spider('test')
 82 |     for item in [None, [], 123]:
 83 |         hs_ext.item_scraped(item, spider)
 84 |         assert hs_ext._write_item.call_count == 0
 85 | 
 86 | 
 87 | def test_hs_ext_spider_closed(hs_ext):
 88 |     spider = Spider('test')
 89 |     hs_ext.spider_closed(spider, 'killed')
 90 |     assert hs_ext.pipe_writer.set_outcome.called
 91 |     assert hs_ext.pipe_writer.set_outcome.call_args == mock.call('killed')
 92 | 
 93 | 
 94 | @pytest.fixture
 95 | def hs_mware(monkeypatch):
 96 |     monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock())
 97 |     crawler = get_crawler()
 98 |     return HubstorageMiddleware.from_crawler(crawler)
 99 | 
100 | 
101 | def test_hs_mware_init(hs_mware):
102 |     assert hs_mware._seen == {}
103 |     assert hs_mware.hsref
104 | 
105 | 
106 | def test_hs_mware_process_spider_input(hs_mware):
107 |     response = Response('http://resp-url')
108 |     response.request = Request('http://req-url')
109 |     hs_mware.process_spider_input(response, Spider('test'))
110 |     assert hs_mware.pipe_writer.write_request.call_count == 1
111 |     args = hs_mware.pipe_writer.write_request.call_args[1]
112 |     if hasattr(hs_mware._crawler, "request_fingerprinter"):
113 |         fp = "1c735665b072000e11b0169081bce5bbaeac09a7"
114 |     else:
115 |         fp = "a001a1eb4537acdc8525edf1250065cab2657152"
116 |     assert args == {
117 |         'duration': 0,
118 |         'fp': fp,
119 |         'method': 'GET',
120 |         'parent': None,
121 |         'rs': 0,
122 |         'status': 200,
123 |         'url': 'http://resp-url'
124 |     }
125 |     assert hs_mware._seen == WeakKeyDictionary({response: 0})
126 | 
127 | 
128 | def test_hs_mware_process_spider_output_void_result(hs_mware):
129 |     response = Response('http://resp-url')
130 |     hs_mware._seen = WeakKeyDictionary({response: 'riq'})
131 |     assert list(hs_mware.process_spider_output(
132 |         response, [], Spider('test'))) == []
133 | 
134 | 
135 | def test_hs_mware_process_spider_output_filter_request(hs_mware):
136 |     response = Response('http://resp-url')
137 |     # provide a response and a new request in result
138 |     child_response = Response('http://resp-url-child')
139 |     child_response.request = Request('http://resp-url-child-req')
140 |     child_request = Request('http://req-url-child')
141 |     hs_mware._seen = WeakKeyDictionary({response: 'riq'})
142 |     result = list(hs_mware.process_spider_output(
143 |         response, [child_response, child_request], Spider('test')))
144 |     assert len(result) == 2
145 |     # make sure that we update hsparent meta only for requests
146 |     assert result[0].meta.get(HS_PARENT_ID_KEY) is None
147 |     assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
148 | 
149 | 
150 | @pytest.mark.skipif(
151 |     version.parse(scrapy.__version__) < version.parse("2.7"),
152 |     reason="Only Scrapy 2.7 and higher support centralized request fingerprints."
153 | )
154 | def test_custom_fingerprinter(monkeypatch):
155 |     monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock())
156 | 
157 |     class CustomFingerprinter:
158 |         def fingerprint(self, request):
159 |             return b"foo"
160 | 
161 |     crawler = get_crawler(settings_dict={"REQUEST_FINGERPRINTER_CLASS": CustomFingerprinter})
162 |     mw = HubstorageMiddleware.from_crawler(crawler)
163 | 
164 |     response = Response('http://resp-url')
165 |     response.request = Request('http://req-url')
166 |     mw.process_spider_input(response, Spider('test'))
167 |     assert mw.pipe_writer.write_request.call_args[1]["fp"] == b"foo".hex()
168 | 
169 | 
170 | def test_subclassing():
171 |     class CustomHubstorageMiddleware(HubstorageMiddleware):
172 |         def __init__(self):
173 |             super().__init__()
174 |             self.foo = "bar"
175 | 
176 |     crawler = get_crawler()
177 |     with warns(
178 |         DeprecationWarning,
179 |         match="must now accept a crawler parameter in their __init__ method",
180 |     ):
181 |         mw = CustomHubstorageMiddleware.from_crawler(crawler)
182 | 
183 |     assert mw.foo == "bar"
184 |     assert hasattr(mw, "_fingerprint")
185 | 


--------------------------------------------------------------------------------
/tests/test_hsref.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import pytest
  3 | from sh_scrapy.hsref import _HubstorageRef
  4 | 
  5 | 
  6 | def test_init_disabled(monkeypatch):
  7 |     monkeypatch.delenv('SHUB_JOBKEY')
  8 |     hsref = _HubstorageRef()
  9 |     assert not hsref._client
 10 |     assert not hsref._project
 11 |     assert not hsref._job
 12 |     assert not hsref.enabled
 13 |     assert not hasattr(hsref, 'jobkey')
 14 |     assert not hsref._projectid
 15 |     assert not hsref._spiderid
 16 |     assert not hsref._jobcounter
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | @pytest.mark.usefixtures('set_environment')
 21 | def hsref():
 22 |     return _HubstorageRef()
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def hsc_class(monkeypatch):
 27 |     hsc_class = mock.Mock()
 28 |     monkeypatch.setattr('scrapinghub.HubstorageClient', hsc_class)
 29 |     return hsc_class
 30 | 
 31 | 
 32 | def test_init(hsref):
 33 |     assert not hsref._client
 34 |     assert not hsref._project
 35 |     assert not hsref._job
 36 |     assert hsref.enabled
 37 |     assert hsref.jobkey == '1/2/3'
 38 |     assert hsref._projectid == 1
 39 |     assert hsref._spiderid == 2
 40 |     assert hsref._jobcounter == 3
 41 | 
 42 | 
 43 | def test_auth(hsref):
 44 |     assert hsref.auth == '1/2/3:authstr'
 45 | 
 46 | 
 47 | def test_endpoint(hsref):
 48 |     assert hsref.endpoint == 'storage-url'
 49 | 
 50 | 
 51 | def test_job_ids(hsref):
 52 |     assert hsref.projectid == 1
 53 |     assert hsref.spiderid == 2
 54 |     assert hsref.jobid == 3
 55 | 
 56 | 
 57 | def test_client(hsref, hsc_class):
 58 |     assert not hsref._client
 59 |     assert hsref.client == hsc_class.return_value
 60 |     hsc_class.assert_called_with(endpoint='storage-url',
 61 |                                  auth='1/2/3:authstr',
 62 |                                  user_agent=None)
 63 |     assert hsref._client
 64 |     assert hsref.client == hsref._client
 65 | 
 66 | 
 67 | def test_client_custom_ua(hsref, hsc_class, monkeypatch):
 68 |     monkeypatch.setenv('SHUB_HS_USER_AGENT', 'testUA')
 69 |     assert not hsref._client
 70 |     assert hsref.client == hsc_class.return_value
 71 |     hsc_class.assert_called_with(endpoint='storage-url',
 72 |                                  auth='1/2/3:authstr',
 73 |                                  user_agent='testUA')
 74 |     assert hsref._client
 75 |     assert hsref.client == hsref._client
 76 | 
 77 | 
 78 | def test_project(hsref):
 79 |     hsc = mock.Mock()
 80 |     hsc.get_project.return_value = 'Project'
 81 |     hsref._client = hsc
 82 | 
 83 |     assert not hsref._project
 84 |     assert hsref.project == 'Project'
 85 |     hsc.get_project.assert_called_with('1')
 86 |     assert hsref._project == hsref.project
 87 | 
 88 | 
 89 | def test_job(hsref):
 90 |     project = mock.Mock()
 91 |     project.get_job.return_value = 'Job'
 92 |     hsref._project = project
 93 | 
 94 |     assert not hsref._job
 95 |     assert hsref.job == 'Job'
 96 |     project.get_job.assert_called_with((2, 3))
 97 |     assert hsref._job == hsref.job
 98 | 
 99 | 
100 | def test_close(hsref):
101 |     assert not hsref._client
102 |     hsref.close()
103 |     client = mock.Mock()
104 |     hsref._client = client
105 |     hsref.close()
106 |     client.close.assert_called_with()
107 | 


--------------------------------------------------------------------------------
/tests/test_log.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import mock
  4 | import pytest
  5 | import sys
  6 | import zlib
  7 | 
  8 | from sh_scrapy.log import _stdout, _stderr
  9 | from sh_scrapy.log import initialize_logging
 10 | from sh_scrapy.log import HubstorageLogHandler
 11 | from sh_scrapy.log import HubstorageLogObserver
 12 | from sh_scrapy.log import StdoutLogger
 13 | 
 14 | 
 15 | @pytest.fixture(autouse=True)
 16 | def reset_std_streams():
 17 |     sys.stdout = _stdout
 18 |     sys.stderr = _stderr
 19 | 
 20 | 
 21 | @mock.patch('twisted.python.log.startLoggingWithObserver')
 22 | @mock.patch('sh_scrapy.log.HubstorageLogObserver')
 23 | def test_initialize_logging_dont_fail(observer, txlog_start):
 24 |     loghandler = initialize_logging()
 25 | 
 26 |     rootlogger = logging.getLogger()
 27 |     assert rootlogger.level == logging.NOTSET
 28 | 
 29 |     # check if null handler is set for libs
 30 |     for lib in ('boto', 'requests', 'hubstorage'):
 31 |         lg = logging.getLogger(lib)
 32 |         assert lg.propagate == 0
 33 |         assert any([hdl for hdl in lg.handlers
 34 |                     if isinstance(hdl, logging.NullHandler)])
 35 | 
 36 |     # check standard out/err redirection
 37 |     assert isinstance(sys.stdout, StdoutLogger)
 38 |     assert sys.stdout.encoding == 'utf-8'
 39 |     assert isinstance(sys.stderr, StdoutLogger)
 40 |     assert sys.stderr.encoding == 'utf-8'
 41 | 
 42 |     # check twisted specific
 43 |     assert observer.called
 44 |     observer.assert_called_with(loghandler)
 45 |     emit_method = observer.return_value.emit
 46 |     assert txlog_start.called
 47 |     txlog_start.assert_called_with(emit_method, setStdout=False)
 48 | 
 49 |     # check returned handler
 50 |     assert isinstance(loghandler, HubstorageLogHandler)
 51 |     assert loghandler.level == logging.INFO
 52 |     assert loghandler.formatter._fmt == '[%(name)s] %(message)s'
 53 | 
 54 | @mock.patch('sh_scrapy.log.pipe_writer')
 55 | def test_hs_loghandler_emit_ok(pipe_writer):
 56 |     hdlr = HubstorageLogHandler()
 57 |     record = logging.makeLogRecord({'msg': 'test-record'})
 58 |     hdlr.emit(record)
 59 |     assert pipe_writer.write_log.called
 60 |     pipe_writer.write_log.assert_called_with(message='test-record', level=None)
 61 | 
 62 | 
 63 | @mock.patch('sh_scrapy.log.pipe_writer')
 64 | def test_hs_loghandler_emit_handle_interrupt(pipe_writer):
 65 |     pipe_writer.write_log.side_effect = KeyboardInterrupt
 66 |     hdlr = HubstorageLogHandler()
 67 |     record = logging.makeLogRecord({'msg': 'test-record'})
 68 |     with pytest.raises(KeyboardInterrupt):
 69 |         hdlr.emit(record)
 70 | 
 71 | 
 72 | @mock.patch('logging.Handler.handleError')
 73 | @mock.patch('sh_scrapy.log.pipe_writer')
 74 | def test_hs_loghandler_emit_handle_exception(pipe_writer, handleError):
 75 |     pipe_writer.write_log.side_effect = ValueError
 76 |     hdlr = HubstorageLogHandler()
 77 |     record = logging.makeLogRecord({'msg': 'test-record'})
 78 |     hdlr.emit(record)
 79 |     assert handleError.called
 80 |     assert handleError.call_args == mock.call(record)
 81 | 
 82 | 
 83 | @pytest.fixture
 84 | def hs_observer():
 85 |     hdlr = mock.Mock()
 86 |     return HubstorageLogObserver(hdlr)
 87 | 
 88 | 
 89 | def test_hs_logobserver_init(hs_observer):
 90 |     assert isinstance(hs_observer._hs_loghdlr, mock.Mock)
 91 | 
 92 | 
 93 | def test_hs_logobserver_get_log_item_low_level(hs_observer):
 94 |     hs_observer._hs_loghdlr.level = 20
 95 |     event = {'system': 'scrapy', 'logLevel': 10}
 96 |     assert not hs_observer._get_log_item(event)
 97 | 
 98 | 
 99 | def test_hs_logobserver_get_log_item_system(hs_observer):
100 |     hs_observer._hs_loghdlr.level = 20
101 |     event = {'system': 'scrapy', 'logLevel': 30, 'message': ['test']}
102 |     assert hs_observer._get_log_item(event) == {
103 |         'level': 30, 'message': 'test'}
104 | 
105 | 
106 | def test_hs_logobserver_get_log_item_info(hs_observer):
107 |     hs_observer._hs_loghdlr.level = 20
108 |     event = {'system': 'other', 'message': ['test'], 'isError': False}
109 |     assert hs_observer._get_log_item(event) == {
110 |         'level': 20, 'message': 'test'}
111 | 
112 | 
113 | def test_hs_logobserver_get_log_item_error(hs_observer):
114 |     hs_observer._hs_loghdlr.level = 20
115 |     event = {'system': 'other', 'message': ['test'], 'isError': True}
116 |     assert hs_observer._get_log_item(event) == {
117 |         'level': 40, 'message': 'test'}
118 | 
119 | 
120 | def test_hs_logobserver_get_log_item_failure(hs_observer):
121 |     hs_observer._hs_loghdlr.level = 20
122 |     failure = mock.Mock()
123 |     failure.getTraceback.return_value = 'some-traceback'
124 |     event = {'system': 'other', 'failure': failure, 'isError': False}
125 |     assert hs_observer._get_log_item(event) == {
126 |         'level': 20, 'message': 'some-traceback'}
127 | 
128 | 
129 | def test_hs_logobserver_get_log_item_why(hs_observer):
130 |     hs_observer._hs_loghdlr.level = 20
131 |     event = {'system': 'other', 'message': ['test'],
132 |              'why': 'why-msg', 'isError': False}
133 |     assert hs_observer._get_log_item(event) == {
134 |         'level': 20, 'message': 'why-msg\n\ttest'}
135 | 
136 | 
137 | def test_hs_logobserver_get_log_item_format(hs_observer):
138 |     hs_observer._hs_loghdlr.level = 20
139 |     event = {'system': 'other', 'message': ['test'], 'data': 'raw',
140 |              'format': 'formatted/%(data)s', 'isError': False}
141 |     assert hs_observer._get_log_item(event) == {
142 |         'level': 20, 'message': 'formatted/raw'}
143 | 
144 | 
145 | def test_hs_logobserver_get_log_item_format_error(hs_observer):
146 |     hs_observer._hs_loghdlr.level = 20
147 |     event = {'system': 'other', 'message': ['test'], 'data': 'raw',
148 |              'format': 'formatted/%(data)%%', 'isError': False}
149 |     expected_template = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r"
150 |     assert hs_observer._get_log_item(event) == {
151 |         'level': 40, 'message': expected_template % (event['format'], event)}
152 | 
153 | 
154 | @mock.patch('sh_scrapy.log.pipe_writer')
155 | def test_hs_logobserver_emit_filter_events(pipe_writer, hs_observer):
156 |     hs_observer._hs_loghdlr.level = 20
157 |     event = {'system': 'scrapy', 'logLevel': 10}
158 |     hs_observer.emit(event)
159 |     assert not pipe_writer.write_log.called
160 | 
161 | 
162 | @mock.patch('sh_scrapy.log.pipe_writer')
163 | def test_hs_logobserver_emit_logitem(pipe_writer, hs_observer):
164 |     hs_observer._hs_loghdlr.level = 20
165 |     event = {'system': 'other', 'message': ['test'], 'isError': False}
166 |     hs_observer.emit(event)
167 |     assert pipe_writer.write_log.called
168 |     pipe_writer.write_log.assert_called_with(level=20, message='test')
169 | 
170 | 
171 | def stdout_logger_init_stdout():
172 |     logger_out = StdoutLogger(0, 'utf-8')
173 |     assert logger_out.prefix == '[stdout]'
174 |     assert logger_out.loglevel == logging.INFO
175 | 
176 | 
177 | def stdout_logger_init_stderr():
178 |     logger_out = StdoutLogger(1, 'utf-8', loglevel=logging.ERROR)
179 |     assert logger_out.prefix == '[stderr]'
180 |     assert logger_out.loglevel == logging.ERROR
181 | 
182 | 
183 | @mock.patch('sh_scrapy.log.pipe_writer')
184 | def test_stdout_logger_logprefixed(pipe_writer):
185 |     logger = StdoutLogger(0, 'utf-8')
186 |     logger._logprefixed('message')
187 |     assert pipe_writer.write_log.called
188 |     pipe_writer.write_log.assert_called_with(level=20, message='[stdout] message')
189 | 
190 | 
191 | @mock.patch('sh_scrapy.log.pipe_writer')
192 | def test_stdout_logger_write(pipe_writer):
193 |     logger = StdoutLogger(0, 'utf-8')
194 |     logger.write('some-string\nother-string\nlast-string')
195 |     assert pipe_writer.write_log.called
196 |     assert pipe_writer.write_log.call_args_list[0] == mock.call(
197 |         level=20,
198 |         message='[stdout] some-string'
199 |     )
200 |     assert pipe_writer.write_log.call_args_list[1] == mock.call(
201 |         level=20,
202 |         message='[stdout] other-string'
203 |     )
204 |     assert logger.buf == 'last-string'
205 | 
206 | 
207 | def test_stdout_logger_writelines_empty():
208 |     logger = StdoutLogger(0, 'utf-8')
209 |     logger.writelines([])
210 | 
211 | 
212 | @mock.patch('sh_scrapy.log.pipe_writer')
213 | def test_stdout_logger_writelines(pipe_writer):
214 |     logger = StdoutLogger(0, 'utf-8')
215 |     logger.writelines(['test-line'])
216 |     assert pipe_writer.write_log.called
217 |     pipe_writer.write_log.assert_called_with(level=20, message='[stdout] test-line')
218 | 
219 | 
220 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2")
221 | @mock.patch('sh_scrapy.log.pipe_writer._pipe')
222 | def test_unicode_decode_error_handling(pipe_mock):
223 |     hdlr = HubstorageLogHandler()
224 |     message = 'value=%s' % zlib.compress('value')
225 |     record = logging.makeLogRecord({'msg': message, 'levelno': 10})
226 |     hdlr.emit(record)
227 |     assert pipe_mock.write.called
228 |     payload = json.loads(pipe_mock.write.call_args_list[2][0][0])
229 |     assert isinstance(payload.pop('time'), int)
230 |     assert payload == {
231 |         'message': r'value=x\x9c+K\xcc)M\x05\x00\x06j\x02\x1e',
232 |         'level': 10
233 |     }
234 | 


--------------------------------------------------------------------------------
/tests/test_middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from weakref import WeakKeyDictionary
  3 | import itertools
  4 | import pytest
  5 | import sys
  6 | from scrapy import Spider, Request, Item
  7 | from scrapy.http import Response
  8 | from scrapy.utils.test import get_crawler
  9 | from typing import Optional
 10 | 
 11 | from sh_scrapy.middlewares import (
 12 |     HubstorageSpiderMiddleware, HubstorageDownloaderMiddleware,
 13 |     HS_REQUEST_ID_KEY, HS_PARENT_ID_KEY
 14 | )
 15 | 
 16 | 
 17 | @pytest.fixture()
 18 | def monkeypatch_globals(monkeypatch):
 19 |     monkeypatch.setattr('sh_scrapy.middlewares.request_id_sequence', itertools.count())
 20 |     monkeypatch.setattr('sh_scrapy.middlewares.seen_requests', WeakKeyDictionary())
 21 | 
 22 | 
 23 | @pytest.fixture()
 24 | def hs_spider_middleware(monkeypatch_globals):
 25 |     return HubstorageSpiderMiddleware()
 26 | 
 27 | 
 28 | @pytest.fixture()
 29 | def hs_downloader_middleware(monkeypatch_globals):
 30 |     crawler = get_crawler()
 31 |     return HubstorageDownloaderMiddleware.from_crawler(crawler)
 32 | 
 33 | 
 34 | def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
 35 |     assert hs_spider_middleware._seen_requests == WeakKeyDictionary()
 36 |     assert hs_downloader_middleware._seen_requests == WeakKeyDictionary()
 37 |     assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests
 38 | 
 39 |     spider = Spider('test')
 40 |     url = 'http://resp-url'
 41 |     request_0 = Request(url)
 42 |     response_0 = Response(url)
 43 | 
 44 |     hs_downloader_middleware.process_request(request_0, spider)
 45 | 
 46 |     assert HS_REQUEST_ID_KEY not in request_0.meta
 47 |     assert HS_PARENT_ID_KEY not in request_0.meta
 48 |     assert len(hs_spider_middleware._seen_requests) == 0
 49 |     assert len(hs_downloader_middleware._seen_requests) == 0
 50 | 
 51 |     hs_downloader_middleware.process_response(request_0, response_0, spider)
 52 | 
 53 |     assert request_0.meta[HS_REQUEST_ID_KEY] == 0
 54 |     assert request_0.meta[HS_PARENT_ID_KEY] is None
 55 |     assert hs_spider_middleware._seen_requests[request_0] == 0
 56 | 
 57 |     response_0.request = request_0
 58 |     request_1 = Request(url)
 59 |     request_2 = Request(url)
 60 |     item1 = {}
 61 |     item2 = Item()
 62 |     output = [request_1, request_2, item1, item2]
 63 |     processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider))
 64 | 
 65 |     assert processed_output[0] is request_1
 66 |     assert request_1.meta[HS_PARENT_ID_KEY] == 0
 67 |     assert processed_output[1] is request_2
 68 |     assert request_2.meta[HS_PARENT_ID_KEY] == 0
 69 |     assert processed_output[2] is item1
 70 |     assert processed_output[3] is item2
 71 | 
 72 |     response_1 = Response(url)
 73 |     hs_downloader_middleware.process_request(request_1, spider)
 74 |     hs_downloader_middleware.process_response(request_1, response_1, spider)
 75 |     assert request_1.meta[HS_REQUEST_ID_KEY] == 1
 76 |     assert request_1.meta[HS_PARENT_ID_KEY] == 0
 77 | 
 78 |     response_2 = Response(url)
 79 |     hs_downloader_middleware.process_request(request_2, spider)
 80 |     hs_downloader_middleware.process_response(request_2, response_2, spider)
 81 |     assert request_2.meta[HS_REQUEST_ID_KEY] == 2
 82 |     assert request_2.meta[HS_PARENT_ID_KEY] == 0
 83 | 
 84 | 
 85 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7")
 86 | def test_hs_middlewares_dummy_response(hs_downloader_middleware, hs_spider_middleware):
 87 |     from dataclasses import dataclass
 88 | 
 89 |     @dataclass(unsafe_hash=True)
 90 |     class DummyResponse(Response):
 91 |         __module__: str = "scrapy_poet.api"
 92 | 
 93 |         def __init__(self, url: str, request: Optional[Request] = None):
 94 |             super().__init__(url=url, request=request)
 95 | 
 96 |     spider = Spider('test')
 97 |     url = 'http://resp-url'
 98 | 
 99 |     # cleaning log file
100 |     hs_downloader_middleware.pipe_writer.open()
101 | 
102 |     request = Request(url)
103 |     response_1 = DummyResponse(url, request)
104 |     response_2 = Response(url)
105 |     hs_downloader_middleware.process_request(request, spider)
106 |     hs_downloader_middleware.process_response(request, response_1, spider)
107 | 
108 |     with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
109 |         assert tmp_file.readline() == ""
110 |     assert request.meta == {}
111 | 
112 |     hs_downloader_middleware.process_response(request, response_2, spider)
113 |     with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
114 |         assert tmp_file.readline().startswith('REQ')
115 | 
116 |     assert request.meta[HS_REQUEST_ID_KEY] == 0
117 |     assert request.meta[HS_PARENT_ID_KEY] is None
118 | 
119 | 
120 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7")
121 | def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware):
122 |     from dataclasses import dataclass
123 | 
124 |     @dataclass(unsafe_hash=True)
125 |     class DummyResponse(Response):
126 |         __module__: str = "scrapy_poet.api"
127 | 
128 |         def __init__(self, url: str, request: Optional[Request] = None):
129 |             super().__init__(url=url, request=request)
130 | 
131 |     spider = Spider('test')
132 |     url = 'http://resp-url'
133 |     request_0 = Request(url)
134 |     response_0 = Response(url)
135 | 
136 |     hs_downloader_middleware.process_request(request_0, spider)
137 | 
138 |     assert HS_REQUEST_ID_KEY not in request_0.meta
139 |     assert HS_PARENT_ID_KEY not in request_0.meta
140 |     assert len(hs_spider_middleware._seen_requests) == 0
141 |     assert len(hs_downloader_middleware._seen_requests) == 0
142 | 
143 |     hs_downloader_middleware.process_response(request_0, response_0, spider)
144 | 
145 |     assert request_0.meta[HS_REQUEST_ID_KEY] == 0
146 |     assert request_0.meta[HS_PARENT_ID_KEY] is None
147 |     assert hs_spider_middleware._seen_requests[request_0] == 0
148 | 
149 |     request_1 = request_0.copy()
150 |     response_1 = Response(url)
151 |     assert request_1.meta[HS_REQUEST_ID_KEY] == 0
152 |     assert request_1.meta[HS_PARENT_ID_KEY] is None
153 | 
154 |     hs_downloader_middleware.process_request(request_1, spider)
155 | 
156 |     assert HS_REQUEST_ID_KEY not in request_1.meta
157 |     assert request_1.meta[HS_PARENT_ID_KEY] == 0
158 | 
159 |     hs_downloader_middleware.process_response(request_1, response_1, spider)
160 | 
161 |     assert request_1.meta[HS_REQUEST_ID_KEY] == 1
162 |     assert request_1.meta[HS_PARENT_ID_KEY] == 0
163 | 
164 |     request_2 = request_1.copy()
165 |     response_2_1 = DummyResponse(url, request_2)
166 |     response_2_2 = Response(url)
167 | 
168 |     hs_downloader_middleware.process_response(request_2, response_2_1, spider)
169 | 
170 |     assert request_2.meta[HS_REQUEST_ID_KEY] == 1
171 |     assert request_2.meta[HS_PARENT_ID_KEY] == 0
172 | 
173 |     hs_downloader_middleware.process_response(request_2, response_2_2, spider)
174 | 
175 |     assert request_2.meta[HS_REQUEST_ID_KEY] == 2
176 |     assert request_2.meta[HS_PARENT_ID_KEY] == 0
177 | 


--------------------------------------------------------------------------------
/tests/test_settings.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import mock
  4 | 
  5 | import pytest
  6 | from scrapy import version_info as scrapy_version
  7 | from scrapy.settings import Settings
  8 | from scrapy.utils.python import to_unicode
  9 | 
 10 | from sh_scrapy.settings import EntrypointSettings
 11 | from sh_scrapy.settings import _enforce_required_settings
 12 | from sh_scrapy.settings import _maybe_load_autoscraping_project
 13 | from sh_scrapy.settings import _get_component_base
 14 | from sh_scrapy.settings import _get_action_on_missing_addons
 15 | from sh_scrapy.settings import _load_addons
 16 | from sh_scrapy.settings import _populate_settings_base
 17 | from sh_scrapy.settings import _load_default_settings
 18 | from sh_scrapy.settings import _update_old_classpaths
 19 | from sh_scrapy.settings import populate_settings
 20 | 
 21 | 
 22 | TEST_ADDON = {
 23 |     'addon_id': 'test_addon',
 24 |     'name': 'Fake test addon',
 25 |     'description': 'Some description',
 26 |     'settings': ('TEST_SETTING_A', 'TEST_SETTING_B'),
 27 |     'default_settings': {},
 28 |     'type': 'SPIDER_MIDDLEWARES',
 29 |     'order': 10,
 30 |     'path': 'scrapy.utils.misc.load_object',
 31 |     'builtin': False,
 32 |     'needs_aws': False,
 33 | }
 34 | 
 35 | 
 36 | def test_update_settings_void_dictionaries():
 37 |     test = EntrypointSettings()
 38 |     test.setdict({}, 10)
 39 |     assert len(test.attributes) == 0
 40 | 
 41 | 
 42 | def test_update_settings_base_test():
 43 |     test = EntrypointSettings()
 44 |     test.setdict({'a': 'b'}, 10)
 45 |     assert test['a'] == 'b'
 46 | 
 47 | 
 48 | def test_update_settings_base_test2():
 49 |     test = EntrypointSettings()
 50 |     test.setdict({'a': 'b', 'c': 'd'}, 10)
 51 |     assert len(test.attributes) == 2
 52 | 
 53 | 
 54 | def test_update_settings_dont_fail_on_non_string():
 55 |     test = EntrypointSettings()
 56 |     test.setdict({'a': 3}, 10)
 57 |     assert test['a'] == 3
 58 | 
 59 | 
 60 | def test_update_settings_update_existing_value():
 61 |     test = EntrypointSettings()
 62 |     test.setdict({'a': 'b', 'c': 'd'}, priority=10)
 63 |     test.setdict({'c': 'e', 'f': 'g'}, 10)
 64 |     assert len(test.attributes) == 3
 65 |     assert test['a'] == 'b'
 66 |     assert test['c'] == 'e'
 67 |     assert test['f'] == 'g'
 68 | 
 69 | 
 70 | def test_update_settings_per_key_priorities_old_behavior():
 71 |     test = EntrypointSettings()
 72 |     test.set('ITEM_PIPELINES', {'path.one': 100})
 73 |     test.set('ITEM_PIPELINES', {'path.two': 200})
 74 |     assert test['ITEM_PIPELINES'] == {'path.two': 200}
 75 | 
 76 | 
 77 | @pytest.mark.skipif(scrapy_version < (1, 1), reason="requires Scrapy>=1.1")
 78 | def test_update_settings_per_key_priorities_new_behaviour():
 79 |     from scrapy.settings import BaseSettings
 80 |     test = EntrypointSettings()
 81 |     test.set('ITEM_PIPELINES', BaseSettings())
 82 |     test['ITEM_PIPELINES'].update({'test.path1': 100})
 83 |     test['ITEM_PIPELINES'].update({'test.path2': 200})
 84 |     assert dict(test['ITEM_PIPELINES']) == {
 85 |         'test.path1': 100, 'test.path2': 200}
 86 | 
 87 | 
 88 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2")
 89 | def test_update_settings_check_unicode_in_py2_key():
 90 |     # a dict entry is duplicated as unicode doesn't match native str value
 91 |     test = EntrypointSettings()
 92 |     test.setdict({'\xf1e\xf1e\xf1e': 'test'}, 10)
 93 |     assert test['\xf1e\xf1e\xf1e'] == 'test'
 94 |     assert test[to_unicode('\xf1e\xf1e\xf1e')] == 'test'
 95 | 
 96 | 
 97 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2")
 98 | def test_update_settings_check_unicode_in_py2_key_value():
 99 |     # a dict entry is duplicated as unicode doesn't match native str value
100 |     test = EntrypointSettings()
101 |     test.setdict({'\xf1e\xf1e\xf1e': '\xf1e\xf1e'}, 10)
102 |     assert test['\xf1e\xf1e\xf1e'] == '\xf1e\xf1e'
103 |     native_key = to_unicode('\xf1e\xf1e\xf1e')
104 |     assert test[native_key] == to_unicode('\xf1e\xf1e')
105 | 
106 | 
107 | @pytest.mark.skipif(sys.version_info < (3,), reason="requires python3")
108 | def test_update_settings_check_unicode_in_py3():
109 |     test = EntrypointSettings()
110 |     test.setdict({'\xf1e\xf1e\xf1e': 'test'}, 10)
111 |     assert test['\xf1e\xf1e\xf1e'] == 'test'
112 | 
113 | 
114 | def test_maybe_load_autoscraping_project_no_spider_type_env():
115 |     result = {}
116 |     _maybe_load_autoscraping_project(result)
117 |     assert result == {}
118 | 
119 | 
120 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'custom'})
121 | def test_maybe_load_autoscraping_project_custom_type():
122 |     result = {}
123 |     _maybe_load_autoscraping_project(result)
124 |     assert result == {}
125 | 
126 | 
127 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'auto'})
128 | def test_maybe_load_autoscraping_project_ok():
129 |     result = EntrypointSettings()
130 |     result.setdict({'SPIDER_MANAGER_CLASS': 'test.class'})
131 |     _maybe_load_autoscraping_project(result)
132 |     assert result['ITEM_PIPELINES'] == {
133 |         'slybot.dupefilter.DupeFilterPipeline': 0}
134 |     assert result['PROJECT_ZIPFILE'] == 'project-slybot.zip'
135 |     assert result['SLYCLOSE_SPIDER_ENABLED']
136 |     assert result['SLYDUPEFILTER_ENABLED']
137 |     assert result['SPIDER_MANAGER_CLASS'] == 'test.class'
138 | 
139 | 
140 | def test_get_component_base():
141 |     assert _get_component_base({}, 'TEST') == 'TEST'
142 |     assert _get_component_base({'SOME_SETTING': 'VAL'}, 'TEST') == 'TEST'
143 |     assert _get_component_base({'TEST_BASE': 'VAL'}, 'TEST') == 'TEST_BASE'
144 | 
145 | 
146 | def test_get_action_on_missing_addons_default():
147 |     o = EntrypointSettings()
148 |     assert _get_action_on_missing_addons(o) == 'warn'
149 | 
150 | 
151 | def test_get_action_on_missing_addons_base():
152 |     o = EntrypointSettings()
153 |     o.setdict({'ON_MISSING_ADDONS': 'fail'})
154 |     assert _get_action_on_missing_addons(o) == 'fail'
155 | 
156 | 
157 | def test_get_action_on_missing_addons_warn_if_wrong_value():
158 |     o = EntrypointSettings()
159 |     o.setdict({'ON_MISSING_ADDONS': 'wrong'})
160 |     assert _get_action_on_missing_addons(o) == 'warn'
161 | 
162 | 
163 | def test_load_addons_void():
164 |     addons = []
165 |     settings, o = EntrypointSettings(), EntrypointSettings()
166 |     _load_addons(addons, settings, o)
167 |     assert addons == []
168 |     assert settings.attributes == o.attributes == {}
169 | 
170 | 
171 | def test_load_addons_basic_usage():
172 |     addons = [TEST_ADDON]
173 |     settings = EntrypointSettings()
174 |     settings.setdict({'SPIDER_MIDDLEWARES': {}})
175 |     o = EntrypointSettings()
176 |     _load_addons(addons, settings, o)
177 |     assert settings['SPIDER_MIDDLEWARES'] == {TEST_ADDON['path']: 10}
178 |     assert o['SPIDER_MIDDLEWARES'] == {TEST_ADDON['path']: 10}
179 | 
180 | 
181 | def test_load_addons_basic_with_defaults():
182 |     addons = [TEST_ADDON.copy()]
183 |     addons[0]['default_settings'] = {'TEST_SETTING_A': 'TEST'}
184 |     settings = {'SPIDER_MIDDLEWARES_BASE': {
185 |         'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
186 |         'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500}}
187 |     o = EntrypointSettings()
188 |     o.setdict({'ON_MISSING_ADDONS': 'warn'})
189 |     _load_addons(addons, settings, o)
190 |     assert settings == {'SPIDER_MIDDLEWARES_BASE': {
191 |         TEST_ADDON['path']: 10,
192 |         'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
193 |         'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500
194 |     }}
195 |     assert len(o.attributes) == 3
196 |     assert o['TEST_SETTING_A'] == 'TEST'
197 |     assert o['ON_MISSING_ADDONS'] == 'warn'
198 |     assert len(o['SPIDER_MIDDLEWARES_BASE']) == 3
199 | 
200 | 
201 | def test_load_addons_hworker_fail_on_import():
202 |     addons = [TEST_ADDON.copy()]
203 |     addons[0]['path'] = 'hworker.some.module'
204 |     settings = EntrypointSettings()
205 |     settings.setdict({'SPIDER_MIDDLEWARES': {}})
206 |     o = EntrypointSettings()
207 |     o.setdict({'ON_MISSING_ADDONS': 'fail'})
208 |     with pytest.raises(ImportError):
209 |         _load_addons(addons, settings, o)
210 | 
211 | 
212 | def test_load_addons_hworker_error_on_import():
213 |     addons = [TEST_ADDON.copy()]
214 |     addons[0]['path'] = 'hworker.some.module'
215 |     settings = {'SPIDER_MIDDLEWARES': {}}
216 |     o = EntrypointSettings()
217 |     o.setdict({'ON_MISSING_ADDONS': 'error'})
218 |     _load_addons(addons, settings, o)
219 |     assert len(o.attributes) == 1
220 |     assert o['ON_MISSING_ADDONS'] == 'error'
221 |     assert settings == {'SPIDER_MIDDLEWARES': {}}
222 | 
223 | 
224 | def test_load_addons_hworker_warning_on_import():
225 |     addons = [TEST_ADDON.copy()]
226 |     addons[0]['path'] = 'hworker.some.module'
227 |     settings = {'SPIDER_MIDDLEWARES': {}}
228 |     o = EntrypointSettings()
229 |     o.setdict({'ON_MISSING_ADDONS': 'warn'})
230 |     _load_addons(addons, settings, o)
231 |     assert len(o.attributes) == 1
232 |     assert o['ON_MISSING_ADDONS'] == 'warn'
233 |     assert settings == {'SPIDER_MIDDLEWARES': {}}
234 | 
235 | 
236 | @mock.patch.dict('sh_scrapy.settings.REPLACE_ADDONS_PATHS',
237 |                  {TEST_ADDON['path']: 'scrapy.utils.misc.arg_to_iter'})
238 | def test_load_addons_hworker_import_replace():
239 |     addons = [TEST_ADDON]
240 |     settings = {'SPIDER_MIDDLEWARES': {}}
241 |     o = EntrypointSettings()
242 |     _load_addons(addons, settings, o)
243 |     assert len(o.attributes) == 1
244 |     assert o['SPIDER_MIDDLEWARES'] == {'scrapy.utils.misc.arg_to_iter': 10}
245 | 
246 | 
247 | def test_populate_settings_dont_fail():
248 |     result = _populate_settings_base({}, lambda x: x)
249 |     assert isinstance(result, Settings)
250 | 
251 | 
252 | def test_populate_settings_with_default():
253 |     def default_test(s):
254 |         s.set('TEST_SETTING_A', 'test')
255 |     result = _populate_settings_base({}, default_test)
256 |     assert result
257 |     assert result['TEST_SETTING_A'] == 'test'
258 | 
259 | 
260 | def test_populate_settings_addons():
261 |     addon = TEST_ADDON.copy()
262 |     addon['default_settings'] = {'TEST_SETTING_A': 'by_addon'}
263 |     msg = {'enabled_addons': [addon]}
264 |     result = _populate_settings_base(msg, lambda x: x)
265 |     assert result
266 |     assert result['TEST_SETTING_A'] == 'by_addon'
267 | 
268 | 
269 | def test_populate_settings_override_settings():
270 |     msg = {}
271 |     for section in ['project_settings',
272 |                     'organization_settings',
273 |                     'job_settings']:
274 |         msg[section] = {'TEST_SETTING_A': 'from_' + section}
275 |         result = _populate_settings_base(msg, lambda x: x)
276 |         assert result
277 |         assert result['TEST_SETTING_A'] == 'from_' + section
278 | 
279 | 
280 | def test_populate_settings_with_spider():
281 |     msg = {'project_settings': {'JOBDIR': 'by_project'},
282 |            'spider_settings': {'TEST_SETTING_A': 'test'}}
283 |     result = _populate_settings_base(msg, lambda x: x, spider=True)
284 |     assert result
285 |     assert result['TEST_SETTING_A'] == 'test'
286 |     assert result['JOBDIR'].split('/')[-1].startswith('jobdata-')
287 |     assert not result.get('PROJECT_ZIPFILE')
288 | 
289 | 
290 | def test_populate_settings_with_spider_override():
291 |     msg = {'job_settings': {'JOBDIR': 'by_job'}}
292 |     result = _populate_settings_base(msg, lambda x: x, spider=True)
293 |     assert result
294 |     assert result['JOBDIR'] == 'by_job'
295 | 
296 | 
297 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'portia'})
298 | def test_populate_settings_with_spider_autoscraping():
299 |     result = _populate_settings_base({}, lambda x: x, spider=True)
300 |     assert result
301 |     assert result['PROJECT_ZIPFILE'] == 'project-slybot.zip'
302 | 
303 | 
304 | @mock.patch('sh_scrapy.settings.get_project_settings')
305 | def test_populate_settings_keep_user_priorities(get_settings_mock):
306 |     get_settings_mock.return_value = Settings({
307 |         'EXTENSIONS_BASE': {
308 |             'sh_scrapy.extension.HubstorageExtension': None,
309 |             'scrapy.spidermiddlewares.depth.DepthMiddleware': 10},
310 |         'SPIDER_MIDDLEWARES_BASE': {'scrapy.utils.misc.load_object': 1}})
311 |     addon = TEST_ADDON.copy()
312 |     api_settings = {
313 |         'project_settings': {
314 |             'EXTENSIONS_BASE': {'sh_scrapy.extension.HubstorageExtension': 300,
315 |                                 'scrapy.contrib.throttle.AutoThrottle': 5}},
316 |         'enabled_addons': [addon]}
317 |     result = _populate_settings_base(api_settings, lambda x: x, spider=True)
318 |     assert result.getdict('SPIDER_MIDDLEWARES_BASE')[
319 |         'scrapy.utils.misc.load_object'] == 1
320 |     assert result.getdict('EXTENSIONS_BASE')[
321 |         'sh_scrapy.extension.HubstorageExtension'] is None
322 |     autothrottles = [k for k in result.getdict('EXTENSIONS_BASE')
323 |                      if 'AutoThrottle' in k]
324 |     assert result.getdict('EXTENSIONS_BASE')[autothrottles[0]] == 5
325 | 
326 | 
327 | def test_populate_settings_unique_update_dict():
328 |     monitoring_dict = {u'SPIDER_OPENED': {u'failed_actions': []}}
329 |     msg = {'spider_settings': {'DASH_MONITORING': monitoring_dict}}
330 |     result = _populate_settings_base(msg, lambda x: x, spider=True)
331 |     assert result['DASH_MONITORING'] == monitoring_dict
332 | 
333 | 
334 | @mock.patch('sh_scrapy.settings.get_project_settings')
335 | def test_populate_settings_keep_user_priorities_oldpath(get_settings_mock):
336 |     get_settings_mock.return_value = Settings({
337 |         'EXTENSIONS_BASE': {'scrapy.contrib.throttle.AutoThrottle': 0}})
338 |     api_settings = {
339 |         'project_settings': {
340 |             'EXTENSIONS_BASE': {'scrapy.contrib.throttle.AutoThrottle': 5}}}
341 |     result = _populate_settings_base(api_settings, lambda x: x, spider=True)
342 |     autothrottles = [k for k in result.getdict('EXTENSIONS_BASE')
343 |                      if 'AutoThrottle' in k]
344 |     assert len(autothrottles) == 1
345 |     assert result.getdict('EXTENSIONS_BASE')[autothrottles[0]] is 0
346 | 
347 | 
348 | def test_load_default_settings():
349 |     result = Settings({'EXTENSIONS_BASE': {
350 |         'sh_scrapy.extension.HubstorageExtension': 50},
351 |                        'SPIDER_MIDDLEWARES_BASE': {}})
352 |     _load_default_settings(result)
353 |     extensions = result['EXTENSIONS_BASE']
354 |     assert extensions['scrapy.extensions.debug.StackTraceDump'] == 0
355 |     assert extensions['sh_scrapy.extension.HubstorageExtension'] == 100
356 |     assert 'slybot.closespider.SlybotCloseSpider' not in extensions
357 |     spider_middlewares = result['SPIDER_MIDDLEWARES_BASE']
358 |     assert 'sh_scrapy.middlewares.HubstorageSpiderMiddleware' in spider_middlewares
359 |     downloader_middlewares = result['DOWNLOADER_MIDDLEWARES_BASE']
360 |     assert 'sh_scrapy.middlewares.HubstorageDownloaderMiddleware' in downloader_middlewares
361 |     assert result['MEMUSAGE_LIMIT_MB'] == 950
362 | 
363 | 
364 | @mock.patch.dict(os.environ, {'SHUB_JOB_MEMORY_LIMIT': '200'})
365 | def test_load_default_settings_mem_limit():
366 |     result = Settings({'EXTENSIONS_BASE': {},
367 |                        'SPIDER_MIDDLEWARES_BASE': {}})
368 |     _load_default_settings(result)
369 |     assert result['MEMUSAGE_LIMIT_MB'] == 200
370 | 
371 | 
372 | def test_enforce_required_settings_default():
373 |     settings = Settings({})
374 |     _enforce_required_settings(settings)
375 |     assert settings['LOG_STDOUT'] is False
376 | 
377 | 
378 | def test_enforce_required_settings_rewrite():
379 |     settings = Settings({'LOG_STDOUT': True})
380 |     _enforce_required_settings(settings)
381 |     assert settings['LOG_STDOUT'] is False
382 | 
383 | 
384 | def test_populate_settings_dont_fail():
385 |     result = populate_settings({})
386 |     assert isinstance(result, Settings)
387 |     # check one of the settings provided by default by sh_scrapy
388 |     assert result['TELNETCONSOLE_HOST'] == '0.0.0.0'
389 | 
390 | 
391 | def test_populate_settings_dont_fail_with_spider():
392 |     result = populate_settings({}, True)
393 |     assert isinstance(result, Settings)
394 |     # check one of the settings provided by default by sh_scrapy
395 |     assert result['TELNETCONSOLE_HOST'] == '0.0.0.0'
396 | 
397 | 
398 | def test_populate_settings_check_required():
399 |     result = populate_settings({'LOG_STDOUT': True})
400 |     assert isinstance(result, Settings)
401 |     # check that some settings fallback to required values
402 |     assert result['LOG_STDOUT'] is False
403 | 
404 | 
405 | def test_update_old_classpaths_not_string():
406 | 
407 |     class CustomObject(object):
408 |         pass
409 | 
410 |     test_value = {'scrapy.exporter.CustomExporter': 1,
411 |                   123: 2, CustomObject: 3}
412 |     test_settings = Settings({'SOME_SETTING': test_value})
413 |     _update_old_classpaths(test_settings)
414 |     expected = test_settings['SOME_SETTING']
415 |     assert len(expected) == 3
416 |     assert 123 in expected
417 |     assert CustomObject in expected
418 |     assert 'scrapy.exporter.CustomExporter' in expected
419 | 


--------------------------------------------------------------------------------
/tests/test_stats.py:
--------------------------------------------------------------------------------
 1 | import mock
 2 | import pytest
 3 | 
 4 | from scrapy.spiders import Spider
 5 | from scrapy.utils.test import get_crawler
 6 | 
 7 | from sh_scrapy import stats
 8 | 
 9 | 
10 | @pytest.fixture
11 | def collector(monkeypatch):
12 |     monkeypatch.setattr('sh_scrapy.stats.pipe_writer', mock.Mock())
13 |     crawler = get_crawler(Spider)
14 |     return stats.HubStorageStatsCollector(crawler)
15 | 
16 | 
17 | def test_collector_class_vars(collector):
18 |     assert collector.INTERVAL == 30
19 | 
20 | 
21 | def test_collector_upload_stats(collector):
22 |     stats = {'item_scraped_count': 10, 'scheduler/enqueued': 20}
23 |     collector.set_stats(stats.copy())
24 |     collector._upload_stats()
25 |     assert collector.pipe_writer.write_stats.call_count == 1
26 |     collector.pipe_writer.write_stats.assert_called_with(stats.copy())
27 | 
28 | 
29 | @mock.patch('twisted.internet.task.LoopingCall')
30 | def test_collector_open_spider(lcall, collector):
31 |     collector.open_spider('spider')
32 |     lcall.assert_called_with(collector._upload_stats)
33 |     lcall.return_value.start.assert_called_with(collector.INTERVAL, now=True)
34 |     dcall = lcall.return_value.start.return_value
35 |     dcall.addErrback.assert_called_with(
36 |         collector._setup_looping_call, now=False)
37 | 
38 | 
39 | def test_collector_close_spider(collector):
40 |     collector._samplestask = mock.Mock()
41 |     collector._samplestask.running = True
42 |     stats = {'item_scraped_count': 10}
43 |     collector.set_stats(stats.copy())
44 |     collector.close_spider('spider', 'reason')
45 |     assert collector._samplestask.stop.called
46 |     collector.pipe_writer.write_stats.assert_called_with(stats.copy())
47 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | from mock import patch
 3 | 
 4 | from pytest import raises
 5 | from scrapy.settings import Settings
 6 | 
 7 | from sh_scrapy.utils import get_project_settings
 8 | 
 9 | 
10 | def test_get_project_settings_class():
11 |     settings = get_project_settings()
12 |     assert isinstance(settings, Settings)
13 | 
14 | 
15 | def test_get_project_settings_default():
16 |     settings = get_project_settings()
17 |     assert settings['TELNETCONSOLE_HOST'] == '0.0.0.0'
18 | 
19 | 
20 | @patch.dict(
21 |     environ,
22 |     {
23 |         'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}',
24 |     }
25 | )
26 | def test_get_project_settings_setting():
27 |     settings = get_project_settings()
28 |     assert settings['SETTING_TEST'] == 'VAL'
29 | 
30 | 
31 | @patch.dict(
32 |     environ,
33 |     {
34 |         'SHUB_SETTINGS': '{"project_settings": {"SETTING....',
35 |     }
36 | )
37 | def test_get_project_settings_bad_setting():
38 |     with raises(ValueError):
39 |         get_project_settings()
40 | 


--------------------------------------------------------------------------------
/tests/test_writer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import logging
  4 | import os
  5 | import threading
  6 | from queue import Queue
  7 | 
  8 | import pytest
  9 | 
 10 | from sh_scrapy.writer import _PipeWriter
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def fifo(tmpdir):
 15 |     path = os.path.join(str(tmpdir.mkdir('fifo')), 'scrapinghub')
 16 |     os.mkfifo(path)
 17 |     return path
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def queue():
 22 |     return Queue()
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def reader(fifo, queue):
 27 |     def read_from_fifo():
 28 |         with open(fifo) as f:
 29 |             for line in iter(f.readline, ''):
 30 |                 queue.put(line)
 31 | 
 32 |     reader_thread = threading.Thread(target=read_from_fifo)
 33 |     reader_thread.start()
 34 |     try:
 35 |         yield reader_thread
 36 |     finally:
 37 |         reader_thread.join(timeout=1)
 38 | 
 39 | 
 40 | @pytest.fixture
 41 | def writer(fifo, reader):
 42 |     w = _PipeWriter(fifo)
 43 |     w.open()
 44 |     try:
 45 |         yield w
 46 |     finally:
 47 |         w.close()
 48 | 
 49 | 
 50 | def test_close(writer):
 51 |     assert writer._pipe.closed is False
 52 |     writer.close()
 53 |     assert writer._pipe.closed is True
 54 | 
 55 | 
 56 | def _parse_data_line(msg):
 57 |     assert msg.endswith('\n')
 58 |     cmd, _, payload = msg.strip().partition(' ')
 59 |     return cmd, json.loads(payload)
 60 | 
 61 | 
 62 | def test_write_item(writer, queue):
 63 |     writer.write_item({'foo': 'bar'})
 64 |     line = queue.get(timeout=1)
 65 |     assert queue.empty()
 66 |     cmd, payload = _parse_data_line(line)
 67 |     assert cmd == 'ITM'
 68 |     assert payload == {'foo': 'bar'}
 69 | 
 70 | 
 71 | def test_write_request(writer, queue):
 72 |     writer.write_request(
 73 |         url='http://example.com/',
 74 |         status=200,
 75 |         method='GET',
 76 |         rs=1024,
 77 |         duration=102,
 78 |         parent=None,
 79 |         fp='fingerprint',
 80 |     )
 81 |     line = queue.get(timeout=1)
 82 |     assert queue.empty()
 83 |     cmd, payload = _parse_data_line(line)
 84 |     assert cmd == 'REQ'
 85 |     assert isinstance(payload.pop('time'), int)
 86 |     assert payload == {
 87 |         'url': 'http://example.com/',
 88 |         'status': 200,
 89 |         'method': 'GET',
 90 |         'rs': 1024,
 91 |         'duration': 102,
 92 |         'parent': None,
 93 |         'fp': 'fingerprint',
 94 |     }
 95 | 
 96 | 
 97 | def test_write_log(writer, queue):
 98 |     writer.write_log(
 99 |         level=logging.INFO,
100 |         message='text',
101 |     )
102 |     line = queue.get(timeout=1)
103 |     assert queue.empty()
104 |     cmd, payload = _parse_data_line(line)
105 |     assert cmd == 'LOG'
106 |     assert isinstance(payload.pop('time'), int)
107 |     assert payload == {
108 |         'message': 'text',
109 |         'level': logging.INFO
110 |     }
111 | 
112 | 
113 | def test_write_stats(writer, queue):
114 |     stats = {'item_scraped_count': 10, 'scheduler/enqueued': 20}
115 |     writer.write_stats(stats.copy())
116 |     line = queue.get(timeout=1)
117 |     assert queue.empty()
118 |     cmd, payload = _parse_data_line(line)
119 |     assert cmd == 'STA'
120 |     assert isinstance(payload.pop('time'), int)
121 |     assert payload == {
122 |         'stats': stats.copy()
123 |     }
124 | 
125 | 
126 | def test_set_outcome(writer, queue):
127 |     outcome = 'custom_outcome'
128 |     writer.set_outcome(outcome)
129 |     line = queue.get(timeout=1)
130 |     assert queue.empty()
131 |     cmd, payload = _parse_data_line(line)
132 |     assert cmd == 'FIN'
133 |     assert payload == {
134 |         'outcome': outcome
135 |     }
136 | 
137 | 
138 | def test_writer_raises_runtime_error_if_not_configured():
139 |     error_msg = "Pipe writer is misconfigured, named pipe path is not set"
140 |     w = _PipeWriter('')
141 |     with pytest.raises(RuntimeError) as exc_info:
142 |         w.write_log(10, 'message')
143 |     assert exc_info.value.args[0] == error_msg
144 |     with pytest.raises(RuntimeError) as exc_info:
145 |         w.close()
146 |     assert exc_info.value.args[0] == error_msg
147 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | from pathlib import Path
 5 | from typing import Tuple, Optional, Union
 6 | 
 7 | 
 8 | def call_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]:
 9 |     result = subprocess.run(
10 |         args,
11 |         cwd=str(cwd),
12 |         stdout=subprocess.PIPE,
13 |         stderr=subprocess.PIPE,
14 |         universal_newlines=True,
15 |     )
16 |     assert result.returncode == 0, result.stderr
17 |     return result.stdout, result.stderr
18 | 
19 | 
20 | def call_scrapy_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]:
21 |     args = (sys.executable, "-m", "scrapy.cmdline") + args
22 |     return call_command(cwd, *args)
23 | 
24 | 
25 | def create_project(topdir: Path, spider_text: Optional[str] = None) -> Path:
26 |     project_name = "foo"
27 |     cwd = topdir
28 |     call_scrapy_command(str(cwd), "startproject", project_name)
29 |     cwd /= project_name
30 |     (cwd / project_name / "spiders" / "spider.py").write_text(spider_text or """
31 | from scrapy import Spider
32 | 
33 | class MySpider(Spider):
34 |     name = "myspider"
35 | """)
36 |     return cwd
37 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # tox.ini
 2 | [tox]
 3 | envlist = py36-scrapy16, py
 4 | requires =
 5 |     # https://github.com/pypa/virtualenv/issues/2550
 6 |     virtualenv<=20.21.1
 7 | 
 8 | [testenv]
 9 | deps =
10 |     pytest
11 |     pytest-cov
12 |     mock
13 |     hubstorage
14 |     packaging
15 |     py36-scrapy16: Scrapy==1.6
16 |     scrapy-spider-metadata>=0.1.1; python_version >= "3.8"
17 |     pydantic>=2; python_version >= "3.8"
18 | 
19 | commands =
20 |     pytest --verbose --cov=sh_scrapy --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: sh_scrapy tests}
21 | 


--------------------------------------------------------------------------------