├── .bumpversion.cfg ├── .github └── workflows │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py ├── sh_scrapy ├── __init__.py ├── commands │ ├── __init__.py │ └── shub_image_info.py ├── compat.py ├── crawl.py ├── diskquota.py ├── env.py ├── exceptions.py ├── extension.py ├── hsref.py ├── log.py ├── middlewares.py ├── settings.py ├── stats.py ├── utils.py └── writer.py ├── tests ├── __init__.py ├── conftest.py ├── test_command.py ├── test_compat.py ├── test_crawl.py ├── test_diskquota.py ├── test_env.py ├── test_extension.py ├── test_hsref.py ├── test_log.py ├── test_middlewares.py ├── test_settings.py ├── test_stats.py ├── test_utils.py ├── test_writer.py └── utils.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.17.5 3 | commit = True 4 | tag = True 5 | tag_name = v{new_version} 6 | 7 | [bumpversion:file:sh_scrapy/__init__.py] 8 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v4 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.9 17 | 18 | - name: Publish to PyPI 19 | run: | 20 | pip install --upgrade pip 21 | pip install --upgrade setuptools wheel twine 22 | python setup.py sdist bdist_wheel 23 | export TWINE_USERNAME=__token__ 24 | export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }} 25 | twine upload dist/* 26 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | tests-ubuntu: 7 | name: "Test: py${{ matrix.python-version }}, Ubuntu" 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - python-version: "3.6" 14 | os: ubuntu-20.04 15 | env: 16 | TOXENV: py36-scrapy16 17 | - python-version: "3.6" 18 | os: ubuntu-20.04 19 | env: 20 | TOXENV: py 21 | - python-version: "3.7" 22 | os: ubuntu-22.04 23 | env: 24 | TOXENV: py 25 | - python-version: "3.8" 26 | os: ubuntu-latest 27 | env: 28 | TOXENV: py 29 | - python-version: "3.9" 30 | os: ubuntu-latest 31 | env: 32 | TOXENV: py 33 | - python-version: "3.10" 34 | os: ubuntu-latest 35 | env: 36 | TOXENV: py 37 | - python-version: "3.11" 38 | os: ubuntu-latest 39 | env: 40 | TOXENV: py 41 | - python-version: "3.12" 42 | os: ubuntu-latest 43 | env: 44 | TOXENV: py 45 | - python-version: "3.13" 46 | os: ubuntu-latest 47 | env: 48 | TOXENV: py 49 | 50 | steps: 51 | - uses: actions/checkout@v4 52 | 53 | - name: Set up Python ${{ matrix.python-version }} 54 | uses: actions/setup-python@v5 55 | with: 56 | python-version: ${{ matrix.python-version }} 57 | 58 | - name: Update pip & install tox 59 | run: | 60 | pip install -U pip 61 | pip install tox 62 | 63 | - name: Run tests 64 | env: ${{ matrix.env }} 65 | run: tox 66 | 67 | - name: Upload coverage report 68 | uses: codecov/codecov-action@v5 69 | with: 70 | token: ${{ secrets.CODECOV_TOKEN }} 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | *.egg-info 4 | *.pyc 5 | .eggs/ 6 | .tox 7 | /.coverage 8 | coverage.xml 9 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Changes 3 | ======= 4 | 5 | 0.17.4 (2024-07-08) 6 | =================== 7 | 8 | - Fixed an exception when running scripts with importlib_ installed, 9 | introduced in 0.17.3. 10 | 11 | 12 | 0.17.3 (2024-06-17) 13 | =================== 14 | 15 | - Replaced a use of the deprecated pkg_resources_ module with importlib_. 16 | 17 | .. _pkg_resources: https://setuptools.pypa.io/en/latest/pkg_resources.html 18 | .. _importlib: https://docs.python.org/3/library/importlib.html 19 | 20 | 21 | 0.17.2 (2024-02-20) 22 | =================== 23 | 24 | - Added official support for Python 3.11 and 3.12. 25 | 26 | - Added support for centralized request fingerprints on Scrapy 2.7 and 27 | higher. 28 | 29 | - Started this change log. Check `GitHub releases`_ for older releases until 30 | 0.12.0, and the `commit history`_ for the complete history. 31 | 32 | .. _commit history: https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/commits/master/ 33 | .. _GitHub releases: https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/releases 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapinghub 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of extruct nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # scrapinghub-entrypoint-scrapy 3 | 4 | [![version](https://img.shields.io/pypi/v/scrapinghub-entrypoint-scrapy.svg)](https://pypi.python.org/pypi/scrapinghub-entrypoint-scrapy) 5 | [![pyversions](https://img.shields.io/pypi/pyversions/scrapinghub-entrypoint-scrapy.svg)](https://pypi.python.org/pypi/scrapinghub-entrypoint-scrapy) 6 | [![actions](https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/workflows/Tests/badge.svg)](https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/actions) 7 | [![codecov](https://codecov.io/gh/scrapinghub/scrapinghub-entrypoint-scrapy/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/scrapinghub-entrypoint-scrapy) 8 | 9 | Scrapy entrypoint for Scrapinghub job runner. 10 | 11 | The package implements a base wrapper layer to extract job data from 12 | environment, parse/prepare it properly and execute job using Scrapy 13 | or custom executor. 14 | 15 | 16 | ## Features 17 | 18 | - parsing job data from environment 19 | - processing job args and settings 20 | - running a job with Scrapy 21 | - collecting stats 22 | - advanced logging & error handling 23 | - full hubstorage support 24 | - custom scripts support 25 | 26 | 27 | ## Install 28 | 29 | pip install scrapinghub-entrypoint-scrapy 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | [sdist_dsc] 4 | Package: scrapinghub-entrypoint-scrapy 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | from sh_scrapy import __version__ 4 | 5 | 6 | setup( 7 | name='scrapinghub-entrypoint-scrapy', 8 | version=__version__, 9 | license='BSD', 10 | description='Scrapy entrypoint for Scrapinghub job runner', 11 | long_description=open('README.md').read(), 12 | packages=find_packages(), 13 | install_requires=[ 14 | 'Scrapy>=1.6', 15 | 'scrapinghub>=2.1.0', 16 | ], 17 | entry_points={ 18 | 'console_scripts': [ 19 | 'start-crawl = sh_scrapy.crawl:main', 20 | 'list-spiders = sh_scrapy.crawl:list_spiders', 21 | 'shub-image-info = sh_scrapy.crawl:shub_image_info', 22 | ], 23 | }, 24 | python_requires='>=3.6', 25 | classifiers=[ 26 | 'Framework :: Scrapy', 27 | 'Development Status :: 5 - Production/Stable', 28 | 'Intended Audience :: Developers', 29 | 'License :: OSI Approved :: BSD License', 30 | 'Operating System :: OS Independent', 31 | 'Programming Language :: Python', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3.6', 34 | 'Programming Language :: Python :: 3.7', 35 | 'Programming Language :: Python :: 3.8', 36 | 'Programming Language :: Python :: 3.9', 37 | 'Programming Language :: Python :: 3.10', 38 | 'Programming Language :: Python :: 3.11', 39 | 'Programming Language :: Python :: 3.12', 40 | 'Programming Language :: Python :: 3.13', 41 | 'Topic :: Utilities', 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /sh_scrapy/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.17.5" 2 | -------------------------------------------------------------------------------- /sh_scrapy/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /sh_scrapy/commands/shub_image_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import json 4 | import subprocess 5 | 6 | from scrapy.commands import ScrapyCommand 7 | 8 | 9 | class Command(ScrapyCommand): 10 | requires_project = True 11 | default_settings = {'LOG_ENABLED': False} 12 | 13 | IMAGE_INFO_CMD = ' && '.join([ 14 | "printf 'Linux packages:\n'", "dpkg -l", 15 | "printf '\nPython packages:\n'", "pip freeze", 16 | ]) 17 | 18 | def short_desc(self): 19 | return "Print JSON-encoded project metadata." 20 | 21 | def add_options(self, parser): 22 | super(Command, self).add_options(parser) 23 | # backward compatibility for optparse/argparse 24 | try: 25 | add_argument = parser.add_argument 26 | except AttributeError: 27 | add_argument = parser.add_option 28 | add_argument( 29 | "--debug", 30 | action="store_true", 31 | help="add debugging information such as list of " 32 | "installed Debian packages and Python packages.", 33 | ) 34 | 35 | def run(self, args, opts): 36 | result = { 37 | 'project_type': 'scrapy', 38 | 'spiders': sorted(self.crawler_process.spider_loader.list()), 39 | } 40 | try: 41 | from scrapy_spider_metadata import get_spider_metadata 42 | except ImportError: 43 | pass 44 | else: 45 | result['metadata'] = {} 46 | for spider_name in result['spiders']: 47 | spider_cls = self.crawler_process.spider_loader.load(spider_name) 48 | metadata_dict = get_spider_metadata(spider_cls, normalize=True) 49 | try: 50 | # make sure it's serializable 51 | json.dumps(metadata_dict) 52 | except (TypeError, ValueError): 53 | continue 54 | result['metadata'][spider_name] = metadata_dict 55 | if opts.debug: 56 | output = subprocess.check_output( 57 | ['bash', '-c', self.IMAGE_INFO_CMD], 58 | stderr=subprocess.STDOUT, 59 | universal_newlines=True, 60 | ) 61 | result['debug'] = output 62 | print(json.dumps(result)) 63 | -------------------------------------------------------------------------------- /sh_scrapy/compat.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from scrapy.exceptions import ScrapyDeprecationWarning 4 | from scrapy.utils.decorators import deprecated 5 | from scrapy.utils.python import ( 6 | to_bytes as scrapy_to_bytes, 7 | to_unicode as scrapy_to_unicode, 8 | ) 9 | 10 | 11 | IS_PYTHON2 = False 12 | STRING_TYPE = str 13 | TEXT_TYPE = str 14 | BINARY_TYPE = bytes 15 | 16 | 17 | warnings.warn( 18 | "The sh_scrapy.compat module is deprecated, use the functions in scrapy.utils.python instead", 19 | category=ScrapyDeprecationWarning, 20 | stacklevel=2, 21 | ) 22 | 23 | 24 | def is_string(var): 25 | warnings.warn( 26 | "is_string(var) is deprecated, please use isinstance(var, str) instead", 27 | category=ScrapyDeprecationWarning, 28 | stacklevel=2, 29 | ) 30 | return isinstance(var, str) 31 | 32 | 33 | @deprecated("scrapy.utils.python.to_bytes") 34 | def to_bytes(text, encoding=None, errors='strict'): 35 | return scrapy_to_bytes(text, encoding, errors) 36 | 37 | 38 | @deprecated("scrapy.utils.python.to_unicode") 39 | def to_native_str(text, encoding=None, errors='strict'): 40 | return scrapy_to_unicode(text, encoding, errors) 41 | 42 | 43 | @deprecated("scrapy.utils.python.to_unicode") 44 | def to_unicode(text, encoding=None, errors='strict'): 45 | return scrapy_to_unicode(text, encoding, errors) 46 | -------------------------------------------------------------------------------- /sh_scrapy/crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # --------------------- DO NOT ADD IMPORTS HERE ------------------------- 3 | # Add them below so that any import errors are caught and sent to sentry 4 | # ----------------------------------------------------------------------- 5 | from __future__ import print_function 6 | import os 7 | import sys 8 | import socket 9 | import logging 10 | import datetime 11 | import warnings 12 | from contextlib import contextmanager 13 | # XXX: Do not use atexit to close Hubstorage client! 14 | # why: functions registed with atexit are called when run_script() finishes, 15 | # and at that point main() function doesn't completed leading to lost log 16 | # messages. 17 | 18 | from sh_scrapy.exceptions import SHScrapyDeprecationWarning 19 | 20 | # Keep a reference to standard output/error as they are redirected 21 | # at log initialization 22 | _sys_stderr = sys.stderr # stderr and stoud are redirected to HS later 23 | _sys_stdout = sys.stdout 24 | # Sentry DSN ins passed by environment variable 25 | _hworker_sentry_dsn = os.environ.pop('HWORKER_SENTRY_DSN', None) 26 | _sentry_dsn = os.environ.pop('SENTRY_DSN', _hworker_sentry_dsn) 27 | 28 | # Set default socket timeout for code that doesn't 29 | socket.setdefaulttimeout(60.0) 30 | 31 | 32 | SCRAPY_SETTINGS_ENTRYPOINT_NOT_FOUND = """ 33 | Scrapy distribution with `scrapy.settings` entrypoint is not found. 34 | The entrypoint should be specified in your project setup.py, please make sure 35 | you specified it in the following format: 36 | setup( 37 | ..., 38 | entry_points = {'scrapy': ['settings = your_project.settings']}, 39 | ... 40 | ) 41 | Check the link for more details: 42 | https://setuptools.readthedocs.io/en/latest/pkg_resources.html#entry-points 43 | """ 44 | 45 | 46 | @contextmanager 47 | def ignore_warnings(**kwargs): 48 | """Context manager that creates a temporary filter to ignore warnings. 49 | 50 | This context manager behaves similarly to warnings.catch_warnings though 51 | filtered warnings aren't recorded and you can ignore them by some criteria 52 | matching warnings.simplefilter arguments. 53 | 54 | As warnings.catch_warnings, this context manager is not thread-safe. 55 | """ 56 | warnings.warn( 57 | "The sh_scrapy.crawl.ignore_warnings function is deprecated.", 58 | category=SHScrapyDeprecationWarning, 59 | stacklevel=2, 60 | ) 61 | _filters = warnings.filters[:] 62 | warnings.filterwarnings('ignore', **kwargs) 63 | yield 64 | warnings.filters = _filters 65 | 66 | 67 | def _fatalerror(): 68 | # Log error to hworker slotN.out 69 | # Inspired by logging.Handler.handleError() 70 | # 71 | # Capture exc_info early on, so that an error in the handler doesn't 72 | # overwrite it. 73 | import traceback 74 | ei = sys.exc_info() 75 | 76 | if _sentry_dsn: 77 | try: 78 | from raven import Client 79 | except ImportError: 80 | # Do not fail here, previous error is more important 81 | print('HWORKER_SENTRY_DSN is set but python-raven ' 82 | 'is not installed', file=_sys_stderr) 83 | else: 84 | try: 85 | Client(_sentry_dsn).captureException() 86 | except Exception as err: 87 | print(datetime.datetime.utcnow().isoformat(), 88 | "Error when sending fatal error to sentry:", err, 89 | file=_sys_stderr) 90 | 91 | # Log error to hworker slotN.out 92 | # Inspired by logging.Handler.handleError() 93 | try: 94 | print(datetime.datetime.utcnow().isoformat(), end=' ', 95 | file=_sys_stderr) 96 | traceback.print_exception(ei[0], ei[1], ei[2], None, _sys_stderr) 97 | except IOError: 98 | pass 99 | finally: 100 | del ei 101 | 102 | 103 | def _get_apisettings(): 104 | from sh_scrapy.env import decode_uri 105 | return decode_uri(envvar='SHUB_SETTINGS') or {} 106 | 107 | 108 | def _run(args, settings): 109 | if args[0] == 'scrapy': 110 | _run_scrapy(args, settings) 111 | else: 112 | _run_pkgscript(args) 113 | 114 | 115 | def _run_scrapy(argv, settings): 116 | from scrapy.cmdline import execute 117 | sys.argv = argv 118 | execute(settings=settings) 119 | 120 | 121 | def _run_pkgscript(argv): 122 | if argv[0].startswith('py:'): 123 | argv[0] = argv[0][3:] 124 | scriptname = argv[0] 125 | sys.argv = argv 126 | 127 | try: 128 | import importlib.metadata 129 | has_importlib = True 130 | except ImportError: 131 | import pkg_resources 132 | has_importlib = False 133 | 134 | def get_distribution(): 135 | if has_importlib: 136 | eps = importlib.metadata.entry_points(group='scrapy') 137 | else: 138 | eps = pkg_resources.WorkingSet().iter_entry_points('scrapy') 139 | 140 | for ep in eps: 141 | if ep.name == 'settings': 142 | return ep.dist 143 | 144 | d = get_distribution() 145 | if not d: 146 | raise ValueError(SCRAPY_SETTINGS_ENTRYPOINT_NOT_FOUND) 147 | ns = {"__name__": "__main__"} 148 | if has_importlib: 149 | _run_script(d, scriptname, ns) 150 | else: 151 | d.run_script(scriptname, ns) 152 | 153 | 154 | def _run_script(dist, script_name, namespace): 155 | # An importlib-based replacement for pkg_resources.NullProvider.run_script(). 156 | # It's possible that this doesn't support all cases that pkg_resources does, 157 | # so it may need to be improved when those are discovered. 158 | # Using a private attribute (dist._path) seems to be necessary to get the 159 | # full file path, but it's only needed for diagnostic messages so it should 160 | # be easy to fix this by moving to relative paths if this API is removed. 161 | script = "scripts/" + script_name 162 | source = dist.read_text(script) 163 | if not source: 164 | raise ValueError( 165 | f"Script {script!r} not found in metadata at {dist._path!r}" 166 | ) 167 | script_filename = dist._path.joinpath(script) 168 | code = compile(source, str(script_filename), "exec") 169 | exec(code, namespace, namespace) 170 | 171 | 172 | def _run_usercode(spider, args, apisettings_func, 173 | log_handler=None, commands_module=None): 174 | try: 175 | from scrapy.exceptions import ScrapyDeprecationWarning 176 | from sh_scrapy.settings import populate_settings 177 | 178 | with warnings.catch_warnings(): 179 | warnings.filterwarnings("ignore", category=ScrapyDeprecationWarning) 180 | settings = populate_settings(apisettings_func(), spider) 181 | if commands_module: 182 | settings.set('COMMANDS_MODULE', commands_module, priority=40) 183 | if log_handler is not None: 184 | log_handler.setLevel(settings['LOG_LEVEL']) 185 | except Exception: 186 | logging.exception('Settings initialization failed') 187 | raise 188 | 189 | try: 190 | _run(args, settings) 191 | except Exception: 192 | logging.exception('Job runtime exception') 193 | raise 194 | 195 | 196 | def _launch(): 197 | try: 198 | from scrapy.exceptions import ScrapyDeprecationWarning 199 | warnings.filterwarnings( 200 | 'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy') 201 | from sh_scrapy.env import get_args_and_env, decode_uri 202 | job = decode_uri(envvar='SHUB_JOB_DATA') 203 | assert job, 'SHUB_JOB_DATA must be set' 204 | args, env = get_args_and_env(job) 205 | os.environ.update(env) 206 | 207 | from sh_scrapy.log import initialize_logging 208 | from sh_scrapy.settings import populate_settings # NOQA 209 | from sh_scrapy.env import setup_environment 210 | loghdlr = initialize_logging() 211 | setup_environment() 212 | except: 213 | _fatalerror() 214 | raise 215 | 216 | _run_usercode(job['spider'], args, _get_apisettings, loghdlr) 217 | 218 | 219 | def list_spiders(): 220 | """ An entrypoint for list-spiders.""" 221 | warnings.warn( 222 | "The sh_scrapy.crawl.list_spiders function is deprecated.", 223 | category=SHScrapyDeprecationWarning, 224 | stacklevel=2, 225 | ) 226 | try: 227 | from scrapy.exceptions import ScrapyDeprecationWarning 228 | warnings.filterwarnings( 229 | 'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy') 230 | from sh_scrapy.env import setup_environment 231 | setup_environment() 232 | except: 233 | _fatalerror() 234 | raise 235 | 236 | _run_usercode(None, ['scrapy', 'list'], _get_apisettings) 237 | 238 | 239 | def shub_image_info(): 240 | """shub-image-info command 241 | 242 | http://shub.readthedocs.io/en/latest/custom-images-contract.html#contract-statements 243 | 244 | """ 245 | try: 246 | from scrapy.exceptions import ScrapyDeprecationWarning 247 | warnings.filterwarnings( 248 | 'ignore', category=ScrapyDeprecationWarning, module='^sh_scrapy') 249 | from sh_scrapy.env import setup_environment 250 | setup_environment() 251 | except: 252 | _fatalerror() 253 | raise 254 | 255 | _run_usercode(None, ['scrapy', 'shub_image_info'] + sys.argv[1:], 256 | _get_apisettings, commands_module='sh_scrapy.commands') 257 | 258 | 259 | def main(): 260 | try: 261 | from sh_scrapy.writer import pipe_writer 262 | pipe_writer.open() 263 | except Exception: 264 | _fatalerror() 265 | return 1 266 | try: 267 | _launch() 268 | except SystemExit as e: 269 | return e.code 270 | except: 271 | # exception was already handled and logged inside _launch() 272 | return 1 273 | finally: 274 | sys.stderr = _sys_stderr 275 | sys.stdout = _sys_stdout 276 | return 0 277 | 278 | 279 | if __name__ == '__main__': 280 | sys.exit(main()) 281 | -------------------------------------------------------------------------------- /sh_scrapy/diskquota.py: -------------------------------------------------------------------------------- 1 | """ 2 | DiskQuota downloader and spider middlewares. 3 | The goal is to catch disk quota errors and stop spider gently. 4 | """ 5 | 6 | from scrapy.exceptions import NotConfigured 7 | 8 | 9 | class DiskQuota(object): 10 | 11 | def __init__(self, crawler): 12 | if not crawler.settings.getbool('DISK_QUOTA_STOP_ON_ERROR'): 13 | raise NotConfigured 14 | self.crawler = crawler 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | return cls(crawler) 19 | 20 | def _is_disk_quota_error(self, error): 21 | return isinstance(error, (OSError, IOError)) and error.errno == 122 22 | 23 | 24 | class DiskQuotaDownloaderMiddleware(DiskQuota): 25 | 26 | def process_exception(self, request, exception, spider): 27 | if self._is_disk_quota_error(exception): 28 | self.crawler.engine.close_spider(spider, 'diskusage_exceeded') 29 | 30 | 31 | class DiskQuotaSpiderMiddleware(DiskQuota): 32 | 33 | def process_spider_exception(self, response, exception, spider): 34 | if self._is_disk_quota_error(exception): 35 | self.crawler.engine.close_spider(spider, 'diskusage_exceeded') 36 | -------------------------------------------------------------------------------- /sh_scrapy/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import codecs 4 | from base64 import b64decode 5 | 6 | from scrapy.utils.python import to_bytes, to_unicode 7 | 8 | 9 | def _make_scrapy_args(arg, args_dict): 10 | if not args_dict: 11 | return [] 12 | args = [] 13 | for k, v in sorted(dict(args_dict).items()): 14 | args += [arg, "{}={}".format( 15 | to_unicode(k), to_unicode(v) if isinstance(v, str) else v)] 16 | return args 17 | 18 | 19 | def _scrapy_crawl_args_and_env(msg): 20 | args = ['scrapy', 'crawl', str(msg['spider'])] + \ 21 | _make_scrapy_args('-a', msg.get('spider_args')) + \ 22 | _make_scrapy_args('-s', msg.get('settings')) 23 | env = { 24 | 'SCRAPY_JOB': msg['key'], 25 | 'SCRAPY_SPIDER': msg['spider'], 26 | 'SCRAPY_PROJECT_ID': msg['key'].split('/')[0], 27 | # the following should be considered deprecated 28 | 'SHUB_SPIDER_TYPE': msg.get('spider_type', '') 29 | } 30 | return args, env 31 | 32 | 33 | def _job_args_and_env(msg): 34 | env = msg.get('job_env') 35 | if not isinstance(env, dict): 36 | env = {} 37 | cmd = msg.get('job_cmd') 38 | if not isinstance(cmd, list): 39 | cmd = [str(cmd)] 40 | return cmd, {to_unicode(k): to_unicode(v) if isinstance(v, str) else v 41 | for k, v in sorted(dict(env).items())} 42 | 43 | 44 | def _jobname(msg): 45 | if 'job_name' in msg: 46 | return msg['job_name'] 47 | elif 'spider' in msg: 48 | return msg['spider'] 49 | else: 50 | return msg['job_cmd'][0] 51 | 52 | 53 | def _jobauth(msg): 54 | auth_data = to_bytes('{0[key]}:{0[auth]}'.format(msg)) 55 | return to_unicode(codecs.encode(auth_data, 'hex_codec')) 56 | 57 | 58 | def get_args_and_env(msg): 59 | envf = _job_args_and_env if 'job_cmd' in msg else _scrapy_crawl_args_and_env 60 | args, env = envf(msg) 61 | if 'api_url' in msg: 62 | env['SHUB_APIURL'] = msg.get('api_url') 63 | 64 | env.update({ 65 | 'SHUB_JOBKEY': msg['key'], 66 | 'SHUB_JOBAUTH': _jobauth(msg), 67 | 'SHUB_JOBNAME': _jobname(msg), 68 | 'SHUB_JOB_TAGS': ','.join(msg.get('tags') or ()), # DEPRECATED? 69 | }) 70 | return args, env 71 | 72 | 73 | def decode_uri(uri=None, envvar=None): 74 | """Return content for a data: or file: URI 75 | 76 | >>> decode_uri('data:application/json;charset=utf8;base64,ImhlbGxvIHdvcmxkIg==') 77 | u'hello world' 78 | >>> decode_uri('data:;base64,ImhlbGxvIHdvcmxkIg==') 79 | u'hello world' 80 | >>> decode_uri('{"spider": "hello"}') 81 | {u'spider': u'hello'} 82 | 83 | """ 84 | if envvar is not None: 85 | uri = os.getenv(envvar, '') 86 | elif uri is None: 87 | raise ValueError('An uri or envvar is required') 88 | 89 | mime_type = 'application/json' 90 | 91 | # data:[][;charset=][;base64], 92 | if uri.startswith('data:'): 93 | prefix, _, data = uri.rpartition(',') 94 | mods = {} 95 | for idx, value in enumerate(prefix[5:].split(';')): 96 | if idx == 0: 97 | mime_type = value or mime_type 98 | elif '=' in value: 99 | k, _, v = value.partition('=') 100 | mods[k] = v 101 | else: 102 | mods[value] = None 103 | 104 | if 'base64' in mods: 105 | data = b64decode(data) 106 | if mime_type == 'application/json': 107 | data = data.decode(mods.get('charset', 'utf-8')) 108 | return json.loads(data) 109 | else: 110 | return data 111 | 112 | if uri.startswith('{'): 113 | return json.loads(uri) 114 | 115 | if uri.startswith('/'): 116 | uri = 'file://' + uri 117 | if uri.startswith('file://'): 118 | reader = codecs.getreader("utf-8") 119 | with open(uri[7:], 'rb') as data_file: 120 | return json.load(reader(data_file)) 121 | 122 | 123 | def setup_environment(): 124 | # scrapy.cfg is required by scrapy.utils.project.data_path 125 | # FIXME: drop this requirement 126 | if not os.path.exists('scrapy.cfg'): 127 | open('scrapy.cfg', 'w').close() 128 | -------------------------------------------------------------------------------- /sh_scrapy/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class SHScrapyDeprecationWarning(Warning): 5 | """Warning category for deprecated features, since the default 6 | DeprecationWarning is silenced on Python 2.7+ 7 | """ 8 | -------------------------------------------------------------------------------- /sh_scrapy/extension.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import suppress 3 | from warnings import warn 4 | from weakref import WeakKeyDictionary 5 | 6 | import scrapy 7 | from scrapy import signals 8 | from scrapy import version_info as SCRAPY_VERSION_INFO 9 | from scrapy.exporters import PythonItemExporter 10 | from scrapy.http import Request 11 | from scrapy.utils.deprecate import create_deprecated_class 12 | 13 | from sh_scrapy import hsref 14 | from sh_scrapy.exceptions import SHScrapyDeprecationWarning 15 | from sh_scrapy.middlewares import HS_PARENT_ID_KEY, request_id_sequence 16 | from sh_scrapy.writer import pipe_writer 17 | 18 | 19 | try: 20 | from itemadapter import ItemAdapter 21 | except ImportError: 22 | _base_item_cls = [dict, scrapy.item.Item] 23 | with suppress(AttributeError): 24 | _base_item_cls.append(scrapy.item.BaseItem) 25 | _base_item_cls = tuple(_base_item_cls) 26 | 27 | def is_item(item): 28 | return isinstance(item, _base_item_cls) 29 | else: 30 | def is_item(item): 31 | return ItemAdapter.is_item(item) 32 | 33 | 34 | class HubstorageExtension(object): 35 | """Extension to write scraped items to HubStorage""" 36 | 37 | def __init__(self, crawler): 38 | self.hsref = hsref.hsref 39 | self.pipe_writer = pipe_writer 40 | self.crawler = crawler 41 | self.logger = logging.getLogger(__name__) 42 | self._write_item = self.pipe_writer.write_item 43 | kwargs = {} 44 | if SCRAPY_VERSION_INFO < (2, 11): 45 | kwargs["binary"] = False 46 | self.exporter = PythonItemExporter(**kwargs) 47 | 48 | @classmethod 49 | def from_crawler(cls, crawler): 50 | o = cls(crawler) 51 | crawler.signals.connect(o.item_scraped, signals.item_scraped) 52 | crawler.signals.connect(o.spider_closed, signals.spider_closed) 53 | return o 54 | 55 | def item_scraped(self, item, spider): 56 | if not is_item(item): 57 | self.logger.error("Wrong item type: %s" % item) 58 | return 59 | type_ = type(item).__name__ 60 | item = self.exporter.export_item(item) 61 | item.setdefault("_type", type_) 62 | self._write_item(item) 63 | 64 | def spider_closed(self, spider, reason): 65 | self.pipe_writer.set_outcome(reason) 66 | 67 | 68 | _HUBSTORAGE_MIDDLEWARE_WARNING = """\ 69 | {cls} inherits from deprecated class {old} 70 | 71 | sh_scrapy.extension.HubstorageMiddleware functionality is now split between two new middlewares: 72 | 73 | - sh_scrapy.middlewares.HubstorageDownloaderMiddleware 74 | - sh_scrapy.middlewares.HubstorageSpiderMiddleware 75 | 76 | Please migrate to new middlewares. 77 | """ 78 | 79 | 80 | class HubstorageMiddleware: 81 | 82 | @classmethod 83 | def from_crawler(cls, crawler): 84 | try: 85 | result = cls(crawler) 86 | except TypeError: 87 | warn( 88 | ( 89 | "Subclasses of HubstorageMiddleware must now accept a " 90 | "crawler parameter in their __init__ method. This will " 91 | "become an error in the future." 92 | ), 93 | DeprecationWarning, 94 | ) 95 | result = cls() 96 | result._crawler = crawler 97 | result._load_fingerprinter() 98 | return result 99 | 100 | def _load_fingerprinter(self): 101 | if hasattr(self._crawler, "request_fingerprinter"): 102 | self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex() 103 | else: 104 | from scrapy.utils.request import request_fingerprint 105 | self._fingerprint = request_fingerprint 106 | 107 | def __init__(self, crawler=None): 108 | self._seen = WeakKeyDictionary() 109 | self.hsref = hsref.hsref 110 | self.pipe_writer = pipe_writer 111 | self.request_id_sequence = request_id_sequence 112 | self._crawler = crawler 113 | if crawler: 114 | self._load_fingerprinter() 115 | 116 | def process_spider_input(self, response, spider): 117 | self.pipe_writer.write_request( 118 | url=response.url, 119 | status=response.status, 120 | method=response.request.method, 121 | rs=len(response.body), 122 | duration=response.meta.get('download_latency', 0) * 1000, 123 | parent=response.meta.get(HS_PARENT_ID_KEY), 124 | fp=self._fingerprint(response.request), 125 | ) 126 | self._seen[response] = next(self.request_id_sequence) 127 | 128 | def process_spider_output(self, response, result, spider): 129 | parent = self._seen.pop(response) 130 | for x in result: 131 | if isinstance(x, Request): 132 | x.meta[HS_PARENT_ID_KEY] = parent 133 | yield x 134 | 135 | 136 | HubstorageMiddleware = create_deprecated_class( 137 | "HubstorageMiddleware", 138 | HubstorageMiddleware, 139 | warn_category=SHScrapyDeprecationWarning, 140 | subclass_warn_message=_HUBSTORAGE_MIDDLEWARE_WARNING 141 | ) 142 | -------------------------------------------------------------------------------- /sh_scrapy/hsref.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module to hold a reference to singleton Hubstorage client and Job instance 3 | """ 4 | import os 5 | from codecs import decode 6 | 7 | from scrapy.utils.python import to_unicode 8 | 9 | 10 | class _HubstorageRef(object): 11 | 12 | def __init__(self): 13 | self.enabled = 'SHUB_JOBKEY' in os.environ 14 | self._client = None 15 | self._project = None 16 | self._job = None 17 | if self.enabled: 18 | self.jobkey = os.environ['SHUB_JOBKEY'] 19 | job_id = [int(id) for id in self.jobkey.split('/')] 20 | self._projectid, self._spiderid, self._jobcounter = job_id 21 | else: 22 | self._projectid = None 23 | self._spiderid = None 24 | self._jobcounter = None 25 | 26 | @property 27 | def auth(self): 28 | return to_unicode(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec')) 29 | 30 | @property 31 | def endpoint(self): 32 | return os.environ.get('SHUB_STORAGE') 33 | 34 | @property 35 | def projectid(self): 36 | return self._projectid 37 | 38 | @property 39 | def spiderid(self): 40 | return self._spiderid 41 | 42 | @property 43 | def jobid(self): 44 | return self._jobcounter 45 | 46 | @property 47 | def client(self): 48 | from scrapinghub import HubstorageClient 49 | if self._client is None: 50 | user_agent = os.environ.get('SHUB_HS_USER_AGENT') 51 | self._client = HubstorageClient(endpoint=self.endpoint, 52 | auth=self.auth, 53 | user_agent=user_agent) 54 | return self._client 55 | 56 | @property 57 | def project(self): 58 | if self._project is None: 59 | self._project = self.client.get_project(str(self.projectid)) 60 | return self._project 61 | 62 | @property 63 | def job(self): 64 | if self._job is None: 65 | self._job = self.project.get_job((self.spiderid, self.jobid)) 66 | return self._job 67 | 68 | def close(self): 69 | if self._client is not None: 70 | self._client.close() 71 | 72 | hsref = _HubstorageRef() 73 | -------------------------------------------------------------------------------- /sh_scrapy/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import warnings 4 | 5 | from scrapy import __version__ 6 | from scrapy.utils.python import to_unicode 7 | from twisted.python import log as txlog 8 | 9 | from sh_scrapy.writer import pipe_writer 10 | 11 | 12 | # keep a global reference to stderr as it is redirected on log initialization 13 | _stdout = sys.stdout 14 | _stderr = sys.stderr 15 | 16 | 17 | def _logfn(level, message): 18 | """Wraps HS job logging function.""" 19 | try: 20 | pipe_writer.write_log(level=level, message=message) 21 | except UnicodeDecodeError: 22 | # workaround for messages that contain binary data 23 | message = repr(message)[1:-1] 24 | pipe_writer.write_log(level=level, message=message) 25 | 26 | 27 | def initialize_logging(): 28 | """Initialize logging to send messages to Hubstorage job logs 29 | 30 | it initializes: 31 | - Python logging 32 | - Twisted logging 33 | - Scrapy logging 34 | - Redirects standard output and stderr to job log at INFO level 35 | 36 | This duplicates some code with Scrapy log.start(), but it's required in 37 | order to avoid scrapy from starting the log twice. 38 | """ 39 | # General python logging 40 | root = logging.getLogger() 41 | root.setLevel(logging.NOTSET) 42 | hdlr = HubstorageLogHandler() 43 | hdlr.setLevel(logging.INFO) 44 | hdlr.setFormatter(logging.Formatter('[%(name)s] %(message)s')) 45 | root.addHandler(hdlr) 46 | 47 | # Silence commonly used noisy libraries 48 | try: 49 | import boto # boto overrides its logger at import time 50 | except ImportError: 51 | pass 52 | 53 | nh = logging.NullHandler() 54 | for ln in ('boto', 'requests', 'hubstorage'): 55 | lg = logging.getLogger(ln) 56 | lg.propagate = 0 57 | lg.addHandler(nh) 58 | 59 | # Redirect standard output and error to HS log 60 | sys.stdout = StdoutLogger(0, 'utf-8') 61 | sys.stderr = StdoutLogger(1, 'utf-8') 62 | 63 | # Twisted specifics (includes Scrapy) 64 | obs = HubstorageLogObserver(hdlr) 65 | _oldshowwarning = warnings.showwarning 66 | txlog.startLoggingWithObserver(obs.emit, setStdout=False) 67 | warnings.showwarning = _oldshowwarning 68 | return hdlr 69 | 70 | 71 | class HubstorageLogHandler(logging.Handler): 72 | """Python logging handler that writes to HubStorage""" 73 | 74 | def emit(self, record): 75 | try: 76 | message = self.format(record) 77 | if message: 78 | _logfn(message=message, level=record.levelno) 79 | except (KeyboardInterrupt, SystemExit): 80 | raise 81 | except: 82 | self.handleError(record) 83 | 84 | def handleError(self, record): 85 | cur = sys.stderr 86 | try: 87 | sys.stderr = _stderr 88 | super(HubstorageLogHandler, self).handleError(record) 89 | finally: 90 | sys.stderr = cur 91 | 92 | 93 | class HubstorageLogObserver(object): 94 | """Twisted log observer with Scrapy specifics that writes to HubStorage""" 95 | 96 | def __init__(self, loghdlr): 97 | self._hs_loghdlr = loghdlr 98 | 99 | def emit(self, ev): 100 | logitem = self._get_log_item(ev) 101 | if logitem: 102 | _logfn(**logitem) 103 | 104 | def _get_log_item(self, ev): 105 | """Get HubStorage log item for the given Twisted event, or None if no 106 | document should be inserted 107 | """ 108 | if ev['system'] == 'scrapy': 109 | level = ev['logLevel'] 110 | else: 111 | if ev['isError']: 112 | level = logging.ERROR 113 | else: 114 | level = logging.INFO 115 | 116 | # It's important to access level trough handler instance, 117 | # min log level can change at any moment. 118 | if level < self._hs_loghdlr.level: 119 | return 120 | 121 | msg = ev.get('message') 122 | if msg: 123 | msg = to_unicode(msg[0]) 124 | 125 | failure = ev.get('failure', None) 126 | if failure: 127 | msg = failure.getTraceback() 128 | 129 | why = ev.get('why', None) 130 | if why: 131 | msg = "%s\n%s" % (why, msg) 132 | 133 | fmt = ev.get('format') 134 | if fmt: 135 | try: 136 | msg = fmt % ev 137 | except: 138 | msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev) 139 | level = logging.ERROR 140 | # to replicate typical scrapy log appeareance 141 | msg = msg.replace('\n', '\n\t') 142 | return {'message': msg, 'level': level} 143 | 144 | 145 | class StdoutLogger(txlog.StdioOnnaStick): 146 | """This works like Twisted's StdioOnnaStick but prepends standard 147 | output/error messages with [stdout] and [stderr] 148 | """ 149 | 150 | def __init__(self, isError=0, encoding=None, loglevel=logging.INFO): 151 | txlog.StdioOnnaStick.__init__(self, isError, encoding) 152 | self.prefix = "[stderr] " if isError else "[stdout] " 153 | self.loglevel = loglevel 154 | 155 | def _logprefixed(self, msg): 156 | _logfn(message=self.prefix + msg, level=self.loglevel) 157 | 158 | def write(self, data): 159 | data = to_unicode(data, self.encoding) 160 | 161 | d = (self.buf + data).split('\n') 162 | self.buf = d[-1] 163 | messages = d[0:-1] 164 | for message in messages: 165 | self._logprefixed(message) 166 | 167 | def writelines(self, lines): 168 | for line in lines: 169 | line = to_unicode(line, self.encoding) 170 | self._logprefixed(line) 171 | -------------------------------------------------------------------------------- /sh_scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import itertools 3 | from warnings import warn 4 | from weakref import WeakKeyDictionary 5 | 6 | from scrapy import Request 7 | 8 | from sh_scrapy.writer import pipe_writer 9 | 10 | HS_REQUEST_ID_KEY = '_hsid' 11 | HS_PARENT_ID_KEY = '_hsparent' 12 | request_id_sequence = itertools.count() 13 | seen_requests = WeakKeyDictionary() 14 | 15 | 16 | class HubstorageSpiderMiddleware(object): 17 | """Hubstorage spider middleware. 18 | 19 | What it does: 20 | 21 | - Sets parent request ids to the requests coming out of the spider. 22 | 23 | """ 24 | 25 | def __init__(self): 26 | self._seen_requests = seen_requests 27 | 28 | def process_spider_output(self, response, result, spider): 29 | parent = self._seen_requests.pop(response.request, None) 30 | for x in result: 31 | if isinstance(x, Request): 32 | self._process_request(x, parent) 33 | yield x 34 | 35 | async def process_spider_output_async(self, response, result, spider): 36 | parent = self._seen_requests.pop(response.request, None) 37 | async for x in result: 38 | if isinstance(x, Request): 39 | self._process_request(x, parent) 40 | yield x 41 | 42 | def _process_request(self, request, parent): 43 | request.meta[HS_PARENT_ID_KEY] = parent 44 | # Remove request id if it was for some reason set in the request coming from Spider. 45 | request.meta.pop(HS_REQUEST_ID_KEY, None) 46 | 47 | 48 | class HubstorageDownloaderMiddleware: 49 | """Hubstorage dowloader middleware. 50 | 51 | What it does: 52 | 53 | - Generates request ids for all downloaded requests. 54 | - Sets parent request ids for requests generated in downloader middlewares. 55 | - Stores all downloaded requests into Hubstorage. 56 | 57 | """ 58 | 59 | @classmethod 60 | def from_crawler(cls, crawler): 61 | try: 62 | result = cls(crawler) 63 | except TypeError: 64 | warn( 65 | ( 66 | "Subclasses of HubstorageDownloaderMiddleware must now " 67 | "accept a crawler parameter in their __init__ method. " 68 | "This will become an error in the future." 69 | ), 70 | DeprecationWarning, 71 | ) 72 | result = cls() 73 | result._crawler = crawler 74 | result._load_fingerprinter() 75 | return result 76 | 77 | def __init__(self, crawler): 78 | self._crawler = crawler 79 | self._seen_requests = seen_requests 80 | self.pipe_writer = pipe_writer 81 | self.request_id_sequence = request_id_sequence 82 | self._load_fingerprinter() 83 | 84 | def _load_fingerprinter(self): 85 | if hasattr(self._crawler, "request_fingerprinter"): 86 | self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex() 87 | else: 88 | from scrapy.utils.request import request_fingerprint 89 | self._fingerprint = request_fingerprint 90 | 91 | def process_request(self, request, spider): 92 | # Check if request id is set, which usually happens for retries or redirects because 93 | # those requests are usually copied from the original one. 94 | request_id = request.meta.pop(HS_REQUEST_ID_KEY, None) 95 | if request_id is not None: 96 | # Set original request id or None as a parent request id. 97 | request.meta[HS_PARENT_ID_KEY] = request_id 98 | 99 | def process_response(self, request, response, spider): 100 | # This class of response check is intended to fix the bug described here 101 | # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112 102 | if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"): 103 | return response 104 | 105 | self.pipe_writer.write_request( 106 | url=response.url, 107 | status=response.status, 108 | method=request.method, 109 | rs=len(response.body), 110 | duration=request.meta.get('download_latency', 0) * 1000, 111 | parent=request.meta.setdefault(HS_PARENT_ID_KEY), 112 | fp=self._fingerprint(request), 113 | ) 114 | # Generate and set request id. 115 | request_id = next(self.request_id_sequence) 116 | self._seen_requests[request] = request_id 117 | request.meta[HS_REQUEST_ID_KEY] = request_id 118 | return response 119 | -------------------------------------------------------------------------------- /sh_scrapy/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import tempfile 5 | 6 | from scrapy.settings import Settings 7 | from scrapy.utils.misc import load_object 8 | from scrapy.utils.project import get_project_settings 9 | from scrapy.utils.python import to_unicode 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | REPLACE_ADDONS_PATHS = { 14 | "hworker.bot.ext.page.PageStorageMiddleware": 15 | "scrapy_pagestorage.PageStorageMiddleware", 16 | "hworker.bot.ext.persistence.DotScrapyPersistence": 17 | "scrapy_dotpersistence.DotScrapyPersistence", 18 | "scrapylib.deltafetch.DeltaFetch": 19 | "scrapy_deltafetch.DeltaFetch", 20 | "scrapylib.magicfields.MagicFieldsMiddleware": 21 | "scrapy_magicfields.MagicFieldsMiddleware", 22 | "scrapylib.querycleaner.QueryCleanerMiddleware": 23 | "scrapy_querycleaner.QueryCleanerMiddleware", 24 | "scrapylib.splitvariants.SplitVariantsMiddleware": 25 | "scrapy_splitvariants.SplitVariantsMiddleware", 26 | "scrapy.contrib.throttle.AutoThrottle": 27 | "scrapy.extensions.throttle.AutoThrottle", 28 | } 29 | SLYBOT_SPIDER_MANAGER = 'slybot.spidermanager.ZipfileSlybotSpiderManager' 30 | SLYBOT_DUPE_FILTER = 'slybot.dupefilter.DupeFilterPipeline' 31 | SETTINGS_ORDERED_DICTS = [ 32 | "DOWNLOADER_MIDDLEWARES", "DOWNLOADER_MIDDLEWARES_BASE", 33 | "EXTENSIONS", "EXTENSIONS_BASE", 34 | "ITEM_PIPELINES", "ITEM_PIPELINES_BASE", 35 | "SPIDER_CONTRACTS", "SPIDER_CONTRACTS_BASE", 36 | "SPIDER_MIDDLEWARES", "SPIDER_MIDDLEWARES_BASE" 37 | ] 38 | 39 | try: 40 | from scrapy.utils.deprecate import update_classpath 41 | except ImportError: 42 | update_classpath = lambda x: x 43 | 44 | 45 | class EntrypointSettings(Settings): 46 | """ 47 | We need to convert settings to string since the S3 download handler 48 | doesn't work if the AWS keys are passed as unicode. Other code may 49 | also depend on settings being str. 50 | """ 51 | 52 | def __init__(self): 53 | super(EntrypointSettings, self).__init__() 54 | self.attributes = {} 55 | 56 | def set(self, name, value, priority='project'): 57 | super(EntrypointSettings, self).set( 58 | to_unicode(name), 59 | value if isinstance(value, str) else value, 60 | priority=priority) 61 | 62 | def copy_to_dict(self): 63 | if hasattr(super(EntrypointSettings, self), 'copy_to_dict'): 64 | return super(EntrypointSettings, self).copy_to_dict() 65 | # Backward compatibility with older Scrapy versions w/o copy_to_dict 66 | settings = self.copy() 67 | return {key: settings[key] for key in settings.attributes} 68 | 69 | 70 | def _maybe_load_autoscraping_project(settings, priority=0): 71 | if os.environ.get('SHUB_SPIDER_TYPE') in ('auto', 'portia'): 72 | slybot_settings = {'ITEM_PIPELINES': {}, 73 | 'SLYDUPEFILTER_ENABLED': True, 74 | 'SLYCLOSE_SPIDER_ENABLED': True, 75 | 'SPIDER_MANAGER_CLASS': SLYBOT_SPIDER_MANAGER} 76 | settings.setdict(slybot_settings, priority=priority) 77 | settings['ITEM_PIPELINES'][SLYBOT_DUPE_FILTER] = 0 78 | settings.set("PROJECT_ZIPFILE", 'project-slybot.zip') 79 | 80 | 81 | def _get_component_base(settings, compkey): 82 | if settings.get(compkey + '_BASE') is not None: 83 | return compkey + '_BASE' 84 | return compkey 85 | 86 | 87 | def _get_action_on_missing_addons(settings): 88 | on_missing_addons = settings.get('ON_MISSING_ADDONS', 'warn') 89 | if on_missing_addons not in ['fail', 'error', 'warn']: 90 | logger.warning( 91 | "Wrong value for ON_MISSING_ADDONS: should be one of " 92 | "[fail,error,warn]. Set default 'warn' value.") 93 | on_missing_addons = 'warn' 94 | return on_missing_addons 95 | 96 | 97 | def _update_old_classpaths(settings): 98 | """Update user's project settings with proper class paths. 99 | 100 | Note that the method updates only settings with dicts as values: 101 | it's needed for proper dicts merge to avoid duplicates in paths. 102 | For all other cases Scrapy will handle it by itself. 103 | """ 104 | for setting_key in settings.attributes.keys(): 105 | setting_value = settings[setting_key] 106 | # A workaround to make it work for: 107 | # - Scrapy==1.0.5 with dicts as values 108 | # - Scrapy>=1.1.0 with BaseSettings as values 109 | if hasattr(setting_value, 'copy_to_dict'): 110 | setting_value = setting_value.copy_to_dict() 111 | elif not isinstance(setting_value, dict): 112 | continue 113 | for path in setting_value.keys(): 114 | if not isinstance(path, str): 115 | continue 116 | updated_path = update_classpath(path) 117 | if updated_path != path: 118 | order = settings[setting_key].pop(path) 119 | settings[setting_key][updated_path] = order 120 | 121 | 122 | def _update_component_order(components, path, order): 123 | """Update component order only if it's not set yet""" 124 | updated_path = update_classpath(path) 125 | if updated_path not in components: 126 | components[updated_path] = order 127 | 128 | 129 | def _load_addons(addons, settings, merged_settings, priority=0): 130 | on_missing_addons = _get_action_on_missing_addons(merged_settings) 131 | for addon in addons: 132 | addon_path = addon['path'] 133 | if addon_path in REPLACE_ADDONS_PATHS: 134 | addon_path = REPLACE_ADDONS_PATHS[addon_path] 135 | try: 136 | load_object(addon_path) 137 | except (ImportError, NameError, ValueError) as exc: 138 | message = "Addon import error {}:\n {}".format(addon_path, exc) 139 | if on_missing_addons == 'warn': 140 | logger.warning(message) 141 | continue 142 | elif on_missing_addons == 'error': 143 | logger.error(message) 144 | continue 145 | raise 146 | skey = _get_component_base(settings, addon['type']) 147 | components = settings[skey] 148 | _update_component_order(components, addon_path, addon['order']) 149 | merged_settings.set(skey, components) 150 | merged_settings.setdict(addon['default_settings'], priority) 151 | 152 | 153 | def _merge_with_keeping_order(settings, updates): 154 | for setting_key, value in updates.items(): 155 | if not isinstance(value, dict): 156 | settings.set(setting_key, value, priority='cmdline') 157 | continue 158 | if setting_key in SETTINGS_ORDERED_DICTS: 159 | components = settings[setting_key] 160 | for path, order in value.items(): 161 | _update_component_order(components, path, order) 162 | else: 163 | settings.set(setting_key, value) 164 | 165 | 166 | def _populate_settings_base(apisettings, defaults_func, spider=None): 167 | """Populate and merge project settings with other ones. 168 | 169 | Important note: Scrapy doesn't really copy values on set/setdict methods, 170 | changing a dict in merged settings means mutating it in original settings. 171 | """ 172 | assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded" 173 | settings = get_project_settings().copy() 174 | _update_old_classpaths(settings) 175 | merged_settings = EntrypointSettings() 176 | 177 | enabled_addons = apisettings.setdefault('enabled_addons', []) 178 | project_settings = apisettings.setdefault('project_settings', {}) 179 | organization_settings = apisettings.setdefault('organization_settings', {}) 180 | spider_settings = apisettings.setdefault('spider_settings', {}) 181 | job_settings = apisettings.setdefault('job_settings', {}) 182 | 183 | defaults_func(settings) 184 | merged_settings.setdict(project_settings, priority=10) 185 | merged_settings.setdict(organization_settings, priority=20) 186 | if spider: 187 | merged_settings.setdict(spider_settings, priority=30) 188 | _maybe_load_autoscraping_project(merged_settings, priority=0) 189 | merged_settings.set('JOBDIR', tempfile.mkdtemp(prefix='jobdata-'), 190 | priority=40) 191 | merged_settings.setdict(job_settings, priority=40) 192 | # Load addons only after we gather all settings 193 | _load_addons(enabled_addons, settings, merged_settings, priority=0) 194 | _merge_with_keeping_order(settings, merged_settings.copy_to_dict()) 195 | _enforce_required_settings(settings) 196 | return settings 197 | 198 | 199 | def _load_default_settings(settings): 200 | downloader_middlewares = { 201 | 'sh_scrapy.diskquota.DiskQuotaDownloaderMiddleware': -10000, # closest to the engine 202 | 'sh_scrapy.middlewares.HubstorageDownloaderMiddleware': 10000, # closest to the downloader 203 | } 204 | spider_middlewares = { 205 | 'sh_scrapy.diskquota.DiskQuotaSpiderMiddleware': -10001, # closest to the engine 206 | 'sh_scrapy.middlewares.HubstorageSpiderMiddleware': -10000, # right after disk quota middleware 207 | } 208 | extensions = { 209 | 'scrapy.extensions.debug.StackTraceDump': 0, 210 | 'sh_scrapy.extension.HubstorageExtension': 100, 211 | } 212 | 213 | try: 214 | import slybot 215 | except ImportError: 216 | pass 217 | else: 218 | extensions['slybot.closespider.SlybotCloseSpider'] = 0 219 | 220 | settings.get('DOWNLOADER_MIDDLEWARES_BASE').update(downloader_middlewares) 221 | settings.get('EXTENSIONS_BASE').update(extensions) 222 | settings.get('SPIDER_MIDDLEWARES_BASE').update(spider_middlewares) 223 | memory_limit = int(os.environ.get('SHUB_JOB_MEMORY_LIMIT', 950)) 224 | settings.setdict({ 225 | 'STATS_CLASS': 'sh_scrapy.stats.HubStorageStatsCollector', 226 | 'MEMUSAGE_ENABLED': True, 227 | 'MEMUSAGE_LIMIT_MB': memory_limit, 228 | 'DISK_QUOTA_STOP_ON_ERROR': True, 229 | 'WEBSERVICE_ENABLED': False, 230 | 'LOG_LEVEL': 'INFO', 231 | 'LOG_ENABLED': False, 232 | 'TELNETCONSOLE_HOST': '0.0.0.0', # to access telnet console from host 233 | }, priority='cmdline') 234 | 235 | 236 | def _enforce_required_settings(settings): 237 | settings.setdict({ 238 | # breaks logging and useless in scrapy cloud 239 | 'LOG_STDOUT': False, 240 | }, priority='cmdline') 241 | 242 | 243 | def populate_settings(apisettings, spider=None): 244 | return _populate_settings_base(apisettings, _load_default_settings, spider) 245 | -------------------------------------------------------------------------------- /sh_scrapy/stats.py: -------------------------------------------------------------------------------- 1 | from twisted.internet import task 2 | from scrapy.statscollectors import StatsCollector 3 | 4 | from sh_scrapy import hsref 5 | from sh_scrapy.writer import pipe_writer 6 | 7 | 8 | class HubStorageStatsCollector(StatsCollector): 9 | 10 | INTERVAL = 30 11 | 12 | def __init__(self, crawler): 13 | super(HubStorageStatsCollector, self).__init__(crawler) 14 | self.hsref = hsref.hsref 15 | self.pipe_writer = pipe_writer 16 | 17 | def _upload_stats(self): 18 | self.pipe_writer.write_stats(self._stats) 19 | 20 | def open_spider(self, spider): 21 | self._setup_looping_call(now=True) 22 | 23 | def _setup_looping_call(self, _ignored=None, **kwargs): 24 | self._samplestask = task.LoopingCall(self._upload_stats) 25 | d = self._samplestask.start(self.INTERVAL, **kwargs) 26 | d.addErrback(self._setup_looping_call, now=False) 27 | 28 | def close_spider(self, spider, reason): 29 | super(HubStorageStatsCollector, self).close_spider(spider, reason) 30 | if self._samplestask.running: 31 | self._samplestask.stop() 32 | self._upload_stats() 33 | -------------------------------------------------------------------------------- /sh_scrapy/utils.py: -------------------------------------------------------------------------------- 1 | from sh_scrapy.settings import populate_settings 2 | from sh_scrapy.crawl import _get_apisettings 3 | 4 | 5 | def get_project_settings(): 6 | return populate_settings(_get_apisettings()) 7 | -------------------------------------------------------------------------------- /sh_scrapy/writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | import threading 5 | 6 | from scrapinghub.hubstorage.serialization import jsondefault 7 | from scrapinghub.hubstorage.utils import millitime 8 | 9 | 10 | def _not_configured(*args, **kwargs): 11 | raise RuntimeError("Pipe writer is misconfigured, named pipe path is not set") 12 | 13 | 14 | class _PipeWriter(object): 15 | """Writer for the Scrapinghub named pipe. 16 | 17 | It's not safe to instantiate and use multiple writers, only one writer 18 | should be instantiated and used, otherwise data may be corrupted. 19 | 20 | The object is thread safe. 21 | 22 | :ivar path: Named pipe path 23 | 24 | """ 25 | 26 | def __init__(self, path): 27 | self.path = path or '' 28 | self._lock = threading.Lock() 29 | self._pipe = None 30 | if not self.path: 31 | self._write = _not_configured 32 | self.open = _not_configured 33 | self.close = _not_configured 34 | 35 | def open(self): 36 | with self._lock: 37 | self._pipe = open(self.path, 'wb') 38 | 39 | def _write(self, command, payload): 40 | # binary command 41 | command = command.encode('utf-8') 42 | # binary payload 43 | encoded_payload = json.dumps( 44 | payload, 45 | separators=(',', ':'), 46 | default=jsondefault 47 | ).encode('utf-8') 48 | # write needs to be locked because write can be called from multiple threads 49 | with self._lock: 50 | self._pipe.write(command) 51 | self._pipe.write(b' ') 52 | self._pipe.write(encoded_payload) 53 | self._pipe.write(b'\n') 54 | self._pipe.flush() 55 | 56 | def write_log(self, level, message): 57 | log = { 58 | 'time': millitime(), 59 | 'level': level, 60 | 'message': message 61 | } 62 | self._write('LOG', log) 63 | 64 | def write_request(self, url, status, method, rs, duration, parent, fp): 65 | request = { 66 | 'url': url, 67 | 'status': int(status), 68 | 'method': method, 69 | 'rs': int(rs), 70 | 'duration': int(duration), 71 | 'parent': parent, 72 | 'time': millitime(), 73 | 'fp': fp, 74 | } 75 | self._write('REQ', request) 76 | 77 | def write_item(self, item): 78 | self._write('ITM', item) 79 | 80 | def write_stats(self, stats): 81 | self._write('STA', {'time': millitime(), 'stats': stats}) 82 | 83 | def set_outcome(self, outcome): 84 | self._write('FIN', {'outcome': outcome}) 85 | 86 | def close(self): 87 | with self._lock: 88 | self._pipe.close() 89 | 90 | 91 | pipe_writer = _PipeWriter(os.environ.get('SHUB_FIFO_PATH', '')) 92 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/scrapinghub-entrypoint-scrapy/aeea6fc61827fc5ff8f871a3f988588d338f8185/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import codecs 3 | import os 4 | import shutil 5 | import tempfile 6 | 7 | import pytest 8 | from scrapy.utils.python import to_unicode, to_bytes 9 | 10 | TEMP_DIR = tempfile.mkdtemp() 11 | SHUB_FIFO_PATH = os.path.join(TEMP_DIR, 'scrapinghub') 12 | os.environ['SHUB_FIFO_PATH'] = SHUB_FIFO_PATH 13 | 14 | from sh_scrapy.writer import pipe_writer # should go after setting SHUB_FIFO_PATH 15 | 16 | 17 | TEST_AUTH = to_unicode(codecs.encode(to_bytes('1/2/3:authstr'), 'hex_codec')) 18 | 19 | 20 | @pytest.fixture(scope='session', autouse=True) 21 | def clean_shub_fifo_path(): 22 | global TEMP_DIR 23 | pipe_writer.open() 24 | try: 25 | yield 26 | finally: 27 | shutil.rmtree(TEMP_DIR) 28 | 29 | 30 | @pytest.fixture(autouse=True) 31 | def set_jobkeyenvironment(monkeypatch): 32 | monkeypatch.setenv('SHUB_JOBKEY', '1/2/3') 33 | monkeypatch.setenv('SCRAPY_JOB', '1/2/3') 34 | monkeypatch.setenv('SHUB_JOBAUTH', TEST_AUTH) 35 | monkeypatch.setenv('SHUB_STORAGE', 'storage-url') 36 | 37 | 38 | # install the reactor explicitly, as Scrapy including scrapy.utils.test.get_crawler() assumes it's installed 39 | from twisted.internet import reactor 40 | -------------------------------------------------------------------------------- /tests/test_command.py: -------------------------------------------------------------------------------- 1 | 2 | from argparse import ArgumentParser 3 | from optparse import OptionParser 4 | 5 | import pytest 6 | import scrapy 7 | from packaging import version 8 | 9 | from sh_scrapy.commands.shub_image_info import Command 10 | 11 | 12 | @pytest.fixture 13 | def command(): 14 | command = Command() 15 | command.settings = scrapy.settings.Settings() 16 | return command 17 | 18 | 19 | @pytest.mark.skipif( 20 | version.parse(scrapy.__version__) >= version.parse("2.6"), 21 | reason="Scrapy>=2.6 uses argparse" 22 | ) 23 | def test_optparse(command): 24 | parser = OptionParser() 25 | command.add_options(parser) 26 | options = parser.parse_args(["--debug"]) 27 | assert options[0].debug 28 | 29 | 30 | @pytest.mark.skipif( 31 | version.parse(scrapy.__version__) < version.parse("2.6"), 32 | reason="Scrapy<2.6 uses optparse" 33 | ) 34 | def test_argparse(command): 35 | parser = ArgumentParser() 36 | command.add_options(parser) 37 | options = parser.parse_args(["--debug"]) 38 | assert options.debug 39 | -------------------------------------------------------------------------------- /tests/test_compat.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pytest 4 | from scrapy.exceptions import ScrapyDeprecationWarning 5 | 6 | from sh_scrapy.compat import is_string, to_bytes, to_unicode, to_native_str 7 | 8 | 9 | # test deprecation messages 10 | 11 | def test_deprecated_is_string(): 12 | with warnings.catch_warnings(record=True) as caught: 13 | assert is_string("foo") 14 | assert not is_string(b"foo") 15 | assert not is_string(1) 16 | assert ( 17 | "is_string(var) is deprecated, please use isinstance(var, str) instead" 18 | == str(caught[0].message) 19 | ) 20 | assert caught[0].category is ScrapyDeprecationWarning 21 | 22 | 23 | def test_deprecated_to_unicode(): 24 | with warnings.catch_warnings(record=True) as caught: 25 | assert to_unicode("foo") == "foo" 26 | assert to_unicode(b"foo") == "foo" 27 | assert ( 28 | "Call to deprecated function to_unicode. Use scrapy.utils.python.to_unicode instead." 29 | == str(caught[0].message) 30 | ) 31 | assert caught[0].category is ScrapyDeprecationWarning 32 | 33 | 34 | def test_deprecated_to_native_str(): 35 | with warnings.catch_warnings(record=True) as caught: 36 | assert to_native_str("foo") == "foo" 37 | assert to_native_str(b"foo") == "foo" 38 | assert ( 39 | "Call to deprecated function to_native_str. Use scrapy.utils.python.to_unicode instead." 40 | == str(caught[0].message) 41 | ) 42 | assert caught[0].category is ScrapyDeprecationWarning 43 | 44 | 45 | def test_deprecated_to_bytes(): 46 | with warnings.catch_warnings(record=True) as caught: 47 | assert to_bytes("foo") == b"foo" 48 | assert to_bytes(b"foo") == b"foo" 49 | assert ( 50 | "Call to deprecated function to_bytes. Use scrapy.utils.python.to_bytes instead." 51 | == str(caught[0].message) 52 | ) 53 | assert caught[0].category is ScrapyDeprecationWarning 54 | 55 | 56 | # Testing to_unicode conversion 57 | 58 | def test_to_str_an_utf8_encoded_string_to_str(): 59 | assert to_unicode(b'lel\xc3\xb1e') == u'lel\xf1e' 60 | 61 | 62 | def test_to_str_a_latin_1_encoded_string_to_str(): 63 | assert to_unicode(b'lel\xf1e', 'latin-1') == u'lel\xf1e' 64 | 65 | 66 | def test_to_str_a_unicode_to_str_should_return_the_same_object(): 67 | assert to_unicode(u'\xf1e\xf1e\xf1e') == u'\xf1e\xf1e\xf1e' 68 | 69 | 70 | def test_to_str_a_strange_object_should_raise_TypeError(): 71 | with pytest.raises(TypeError) as excinfo: 72 | to_unicode(123) 73 | 74 | 75 | def test_to_str_errors_argument(): 76 | assert to_unicode(b'a\xedb', 'utf-8', errors='replace') == u'a\ufffdb' 77 | 78 | 79 | # Testing to_bytes conversion 80 | 81 | def test_to_bytes_a_unicode_object_to_an_utf_8_encoded_string(): 82 | assert to_bytes(u'\xa3 49') == b'\xc2\xa3 49' 83 | 84 | 85 | def test_to_bytes_a_unicode_object_to_a_latin_1_encoded_string(): 86 | assert to_bytes(u'\xa3 49', 'latin-1') == b'\xa3 49' 87 | 88 | 89 | def test_to_bytes_a_regular_bytes_to_bytes_should_return_the_same_object(): 90 | assert to_bytes(b'lel\xf1e') == b'lel\xf1e' 91 | 92 | 93 | def test_to_bytes_a_strange_object_should_raise_TypeError(): 94 | with pytest.raises(TypeError): 95 | to_bytes(pytest) 96 | 97 | 98 | def test_to_bytes_errors_argument(): 99 | assert to_bytes(u'a\ufffdb', 'latin-1', errors='replace') == b'a?b' 100 | -------------------------------------------------------------------------------- /tests/test_crawl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import mock 5 | import pytest 6 | import unittest 7 | from scrapy.settings import Settings 8 | 9 | import sh_scrapy.crawl 10 | from sh_scrapy.crawl import _fatalerror 11 | from sh_scrapy.crawl import _get_apisettings 12 | from sh_scrapy.crawl import _run 13 | from sh_scrapy.crawl import _run_scrapy 14 | from sh_scrapy.crawl import _run_pkgscript 15 | from sh_scrapy.crawl import _run_usercode 16 | from sh_scrapy.crawl import _launch 17 | from sh_scrapy.crawl import list_spiders 18 | from sh_scrapy.crawl import main 19 | from sh_scrapy.log import HubstorageLogHandler 20 | from tests.utils import create_project, call_command 21 | 22 | 23 | try: 24 | from scrapy_spider_metadata import get_spider_metadata 25 | SPIDER_METADATA_AVAILABLE = True 26 | except: 27 | SPIDER_METADATA_AVAILABLE = False 28 | 29 | 30 | @mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn', 31 | 'SENTRY_DSN': 'sentry-dsn'}) 32 | def test_init_module(): 33 | assert sh_scrapy.crawl._sys_stderr == sys.stderr 34 | assert sh_scrapy.crawl._sys_stdout == sys.stdout 35 | assert sh_scrapy.crawl.socket.getdefaulttimeout() == 60.0 36 | 37 | 38 | @mock.patch('traceback.print_exception') 39 | def test_fatal_error(trace_print): 40 | exception = ValueError('some exception') 41 | traceback = None 42 | try: 43 | raise exception 44 | except: 45 | # get traceback before we cleaned it with fatalerror 46 | traceback = sys.exc_info()[2] 47 | _fatalerror() 48 | assert trace_print.called 49 | trace_args = trace_print.call_args_list[0] 50 | assert trace_args[0][0] == ValueError 51 | assert trace_args[0][1] == exception 52 | assert trace_args[0][2] == traceback 53 | assert trace_args[0][3] is None 54 | assert trace_args[0][4] == sys.stderr 55 | 56 | 57 | @mock.patch('traceback.print_exception') 58 | def test_fatal_error_ignore_IOError(trace_print): 59 | trace_print.side_effect = IOError('some error') 60 | try: 61 | raise ValueError('some exception') 62 | except: 63 | _fatalerror() 64 | assert trace_print.called 65 | 66 | 67 | @mock.patch('sh_scrapy.crawl._sentry_dsn') 68 | def test_fatal_error_sentry_import_error(sentry_dsn): 69 | try: 70 | raise ValueError('some exception') 71 | except: 72 | _fatalerror() 73 | 74 | 75 | @mock.patch('sh_scrapy.crawl._sentry_dsn') 76 | def test_fatal_error_sentry_with_mock(sentry_dsn): 77 | raven_stub = type('raven', (object, ), {}) 78 | raven_stub.Client = mock.Mock() 79 | try: 80 | sys.modules['raven'] = raven_stub 81 | raise ValueError('some exception') 82 | except: 83 | _fatalerror() 84 | finally: 85 | del sys.modules['raven'] 86 | assert raven_stub.Client.called 87 | assert raven_stub.Client.call_args[0] == (sentry_dsn,) 88 | sentry_client = raven_stub.Client.return_value 89 | assert sentry_client.captureException.called 90 | 91 | 92 | @mock.patch('sh_scrapy.crawl._sentry_dsn') 93 | def test_fatal_error_sentry_with_mock_ignore_errors(sentry_dsn): 94 | raven_stub = type('raven', (object, ), {}) 95 | raven_stub.Client = mock.Mock() 96 | sentry_client = raven_stub.Client.return_value 97 | sentry_client.captureException.side_effect = IOError('error') 98 | try: 99 | sys.modules['raven'] = raven_stub 100 | raise ValueError('some exception') 101 | except: 102 | _fatalerror() 103 | finally: 104 | del sys.modules['raven'] 105 | 106 | 107 | def test_get_apisettings_empty(): 108 | assert _get_apisettings() == {} 109 | 110 | 111 | @mock.patch.dict(os.environ, { 112 | 'SHUB_SETTINGS': 'data:;base64,ImhlbGxvIHdvcmxkIg=='}) 113 | def test_get_apisettings_from_env(): 114 | assert _get_apisettings() == 'hello world' 115 | 116 | 117 | @mock.patch('sh_scrapy.crawl._run_pkgscript') 118 | def test_run_pkg_script(run_pkg_mock): 119 | _run(['py:script.py'], {'SETTING': 'VALUE'}) 120 | assert run_pkg_mock.called 121 | assert run_pkg_mock.call_args[0] == (['py:script.py'],) 122 | 123 | 124 | @unittest.skipIf(sys.version_info > (3,7), "Requires Python 3.7 or lower") 125 | @mock.patch('pkg_resources.WorkingSet') 126 | def test_run_pkg_script_distribution_not_found(working_set_class): 127 | fake_set = mock.Mock() 128 | fake_set.iter_entry_points.return_value = iter(()) 129 | working_set_class.return_value = fake_set 130 | with pytest.raises(ValueError): 131 | _run(['py:script.py'], {'SETTING': 'VALUE'}) 132 | 133 | @unittest.skipIf(sys.version_info < (3,8), "Requires Python 3.8 or higher") 134 | @mock.patch('importlib.metadata.entry_points') 135 | def test_run_pkg_script_distribution_not_found_python_3_8_plus(working_set_class): 136 | fake_set = mock.Mock() 137 | fake_set.iter_entry_points.return_value = iter(()) 138 | working_set_class.return_value = [fake_set] 139 | with pytest.raises(ValueError): 140 | _run(['py:script.py'], {'SETTING': 'VALUE'}) 141 | 142 | @mock.patch('sh_scrapy.crawl._run_scrapy') 143 | def test_run_scrapy_spider(run_scrapy_mock): 144 | _run(['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'}) 145 | assert run_scrapy_mock.called 146 | assert run_scrapy_mock.call_args[0] == ( 147 | ['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'}) 148 | 149 | 150 | @mock.patch('scrapy.cmdline.execute') 151 | def test_run_scrapy(execute_mock): 152 | _run_scrapy(['scrapy', 'crawl', 'spider'], {'SETTING': 'VALUE'}) 153 | assert execute_mock.called 154 | assert execute_mock.call_args == ( 155 | {'settings': {'SETTING': 'VALUE'}},) 156 | assert sys.argv == ['scrapy', 'crawl', 'spider'] 157 | 158 | 159 | def get_working_set(working_set_class): 160 | """Helper to confugure a fake working set with ep""" 161 | working_set = working_set_class.return_value 162 | ep = mock.Mock() 163 | ep.name = 'settings' 164 | working_set.iter_entry_points.return_value = [ep] 165 | return working_set 166 | 167 | 168 | @unittest.skipIf(sys.version_info > (3,7), "Requires Python 3.7 or lower") 169 | @mock.patch('pkg_resources.WorkingSet') 170 | def test_run_pkgscript_base_usage(working_set_class): 171 | working_set = get_working_set(working_set_class) 172 | _run_pkgscript(['py:script.py', 'arg1', 'arg2']) 173 | assert working_set.iter_entry_points.called 174 | assert working_set.iter_entry_points.call_args[0] == ('scrapy',) 175 | ep = working_set.iter_entry_points.return_value[0] 176 | assert ep.dist.run_script.called 177 | assert ep.dist.run_script.call_args[0] == ( 178 | 'script.py', {'__name__': '__main__'}) 179 | assert sys.argv == ['script.py', 'arg1', 'arg2'] 180 | 181 | def get_entry_points_mock(): 182 | """Helper to configure a fake entry point""" 183 | ep = mock.Mock() 184 | ep.name = 'settings' 185 | ep.dist.run_script = mock.Mock() # only for the pkg_resources code path 186 | return [ep] 187 | 188 | @unittest.skipIf(sys.version_info < (3,8), "Requires Python 3.8 or higher") 189 | @mock.patch('sh_scrapy.crawl._run_script') 190 | @mock.patch('importlib.metadata.entry_points') 191 | def test_run_pkgscript_base_usage_python_3_8_plus(entry_points_mock, mocked_run): 192 | entry_points_mock.return_value = get_entry_points_mock() 193 | _run_pkgscript(['py:script.py', 'arg1', 'arg2']) 194 | assert entry_points_mock.called 195 | assert entry_points_mock.call_args[1] == {'group': 'scrapy'} 196 | assert mocked_run.called 197 | assert mocked_run.call_args[0][1:] == ('script.py', {'__name__': '__main__'}) 198 | assert sys.argv == ['script.py', 'arg1', 'arg2'] 199 | 200 | 201 | @mock.patch.dict(os.environ, { 202 | 'SHUB_SETTINGS': '{"project_settings": {"SETTING....'}) 203 | @mock.patch('sh_scrapy.crawl._run') 204 | def test_run_usercode_bad_settings(mocked_run): 205 | with pytest.raises(ValueError): 206 | _run_usercode('py:script.py', ['py:script.py'], _get_apisettings) 207 | assert not mocked_run.called 208 | 209 | 210 | @mock.patch.dict(os.environ, { 211 | 'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}'}) 212 | @mock.patch('sh_scrapy.crawl._run') 213 | def test_run_usercode_run_exception(mocked_run): 214 | mocked_run.side_effect = AttributeError('argA is missing') 215 | with pytest.raises(AttributeError): 216 | _run_usercode('py:script.py', ['py:script.py'], _get_apisettings) 217 | assert mocked_run.called 218 | 219 | 220 | @mock.patch.dict(os.environ, { 221 | 'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}'}) 222 | @mock.patch('sh_scrapy.crawl._run') 223 | def test_run_usercode(mocked_run): 224 | _run_usercode('py:script.py', ['py:script.py', 'arg1'], _get_apisettings) 225 | assert mocked_run.called 226 | assert mocked_run.call_args[0][0] == ['py:script.py', 'arg1'] 227 | settings = mocked_run.call_args[0][1] 228 | assert isinstance(settings, Settings) 229 | assert settings['SETTING_TEST'] == 'VAL' 230 | 231 | 232 | @mock.patch.dict(os.environ, { 233 | 'SHUB_SETTINGS': '{"project_settings": {"LOG_LEVEL": 10}}'}) 234 | @mock.patch('sh_scrapy.crawl._run') 235 | def test_run_usercode_with_loghandler(mocked_run): 236 | loghandler = mock.Mock() 237 | _run_usercode('py:script.py', ['py:script.py', 'arg1'], 238 | _get_apisettings, loghandler) 239 | assert mocked_run.called 240 | assert loghandler.setLevel.called 241 | call_args = loghandler.setLevel.call_args[0] 242 | assert len(call_args) == 1 243 | assert call_args[0] == 10 244 | 245 | 246 | SPIDER_MSG = { 247 | 'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto', 248 | 'auth': 'auths', 'spider_args': {'arg1': 'val1', 'arg2': 'val2'}, 249 | 'settings': {'SETTING1': 'VAL1', 'SETTING2': 'VAL2'} 250 | } 251 | 252 | 253 | @mock.patch('sh_scrapy.crawl._fatalerror') 254 | def test_launch_handle_fatalerror(mocked_fatalerr): 255 | with pytest.raises(AssertionError): 256 | _launch() 257 | assert mocked_fatalerr.called 258 | 259 | 260 | @mock.patch.dict(os.environ, {'SHUB_JOB_DATA': json.dumps(SPIDER_MSG)}) 261 | @mock.patch('sh_scrapy.env.setup_environment') 262 | @mock.patch('sh_scrapy.crawl._run_usercode') 263 | def test_launch(mocked_run, mocked_setup): 264 | _launch() 265 | expected_env = { 266 | 'SCRAPY_SPIDER': 'test', 'SHUB_JOBNAME': 'test', 267 | 'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1', 268 | 'SHUB_JOBKEY': '1/2/3', 'SHUB_JOB_TAGS': '', 269 | 'SHUB_JOBAUTH': '312f322f333a6175746873', 270 | 'SHUB_SPIDER_TYPE': 'auto'} 271 | for k, v in expected_env.items(): 272 | assert os.environ.get(k) == v 273 | assert mocked_run.called 274 | run_args = mocked_run.call_args[0] 275 | assert run_args[0] == 'test' 276 | expected_args = [ 277 | 'scrapy', 'crawl', 'test', '-a', 'arg1=val1', '-a', 278 | 'arg2=val2', '-s', 'SETTING1=VAL1', '-s', 'SETTING2=VAL2'] 279 | assert run_args[1] == expected_args 280 | assert run_args[2] == _get_apisettings 281 | assert isinstance(run_args[3], HubstorageLogHandler) 282 | assert mocked_setup.called 283 | 284 | 285 | @mock.patch('sh_scrapy.env.setup_environment') 286 | @mock.patch('sh_scrapy.crawl._run_usercode') 287 | def test_list_spiders(mocked_run, mocked_setup): 288 | list_spiders() 289 | assert mocked_run.called 290 | run_args = mocked_run.call_args[0] 291 | assert run_args[0] is None 292 | expected_args = ['scrapy', 'list'] 293 | assert run_args[1] == expected_args 294 | assert run_args[2] == _get_apisettings 295 | assert mocked_setup.called 296 | 297 | 298 | @mock.patch('sh_scrapy.crawl._fatalerror') 299 | @mock.patch('sh_scrapy.env.setup_environment') 300 | def test_list_spiders_handle_fatalerror(mocked_setup, mocked_fatalerr): 301 | mocked_setup.side_effect = AttributeError('some error') 302 | with pytest.raises(AttributeError): 303 | list_spiders() 304 | assert mocked_fatalerr.called 305 | 306 | 307 | @mock.patch('sh_scrapy.writer.pipe_writer') 308 | @mock.patch('sh_scrapy.crawl._launch') 309 | def test_main(mocked_launch, pipe_writer): 310 | main() 311 | assert pipe_writer.open.called 312 | assert mocked_launch.called 313 | assert mocked_launch.call_args == () 314 | assert sys.stdout == sh_scrapy.crawl._sys_stdout 315 | assert sys.stderr == sh_scrapy.crawl._sys_stderr 316 | # Pipe writer file object is closed implicitly on program exit. 317 | # This ensures that pipe is writable even if main program is fininshed - 318 | # e.g. for threads that are not closed yet. 319 | assert not pipe_writer.close.called 320 | 321 | 322 | def test_image_info(tmp_path): 323 | project_dir = create_project(tmp_path) 324 | out, err = call_command(project_dir, "shub-image-info") 325 | # can't be asserted as it contains a SHScrapyDeprecationWarning 326 | # assert err == "" 327 | data = json.loads(out) 328 | expected = { 329 | "project_type": "scrapy", 330 | "spiders": ["myspider"], 331 | "metadata": {"myspider": {}}, 332 | } 333 | if not SPIDER_METADATA_AVAILABLE: 334 | del expected["metadata"] 335 | assert data == expected 336 | 337 | 338 | def test_image_info_metadata(tmp_path): 339 | project_dir = create_project(tmp_path, spider_text=""" 340 | from scrapy import Spider 341 | 342 | class MySpider(Spider): 343 | name = "myspider" 344 | metadata = {"foo": 42} 345 | """) 346 | out, _ = call_command(project_dir, "shub-image-info") 347 | data = json.loads(out) 348 | expected = { 349 | "project_type": "scrapy", 350 | "spiders": ["myspider"], 351 | "metadata": {"myspider": {"foo": 42}}, 352 | } 353 | if not SPIDER_METADATA_AVAILABLE: 354 | del expected["metadata"] 355 | assert data == expected 356 | 357 | 358 | def test_image_info_metadata_skip_broken(tmp_path): 359 | project_dir = create_project(tmp_path, spider_text=""" 360 | from scrapy import Spider 361 | 362 | class MySpider(Spider): 363 | name = "myspider" 364 | metadata = {"foo": Spider} 365 | """) 366 | out, _ = call_command(project_dir, "shub-image-info") 367 | data = json.loads(out) 368 | expected = { 369 | "project_type": "scrapy", 370 | "spiders": ["myspider"], 371 | "metadata": {}, 372 | } 373 | if not SPIDER_METADATA_AVAILABLE: 374 | del expected["metadata"] 375 | assert data == expected 376 | 377 | 378 | @pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") 379 | def test_image_info_args(tmp_path): 380 | project_dir = create_project(tmp_path, spider_text=""" 381 | from enum import Enum 382 | from scrapy import Spider 383 | from scrapy_spider_metadata import Args 384 | from pydantic import BaseModel, Field 385 | 386 | class ToolEnum(Enum): 387 | spanner = "spanner" 388 | wrench = "wrench" 389 | 390 | class Parameters(BaseModel): 391 | tool: ToolEnum = ToolEnum.spanner 392 | 393 | class MySpider(Args[Parameters], Spider): 394 | name = "myspider" 395 | """) 396 | out, _ = call_command(project_dir, "shub-image-info") 397 | data = json.loads(out) 398 | expected = { 399 | "project_type": "scrapy", 400 | "spiders": ["myspider"], 401 | "metadata": { 402 | "myspider": { 403 | "param_schema": { 404 | "properties": { 405 | "tool": { 406 | "default": "spanner", 407 | "enum": ["spanner", "wrench"], 408 | "title": "Tool", 409 | "type": "string", 410 | }, 411 | }, 412 | "title": "Parameters", 413 | "type": "object", 414 | }, 415 | }, 416 | }, 417 | } 418 | if not SPIDER_METADATA_AVAILABLE: 419 | del expected["metadata"] 420 | assert data == expected 421 | 422 | 423 | @pytest.mark.skipif(not SPIDER_METADATA_AVAILABLE, reason="scrapy-spider-metadata is not installed") 424 | def test_image_info_args_metadata(tmp_path): 425 | project_dir = create_project(tmp_path, spider_text=""" 426 | from enum import Enum 427 | from scrapy import Spider 428 | from scrapy_spider_metadata import Args 429 | from pydantic import BaseModel, Field 430 | 431 | class ToolEnum(Enum): 432 | spanner = "spanner" 433 | wrench = "wrench" 434 | 435 | class Parameters(BaseModel): 436 | tool: ToolEnum = ToolEnum.spanner 437 | 438 | class MySpider(Args[Parameters], Spider): 439 | name = "myspider" 440 | metadata = {"foo": 42} 441 | """) 442 | out, _ = call_command(project_dir, "shub-image-info") 443 | data = json.loads(out) 444 | expected = { 445 | "project_type": "scrapy", 446 | "spiders": ["myspider"], 447 | "metadata": { 448 | "myspider": { 449 | "foo": 42, 450 | "param_schema": { 451 | "properties": { 452 | "tool": { 453 | "default": "spanner", 454 | "enum": ["spanner", "wrench"], 455 | "title": "Tool", 456 | "type": "string", 457 | }, 458 | }, 459 | "title": "Parameters", 460 | "type": "object", 461 | }, 462 | }, 463 | }, 464 | } 465 | if not SPIDER_METADATA_AVAILABLE: 466 | del expected["metadata"] 467 | assert data == expected 468 | -------------------------------------------------------------------------------- /tests/test_diskquota.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pytest 3 | from scrapy.utils.test import get_crawler 4 | from scrapy.exceptions import NotConfigured 5 | 6 | from sh_scrapy.diskquota import DiskQuota 7 | from sh_scrapy.diskquota import DiskQuotaDownloaderMiddleware 8 | from sh_scrapy.diskquota import DiskQuotaSpiderMiddleware 9 | 10 | 11 | def test_disk_quota_disabled(): 12 | crawler = get_crawler() 13 | with pytest.raises(NotConfigured): 14 | DiskQuota(crawler) 15 | 16 | 17 | @pytest.fixture 18 | def crawler(): 19 | return get_crawler(settings_dict={'DISK_QUOTA_STOP_ON_ERROR': True}) 20 | 21 | 22 | def test_disk_quota_init(crawler): 23 | dquota = DiskQuota(crawler) 24 | assert dquota.crawler == crawler 25 | 26 | 27 | def test_disk_quota_from_crawler(crawler): 28 | assert isinstance(DiskQuota.from_crawler(crawler), DiskQuota) 29 | 30 | 31 | def test_disk_quota_check_error(crawler): 32 | dquota = DiskQuota(crawler) 33 | assert not dquota._is_disk_quota_error(ValueError()) 34 | assert not dquota._is_disk_quota_error(IOError()) 35 | valid_error = IOError() 36 | valid_error.errno = 122 37 | assert dquota._is_disk_quota_error(valid_error) 38 | other_valid_error = OSError() 39 | other_valid_error.errno = 122 40 | assert dquota._is_disk_quota_error(other_valid_error) 41 | 42 | 43 | def test_downloaded_mware_process_not_stopped(crawler): 44 | crawler.engine = mock.Mock() 45 | mware = DiskQuotaDownloaderMiddleware(crawler) 46 | mware.process_exception('request', ValueError(), 'spider') 47 | assert not crawler.engine.close_spider.called 48 | 49 | 50 | def test_downloaded_mware_process_stopped(crawler): 51 | crawler.engine = mock.Mock() 52 | mware = DiskQuotaDownloaderMiddleware(crawler) 53 | error = IOError() 54 | error.errno = 122 55 | mware.process_exception('request', error, 'spider') 56 | assert crawler.engine.close_spider.called 57 | assert crawler.engine.close_spider.call_args[0] == ( 58 | 'spider', 'diskusage_exceeded') 59 | 60 | 61 | def test_spider_mware_process_not_stopped(crawler): 62 | crawler.engine = mock.Mock() 63 | mware = DiskQuotaSpiderMiddleware(crawler) 64 | mware.process_spider_exception('response', ValueError(), 'spider') 65 | assert not crawler.engine.close_spider.called 66 | 67 | 68 | def test_spider_mware_process_stopped(crawler): 69 | crawler.engine = mock.Mock() 70 | mware = DiskQuotaSpiderMiddleware(crawler) 71 | error = IOError() 72 | error.errno = 122 73 | mware.process_spider_exception('response', error, 'spider') 74 | assert crawler.engine.close_spider.called 75 | assert crawler.engine.close_spider.call_args[0] == ( 76 | 'spider', 'diskusage_exceeded') 77 | -------------------------------------------------------------------------------- /tests/test_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import mock 4 | import codecs 5 | import pytest 6 | import tempfile 7 | 8 | from scrapy.utils.python import to_bytes, to_unicode 9 | 10 | from sh_scrapy.env import _jobauth 11 | from sh_scrapy.env import _jobname 12 | from sh_scrapy.env import decode_uri 13 | from sh_scrapy.env import get_args_and_env 14 | from sh_scrapy.env import _job_args_and_env 15 | from sh_scrapy.env import _make_scrapy_args 16 | from sh_scrapy.env import setup_environment 17 | from sh_scrapy.env import _scrapy_crawl_args_and_env 18 | 19 | 20 | def test_make_scrapy_args(): 21 | assert _make_scrapy_args('-a', {}) == [] 22 | assert _make_scrapy_args('-a', {'test': 'val'}) == ['-a', 'test=val'] 23 | result1 = _make_scrapy_args('-s', [('k1', 'v1'), ('k2', 'v2')]) 24 | assert result1 == ['-s', 'k1=v1', '-s', 'k2=v2'] 25 | result2 = _make_scrapy_args('-s', [('arg1', 'val1'), ('arg2', 'val2')]) 26 | assert result2 == ['-s', 'arg1=val1', '-s', 'arg2=val2'] 27 | result3 = _make_scrapy_args('-s', [('arg1', 1), ('arg2', 2)]) 28 | assert result3 == ['-s', 'arg1=1', '-s', 'arg2=2'] 29 | 30 | 31 | def test_scrapy_crawl_args_and_env(): 32 | # test with minimal message 33 | result = _scrapy_crawl_args_and_env({'key': '1/2/3', 'spider': 'test'}) 34 | assert len(result) == 2 35 | assert result[0] == ['scrapy', 'crawl', 'test'] 36 | assert result[1] == {'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1', 37 | 'SCRAPY_SPIDER': 'test', 'SHUB_SPIDER_TYPE': ''} 38 | # test with full message 39 | result1 = _scrapy_crawl_args_and_env( 40 | {'key': '1/2/3', 'spider': 'test', 41 | 'spider_args': [('arg1', 'val1'), ('arg2', 'val2')], 42 | 'settings': [('SETTING1', 'VAL1'), ('SETTING2', 'VAL2')], 43 | 'spider_type': 'auto', 44 | 'extra_args': ['some', 'extra', 'args']}) 45 | assert result1[0] == ['scrapy', 'crawl', 'test', '-a', 'arg1=val1', 46 | '-a', 'arg2=val2', '-s', 'SETTING1=VAL1', 47 | '-s', 'SETTING2=VAL2'] 48 | assert result1[1] == {'SCRAPY_JOB': '1/2/3', 'SCRAPY_PROJECT_ID': '1', 49 | 'SCRAPY_SPIDER': 'test', 'SHUB_SPIDER_TYPE': 'auto'} 50 | 51 | 52 | def test_job_args_and_env(): 53 | result = _job_args_and_env({'job_cmd': ['custom.py', 'arg1']}) 54 | assert result == (['custom.py', 'arg1'], {}) 55 | result1 = _job_args_and_env({'job_cmd': ['custom.py', 'arg1'], 56 | 'job_env': {'some': 'env'}}) 57 | assert result1 == (['custom.py', 'arg1'], {'some': 'env'}) 58 | result2 = _job_args_and_env({'job_cmd': ('wrong', 'cmd', 'style')}) 59 | assert result2 == (["('wrong', 'cmd', 'style')"], {}) 60 | 61 | 62 | def test_jobname(): 63 | msg = {'job_name': 'jobn', 'spider': 'test', 64 | 'job_cmd': ['custom.py', 'arg1', 'arg2']} 65 | assert _jobname(msg) == 'jobn' 66 | msg.pop('job_name') 67 | assert _jobname(msg) == 'test' 68 | msg.pop('spider') 69 | assert _jobname(msg) == 'custom.py' 70 | 71 | 72 | def test_jobauth(): 73 | msg = {'key': '1/2/3', 'auth': 'authstring'} 74 | expected = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec') 75 | assert _jobauth(msg) == to_unicode(expected) 76 | 77 | 78 | def test_get_args_and_env_run_spider(): 79 | msg = {'key': '1/2/3', 'spider': 'test', 'spider_type': 'auto', 80 | 'auth': 'auths', 'spider_args': {'arg1': 'val1', 'arg2': 'val2'}, 81 | 'settings': {'SETTING1': 'VAL1', 'SETTING2': 'VAL2'}} 82 | result = get_args_and_env(msg) 83 | expected_auth = codecs.encode(to_bytes('1/2/3:auths'), 'hex_codec') 84 | assert len(result) == 2 85 | assert result[0] == ['scrapy', 'crawl', 'test', '-a', 'arg1=val1', 86 | '-a', 'arg2=val2', '-s', 'SETTING1=VAL1', '-s', 87 | 'SETTING2=VAL2'] 88 | assert result[1] == {'SCRAPY_JOB': '1/2/3', 89 | 'SCRAPY_PROJECT_ID': '1', 90 | 'SCRAPY_SPIDER': 'test', 91 | 'SHUB_JOBAUTH': to_unicode(expected_auth), 92 | 'SHUB_JOBKEY': '1/2/3', 93 | 'SHUB_JOBNAME': 'test', 94 | 'SHUB_JOB_TAGS': '', 95 | 'SHUB_SPIDER_TYPE': 'auto'} 96 | add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} 97 | msg.update(add_fields) 98 | result1 = get_args_and_env(msg) 99 | assert len(result1) == 2 100 | assert result1[1]['SHUB_APIURL'] == 'some-api-url' 101 | assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB' 102 | 103 | 104 | def test_get_args_and_env_run_script(): 105 | msg = {'key': '1/2/3', 'job_cmd': ['custom.py', 'arg1'], 106 | 'auth': 'authstring'} 107 | result = get_args_and_env(msg) 108 | expected_auth = codecs.encode(to_bytes('1/2/3:authstring'), 'hex_codec') 109 | assert len(result) == 2 110 | assert result[0] == ['custom.py', 'arg1'] 111 | assert result[1] == { 112 | 'SHUB_JOBAUTH': to_unicode(expected_auth), 113 | 'SHUB_JOBKEY': '1/2/3', 114 | 'SHUB_JOBNAME': 'custom.py', 115 | 'SHUB_JOB_TAGS': ''} 116 | add_fields = {'tags': ['tagA', 'tagB'], 'api_url': 'some-api-url'} 117 | msg.update(add_fields) 118 | result1 = get_args_and_env(msg) 119 | assert len(result1) == 2 120 | assert result1[1]['SHUB_APIURL'] == 'some-api-url' 121 | assert result1[1]['SHUB_JOB_TAGS'] == 'tagA,tagB' 122 | 123 | 124 | def test_decode_uri_basic_usage(): 125 | assert decode_uri('{"spider": "hello"}') == {'spider': 'hello'} 126 | str1 = 'data:application/json;charset=utf8;base64,ImhlbGxvIHdvcmxkIg==' 127 | assert decode_uri(str1) == u'hello world' 128 | assert decode_uri('data:;base64,ImhlbGxvIHdvcmxkIg==') == 'hello world' 129 | str2 = 'data:custom-mime;charset=utf8;base64,ImhlbGxvIHdvcmxkIg==' 130 | assert decode_uri(str2) == b'"hello world"' 131 | 132 | 133 | @mock.patch.dict(os.environ, {'TEST_VAR': '{"spider": "hello"}'}) 134 | def test_decode_uri_from_env(): 135 | assert decode_uri(None, 'TEST_VAR') == {'spider': 'hello'} 136 | 137 | 138 | def test_decode_uri_var_or_env_is_needed(): 139 | with pytest.raises(ValueError): 140 | decode_uri() 141 | 142 | 143 | def test_decode_uri_from_file(): 144 | with tempfile.NamedTemporaryFile() as temp: 145 | temp.write('{"hello":"world"}'.encode('utf-8')) 146 | temp.flush() 147 | assert decode_uri(temp.name) == {'hello': 'world'} 148 | assert decode_uri('file://' + temp.name) == {'hello': 'world'} 149 | 150 | 151 | def test_setup_environment(): 152 | builtin_mod = '__builtin__' if sys.version_info < (3,) else 'builtins' 153 | with mock.patch(builtin_mod + '.open') as mock_open: 154 | setup_environment() 155 | assert mock_open.called 156 | -------------------------------------------------------------------------------- /tests/test_extension.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from weakref import WeakKeyDictionary 3 | 4 | import mock 5 | import pytest 6 | import scrapy 7 | from packaging import version 8 | from pytest import warns 9 | from scrapy import Spider 10 | from scrapy.exporters import PythonItemExporter 11 | from scrapy.http import Request, Response 12 | from scrapy.item import Item 13 | from scrapy.utils.test import get_crawler 14 | 15 | from sh_scrapy.extension import HubstorageExtension, HubstorageMiddleware 16 | from sh_scrapy.middlewares import HS_PARENT_ID_KEY 17 | 18 | 19 | @pytest.fixture 20 | def hs_ext(monkeypatch): 21 | monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock()) 22 | monkeypatch.setattr('sh_scrapy.extension.hsref', mock.Mock()) 23 | crawler = get_crawler(Spider) 24 | return HubstorageExtension.from_crawler(crawler) 25 | 26 | 27 | def test_hs_ext_init(hs_ext): 28 | assert hs_ext.crawler 29 | assert hs_ext._write_item == hs_ext.pipe_writer.write_item 30 | assert isinstance(hs_ext.exporter, PythonItemExporter) 31 | assert hs_ext.exporter.export_item({"a": "b"}) == {"a": "b"} 32 | 33 | 34 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7") 35 | def test_hs_ext_dataclass_item_scraped(hs_ext): 36 | from dataclasses import dataclass 37 | 38 | @dataclass 39 | class DataclassItem: 40 | pass 41 | 42 | hs_ext._write_item = mock.Mock() 43 | item = DataclassItem() 44 | spider = Spider('test') 45 | hs_ext.item_scraped(item, spider) 46 | assert hs_ext._write_item.call_count == 1 47 | assert hs_ext._write_item.call_args[0] == ({'_type': 'DataclassItem'},) 48 | 49 | 50 | def test_hs_ext_attrs_item_scraped(hs_ext): 51 | try: 52 | import attr 53 | import iteamadapter 54 | except ImportError: 55 | pytest.skip('attrs not installed') 56 | return 57 | 58 | @attr.s 59 | class AttrsItem(object): 60 | pass 61 | 62 | hs_ext._write_item = mock.Mock() 63 | item = AttrsItem() 64 | spider = Spider('test') 65 | hs_ext.item_scraped(item, spider) 66 | assert hs_ext._write_item.call_count == 1 67 | assert hs_ext._write_item.call_args[0] == ({'_type': 'AttrsItem'},) 68 | 69 | 70 | def test_hs_ext_item_scraped(hs_ext): 71 | hs_ext._write_item = mock.Mock() 72 | item = Item() 73 | spider = Spider('test') 74 | hs_ext.item_scraped(item, spider) 75 | assert hs_ext._write_item.call_count == 1 76 | assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'},) 77 | 78 | 79 | def test_hs_ext_item_scraped_skip_wrong_type(hs_ext): 80 | hs_ext._write_item = mock.Mock() 81 | spider = Spider('test') 82 | for item in [None, [], 123]: 83 | hs_ext.item_scraped(item, spider) 84 | assert hs_ext._write_item.call_count == 0 85 | 86 | 87 | def test_hs_ext_spider_closed(hs_ext): 88 | spider = Spider('test') 89 | hs_ext.spider_closed(spider, 'killed') 90 | assert hs_ext.pipe_writer.set_outcome.called 91 | assert hs_ext.pipe_writer.set_outcome.call_args == mock.call('killed') 92 | 93 | 94 | @pytest.fixture 95 | def hs_mware(monkeypatch): 96 | monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock()) 97 | crawler = get_crawler() 98 | return HubstorageMiddleware.from_crawler(crawler) 99 | 100 | 101 | def test_hs_mware_init(hs_mware): 102 | assert hs_mware._seen == {} 103 | assert hs_mware.hsref 104 | 105 | 106 | def test_hs_mware_process_spider_input(hs_mware): 107 | response = Response('http://resp-url') 108 | response.request = Request('http://req-url') 109 | hs_mware.process_spider_input(response, Spider('test')) 110 | assert hs_mware.pipe_writer.write_request.call_count == 1 111 | args = hs_mware.pipe_writer.write_request.call_args[1] 112 | if hasattr(hs_mware._crawler, "request_fingerprinter"): 113 | fp = "1c735665b072000e11b0169081bce5bbaeac09a7" 114 | else: 115 | fp = "a001a1eb4537acdc8525edf1250065cab2657152" 116 | assert args == { 117 | 'duration': 0, 118 | 'fp': fp, 119 | 'method': 'GET', 120 | 'parent': None, 121 | 'rs': 0, 122 | 'status': 200, 123 | 'url': 'http://resp-url' 124 | } 125 | assert hs_mware._seen == WeakKeyDictionary({response: 0}) 126 | 127 | 128 | def test_hs_mware_process_spider_output_void_result(hs_mware): 129 | response = Response('http://resp-url') 130 | hs_mware._seen = WeakKeyDictionary({response: 'riq'}) 131 | assert list(hs_mware.process_spider_output( 132 | response, [], Spider('test'))) == [] 133 | 134 | 135 | def test_hs_mware_process_spider_output_filter_request(hs_mware): 136 | response = Response('http://resp-url') 137 | # provide a response and a new request in result 138 | child_response = Response('http://resp-url-child') 139 | child_response.request = Request('http://resp-url-child-req') 140 | child_request = Request('http://req-url-child') 141 | hs_mware._seen = WeakKeyDictionary({response: 'riq'}) 142 | result = list(hs_mware.process_spider_output( 143 | response, [child_response, child_request], Spider('test'))) 144 | assert len(result) == 2 145 | # make sure that we update hsparent meta only for requests 146 | assert result[0].meta.get(HS_PARENT_ID_KEY) is None 147 | assert result[1].meta[HS_PARENT_ID_KEY] == 'riq' 148 | 149 | 150 | @pytest.mark.skipif( 151 | version.parse(scrapy.__version__) < version.parse("2.7"), 152 | reason="Only Scrapy 2.7 and higher support centralized request fingerprints." 153 | ) 154 | def test_custom_fingerprinter(monkeypatch): 155 | monkeypatch.setattr('sh_scrapy.extension.pipe_writer', mock.Mock()) 156 | 157 | class CustomFingerprinter: 158 | def fingerprint(self, request): 159 | return b"foo" 160 | 161 | crawler = get_crawler(settings_dict={"REQUEST_FINGERPRINTER_CLASS": CustomFingerprinter}) 162 | mw = HubstorageMiddleware.from_crawler(crawler) 163 | 164 | response = Response('http://resp-url') 165 | response.request = Request('http://req-url') 166 | mw.process_spider_input(response, Spider('test')) 167 | assert mw.pipe_writer.write_request.call_args[1]["fp"] == b"foo".hex() 168 | 169 | 170 | def test_subclassing(): 171 | class CustomHubstorageMiddleware(HubstorageMiddleware): 172 | def __init__(self): 173 | super().__init__() 174 | self.foo = "bar" 175 | 176 | crawler = get_crawler() 177 | with warns( 178 | DeprecationWarning, 179 | match="must now accept a crawler parameter in their __init__ method", 180 | ): 181 | mw = CustomHubstorageMiddleware.from_crawler(crawler) 182 | 183 | assert mw.foo == "bar" 184 | assert hasattr(mw, "_fingerprint") 185 | -------------------------------------------------------------------------------- /tests/test_hsref.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pytest 3 | from sh_scrapy.hsref import _HubstorageRef 4 | 5 | 6 | def test_init_disabled(monkeypatch): 7 | monkeypatch.delenv('SHUB_JOBKEY') 8 | hsref = _HubstorageRef() 9 | assert not hsref._client 10 | assert not hsref._project 11 | assert not hsref._job 12 | assert not hsref.enabled 13 | assert not hasattr(hsref, 'jobkey') 14 | assert not hsref._projectid 15 | assert not hsref._spiderid 16 | assert not hsref._jobcounter 17 | 18 | 19 | @pytest.fixture 20 | @pytest.mark.usefixtures('set_environment') 21 | def hsref(): 22 | return _HubstorageRef() 23 | 24 | 25 | @pytest.fixture 26 | def hsc_class(monkeypatch): 27 | hsc_class = mock.Mock() 28 | monkeypatch.setattr('scrapinghub.HubstorageClient', hsc_class) 29 | return hsc_class 30 | 31 | 32 | def test_init(hsref): 33 | assert not hsref._client 34 | assert not hsref._project 35 | assert not hsref._job 36 | assert hsref.enabled 37 | assert hsref.jobkey == '1/2/3' 38 | assert hsref._projectid == 1 39 | assert hsref._spiderid == 2 40 | assert hsref._jobcounter == 3 41 | 42 | 43 | def test_auth(hsref): 44 | assert hsref.auth == '1/2/3:authstr' 45 | 46 | 47 | def test_endpoint(hsref): 48 | assert hsref.endpoint == 'storage-url' 49 | 50 | 51 | def test_job_ids(hsref): 52 | assert hsref.projectid == 1 53 | assert hsref.spiderid == 2 54 | assert hsref.jobid == 3 55 | 56 | 57 | def test_client(hsref, hsc_class): 58 | assert not hsref._client 59 | assert hsref.client == hsc_class.return_value 60 | hsc_class.assert_called_with(endpoint='storage-url', 61 | auth='1/2/3:authstr', 62 | user_agent=None) 63 | assert hsref._client 64 | assert hsref.client == hsref._client 65 | 66 | 67 | def test_client_custom_ua(hsref, hsc_class, monkeypatch): 68 | monkeypatch.setenv('SHUB_HS_USER_AGENT', 'testUA') 69 | assert not hsref._client 70 | assert hsref.client == hsc_class.return_value 71 | hsc_class.assert_called_with(endpoint='storage-url', 72 | auth='1/2/3:authstr', 73 | user_agent='testUA') 74 | assert hsref._client 75 | assert hsref.client == hsref._client 76 | 77 | 78 | def test_project(hsref): 79 | hsc = mock.Mock() 80 | hsc.get_project.return_value = 'Project' 81 | hsref._client = hsc 82 | 83 | assert not hsref._project 84 | assert hsref.project == 'Project' 85 | hsc.get_project.assert_called_with('1') 86 | assert hsref._project == hsref.project 87 | 88 | 89 | def test_job(hsref): 90 | project = mock.Mock() 91 | project.get_job.return_value = 'Job' 92 | hsref._project = project 93 | 94 | assert not hsref._job 95 | assert hsref.job == 'Job' 96 | project.get_job.assert_called_with((2, 3)) 97 | assert hsref._job == hsref.job 98 | 99 | 100 | def test_close(hsref): 101 | assert not hsref._client 102 | hsref.close() 103 | client = mock.Mock() 104 | hsref._client = client 105 | hsref.close() 106 | client.close.assert_called_with() 107 | -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import mock 4 | import pytest 5 | import sys 6 | import zlib 7 | 8 | from sh_scrapy.log import _stdout, _stderr 9 | from sh_scrapy.log import initialize_logging 10 | from sh_scrapy.log import HubstorageLogHandler 11 | from sh_scrapy.log import HubstorageLogObserver 12 | from sh_scrapy.log import StdoutLogger 13 | 14 | 15 | @pytest.fixture(autouse=True) 16 | def reset_std_streams(): 17 | sys.stdout = _stdout 18 | sys.stderr = _stderr 19 | 20 | 21 | @mock.patch('twisted.python.log.startLoggingWithObserver') 22 | @mock.patch('sh_scrapy.log.HubstorageLogObserver') 23 | def test_initialize_logging_dont_fail(observer, txlog_start): 24 | loghandler = initialize_logging() 25 | 26 | rootlogger = logging.getLogger() 27 | assert rootlogger.level == logging.NOTSET 28 | 29 | # check if null handler is set for libs 30 | for lib in ('boto', 'requests', 'hubstorage'): 31 | lg = logging.getLogger(lib) 32 | assert lg.propagate == 0 33 | assert any([hdl for hdl in lg.handlers 34 | if isinstance(hdl, logging.NullHandler)]) 35 | 36 | # check standard out/err redirection 37 | assert isinstance(sys.stdout, StdoutLogger) 38 | assert sys.stdout.encoding == 'utf-8' 39 | assert isinstance(sys.stderr, StdoutLogger) 40 | assert sys.stderr.encoding == 'utf-8' 41 | 42 | # check twisted specific 43 | assert observer.called 44 | observer.assert_called_with(loghandler) 45 | emit_method = observer.return_value.emit 46 | assert txlog_start.called 47 | txlog_start.assert_called_with(emit_method, setStdout=False) 48 | 49 | # check returned handler 50 | assert isinstance(loghandler, HubstorageLogHandler) 51 | assert loghandler.level == logging.INFO 52 | assert loghandler.formatter._fmt == '[%(name)s] %(message)s' 53 | 54 | @mock.patch('sh_scrapy.log.pipe_writer') 55 | def test_hs_loghandler_emit_ok(pipe_writer): 56 | hdlr = HubstorageLogHandler() 57 | record = logging.makeLogRecord({'msg': 'test-record'}) 58 | hdlr.emit(record) 59 | assert pipe_writer.write_log.called 60 | pipe_writer.write_log.assert_called_with(message='test-record', level=None) 61 | 62 | 63 | @mock.patch('sh_scrapy.log.pipe_writer') 64 | def test_hs_loghandler_emit_handle_interrupt(pipe_writer): 65 | pipe_writer.write_log.side_effect = KeyboardInterrupt 66 | hdlr = HubstorageLogHandler() 67 | record = logging.makeLogRecord({'msg': 'test-record'}) 68 | with pytest.raises(KeyboardInterrupt): 69 | hdlr.emit(record) 70 | 71 | 72 | @mock.patch('logging.Handler.handleError') 73 | @mock.patch('sh_scrapy.log.pipe_writer') 74 | def test_hs_loghandler_emit_handle_exception(pipe_writer, handleError): 75 | pipe_writer.write_log.side_effect = ValueError 76 | hdlr = HubstorageLogHandler() 77 | record = logging.makeLogRecord({'msg': 'test-record'}) 78 | hdlr.emit(record) 79 | assert handleError.called 80 | assert handleError.call_args == mock.call(record) 81 | 82 | 83 | @pytest.fixture 84 | def hs_observer(): 85 | hdlr = mock.Mock() 86 | return HubstorageLogObserver(hdlr) 87 | 88 | 89 | def test_hs_logobserver_init(hs_observer): 90 | assert isinstance(hs_observer._hs_loghdlr, mock.Mock) 91 | 92 | 93 | def test_hs_logobserver_get_log_item_low_level(hs_observer): 94 | hs_observer._hs_loghdlr.level = 20 95 | event = {'system': 'scrapy', 'logLevel': 10} 96 | assert not hs_observer._get_log_item(event) 97 | 98 | 99 | def test_hs_logobserver_get_log_item_system(hs_observer): 100 | hs_observer._hs_loghdlr.level = 20 101 | event = {'system': 'scrapy', 'logLevel': 30, 'message': ['test']} 102 | assert hs_observer._get_log_item(event) == { 103 | 'level': 30, 'message': 'test'} 104 | 105 | 106 | def test_hs_logobserver_get_log_item_info(hs_observer): 107 | hs_observer._hs_loghdlr.level = 20 108 | event = {'system': 'other', 'message': ['test'], 'isError': False} 109 | assert hs_observer._get_log_item(event) == { 110 | 'level': 20, 'message': 'test'} 111 | 112 | 113 | def test_hs_logobserver_get_log_item_error(hs_observer): 114 | hs_observer._hs_loghdlr.level = 20 115 | event = {'system': 'other', 'message': ['test'], 'isError': True} 116 | assert hs_observer._get_log_item(event) == { 117 | 'level': 40, 'message': 'test'} 118 | 119 | 120 | def test_hs_logobserver_get_log_item_failure(hs_observer): 121 | hs_observer._hs_loghdlr.level = 20 122 | failure = mock.Mock() 123 | failure.getTraceback.return_value = 'some-traceback' 124 | event = {'system': 'other', 'failure': failure, 'isError': False} 125 | assert hs_observer._get_log_item(event) == { 126 | 'level': 20, 'message': 'some-traceback'} 127 | 128 | 129 | def test_hs_logobserver_get_log_item_why(hs_observer): 130 | hs_observer._hs_loghdlr.level = 20 131 | event = {'system': 'other', 'message': ['test'], 132 | 'why': 'why-msg', 'isError': False} 133 | assert hs_observer._get_log_item(event) == { 134 | 'level': 20, 'message': 'why-msg\n\ttest'} 135 | 136 | 137 | def test_hs_logobserver_get_log_item_format(hs_observer): 138 | hs_observer._hs_loghdlr.level = 20 139 | event = {'system': 'other', 'message': ['test'], 'data': 'raw', 140 | 'format': 'formatted/%(data)s', 'isError': False} 141 | assert hs_observer._get_log_item(event) == { 142 | 'level': 20, 'message': 'formatted/raw'} 143 | 144 | 145 | def test_hs_logobserver_get_log_item_format_error(hs_observer): 146 | hs_observer._hs_loghdlr.level = 20 147 | event = {'system': 'other', 'message': ['test'], 'data': 'raw', 148 | 'format': 'formatted/%(data)%%', 'isError': False} 149 | expected_template = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" 150 | assert hs_observer._get_log_item(event) == { 151 | 'level': 40, 'message': expected_template % (event['format'], event)} 152 | 153 | 154 | @mock.patch('sh_scrapy.log.pipe_writer') 155 | def test_hs_logobserver_emit_filter_events(pipe_writer, hs_observer): 156 | hs_observer._hs_loghdlr.level = 20 157 | event = {'system': 'scrapy', 'logLevel': 10} 158 | hs_observer.emit(event) 159 | assert not pipe_writer.write_log.called 160 | 161 | 162 | @mock.patch('sh_scrapy.log.pipe_writer') 163 | def test_hs_logobserver_emit_logitem(pipe_writer, hs_observer): 164 | hs_observer._hs_loghdlr.level = 20 165 | event = {'system': 'other', 'message': ['test'], 'isError': False} 166 | hs_observer.emit(event) 167 | assert pipe_writer.write_log.called 168 | pipe_writer.write_log.assert_called_with(level=20, message='test') 169 | 170 | 171 | def stdout_logger_init_stdout(): 172 | logger_out = StdoutLogger(0, 'utf-8') 173 | assert logger_out.prefix == '[stdout]' 174 | assert logger_out.loglevel == logging.INFO 175 | 176 | 177 | def stdout_logger_init_stderr(): 178 | logger_out = StdoutLogger(1, 'utf-8', loglevel=logging.ERROR) 179 | assert logger_out.prefix == '[stderr]' 180 | assert logger_out.loglevel == logging.ERROR 181 | 182 | 183 | @mock.patch('sh_scrapy.log.pipe_writer') 184 | def test_stdout_logger_logprefixed(pipe_writer): 185 | logger = StdoutLogger(0, 'utf-8') 186 | logger._logprefixed('message') 187 | assert pipe_writer.write_log.called 188 | pipe_writer.write_log.assert_called_with(level=20, message='[stdout] message') 189 | 190 | 191 | @mock.patch('sh_scrapy.log.pipe_writer') 192 | def test_stdout_logger_write(pipe_writer): 193 | logger = StdoutLogger(0, 'utf-8') 194 | logger.write('some-string\nother-string\nlast-string') 195 | assert pipe_writer.write_log.called 196 | assert pipe_writer.write_log.call_args_list[0] == mock.call( 197 | level=20, 198 | message='[stdout] some-string' 199 | ) 200 | assert pipe_writer.write_log.call_args_list[1] == mock.call( 201 | level=20, 202 | message='[stdout] other-string' 203 | ) 204 | assert logger.buf == 'last-string' 205 | 206 | 207 | def test_stdout_logger_writelines_empty(): 208 | logger = StdoutLogger(0, 'utf-8') 209 | logger.writelines([]) 210 | 211 | 212 | @mock.patch('sh_scrapy.log.pipe_writer') 213 | def test_stdout_logger_writelines(pipe_writer): 214 | logger = StdoutLogger(0, 'utf-8') 215 | logger.writelines(['test-line']) 216 | assert pipe_writer.write_log.called 217 | pipe_writer.write_log.assert_called_with(level=20, message='[stdout] test-line') 218 | 219 | 220 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2") 221 | @mock.patch('sh_scrapy.log.pipe_writer._pipe') 222 | def test_unicode_decode_error_handling(pipe_mock): 223 | hdlr = HubstorageLogHandler() 224 | message = 'value=%s' % zlib.compress('value') 225 | record = logging.makeLogRecord({'msg': message, 'levelno': 10}) 226 | hdlr.emit(record) 227 | assert pipe_mock.write.called 228 | payload = json.loads(pipe_mock.write.call_args_list[2][0][0]) 229 | assert isinstance(payload.pop('time'), int) 230 | assert payload == { 231 | 'message': r'value=x\x9c+K\xcc)M\x05\x00\x06j\x02\x1e', 232 | 'level': 10 233 | } 234 | -------------------------------------------------------------------------------- /tests/test_middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from weakref import WeakKeyDictionary 3 | import itertools 4 | import pytest 5 | import sys 6 | from scrapy import Spider, Request, Item 7 | from scrapy.http import Response 8 | from scrapy.utils.test import get_crawler 9 | from typing import Optional 10 | 11 | from sh_scrapy.middlewares import ( 12 | HubstorageSpiderMiddleware, HubstorageDownloaderMiddleware, 13 | HS_REQUEST_ID_KEY, HS_PARENT_ID_KEY 14 | ) 15 | 16 | 17 | @pytest.fixture() 18 | def monkeypatch_globals(monkeypatch): 19 | monkeypatch.setattr('sh_scrapy.middlewares.request_id_sequence', itertools.count()) 20 | monkeypatch.setattr('sh_scrapy.middlewares.seen_requests', WeakKeyDictionary()) 21 | 22 | 23 | @pytest.fixture() 24 | def hs_spider_middleware(monkeypatch_globals): 25 | return HubstorageSpiderMiddleware() 26 | 27 | 28 | @pytest.fixture() 29 | def hs_downloader_middleware(monkeypatch_globals): 30 | crawler = get_crawler() 31 | return HubstorageDownloaderMiddleware.from_crawler(crawler) 32 | 33 | 34 | def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware): 35 | assert hs_spider_middleware._seen_requests == WeakKeyDictionary() 36 | assert hs_downloader_middleware._seen_requests == WeakKeyDictionary() 37 | assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests 38 | 39 | spider = Spider('test') 40 | url = 'http://resp-url' 41 | request_0 = Request(url) 42 | response_0 = Response(url) 43 | 44 | hs_downloader_middleware.process_request(request_0, spider) 45 | 46 | assert HS_REQUEST_ID_KEY not in request_0.meta 47 | assert HS_PARENT_ID_KEY not in request_0.meta 48 | assert len(hs_spider_middleware._seen_requests) == 0 49 | assert len(hs_downloader_middleware._seen_requests) == 0 50 | 51 | hs_downloader_middleware.process_response(request_0, response_0, spider) 52 | 53 | assert request_0.meta[HS_REQUEST_ID_KEY] == 0 54 | assert request_0.meta[HS_PARENT_ID_KEY] is None 55 | assert hs_spider_middleware._seen_requests[request_0] == 0 56 | 57 | response_0.request = request_0 58 | request_1 = Request(url) 59 | request_2 = Request(url) 60 | item1 = {} 61 | item2 = Item() 62 | output = [request_1, request_2, item1, item2] 63 | processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider)) 64 | 65 | assert processed_output[0] is request_1 66 | assert request_1.meta[HS_PARENT_ID_KEY] == 0 67 | assert processed_output[1] is request_2 68 | assert request_2.meta[HS_PARENT_ID_KEY] == 0 69 | assert processed_output[2] is item1 70 | assert processed_output[3] is item2 71 | 72 | response_1 = Response(url) 73 | hs_downloader_middleware.process_request(request_1, spider) 74 | hs_downloader_middleware.process_response(request_1, response_1, spider) 75 | assert request_1.meta[HS_REQUEST_ID_KEY] == 1 76 | assert request_1.meta[HS_PARENT_ID_KEY] == 0 77 | 78 | response_2 = Response(url) 79 | hs_downloader_middleware.process_request(request_2, spider) 80 | hs_downloader_middleware.process_response(request_2, response_2, spider) 81 | assert request_2.meta[HS_REQUEST_ID_KEY] == 2 82 | assert request_2.meta[HS_PARENT_ID_KEY] == 0 83 | 84 | 85 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7") 86 | def test_hs_middlewares_dummy_response(hs_downloader_middleware, hs_spider_middleware): 87 | from dataclasses import dataclass 88 | 89 | @dataclass(unsafe_hash=True) 90 | class DummyResponse(Response): 91 | __module__: str = "scrapy_poet.api" 92 | 93 | def __init__(self, url: str, request: Optional[Request] = None): 94 | super().__init__(url=url, request=request) 95 | 96 | spider = Spider('test') 97 | url = 'http://resp-url' 98 | 99 | # cleaning log file 100 | hs_downloader_middleware.pipe_writer.open() 101 | 102 | request = Request(url) 103 | response_1 = DummyResponse(url, request) 104 | response_2 = Response(url) 105 | hs_downloader_middleware.process_request(request, spider) 106 | hs_downloader_middleware.process_response(request, response_1, spider) 107 | 108 | with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file: 109 | assert tmp_file.readline() == "" 110 | assert request.meta == {} 111 | 112 | hs_downloader_middleware.process_response(request, response_2, spider) 113 | with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file: 114 | assert tmp_file.readline().startswith('REQ') 115 | 116 | assert request.meta[HS_REQUEST_ID_KEY] == 0 117 | assert request.meta[HS_PARENT_ID_KEY] is None 118 | 119 | 120 | @pytest.mark.skipif(sys.version_info < (3, 7), reason="requires python3.7") 121 | def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware): 122 | from dataclasses import dataclass 123 | 124 | @dataclass(unsafe_hash=True) 125 | class DummyResponse(Response): 126 | __module__: str = "scrapy_poet.api" 127 | 128 | def __init__(self, url: str, request: Optional[Request] = None): 129 | super().__init__(url=url, request=request) 130 | 131 | spider = Spider('test') 132 | url = 'http://resp-url' 133 | request_0 = Request(url) 134 | response_0 = Response(url) 135 | 136 | hs_downloader_middleware.process_request(request_0, spider) 137 | 138 | assert HS_REQUEST_ID_KEY not in request_0.meta 139 | assert HS_PARENT_ID_KEY not in request_0.meta 140 | assert len(hs_spider_middleware._seen_requests) == 0 141 | assert len(hs_downloader_middleware._seen_requests) == 0 142 | 143 | hs_downloader_middleware.process_response(request_0, response_0, spider) 144 | 145 | assert request_0.meta[HS_REQUEST_ID_KEY] == 0 146 | assert request_0.meta[HS_PARENT_ID_KEY] is None 147 | assert hs_spider_middleware._seen_requests[request_0] == 0 148 | 149 | request_1 = request_0.copy() 150 | response_1 = Response(url) 151 | assert request_1.meta[HS_REQUEST_ID_KEY] == 0 152 | assert request_1.meta[HS_PARENT_ID_KEY] is None 153 | 154 | hs_downloader_middleware.process_request(request_1, spider) 155 | 156 | assert HS_REQUEST_ID_KEY not in request_1.meta 157 | assert request_1.meta[HS_PARENT_ID_KEY] == 0 158 | 159 | hs_downloader_middleware.process_response(request_1, response_1, spider) 160 | 161 | assert request_1.meta[HS_REQUEST_ID_KEY] == 1 162 | assert request_1.meta[HS_PARENT_ID_KEY] == 0 163 | 164 | request_2 = request_1.copy() 165 | response_2_1 = DummyResponse(url, request_2) 166 | response_2_2 = Response(url) 167 | 168 | hs_downloader_middleware.process_response(request_2, response_2_1, spider) 169 | 170 | assert request_2.meta[HS_REQUEST_ID_KEY] == 1 171 | assert request_2.meta[HS_PARENT_ID_KEY] == 0 172 | 173 | hs_downloader_middleware.process_response(request_2, response_2_2, spider) 174 | 175 | assert request_2.meta[HS_REQUEST_ID_KEY] == 2 176 | assert request_2.meta[HS_PARENT_ID_KEY] == 0 177 | -------------------------------------------------------------------------------- /tests/test_settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import mock 4 | 5 | import pytest 6 | from scrapy import version_info as scrapy_version 7 | from scrapy.settings import Settings 8 | from scrapy.utils.python import to_unicode 9 | 10 | from sh_scrapy.settings import EntrypointSettings 11 | from sh_scrapy.settings import _enforce_required_settings 12 | from sh_scrapy.settings import _maybe_load_autoscraping_project 13 | from sh_scrapy.settings import _get_component_base 14 | from sh_scrapy.settings import _get_action_on_missing_addons 15 | from sh_scrapy.settings import _load_addons 16 | from sh_scrapy.settings import _populate_settings_base 17 | from sh_scrapy.settings import _load_default_settings 18 | from sh_scrapy.settings import _update_old_classpaths 19 | from sh_scrapy.settings import populate_settings 20 | 21 | 22 | TEST_ADDON = { 23 | 'addon_id': 'test_addon', 24 | 'name': 'Fake test addon', 25 | 'description': 'Some description', 26 | 'settings': ('TEST_SETTING_A', 'TEST_SETTING_B'), 27 | 'default_settings': {}, 28 | 'type': 'SPIDER_MIDDLEWARES', 29 | 'order': 10, 30 | 'path': 'scrapy.utils.misc.load_object', 31 | 'builtin': False, 32 | 'needs_aws': False, 33 | } 34 | 35 | 36 | def test_update_settings_void_dictionaries(): 37 | test = EntrypointSettings() 38 | test.setdict({}, 10) 39 | assert len(test.attributes) == 0 40 | 41 | 42 | def test_update_settings_base_test(): 43 | test = EntrypointSettings() 44 | test.setdict({'a': 'b'}, 10) 45 | assert test['a'] == 'b' 46 | 47 | 48 | def test_update_settings_base_test2(): 49 | test = EntrypointSettings() 50 | test.setdict({'a': 'b', 'c': 'd'}, 10) 51 | assert len(test.attributes) == 2 52 | 53 | 54 | def test_update_settings_dont_fail_on_non_string(): 55 | test = EntrypointSettings() 56 | test.setdict({'a': 3}, 10) 57 | assert test['a'] == 3 58 | 59 | 60 | def test_update_settings_update_existing_value(): 61 | test = EntrypointSettings() 62 | test.setdict({'a': 'b', 'c': 'd'}, priority=10) 63 | test.setdict({'c': 'e', 'f': 'g'}, 10) 64 | assert len(test.attributes) == 3 65 | assert test['a'] == 'b' 66 | assert test['c'] == 'e' 67 | assert test['f'] == 'g' 68 | 69 | 70 | def test_update_settings_per_key_priorities_old_behavior(): 71 | test = EntrypointSettings() 72 | test.set('ITEM_PIPELINES', {'path.one': 100}) 73 | test.set('ITEM_PIPELINES', {'path.two': 200}) 74 | assert test['ITEM_PIPELINES'] == {'path.two': 200} 75 | 76 | 77 | @pytest.mark.skipif(scrapy_version < (1, 1), reason="requires Scrapy>=1.1") 78 | def test_update_settings_per_key_priorities_new_behaviour(): 79 | from scrapy.settings import BaseSettings 80 | test = EntrypointSettings() 81 | test.set('ITEM_PIPELINES', BaseSettings()) 82 | test['ITEM_PIPELINES'].update({'test.path1': 100}) 83 | test['ITEM_PIPELINES'].update({'test.path2': 200}) 84 | assert dict(test['ITEM_PIPELINES']) == { 85 | 'test.path1': 100, 'test.path2': 200} 86 | 87 | 88 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2") 89 | def test_update_settings_check_unicode_in_py2_key(): 90 | # a dict entry is duplicated as unicode doesn't match native str value 91 | test = EntrypointSettings() 92 | test.setdict({'\xf1e\xf1e\xf1e': 'test'}, 10) 93 | assert test['\xf1e\xf1e\xf1e'] == 'test' 94 | assert test[to_unicode('\xf1e\xf1e\xf1e')] == 'test' 95 | 96 | 97 | @pytest.mark.skipif(sys.version_info[0] == 3, reason="requires python2") 98 | def test_update_settings_check_unicode_in_py2_key_value(): 99 | # a dict entry is duplicated as unicode doesn't match native str value 100 | test = EntrypointSettings() 101 | test.setdict({'\xf1e\xf1e\xf1e': '\xf1e\xf1e'}, 10) 102 | assert test['\xf1e\xf1e\xf1e'] == '\xf1e\xf1e' 103 | native_key = to_unicode('\xf1e\xf1e\xf1e') 104 | assert test[native_key] == to_unicode('\xf1e\xf1e') 105 | 106 | 107 | @pytest.mark.skipif(sys.version_info < (3,), reason="requires python3") 108 | def test_update_settings_check_unicode_in_py3(): 109 | test = EntrypointSettings() 110 | test.setdict({'\xf1e\xf1e\xf1e': 'test'}, 10) 111 | assert test['\xf1e\xf1e\xf1e'] == 'test' 112 | 113 | 114 | def test_maybe_load_autoscraping_project_no_spider_type_env(): 115 | result = {} 116 | _maybe_load_autoscraping_project(result) 117 | assert result == {} 118 | 119 | 120 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'custom'}) 121 | def test_maybe_load_autoscraping_project_custom_type(): 122 | result = {} 123 | _maybe_load_autoscraping_project(result) 124 | assert result == {} 125 | 126 | 127 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'auto'}) 128 | def test_maybe_load_autoscraping_project_ok(): 129 | result = EntrypointSettings() 130 | result.setdict({'SPIDER_MANAGER_CLASS': 'test.class'}) 131 | _maybe_load_autoscraping_project(result) 132 | assert result['ITEM_PIPELINES'] == { 133 | 'slybot.dupefilter.DupeFilterPipeline': 0} 134 | assert result['PROJECT_ZIPFILE'] == 'project-slybot.zip' 135 | assert result['SLYCLOSE_SPIDER_ENABLED'] 136 | assert result['SLYDUPEFILTER_ENABLED'] 137 | assert result['SPIDER_MANAGER_CLASS'] == 'test.class' 138 | 139 | 140 | def test_get_component_base(): 141 | assert _get_component_base({}, 'TEST') == 'TEST' 142 | assert _get_component_base({'SOME_SETTING': 'VAL'}, 'TEST') == 'TEST' 143 | assert _get_component_base({'TEST_BASE': 'VAL'}, 'TEST') == 'TEST_BASE' 144 | 145 | 146 | def test_get_action_on_missing_addons_default(): 147 | o = EntrypointSettings() 148 | assert _get_action_on_missing_addons(o) == 'warn' 149 | 150 | 151 | def test_get_action_on_missing_addons_base(): 152 | o = EntrypointSettings() 153 | o.setdict({'ON_MISSING_ADDONS': 'fail'}) 154 | assert _get_action_on_missing_addons(o) == 'fail' 155 | 156 | 157 | def test_get_action_on_missing_addons_warn_if_wrong_value(): 158 | o = EntrypointSettings() 159 | o.setdict({'ON_MISSING_ADDONS': 'wrong'}) 160 | assert _get_action_on_missing_addons(o) == 'warn' 161 | 162 | 163 | def test_load_addons_void(): 164 | addons = [] 165 | settings, o = EntrypointSettings(), EntrypointSettings() 166 | _load_addons(addons, settings, o) 167 | assert addons == [] 168 | assert settings.attributes == o.attributes == {} 169 | 170 | 171 | def test_load_addons_basic_usage(): 172 | addons = [TEST_ADDON] 173 | settings = EntrypointSettings() 174 | settings.setdict({'SPIDER_MIDDLEWARES': {}}) 175 | o = EntrypointSettings() 176 | _load_addons(addons, settings, o) 177 | assert settings['SPIDER_MIDDLEWARES'] == {TEST_ADDON['path']: 10} 178 | assert o['SPIDER_MIDDLEWARES'] == {TEST_ADDON['path']: 10} 179 | 180 | 181 | def test_load_addons_basic_with_defaults(): 182 | addons = [TEST_ADDON.copy()] 183 | addons[0]['default_settings'] = {'TEST_SETTING_A': 'TEST'} 184 | settings = {'SPIDER_MIDDLEWARES_BASE': { 185 | 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 186 | 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500}} 187 | o = EntrypointSettings() 188 | o.setdict({'ON_MISSING_ADDONS': 'warn'}) 189 | _load_addons(addons, settings, o) 190 | assert settings == {'SPIDER_MIDDLEWARES_BASE': { 191 | TEST_ADDON['path']: 10, 192 | 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 193 | 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500 194 | }} 195 | assert len(o.attributes) == 3 196 | assert o['TEST_SETTING_A'] == 'TEST' 197 | assert o['ON_MISSING_ADDONS'] == 'warn' 198 | assert len(o['SPIDER_MIDDLEWARES_BASE']) == 3 199 | 200 | 201 | def test_load_addons_hworker_fail_on_import(): 202 | addons = [TEST_ADDON.copy()] 203 | addons[0]['path'] = 'hworker.some.module' 204 | settings = EntrypointSettings() 205 | settings.setdict({'SPIDER_MIDDLEWARES': {}}) 206 | o = EntrypointSettings() 207 | o.setdict({'ON_MISSING_ADDONS': 'fail'}) 208 | with pytest.raises(ImportError): 209 | _load_addons(addons, settings, o) 210 | 211 | 212 | def test_load_addons_hworker_error_on_import(): 213 | addons = [TEST_ADDON.copy()] 214 | addons[0]['path'] = 'hworker.some.module' 215 | settings = {'SPIDER_MIDDLEWARES': {}} 216 | o = EntrypointSettings() 217 | o.setdict({'ON_MISSING_ADDONS': 'error'}) 218 | _load_addons(addons, settings, o) 219 | assert len(o.attributes) == 1 220 | assert o['ON_MISSING_ADDONS'] == 'error' 221 | assert settings == {'SPIDER_MIDDLEWARES': {}} 222 | 223 | 224 | def test_load_addons_hworker_warning_on_import(): 225 | addons = [TEST_ADDON.copy()] 226 | addons[0]['path'] = 'hworker.some.module' 227 | settings = {'SPIDER_MIDDLEWARES': {}} 228 | o = EntrypointSettings() 229 | o.setdict({'ON_MISSING_ADDONS': 'warn'}) 230 | _load_addons(addons, settings, o) 231 | assert len(o.attributes) == 1 232 | assert o['ON_MISSING_ADDONS'] == 'warn' 233 | assert settings == {'SPIDER_MIDDLEWARES': {}} 234 | 235 | 236 | @mock.patch.dict('sh_scrapy.settings.REPLACE_ADDONS_PATHS', 237 | {TEST_ADDON['path']: 'scrapy.utils.misc.arg_to_iter'}) 238 | def test_load_addons_hworker_import_replace(): 239 | addons = [TEST_ADDON] 240 | settings = {'SPIDER_MIDDLEWARES': {}} 241 | o = EntrypointSettings() 242 | _load_addons(addons, settings, o) 243 | assert len(o.attributes) == 1 244 | assert o['SPIDER_MIDDLEWARES'] == {'scrapy.utils.misc.arg_to_iter': 10} 245 | 246 | 247 | def test_populate_settings_dont_fail(): 248 | result = _populate_settings_base({}, lambda x: x) 249 | assert isinstance(result, Settings) 250 | 251 | 252 | def test_populate_settings_with_default(): 253 | def default_test(s): 254 | s.set('TEST_SETTING_A', 'test') 255 | result = _populate_settings_base({}, default_test) 256 | assert result 257 | assert result['TEST_SETTING_A'] == 'test' 258 | 259 | 260 | def test_populate_settings_addons(): 261 | addon = TEST_ADDON.copy() 262 | addon['default_settings'] = {'TEST_SETTING_A': 'by_addon'} 263 | msg = {'enabled_addons': [addon]} 264 | result = _populate_settings_base(msg, lambda x: x) 265 | assert result 266 | assert result['TEST_SETTING_A'] == 'by_addon' 267 | 268 | 269 | def test_populate_settings_override_settings(): 270 | msg = {} 271 | for section in ['project_settings', 272 | 'organization_settings', 273 | 'job_settings']: 274 | msg[section] = {'TEST_SETTING_A': 'from_' + section} 275 | result = _populate_settings_base(msg, lambda x: x) 276 | assert result 277 | assert result['TEST_SETTING_A'] == 'from_' + section 278 | 279 | 280 | def test_populate_settings_with_spider(): 281 | msg = {'project_settings': {'JOBDIR': 'by_project'}, 282 | 'spider_settings': {'TEST_SETTING_A': 'test'}} 283 | result = _populate_settings_base(msg, lambda x: x, spider=True) 284 | assert result 285 | assert result['TEST_SETTING_A'] == 'test' 286 | assert result['JOBDIR'].split('/')[-1].startswith('jobdata-') 287 | assert not result.get('PROJECT_ZIPFILE') 288 | 289 | 290 | def test_populate_settings_with_spider_override(): 291 | msg = {'job_settings': {'JOBDIR': 'by_job'}} 292 | result = _populate_settings_base(msg, lambda x: x, spider=True) 293 | assert result 294 | assert result['JOBDIR'] == 'by_job' 295 | 296 | 297 | @mock.patch.dict(os.environ, {'SHUB_SPIDER_TYPE': 'portia'}) 298 | def test_populate_settings_with_spider_autoscraping(): 299 | result = _populate_settings_base({}, lambda x: x, spider=True) 300 | assert result 301 | assert result['PROJECT_ZIPFILE'] == 'project-slybot.zip' 302 | 303 | 304 | @mock.patch('sh_scrapy.settings.get_project_settings') 305 | def test_populate_settings_keep_user_priorities(get_settings_mock): 306 | get_settings_mock.return_value = Settings({ 307 | 'EXTENSIONS_BASE': { 308 | 'sh_scrapy.extension.HubstorageExtension': None, 309 | 'scrapy.spidermiddlewares.depth.DepthMiddleware': 10}, 310 | 'SPIDER_MIDDLEWARES_BASE': {'scrapy.utils.misc.load_object': 1}}) 311 | addon = TEST_ADDON.copy() 312 | api_settings = { 313 | 'project_settings': { 314 | 'EXTENSIONS_BASE': {'sh_scrapy.extension.HubstorageExtension': 300, 315 | 'scrapy.contrib.throttle.AutoThrottle': 5}}, 316 | 'enabled_addons': [addon]} 317 | result = _populate_settings_base(api_settings, lambda x: x, spider=True) 318 | assert result.getdict('SPIDER_MIDDLEWARES_BASE')[ 319 | 'scrapy.utils.misc.load_object'] == 1 320 | assert result.getdict('EXTENSIONS_BASE')[ 321 | 'sh_scrapy.extension.HubstorageExtension'] is None 322 | autothrottles = [k for k in result.getdict('EXTENSIONS_BASE') 323 | if 'AutoThrottle' in k] 324 | assert result.getdict('EXTENSIONS_BASE')[autothrottles[0]] == 5 325 | 326 | 327 | def test_populate_settings_unique_update_dict(): 328 | monitoring_dict = {u'SPIDER_OPENED': {u'failed_actions': []}} 329 | msg = {'spider_settings': {'DASH_MONITORING': monitoring_dict}} 330 | result = _populate_settings_base(msg, lambda x: x, spider=True) 331 | assert result['DASH_MONITORING'] == monitoring_dict 332 | 333 | 334 | @mock.patch('sh_scrapy.settings.get_project_settings') 335 | def test_populate_settings_keep_user_priorities_oldpath(get_settings_mock): 336 | get_settings_mock.return_value = Settings({ 337 | 'EXTENSIONS_BASE': {'scrapy.contrib.throttle.AutoThrottle': 0}}) 338 | api_settings = { 339 | 'project_settings': { 340 | 'EXTENSIONS_BASE': {'scrapy.contrib.throttle.AutoThrottle': 5}}} 341 | result = _populate_settings_base(api_settings, lambda x: x, spider=True) 342 | autothrottles = [k for k in result.getdict('EXTENSIONS_BASE') 343 | if 'AutoThrottle' in k] 344 | assert len(autothrottles) == 1 345 | assert result.getdict('EXTENSIONS_BASE')[autothrottles[0]] is 0 346 | 347 | 348 | def test_load_default_settings(): 349 | result = Settings({'EXTENSIONS_BASE': { 350 | 'sh_scrapy.extension.HubstorageExtension': 50}, 351 | 'SPIDER_MIDDLEWARES_BASE': {}}) 352 | _load_default_settings(result) 353 | extensions = result['EXTENSIONS_BASE'] 354 | assert extensions['scrapy.extensions.debug.StackTraceDump'] == 0 355 | assert extensions['sh_scrapy.extension.HubstorageExtension'] == 100 356 | assert 'slybot.closespider.SlybotCloseSpider' not in extensions 357 | spider_middlewares = result['SPIDER_MIDDLEWARES_BASE'] 358 | assert 'sh_scrapy.middlewares.HubstorageSpiderMiddleware' in spider_middlewares 359 | downloader_middlewares = result['DOWNLOADER_MIDDLEWARES_BASE'] 360 | assert 'sh_scrapy.middlewares.HubstorageDownloaderMiddleware' in downloader_middlewares 361 | assert result['MEMUSAGE_LIMIT_MB'] == 950 362 | 363 | 364 | @mock.patch.dict(os.environ, {'SHUB_JOB_MEMORY_LIMIT': '200'}) 365 | def test_load_default_settings_mem_limit(): 366 | result = Settings({'EXTENSIONS_BASE': {}, 367 | 'SPIDER_MIDDLEWARES_BASE': {}}) 368 | _load_default_settings(result) 369 | assert result['MEMUSAGE_LIMIT_MB'] == 200 370 | 371 | 372 | def test_enforce_required_settings_default(): 373 | settings = Settings({}) 374 | _enforce_required_settings(settings) 375 | assert settings['LOG_STDOUT'] is False 376 | 377 | 378 | def test_enforce_required_settings_rewrite(): 379 | settings = Settings({'LOG_STDOUT': True}) 380 | _enforce_required_settings(settings) 381 | assert settings['LOG_STDOUT'] is False 382 | 383 | 384 | def test_populate_settings_dont_fail(): 385 | result = populate_settings({}) 386 | assert isinstance(result, Settings) 387 | # check one of the settings provided by default by sh_scrapy 388 | assert result['TELNETCONSOLE_HOST'] == '0.0.0.0' 389 | 390 | 391 | def test_populate_settings_dont_fail_with_spider(): 392 | result = populate_settings({}, True) 393 | assert isinstance(result, Settings) 394 | # check one of the settings provided by default by sh_scrapy 395 | assert result['TELNETCONSOLE_HOST'] == '0.0.0.0' 396 | 397 | 398 | def test_populate_settings_check_required(): 399 | result = populate_settings({'LOG_STDOUT': True}) 400 | assert isinstance(result, Settings) 401 | # check that some settings fallback to required values 402 | assert result['LOG_STDOUT'] is False 403 | 404 | 405 | def test_update_old_classpaths_not_string(): 406 | 407 | class CustomObject(object): 408 | pass 409 | 410 | test_value = {'scrapy.exporter.CustomExporter': 1, 411 | 123: 2, CustomObject: 3} 412 | test_settings = Settings({'SOME_SETTING': test_value}) 413 | _update_old_classpaths(test_settings) 414 | expected = test_settings['SOME_SETTING'] 415 | assert len(expected) == 3 416 | assert 123 in expected 417 | assert CustomObject in expected 418 | assert 'scrapy.exporter.CustomExporter' in expected 419 | -------------------------------------------------------------------------------- /tests/test_stats.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pytest 3 | 4 | from scrapy.spiders import Spider 5 | from scrapy.utils.test import get_crawler 6 | 7 | from sh_scrapy import stats 8 | 9 | 10 | @pytest.fixture 11 | def collector(monkeypatch): 12 | monkeypatch.setattr('sh_scrapy.stats.pipe_writer', mock.Mock()) 13 | crawler = get_crawler(Spider) 14 | return stats.HubStorageStatsCollector(crawler) 15 | 16 | 17 | def test_collector_class_vars(collector): 18 | assert collector.INTERVAL == 30 19 | 20 | 21 | def test_collector_upload_stats(collector): 22 | stats = {'item_scraped_count': 10, 'scheduler/enqueued': 20} 23 | collector.set_stats(stats.copy()) 24 | collector._upload_stats() 25 | assert collector.pipe_writer.write_stats.call_count == 1 26 | collector.pipe_writer.write_stats.assert_called_with(stats.copy()) 27 | 28 | 29 | @mock.patch('twisted.internet.task.LoopingCall') 30 | def test_collector_open_spider(lcall, collector): 31 | collector.open_spider('spider') 32 | lcall.assert_called_with(collector._upload_stats) 33 | lcall.return_value.start.assert_called_with(collector.INTERVAL, now=True) 34 | dcall = lcall.return_value.start.return_value 35 | dcall.addErrback.assert_called_with( 36 | collector._setup_looping_call, now=False) 37 | 38 | 39 | def test_collector_close_spider(collector): 40 | collector._samplestask = mock.Mock() 41 | collector._samplestask.running = True 42 | stats = {'item_scraped_count': 10} 43 | collector.set_stats(stats.copy()) 44 | collector.close_spider('spider', 'reason') 45 | assert collector._samplestask.stop.called 46 | collector.pipe_writer.write_stats.assert_called_with(stats.copy()) 47 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | from mock import patch 3 | 4 | from pytest import raises 5 | from scrapy.settings import Settings 6 | 7 | from sh_scrapy.utils import get_project_settings 8 | 9 | 10 | def test_get_project_settings_class(): 11 | settings = get_project_settings() 12 | assert isinstance(settings, Settings) 13 | 14 | 15 | def test_get_project_settings_default(): 16 | settings = get_project_settings() 17 | assert settings['TELNETCONSOLE_HOST'] == '0.0.0.0' 18 | 19 | 20 | @patch.dict( 21 | environ, 22 | { 23 | 'SHUB_SETTINGS': '{"project_settings": {"SETTING_TEST": "VAL"}}', 24 | } 25 | ) 26 | def test_get_project_settings_setting(): 27 | settings = get_project_settings() 28 | assert settings['SETTING_TEST'] == 'VAL' 29 | 30 | 31 | @patch.dict( 32 | environ, 33 | { 34 | 'SHUB_SETTINGS': '{"project_settings": {"SETTING....', 35 | } 36 | ) 37 | def test_get_project_settings_bad_setting(): 38 | with raises(ValueError): 39 | get_project_settings() 40 | -------------------------------------------------------------------------------- /tests/test_writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import os 5 | import threading 6 | from queue import Queue 7 | 8 | import pytest 9 | 10 | from sh_scrapy.writer import _PipeWriter 11 | 12 | 13 | @pytest.fixture 14 | def fifo(tmpdir): 15 | path = os.path.join(str(tmpdir.mkdir('fifo')), 'scrapinghub') 16 | os.mkfifo(path) 17 | return path 18 | 19 | 20 | @pytest.fixture 21 | def queue(): 22 | return Queue() 23 | 24 | 25 | @pytest.fixture 26 | def reader(fifo, queue): 27 | def read_from_fifo(): 28 | with open(fifo) as f: 29 | for line in iter(f.readline, ''): 30 | queue.put(line) 31 | 32 | reader_thread = threading.Thread(target=read_from_fifo) 33 | reader_thread.start() 34 | try: 35 | yield reader_thread 36 | finally: 37 | reader_thread.join(timeout=1) 38 | 39 | 40 | @pytest.fixture 41 | def writer(fifo, reader): 42 | w = _PipeWriter(fifo) 43 | w.open() 44 | try: 45 | yield w 46 | finally: 47 | w.close() 48 | 49 | 50 | def test_close(writer): 51 | assert writer._pipe.closed is False 52 | writer.close() 53 | assert writer._pipe.closed is True 54 | 55 | 56 | def _parse_data_line(msg): 57 | assert msg.endswith('\n') 58 | cmd, _, payload = msg.strip().partition(' ') 59 | return cmd, json.loads(payload) 60 | 61 | 62 | def test_write_item(writer, queue): 63 | writer.write_item({'foo': 'bar'}) 64 | line = queue.get(timeout=1) 65 | assert queue.empty() 66 | cmd, payload = _parse_data_line(line) 67 | assert cmd == 'ITM' 68 | assert payload == {'foo': 'bar'} 69 | 70 | 71 | def test_write_request(writer, queue): 72 | writer.write_request( 73 | url='http://example.com/', 74 | status=200, 75 | method='GET', 76 | rs=1024, 77 | duration=102, 78 | parent=None, 79 | fp='fingerprint', 80 | ) 81 | line = queue.get(timeout=1) 82 | assert queue.empty() 83 | cmd, payload = _parse_data_line(line) 84 | assert cmd == 'REQ' 85 | assert isinstance(payload.pop('time'), int) 86 | assert payload == { 87 | 'url': 'http://example.com/', 88 | 'status': 200, 89 | 'method': 'GET', 90 | 'rs': 1024, 91 | 'duration': 102, 92 | 'parent': None, 93 | 'fp': 'fingerprint', 94 | } 95 | 96 | 97 | def test_write_log(writer, queue): 98 | writer.write_log( 99 | level=logging.INFO, 100 | message='text', 101 | ) 102 | line = queue.get(timeout=1) 103 | assert queue.empty() 104 | cmd, payload = _parse_data_line(line) 105 | assert cmd == 'LOG' 106 | assert isinstance(payload.pop('time'), int) 107 | assert payload == { 108 | 'message': 'text', 109 | 'level': logging.INFO 110 | } 111 | 112 | 113 | def test_write_stats(writer, queue): 114 | stats = {'item_scraped_count': 10, 'scheduler/enqueued': 20} 115 | writer.write_stats(stats.copy()) 116 | line = queue.get(timeout=1) 117 | assert queue.empty() 118 | cmd, payload = _parse_data_line(line) 119 | assert cmd == 'STA' 120 | assert isinstance(payload.pop('time'), int) 121 | assert payload == { 122 | 'stats': stats.copy() 123 | } 124 | 125 | 126 | def test_set_outcome(writer, queue): 127 | outcome = 'custom_outcome' 128 | writer.set_outcome(outcome) 129 | line = queue.get(timeout=1) 130 | assert queue.empty() 131 | cmd, payload = _parse_data_line(line) 132 | assert cmd == 'FIN' 133 | assert payload == { 134 | 'outcome': outcome 135 | } 136 | 137 | 138 | def test_writer_raises_runtime_error_if_not_configured(): 139 | error_msg = "Pipe writer is misconfigured, named pipe path is not set" 140 | w = _PipeWriter('') 141 | with pytest.raises(RuntimeError) as exc_info: 142 | w.write_log(10, 'message') 143 | assert exc_info.value.args[0] == error_msg 144 | with pytest.raises(RuntimeError) as exc_info: 145 | w.close() 146 | assert exc_info.value.args[0] == error_msg 147 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | from pathlib import Path 5 | from typing import Tuple, Optional, Union 6 | 7 | 8 | def call_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: 9 | result = subprocess.run( 10 | args, 11 | cwd=str(cwd), 12 | stdout=subprocess.PIPE, 13 | stderr=subprocess.PIPE, 14 | universal_newlines=True, 15 | ) 16 | assert result.returncode == 0, result.stderr 17 | return result.stdout, result.stderr 18 | 19 | 20 | def call_scrapy_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]: 21 | args = (sys.executable, "-m", "scrapy.cmdline") + args 22 | return call_command(cwd, *args) 23 | 24 | 25 | def create_project(topdir: Path, spider_text: Optional[str] = None) -> Path: 26 | project_name = "foo" 27 | cwd = topdir 28 | call_scrapy_command(str(cwd), "startproject", project_name) 29 | cwd /= project_name 30 | (cwd / project_name / "spiders" / "spider.py").write_text(spider_text or """ 31 | from scrapy import Spider 32 | 33 | class MySpider(Spider): 34 | name = "myspider" 35 | """) 36 | return cwd 37 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox.ini 2 | [tox] 3 | envlist = py36-scrapy16, py 4 | requires = 5 | # https://github.com/pypa/virtualenv/issues/2550 6 | virtualenv<=20.21.1 7 | 8 | [testenv] 9 | deps = 10 | pytest 11 | pytest-cov 12 | mock 13 | hubstorage 14 | packaging 15 | py36-scrapy16: Scrapy==1.6 16 | scrapy-spider-metadata>=0.1.1; python_version >= "3.8" 17 | pydantic>=2; python_version >= "3.8" 18 | 19 | commands = 20 | pytest --verbose --cov=sh_scrapy --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: sh_scrapy tests} 21 | --------------------------------------------------------------------------------