├── tests ├── __init__.py ├── test_links.py ├── test_querycleaner.py ├── test_splitvariants.py ├── test_constraints.py ├── test_processors.py ├── test_magicfields.py ├── test_hubproxy.py ├── test_hcf.py ├── test_crawlera.py └── test_deltafetch.py ├── scrapylib ├── __init__.py ├── pipelines.py ├── constraints │ ├── pipeline.py │ └── __init__.py ├── links.py ├── hubproxy.py ├── processors │ ├── date.py │ └── __init__.py ├── splitvariants.py ├── redisqueue.py ├── proxy.py ├── guid.py ├── querycleaner.py ├── spidertrace.py ├── deltafetch.py ├── magicfields.py ├── crawlera.py └── hcf.py ├── requirements.txt ├── .bumpversion.cfg ├── .gitignore ├── tox.ini ├── setup.py ├── .travis.yml └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapylib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six 2 | boto 3 | hubstorage>=0.23 4 | python-dateutil 5 | scrapinghub 6 | Scrapy>=1.1 7 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.7.0 3 | commit = True 4 | tag = True 5 | tag_name = {new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # temp files 2 | **swp 3 | **pyc 4 | **~ 5 | 6 | # setuptools/distutils files 7 | scrapylib\.egg-info 8 | build/ 9 | dist/ 10 | \.idea/ 11 | \.tox/ 12 | -------------------------------------------------------------------------------- /scrapylib/pipelines.py: -------------------------------------------------------------------------------- 1 | 2 | class SpiderFieldPipeline(object): 3 | def process_item(self, item, spider): 4 | item['spider'] = spider.name 5 | return item 6 | -------------------------------------------------------------------------------- /scrapylib/constraints/pipeline.py: -------------------------------------------------------------------------------- 1 | from scrapy.exceptions import DropItem 2 | 3 | class ConstraintsPipeline(object): 4 | 5 | def process_item(self, item, spider): 6 | try: 7 | for c in item.constraints: 8 | c(item) 9 | except AssertionError as e: 10 | raise DropItem(str(e)) 11 | return item 12 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27, pypy, py33 8 | 9 | [testenv] 10 | setenv = 11 | BERKELEYDB_DIR = /usr 12 | deps = 13 | -rrequirements.txt 14 | mock 15 | nose 16 | bsddb3 17 | commands = nosetests --with-doctest [] 18 | -------------------------------------------------------------------------------- /scrapylib/links.py: -------------------------------------------------------------------------------- 1 | from scrapy.http import Request 2 | 3 | def follow_links(link_extractor, response, callback): 4 | """Returns a generator of requests with given `callback` 5 | of links extractor from `response`. 6 | 7 | Parameters: 8 | link_extractor -- LinkExtractor to use 9 | response -- Response to extract links from 10 | callback -- callback to apply to each new requests 11 | 12 | """ 13 | for link in link_extractor.extract_links(response): 14 | yield Request(link.url, callback=callback) 15 | -------------------------------------------------------------------------------- /scrapylib/hubproxy.py: -------------------------------------------------------------------------------- 1 | from .crawlera import CrawleraMiddleware 2 | 3 | 4 | class HubProxyMiddleware(CrawleraMiddleware): 5 | 6 | def __init__(self, *args, **kwargs): 7 | import warnings 8 | from scrapy.exceptions import ScrapyDeprecationWarning 9 | warnings.warn('scrapylib.hubproxy.HubProxyMiddleware is deprecated, ' 10 | 'use scrapylib.crawlera.CrawleraMiddleware instead.', 11 | category=ScrapyDeprecationWarning, stacklevel=1) 12 | super(HubProxyMiddleware, self).__init__(*args, **kwargs) 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='scrapylib', 5 | version='1.7.0', 6 | license='BSD', 7 | description='Scrapy helper functions and processors', 8 | author='Scrapinghub', 9 | author_email='info@scrapinghub.com', 10 | url='http://github.com/scrapinghub/scrapylib', 11 | packages=['scrapylib', 'scrapylib.constraints', 'scrapylib.processors'], 12 | platforms=['Any'], 13 | classifiers=[ 14 | 'Development Status :: 7 - Inactive', 15 | 'License :: OSI Approved :: BSD License', 16 | 'Operating System :: OS Independent', 17 | 'Programming Language :: Python' 18 | ], 19 | install_requires=['Scrapy>=1.0.0'] 20 | ) 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3.5 3 | sudo: false 4 | env: 5 | matrix: 6 | - TOXENV=py27 7 | - TOXENV=py33 8 | - TOXENV=py35 9 | addons: 10 | apt: 11 | packages: 12 | - language-pack-fr 13 | - libdb-dev 14 | 15 | install: pip install -U tox 16 | script: tox 17 | 18 | deploy: 19 | provider: pypi 20 | user: scrapinghub 21 | distributions: sdist bdist_wheel 22 | password: 23 | secure: iKVlMlKSr+LOuCCMMOqL65aYjNRy3k1Zb4d7NRN0JpWS5DGau8G8cEhJ1dY4uyc/DNKVJmd939OiLBsUqqCmz09+ozen/YrRNjEZS5lOwBNfhpiCESkbOjcInV1PQgx2XfuHGp8O/9vxtXjjH9WE9CabQ+8Zg5/rMMvXizT4/O4= 24 | on: 25 | tags: true 26 | all_branches: true 27 | repo: scrapinghub/scrapylib 28 | condition: $TOXENV = py27 29 | -------------------------------------------------------------------------------- /scrapylib/processors/date.py: -------------------------------------------------------------------------------- 1 | from dateutil.parser import parse 2 | from scrapy.loader.processors import Compose 3 | from scrapy import log 4 | from scrapylib.processors import default_output_processor 5 | 6 | def parse_datetime(value): 7 | try: 8 | d = parse(value) 9 | except ValueError: 10 | log.msg('Unable to parse %s' % value, level=log.WARNING) 11 | return value 12 | else: 13 | return d.isoformat() 14 | 15 | def parse_date(value): 16 | try: 17 | d = parse(value) 18 | except ValueError: 19 | log.msg('Unable to parse %s' % value, level=log.WARNING) 20 | return value 21 | else: 22 | return d.strftime("%Y-%m-%d") 23 | 24 | default_out_parse_datetime = Compose(default_output_processor, parse_datetime) 25 | default_out_parse_date = Compose(default_output_processor, parse_date) 26 | -------------------------------------------------------------------------------- /tests/test_links.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scrapylib.links import follow_links 4 | from scrapy.http import Request 5 | 6 | 7 | class LinkMock(object): 8 | def __init__(self, url): 9 | self.url = url 10 | 11 | 12 | class LinkExtractorMock(object): 13 | 14 | def extract_links(self, response): 15 | return [LinkMock(url=x) for x in response.split('|')] 16 | 17 | 18 | def some_callback(): 19 | pass 20 | 21 | 22 | class TestLinks(unittest.TestCase): 23 | 24 | def test_follow_links(self): 25 | r = list(follow_links(LinkExtractorMock(), 'http://link1|http://link2|http://link3', callback=some_callback)) 26 | assert all(isinstance(x, Request) for x in r) 27 | assert all(x.callback is some_callback for x in r) 28 | self.assertEqual([x.url for x in r], ['http://link1', 'http://link2', 'http://link3']) 29 | -------------------------------------------------------------------------------- /scrapylib/splitvariants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Splits each product with variants into different single products. 3 | For autoscraping products adaptation 4 | """ 5 | 6 | from copy import deepcopy 7 | from scrapy.item import DictItem 8 | from scrapy.exceptions import NotConfigured 9 | 10 | class SplitVariantsMiddleware(object): 11 | 12 | @classmethod 13 | def from_crawler(cls, crawler): 14 | if not crawler.settings.getbool("SPLITVARIANTS_ENABLED"): 15 | raise NotConfigured 16 | return cls() 17 | 18 | def process_spider_output(self, response, result, spider): 19 | for r in result: 20 | if isinstance(r, DictItem) and "variants" in r: 21 | variants = r.pop("variants") 22 | for variant in variants: 23 | new_product = deepcopy(r) 24 | new_product.update(variant) 25 | yield new_product 26 | else: 27 | yield r 28 | -------------------------------------------------------------------------------- /scrapylib/redisqueue.py: -------------------------------------------------------------------------------- 1 | try: 2 | import cPickle as pickle 3 | except ImportError: 4 | import pickle 5 | 6 | from scrapy.exceptions import NotConfigured 7 | from scrapy import signals 8 | 9 | 10 | class RedisQueue(object): 11 | 12 | def __init__(self, crawler): 13 | try: 14 | from redis import Redis 15 | except ImportError: 16 | raise NotConfigured 17 | 18 | settings = crawler.settings 19 | 20 | # get settings 21 | queue = settings.get('REDIS_QUEUE') 22 | if queue is None: 23 | raise NotConfigured 24 | 25 | host = settings.get('REDIS_HOST', 'localhost') 26 | port = settings.getint('REDIS_PORT', 6379) 27 | db = settings.getint('REDIS_DB', 0) 28 | password = settings.get('REDIS_PASSWORD') 29 | 30 | self.redis = Redis(host=host, port=port, db=db, password=password) 31 | self.queue = queue 32 | self.project = settings['BOT_NAME'] 33 | 34 | crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) 35 | 36 | @classmethod 37 | def from_crawler(cls, crawler): 38 | return cls(crawler) 39 | 40 | def spider_closed(self, spider, reason): 41 | msg = {'project': self.project, 'spider': spider.name, 'reason': reason} 42 | self.redis.rpush(self.queue, pickle.dumps(msg)) 43 | -------------------------------------------------------------------------------- /scrapylib/proxy.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from six.moves.urllib.parse import unquote, urlunparse 3 | try: 4 | from urllib2 import _parse_proxy 5 | except ImportError: 6 | from urllib.request import _parse_proxy 7 | 8 | 9 | class SelectiveProxyMiddleware(object): 10 | """A middleware to enable http proxy to selected spiders only. 11 | 12 | Settings: 13 | HTTP_PROXY -- proxy uri. e.g.: http://user:pass@proxy.host:port 14 | PROXY_SPIDERS -- all requests from these spiders will be routed 15 | through the proxy 16 | """ 17 | 18 | def __init__(self, settings): 19 | self.proxy = self.parse_proxy(settings.get('HTTP_PROXY'), 'http') 20 | self.proxy_spiders = set(settings.getlist('PROXY_SPIDERS', [])) 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | return cls(crawler.settings) 25 | 26 | def parse_proxy(self, url, orig_type): 27 | proxy_type, user, password, hostport = _parse_proxy(url) 28 | proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) 29 | 30 | if user and password: 31 | user_pass = '%s:%s' % (unquote(user), unquote(password)) 32 | creds = base64.b64encode(user_pass).strip() 33 | else: 34 | creds = None 35 | 36 | return creds, proxy_url 37 | 38 | def process_request(self, request, spider): 39 | if spider.name in self.proxy_spiders: 40 | creds, proxy = self.proxy 41 | request.meta['proxy'] = proxy 42 | if creds: 43 | request.headers['Proxy-Authorization'] = 'Basic ' + creds 44 | -------------------------------------------------------------------------------- /tests/test_querycleaner.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from scrapy.http import Request, Response 4 | from scrapy.spiders import Spider 5 | from scrapy.utils.test import get_crawler 6 | from scrapylib.querycleaner import QueryCleanerMiddleware 7 | from scrapy.exceptions import NotConfigured 8 | 9 | 10 | class QueryCleanerTestCase(TestCase): 11 | 12 | mwcls = QueryCleanerMiddleware 13 | 14 | def setUp(self): 15 | self.spider = Spider('foo') 16 | 17 | def test_not_loaded(self): 18 | crawler = get_crawler(settings_dict={}) 19 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) 20 | 21 | def test_filter_keep(self): 22 | crawler = get_crawler(settings_dict={"QUERYCLEANER_KEEP": "qxp"}) 23 | mw = self.mwcls.from_crawler(crawler) 24 | response = Response(url="http://www.example.com/qxg1231") 25 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231") 26 | new_request = list(mw.process_spider_output(response, [request], self.spider))[0] 27 | self.assertEqual(new_request.url, "http://www.example.com/product/?qxp=12") 28 | self.assertNotEqual(request, new_request) 29 | 30 | def test_filter_remove(self): 31 | crawler = get_crawler(settings_dict={"QUERYCLEANER_REMOVE": "qxg"}) 32 | mw = self.mwcls.from_crawler(crawler) 33 | response = Response(url="http://www.example.com/qxg1231") 34 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231") 35 | new_request = list(mw.process_spider_output(response, [request], self.spider))[0] 36 | self.assertEqual(new_request.url, "http://www.example.com/product/?qxp=12") 37 | self.assertNotEqual(request, new_request) 38 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | scrapylib 3 | ========= 4 | 5 | Overview 6 | ======== 7 | 8 | **This library is deprecated and unmaintained.** 9 | 10 | Some of its components were moved to their own packages: 11 | 12 | +--------------------------+------------------------------------------------+ 13 | | Old location | New location | 14 | +==========================+================================================+ 15 | | scrapylib.crawlera | `scrapy-crawlera`_ | 16 | +--------------------------+------------------------------------------------+ 17 | | scrapylib.deltafetch | `scrapy-deltafetch`_ | 18 | +--------------------------+------------------------------------------------+ 19 | | scrapylib.hcf | `scrapy-hcf`_ | 20 | +--------------------------+------------------------------------------------+ 21 | | scrapylib.magicfields | `scrapy-magicfields`_ | 22 | +--------------------------+------------------------------------------------+ 23 | | scrapylib.querycleaner | `scrapy-querycleaner`_ | 24 | +--------------------------+------------------------------------------------+ 25 | | scrapylib.splitvariants | `scrapy-splitvariants`_ | 26 | +--------------------------+------------------------------------------------+ 27 | 28 | .. _scrapy-crawlera: https://github.com/scrapy-plugins/scrapy-crawlera 29 | .. _scrapy-deltafetch: https://github.com/scrapy-plugins/scrapy-deltafetch 30 | .. _scrapy-hcf: https://github.com/scrapy-plugins/scrapy-hcf 31 | .. _scrapy-magicfields: https://github.com/scrapy-plugins/scrapy-magicfields 32 | .. _scrapy-querycleaner: https://github.com/scrapy-plugins/scrapy-querycleaner 33 | .. _scrapy-splitvariants: https://github.com/scrapy-plugins/scrapy-splitvariants 34 | -------------------------------------------------------------------------------- /tests/test_splitvariants.py: -------------------------------------------------------------------------------- 1 | """ Tests to cover splitvariants middleware """ 2 | from unittest import TestCase 3 | 4 | from scrapy.spiders import Spider 5 | from scrapy.item import DictItem, Field 6 | from scrapy.http import HtmlResponse 7 | from scrapy.utils.test import get_crawler 8 | 9 | from scrapylib.splitvariants import SplitVariantsMiddleware 10 | 11 | 12 | class TestItem(DictItem): 13 | """ 14 | Item used in test spider 15 | """ 16 | fields = { 17 | 'id': Field(), 18 | 'name': Field(), 19 | 'size': Field(), 20 | 'price': Field(), 21 | 'variants': Field() 22 | } 23 | 24 | 25 | class SplitVariantsTest(TestCase): 26 | """ Split variants middleware test cases """ 27 | def setUp(self): 28 | self.spider = Spider('myspider', 29 | start_urls=["http://example.com"]) 30 | self.response = HtmlResponse(body=b"", 31 | url="http://www.example.com") 32 | 33 | def test_variants_splitted(self): 34 | """ 35 | Checks if item with variants is split as expected 36 | """ 37 | settings = {"SPLITVARIANTS_ENABLED": True} 38 | crawler = get_crawler(settings_dict=settings) 39 | mware = SplitVariantsMiddleware.from_crawler(crawler) 40 | 41 | # Define item with variants 42 | item = {"id": 12, 43 | "name": "Big chair", 44 | "variants": [{"size": "XL", "price": 200}, 45 | {"size": "L", "price": 220}]} 46 | result = [TestItem(item)] 47 | 48 | # Define how split items should look 49 | expected = [ 50 | {"id": 12, "name": "Big chair", "size": 'XL', 'price': 200}, 51 | {"id": 12, "name": "Big chair", "size": 'L', 'price': 220}] 52 | 53 | # Calling middleware for given result 54 | result = mware.process_spider_output(self.response, result, 55 | self.spider) 56 | self.assertEquals(list(result), expected) 57 | -------------------------------------------------------------------------------- /scrapylib/guid.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from scrapy import signals 4 | from scrapy.exceptions import DropItem 5 | try: 6 | from scrapy.utils.python import to_bytes 7 | except ImportError: 8 | from scrapy.utils.python import unicode_to_str as to_bytes 9 | 10 | 11 | def hash_values(*values): 12 | """Hash a series of non-None values. 13 | 14 | For example: 15 | >>> hash_values('some', 'values', 'to', 'hash') 16 | '1d7b7a17aeb0e5f9a6814289d12d3253' 17 | """ 18 | hash = hashlib.md5() 19 | for value in values: 20 | if value is None: 21 | message = "hash_values was passed None at argument index %d" % list(values).index(None) 22 | raise ValueError(message) 23 | hash.update(to_bytes('%s' % value)) 24 | return hash.hexdigest() 25 | 26 | 27 | class GUIDPipeline(object): 28 | 29 | item_fields = {} 30 | 31 | def __init__(self): 32 | self.guids = {} 33 | 34 | @classmethod 35 | def from_crawler(cls, crawler): 36 | o = cls() 37 | crawler.signals.connect(o.spider_opened, signals.spider_opened) 38 | crawler.signals.connect(o.spider_closed, signals.spider_closed) 39 | return o 40 | 41 | def spider_opened(self, spider): 42 | self.guids[spider] = set() 43 | 44 | def spider_closed(self, spider): 45 | del self.guids[spider] 46 | 47 | def process_item(self, item, spider): 48 | if type(item) in self.item_fields: 49 | item['guid'] = guid = self.generate_guid(item, spider) 50 | if guid is None: 51 | raise DropItem("Missing guid fields on: %s" % item) 52 | if guid in self.guids[spider]: 53 | raise DropItem("Duplicate item found: %s" % item) 54 | else: 55 | self.guids[spider].add(guid) 56 | return item 57 | 58 | def generate_guid(self, item, spider): 59 | values = [] 60 | for field in self.item_fields[type(item)]: 61 | value = item.get(field) 62 | if value is None: 63 | return 64 | values.append(value.encode('utf-8')) 65 | values.insert(0, spider.name) 66 | return hash_values(*values) 67 | -------------------------------------------------------------------------------- /scrapylib/processors/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import locale as localelib 3 | import re 4 | import time 5 | from six.moves.urllib.parse import urljoin 6 | 7 | 8 | from scrapy.loader.processors import MapCompose, TakeFirst 9 | from scrapy.utils.markup import (remove_tags, replace_escape_chars, 10 | unquote_markup) 11 | 12 | 13 | _clean_spaces_re = re.compile("\s+", re.U) 14 | 15 | 16 | def clean_spaces(value): 17 | return _clean_spaces_re.sub(' ', value) 18 | 19 | 20 | def make_absolute_url(val, loader_context): 21 | base_url = loader_context.get('base_url') 22 | if base_url is None: 23 | response = loader_context.get('response') 24 | if response is None: 25 | raise AttributeError('You must provide a base_url or a response ' 26 | 'to the loader context') 27 | base_url = response.url 28 | return urljoin(base_url, val) 29 | 30 | 31 | def remove_query_params(value): 32 | # some urls don't have ? but have & 33 | return value.split('?')[0].split('&')[0] 34 | 35 | 36 | _br_re = re.compile('', re.IGNORECASE) 37 | def replace_br(value): 38 | return _br_re.sub(' ', value) 39 | 40 | 41 | def replace_escape(value): 42 | return replace_escape_chars(value, replace_by=u' ') 43 | 44 | 45 | def split(value): 46 | return [v.strip() for v in value.split(',')] 47 | 48 | 49 | def strip(value): 50 | return value.strip() 51 | 52 | 53 | def to_datetime(value, format, locale=None): 54 | """Returns a datetime parsed from value with the specified format 55 | and locale. 56 | 57 | If no year is specified in the parsing format it is taken from the 58 | current date. 59 | """ 60 | if locale: 61 | old_locale = localelib.getlocale(localelib.LC_TIME) 62 | localelib.setlocale(localelib.LC_TIME, locale) 63 | 64 | time_s = time.strptime(value, format) 65 | dt = datetime.datetime(*time_s[0:5]) 66 | # 1900 is the default year from strptime, means no year parsed 67 | if dt.year == 1900: 68 | dt = dt.replace(year=datetime.datetime.utcnow().year) 69 | 70 | if locale: 71 | localelib.setlocale(localelib.LC_TIME, old_locale) 72 | 73 | return dt 74 | 75 | 76 | def to_date(value, format, locale=None): 77 | return to_datetime(value, format, locale).date() 78 | 79 | 80 | def to_time(value, format): 81 | time_s = time.strptime(value, format) 82 | return datetime.time(time_s[3], time_s[4]) 83 | 84 | 85 | # defaults 86 | 87 | default_input_processor = MapCompose(replace_br, remove_tags, unquote_markup, 88 | replace_escape, strip, clean_spaces) 89 | 90 | default_output_processor = TakeFirst() 91 | -------------------------------------------------------------------------------- /scrapylib/querycleaner.py: -------------------------------------------------------------------------------- 1 | """Get parameter cleaner for AS. 2 | 3 | Add removed/kept pattern (regex) with 4 | 5 | QUERYCLEANER_REMOVE 6 | QUERYCLEANER_KEEP 7 | 8 | Remove patterns has precedence. 9 | """ 10 | import re 11 | from six.moves.urllib.parse import quote 12 | from six import string_types 13 | 14 | from scrapy.utils.httpobj import urlparse_cached 15 | from scrapy.http import Request 16 | from scrapy.exceptions import NotConfigured 17 | 18 | from w3lib.url import _safe_chars 19 | 20 | def _parse_query_string(query): 21 | """Used for replacing cgi.parse_qsl. 22 | The cgi version returns the same pair for query 'key' 23 | and query 'key=', so reconstruction 24 | maps to the same string. But some sites does not handle both versions 25 | in the same way. 26 | This version returns (key, None) in the first case, and (key, '') in the 27 | second one, so correct reconstruction can be performed.""" 28 | 29 | params = query.split("&") 30 | keyvals = [] 31 | for param in params: 32 | kv = param.split("=") + [None] 33 | keyvals.append((kv[0], kv[1])) 34 | return keyvals 35 | 36 | def _filter_query(query, remove_re=None, keep_re=None): 37 | """ 38 | Filters query parameters in a query string according to key patterns 39 | >>> _filter_query('as=3&bs=8&cs=9') 40 | 'as=3&bs=8&cs=9' 41 | >>> _filter_query('as=3&bs=8&cs=9', None, re.compile("as|bs")) 42 | 'as=3&bs=8' 43 | >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs")) 44 | 'cs=9' 45 | >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs"), re.compile("as|cs")) 46 | 'cs=9' 47 | """ 48 | keyvals = _parse_query_string(query) 49 | qargs = [] 50 | for k, v in keyvals: 51 | if remove_re is not None and remove_re.search(k): 52 | continue 53 | if keep_re is None or keep_re.search(k): 54 | qarg = quote(k, _safe_chars) 55 | if isinstance(v, string_types): 56 | qarg = qarg + '=' + quote(v, _safe_chars) 57 | qargs.append(qarg.replace("%20", "+")) 58 | return '&'.join(qargs) 59 | 60 | class QueryCleanerMiddleware(object): 61 | def __init__(self, settings): 62 | remove = settings.get("QUERYCLEANER_REMOVE") 63 | keep = settings.get("QUERYCLEANER_KEEP") 64 | if not (remove or keep): 65 | raise NotConfigured 66 | self.remove = re.compile(remove) if remove else None 67 | self.keep = re.compile(keep) if keep else None 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | return cls(crawler.settings) 72 | 73 | def process_spider_output(self, response, result, spider): 74 | for res in result: 75 | if isinstance(res, Request): 76 | parsed = urlparse_cached(res) 77 | if parsed.query: 78 | parsed = parsed._replace(query=_filter_query(parsed.query, self.remove, self.keep)) 79 | res = res.replace(url=parsed.geturl()) 80 | yield res 81 | 82 | -------------------------------------------------------------------------------- /scrapylib/spidertrace.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spider Trace 3 | 4 | This SpiderMiddleware logs a trace of requests and items extracted for a 5 | spider 6 | """ 7 | import os 8 | from os.path import basename 9 | from tempfile import mkstemp 10 | from gzip import GzipFile 11 | import time 12 | import boto 13 | import json 14 | from boto.s3.key import Key 15 | from scrapy import signals, log 16 | from scrapy.exceptions import NotConfigured 17 | from scrapy.http import Request 18 | from scrapy.utils.request import request_fingerprint 19 | 20 | 21 | class SpiderTraceMiddleware(object): 22 | """Saves a trace of spider execution and uploads to S3 23 | 24 | The trace records: 25 | (timestamp, http response, results extracted from spider) 26 | """ 27 | REQUEST_ATTRS = ('url', 'method', 'body', 'headers', 'cookies', 'meta') 28 | RESPONSE_ATTRS = ('url', 'status', 'headers', 'body', 'request', 'flags') 29 | 30 | def __init__(self, crawler): 31 | self.bucket = crawler.settings.get("SPIDERTRACE_BUCKET") 32 | if not self.bucket: 33 | raise NotConfigured 34 | crawler.signals.connect(self.open_spider, signals.spider_opened) 35 | crawler.signals.connect(self.close_spider, signals.spider_closed) 36 | self.outputs = {} 37 | 38 | @classmethod 39 | def from_crawler(cls, crawler): 40 | return cls(crawler) 41 | 42 | def process_spider_output(self, response, result, spider): 43 | f = self.outputs[spider] 44 | fp = request_fingerprint(response.request) 45 | tracetime = time.time() 46 | data = self._objtodict(self.RESPONSE_ATTRS, response) 47 | data['request'] = self._objtodict(self.REQUEST_ATTRS, response.request) 48 | self._write(f, fp, tracetime, 'response', data) 49 | 50 | for item in result: 51 | if isinstance(item, Request): 52 | data = self._objtodict(self.REQUEST_ATTRS, item) 53 | data['fp'] = request_fingerprint(item) 54 | self._write(f, fp, tracetime, 'request', data) 55 | else: 56 | self._write(f, fp, tracetime, 'item', dict(item)) 57 | yield item 58 | 59 | @staticmethod 60 | def _write(f, fp, tracetime, otype, data): 61 | f.write('%s\t%s\t%s\t%s\n' % (tracetime, fp, otype, json.dumps(data))) 62 | 63 | @staticmethod 64 | def _objtodict(attrs, obj): 65 | data = [(a, getattr(obj, a)) for a in attrs] 66 | return dict(x for x in data if x[1]) 67 | 68 | def open_spider(self, spider): 69 | _, fname = mkstemp(prefix=spider.name + '-', suffix='.trace.gz') 70 | self.outputs[spider] = GzipFile(fname, 'wb') 71 | 72 | def close_spider(self, spider): 73 | f = self.outputs.pop(spider) 74 | f.close() 75 | c = boto.connect_s3() 76 | fname = basename(f.name) 77 | key = Key(c.get_bucket(self.bucket), fname) 78 | log.msg("uploading trace to s3://%s/%s" % (key.bucket.name, fname)) 79 | key.set_contents_from_filename(f.name) 80 | os.remove(f.name) 81 | -------------------------------------------------------------------------------- /tests/test_constraints.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import six 3 | 4 | from scrapylib.constraints import RequiredFields, NonEmptyFields, IsType, IsNumber, IsPrice, MaxLen, MinLen 5 | 6 | 7 | class RequiredFieldsTest(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.item = {'str': 'bar', 'list': ['one'], 'bool': False, 'none': None} 11 | 12 | def test_basic(self): 13 | RequiredFields('str')(self.item) 14 | RequiredFields('str', 'list', 'bool', 'none')(self.item) 15 | 16 | def test_fail(self): 17 | self.assertRaises(AssertionError, RequiredFields('list', 'xxx'), self.item) 18 | 19 | 20 | class NonEmptyFieldsTest(unittest.TestCase): 21 | 22 | def setUp(self): 23 | self.item = {'str': 'foo', 'list': [0], 'empty_str': '', 'empty_list': []} 24 | 25 | def test_basic(self): 26 | NonEmptyFields('str')(self.item) 27 | NonEmptyFields('str', 'list')(self.item) 28 | 29 | def test_fail(self): 30 | self.assertRaises(AssertionError, NonEmptyFields('list', 'xxx'), self.item) 31 | self.assertRaises(AssertionError, NonEmptyFields('empty_str'), self.item) 32 | self.assertRaises(AssertionError, NonEmptyFields('empty_list'), self.item) 33 | 34 | 35 | class IsTypeTest(unittest.TestCase): 36 | 37 | def setUp(self): 38 | self.item = {'str': 'bar', 'list': ['one']} 39 | 40 | def test_ok(self): 41 | IsType(six.string_types, 'str')(self.item) 42 | IsType(list, 'list')(self.item) 43 | IsType(list, 'missing')(self.item) 44 | 45 | def test_fail(self): 46 | for t in six.string_types: 47 | self.assertRaises(AssertionError, IsType(t, 'list'), self.item) 48 | self.assertRaises(AssertionError, IsType(list, 'str'), self.item) 49 | 50 | 51 | class IsNumberTest(unittest.TestCase): 52 | 53 | def setUp(self): 54 | self.item = {'name': 'foo', 'age': '23'} 55 | 56 | def test_ok(self): 57 | IsNumber('age')(self.item) 58 | IsNumber('xxx')(self.item) 59 | 60 | def test_fail(self): 61 | self.assertRaises(AssertionError, IsNumber('name'), self.item) 62 | 63 | 64 | class IsPriceTest(unittest.TestCase): 65 | 66 | def setUp(self): 67 | self.item = {'name': 'foo', 'price': '1,223.23 '} 68 | 69 | def test_basic(self): 70 | IsPrice('price')(self.item) 71 | IsPrice('xxx')(self.item) 72 | 73 | def test_fail(self): 74 | self.assertRaises(AssertionError, IsPrice('name'), self.item) 75 | 76 | 77 | class MaxLenTest(unittest.TestCase): 78 | 79 | def setUp(self): 80 | self.item = {'name': 'foo', 'other': 'very long content'} 81 | 82 | def test_ok(self): 83 | MaxLen(8, 'name')(self.item) 84 | MaxLen(8, 'xxx')(self.item) 85 | 86 | def test_fail(self): 87 | self.assertRaises(AssertionError, MaxLen(8, 'other'), self.item) 88 | 89 | 90 | class MinLenTest(MaxLenTest): 91 | 92 | def test_ok(self): 93 | MinLen(8, 'other')(self.item) 94 | MinLen(8, 'xxx')(self.item) 95 | 96 | def test_fail(self): 97 | self.assertRaises(AssertionError, MinLen(8, 'name'), self.item) 98 | -------------------------------------------------------------------------------- /tests/test_processors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import datetime 3 | import locale 4 | import unittest 5 | 6 | from scrapylib.processors import to_datetime, to_date, default_input_processor 7 | 8 | 9 | def locale_exists(): 10 | current_locale = locale.getlocale(locale.LC_TIME) 11 | try: 12 | locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8') 13 | except Exception: 14 | return False 15 | else: 16 | locale.setlocale(locale.LC_TIME, current_locale) 17 | return True 18 | 19 | 20 | class TestProcessors(unittest.TestCase): 21 | 22 | def test_to_datetime(self): 23 | self.assertEquals(to_datetime('March 4, 2011 20:00', '%B %d, %Y %H:%S'), 24 | datetime.datetime(2011, 3, 4, 20, 0)) 25 | 26 | # test no year in parse format 27 | test_date = to_datetime('March 4, 20:00', '%B %d, %H:%S') 28 | self.assertEquals(test_date.year, datetime.datetime.utcnow().year) 29 | 30 | # test parse only date 31 | self.assertEquals(to_datetime('March 4, 2011', '%B %d, %Y'), 32 | datetime.datetime(2011, 3, 4)) 33 | 34 | @unittest.skipUnless(locale_exists(), "locale does not exist") 35 | def test_localized_to_datetime(self): 36 | current_locale = locale.getlocale(locale.LC_TIME) 37 | 38 | self.assertEquals( 39 | to_datetime('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'), 40 | datetime.datetime(2011, 1, 11) 41 | ) 42 | 43 | self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME)) 44 | 45 | def test_to_date(self): 46 | self.assertEquals(to_date('March 4, 2011', '%B %d, %Y'), 47 | datetime.date(2011, 3, 4)) 48 | 49 | # test no year in parse format 50 | test_date = to_date('March 4', '%B %d') 51 | self.assertEquals(test_date.year, datetime.datetime.utcnow().year) 52 | 53 | @unittest.skipUnless(locale_exists(), "locale does not exist") 54 | def test_localized_to_date(self): 55 | current_locale = locale.getlocale(locale.LC_TIME) 56 | 57 | self.assertEquals( 58 | to_date('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'), 59 | datetime.date(2011, 1, 11) 60 | ) 61 | 62 | self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME)) 63 | 64 | def test_default_input_processor(self): 65 | self.assertEquals(default_input_processor( 66 | """up to 54%"""), 72 | [u'up to 54%']) 73 | 74 | self.assertEquals(default_input_processor( 75 | """

<< ...The Sunnywale, Calif.-based... >>

"""), 76 | [u'<< ...The Sunnywale, Calif.-based... >>']) 77 | 78 | self.assertEquals(default_input_processor( 79 | """newline
must be replaced before tags and only then quotes like <br>"""), 80 | [u'newline must be replaced before tags and only then quotes like
']) 81 | 82 | if __name__ == '__main__': 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /scrapylib/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Item Constrains 3 | --------------- 4 | 5 | This module provides several classes that can be used as conditions to check 6 | certain item constraints. Conditions are just callables that receive a dict and 7 | *may* raise an AssertionError if the condition is not met. 8 | 9 | Item constraints can be checked automatically (at scraping time) to drop items 10 | that fail to meet the constraints. In order to do that, add the constraints 11 | pipeline to your ITEM_PIPELINES: 12 | 13 | ITEM_PIPELINES = ['scrapylib.constraints.pipeline.ConstraintsPipeline'] 14 | 15 | And define the constraints attribute in your item: 16 | 17 | class Product(Item): 18 | name = Field() 19 | price = Field() 20 | colors = Field() 21 | 22 | constraints = [ 23 | RequiredFields('name', 'price'), 24 | IsPrice('price'), 25 | IsList('colors'), 26 | MinLen(10, 'name'), 27 | ] 28 | 29 | """ 30 | 31 | import re 32 | from functools import partial 33 | from six import string_types, text_type 34 | 35 | 36 | class RequiredFields(object): 37 | """Assert that the specified fields are populated""" 38 | 39 | def __init__(self, *fields): 40 | self.fields = fields 41 | 42 | def __call__(self, item): 43 | for f in self.fields: 44 | assert f in item.keys(), "missing field: %s" % f 45 | 46 | class NonEmptyFields(object): 47 | """Assert that the specified fields are populated and non-empty""" 48 | 49 | def __init__(self, *fields): 50 | self.fields = fields 51 | 52 | def __call__(self, item): 53 | for f in self.fields: 54 | assert f in item.keys(), "missing field: %s" % f 55 | v = item[f] 56 | try: 57 | assert len(v) > 0, "empty field: %s" % f 58 | except TypeError: 59 | pass 60 | 61 | class IsType(object): 62 | """Assert that the specified fields are of the given type""" 63 | 64 | def __init__(self, type, *fields): 65 | self.type = type 66 | self.fields = fields 67 | 68 | def __call__(self, item): 69 | for f in self.fields: 70 | if f in item: 71 | v = item.get(f) 72 | assert isinstance(v, self.type), "field %r is not a %s: %r" % \ 73 | (f, self.type.__name__, v) 74 | 75 | IsString = partial(IsType, string_types) 76 | IsUnicode = partial(IsType, text_type) 77 | IsList = partial(IsType, list) 78 | IsDict = partial(IsType, dict) 79 | 80 | class IsNumber(object): 81 | """Assert that the specified fields are string and contain only numbers""" 82 | 83 | def __init__(self, *fields): 84 | self.fields = fields 85 | 86 | def __call__(self, item): 87 | for f in self.fields: 88 | v = item.get(f) 89 | if v is None: 90 | continue 91 | assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v) 92 | assert v.strip().isdigit(), "field %r contains non-numeric chars: %r" % (f, v) 93 | 94 | class IsPrice(object): 95 | """Assert that the specified fields are string and look like a price""" 96 | 97 | def __init__(self, *fields): 98 | self.fields = fields 99 | self.price_re = re.compile('^[0-9\., ]+$') 100 | 101 | def __call__(self, item): 102 | for f in self.fields: 103 | v = item.get(f) 104 | if v: 105 | assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v) 106 | assert self.price_re.search(v), "field %r is not a price: %r" % (f, v) 107 | 108 | class MaxLen(object): 109 | """Assert that the length of specified fields do not exceed the given 110 | size""" 111 | 112 | def __init__(self, size, *fields): 113 | self.size = size 114 | self.fields = fields 115 | 116 | def __call__(self, item): 117 | for f in self.fields: 118 | v = item.get(f) 119 | if v: 120 | self._proper_len(f, v) 121 | 122 | def _proper_len(self, f, v): 123 | assert len(v) <= self.size, "field %r length exceeds %d: %r" % (f, self.size, v) 124 | 125 | class MinLen(MaxLen): 126 | """Assert that the length of specified fields are larger (or equal) than 127 | the given size""" 128 | 129 | def _proper_len(self, f, v): 130 | assert len(v) >= self.size, "field %r length below %d: %r" % (f, self.size, v) 131 | -------------------------------------------------------------------------------- /scrapylib/deltafetch.py: -------------------------------------------------------------------------------- 1 | import os, time 2 | 3 | from scrapy.http import Request 4 | from scrapy.item import BaseItem 5 | from scrapy.utils.request import request_fingerprint 6 | from scrapy.utils.project import data_path 7 | from scrapy.utils.python import to_bytes 8 | from scrapy.exceptions import NotConfigured 9 | from scrapy import log, signals 10 | 11 | 12 | class DeltaFetch(object): 13 | """This is a spider middleware to ignore requests to pages containing items 14 | seen in previous crawls of the same spider, thus producing a "delta crawl" 15 | containing only new items. 16 | 17 | This also speeds up the crawl, by reducing the number of requests that need 18 | to be crawled, and processed (typically, item requests are the most cpu 19 | intensive). 20 | 21 | Supported settings: 22 | 23 | * DELTAFETCH_ENABLED - to enable (or disable) this extension 24 | * DELTAFETCH_DIR - directory where to store state 25 | * DELTAFETCH_RESET - reset the state, clearing out all seen requests 26 | 27 | Supported spider arguments: 28 | 29 | * deltafetch_reset - same effect as DELTAFETCH_RESET setting 30 | 31 | Supported request meta keys: 32 | 33 | * deltafetch_key - used to define the lookup key for that request. by 34 | default it's the fingerprint, but it can be changed to contain an item 35 | id, for example. This requires support from the spider, but makes the 36 | extension more efficient for sites that many URLs for the same item. 37 | 38 | """ 39 | 40 | def __init__(self, dir, reset=False, stats=None): 41 | dbmodule = None 42 | try: 43 | dbmodule = __import__('bsddb3').db 44 | except ImportError: 45 | try: 46 | dbmodule = __import__('bsddb').db 47 | except ImportError: 48 | pass 49 | if not dbmodule: 50 | raise NotConfigured('bssdb or bsddb3 is required') 51 | self.dbmodule = dbmodule 52 | self.dir = dir 53 | self.reset = reset 54 | self.stats = stats 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | s = crawler.settings 59 | if not s.getbool('DELTAFETCH_ENABLED'): 60 | raise NotConfigured 61 | dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) 62 | reset = s.getbool('DELTAFETCH_RESET') 63 | o = cls(dir, reset, crawler.stats) 64 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 65 | crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) 66 | return o 67 | 68 | def spider_opened(self, spider): 69 | if not os.path.exists(self.dir): 70 | os.makedirs(self.dir) 71 | dbpath = os.path.join(self.dir, '%s.db' % spider.name) 72 | reset = self.reset or getattr(spider, 'deltafetch_reset', False) 73 | flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE 74 | try: 75 | self.db = self.dbmodule.DB() 76 | self.db.open(filename=dbpath, 77 | dbtype=self.dbmodule.DB_HASH, 78 | flags=flag) 79 | except Exception: 80 | spider.log("Failed to open DeltaFetch database at %s, " 81 | "trying to recreate it" % dbpath) 82 | if os.path.exists(dbpath): 83 | os.remove(dbpath) 84 | self.db = self.dbmodule.DB() 85 | self.db.open(filename=dbpath, 86 | dbtype=self.dbmodule.DB_HASH, 87 | flags=self.dbmodule.DB_CREATE) 88 | 89 | def spider_closed(self, spider): 90 | self.db.close() 91 | 92 | def process_spider_output(self, response, result, spider): 93 | for r in result: 94 | if isinstance(r, Request): 95 | key = self._get_key(r) 96 | if self.db.has_key(key): 97 | spider.log("Ignoring already visited: %s" % r, level=log.INFO) 98 | if self.stats: 99 | self.stats.inc_value('deltafetch/skipped', spider=spider) 100 | continue 101 | elif isinstance(r, BaseItem): 102 | key = self._get_key(response.request) 103 | self.db[key] = str(time.time()).encode('iso8859-1') 104 | if self.stats: 105 | self.stats.inc_value('deltafetch/stored', spider=spider) 106 | yield r 107 | 108 | def _get_key(self, request): 109 | key = request.meta.get('deltafetch_key') or request_fingerprint(request) 110 | # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string 111 | return to_bytes(key) 112 | -------------------------------------------------------------------------------- /tests/test_magicfields.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import re, os 3 | from unittest import TestCase 4 | 5 | from scrapy.spiders import Spider 6 | from scrapy.utils.test import get_crawler 7 | from scrapy.item import DictItem, Field 8 | from scrapy.http import HtmlResponse 9 | 10 | from scrapylib.magicfields import _format, MagicFieldsMiddleware 11 | 12 | 13 | class TestItem(DictItem): 14 | fields = { 15 | 'url': Field(), 16 | 'nom': Field(), 17 | 'prix': Field(), 18 | 'spider': Field(), 19 | 'sku': Field(), 20 | } 21 | 22 | 23 | class MagicFieldsTest(TestCase): 24 | 25 | def setUp(self): 26 | self.environ = os.environ.copy() 27 | self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"]) 28 | 29 | def _log(x): 30 | print(x) 31 | 32 | self.spider.log = _log 33 | self.response = HtmlResponse(body=b"", url="http://www.example.com/product/8798732") 34 | self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"}) 35 | 36 | def tearDown(self): 37 | os.environ = self.environ 38 | 39 | def assertRegexpMatches(self, text, regexp): 40 | """not present in python below 2.7""" 41 | return self.assertNotEqual(re.match(regexp, text), None) 42 | 43 | def test_hello(self): 44 | self.assertEqual(_format("hello world!", self.spider, self.response, self.item, {}), 'hello world!') 45 | 46 | def test_spidername_time(self): 47 | formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {}) 48 | self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$') 49 | 50 | def test_unixtime(self): 51 | formatted = _format("Item scraped at $unixtime", self.spider, self.response, self.item, {}) 52 | self.assertRegexpMatches(formatted, 'Item scraped at \d+\.\d+$') 53 | 54 | def test_isotime(self): 55 | formatted = _format("$isotime", self.spider, self.response, self.item, {}) 56 | self.assertRegexpMatches(formatted, '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}$') 57 | 58 | def test_jobid(self): 59 | os.environ["SCRAPY_JOB"] = 'aa788' 60 | formatted = _format("job id '$jobid' for spider $spider:name", self.spider, self.response, self.item, {}) 61 | self.assertEqual(formatted, "job id 'aa788' for spider myspider") 62 | 63 | def test_spiderarg(self): 64 | formatted = _format("Argument arg1: $spider:arg1", self.spider, self.response, self.item, {}) 65 | self.assertEqual(formatted, 'Argument arg1: val1') 66 | 67 | def test_spiderattr(self): 68 | formatted = _format("$spider:start_urls", self.spider, self.response, self.item, {}) 69 | self.assertEqual(formatted, "['http://example.com']") 70 | 71 | def test_settings(self): 72 | formatted = _format("$setting:MY_SETTING", self.spider, self.response, self.item, {"$setting": {"MY_SETTING": True}}) 73 | self.assertEqual(formatted, 'True') 74 | 75 | def test_notexisting(self): 76 | """Not existing entities are not substituted""" 77 | formatted = _format("Item scraped at $myentity", self.spider, self.response, self.item, {}) 78 | self.assertEqual(formatted, 'Item scraped at $myentity') 79 | 80 | def test_noargs(self): 81 | """If entity does not accept arguments, don't substitute""" 82 | formatted = _format("Scraped on day $unixtime:arg", self.spider, self.response, self.item, {}) 83 | self.assertEqual(formatted, "Scraped on day $unixtime:arg") 84 | 85 | def test_noargs2(self): 86 | """If entity does not have enough arguments, don't substitute""" 87 | formatted = _format("$spider", self.spider, self.response, self.item, {}) 88 | self.assertEqual(formatted, "$spider") 89 | 90 | def test_invalidattr(self): 91 | formatted = _format("Argument arg2: $spider:arg2", self.spider, self.response, self.item, {}) 92 | self.assertEqual(formatted, "Argument arg2: $spider:arg2") 93 | 94 | def test_environment(self): 95 | os.environ["TEST_ENV"] = "testval" 96 | formatted = _format("$env:TEST_ENV", self.spider, self.response, self.item, {}) 97 | self.assertEqual(formatted, "testval") 98 | 99 | def test_response(self): 100 | formatted = _format("$response:url", self.spider, self.response, self.item, {}) 101 | self.assertEqual(formatted, self.response.url) 102 | 103 | def test_fields_copy(self): 104 | formatted = _format("$field:nom", self.spider, self.response, self.item, {}) 105 | self.assertEqual(formatted, 'myitem') 106 | 107 | def test_regex(self): 108 | formatted = _format("$field:url,r'item_no=(\d+)'", self.spider, self.response, self.item, {}) 109 | self.assertEqual(formatted, '345') 110 | 111 | def test_mware(self): 112 | settings = {"MAGIC_FIELDS": {"spider": "$spider:name"}} 113 | crawler = get_crawler(settings_dict=settings) 114 | mware = MagicFieldsMiddleware.from_crawler(crawler) 115 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0] 116 | expected = { 117 | 'nom': 'myitem', 118 | 'prix': '56.70 euros', 119 | 'spider': 'myspider', 120 | 'url': 'http://www.example.com/product.html?item_no=345' 121 | } 122 | self.assertEqual(result, expected) 123 | 124 | def test_mware_override(self): 125 | settings = { 126 | "MAGIC_FIELDS": {"spider": "$spider:name"}, 127 | "MAGIC_FIELDS_OVERRIDE": {"sku": "$field:nom"} 128 | } 129 | crawler = get_crawler(settings_dict=settings) 130 | mware = MagicFieldsMiddleware.from_crawler(crawler) 131 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0] 132 | expected = { 133 | 'nom': 'myitem', 134 | 'prix': '56.70 euros', 135 | 'spider': 'myspider', 136 | 'url': 'http://www.example.com/product.html?item_no=345', 137 | 'sku': 'myitem', 138 | } 139 | self.assertEqual(result, expected) 140 | -------------------------------------------------------------------------------- /scrapylib/magicfields.py: -------------------------------------------------------------------------------- 1 | """ 2 | Allow to add extra fields to items, based on the configuration setting MAGIC_FIELDS and MAGIC_FIELDS_OVERRIDE. 3 | Both settings are a dict. The keys are the destination field names, their values, a string which admits magic variables, 4 | identified by a starting '$', which will be substituted by a corresponding value. Some magic also accept arguments, and are specified 5 | after the magic name, using a ':' as separator. 6 | 7 | You can set project global magics with MAGIC_FIELDS, and tune them for a specific spider using MAGIC_FIELDS_OVERRIDE. 8 | 9 | In case there is more than one argument, they must come separated by ','. So, the generic magic format is 10 | 11 | $[:arg1,arg2,...] 12 | 13 | Current magic variables are: 14 | - $time 15 | The UTC timestamp at which the item was scraped, in format '%Y-%m-%d %H:%M:%S'. 16 | - $unixtime 17 | The unixtime (number of seconds since the Epoch, i.e. time.time()) at which the item was scraped. 18 | - $isotime 19 | The UTC timestamp at which the item was scraped, with format '%Y-%m-%dT%H:%M:%S". 20 | - $spider 21 | Must be followed by an argument, which is the name of an attribute of the spider (like an argument passed to it). 22 | - $env 23 | The value of an environment variable. It admits as argument the name of the variable. 24 | - $jobid 25 | The job id (shortcut for $env:SCRAPY_JOB) 26 | - $jobtime 27 | The UTC timestamp at which the job started, in format '%Y-%m-%d %H:%M:%S'. 28 | - $response 29 | Access to some response properties. 30 | $response:url 31 | The url from where the item was extracted from. 32 | $response:status 33 | Response http status. 34 | $response:headers 35 | Response http headers. 36 | - $setting 37 | Access the given Scrapy setting. It accepts one argument: the name of the setting. 38 | - $field 39 | Allows to copy the value of one field to another. Its argument is the source field. Effects are unpredicable if you use as source a field that is filled 40 | using magic fields. 41 | 42 | Examples: 43 | 44 | The following configuration will add two fields to each scraped item: 'timestamp', which will be filled with the string 'item scraped at ', 45 | and 'spider', which will contain the spider name: 46 | 47 | MAGIC_FIELDS = {"timestamp": "item scraped at $time", "spider": "$spider:name"} 48 | 49 | The following configuration will copy the url to the field sku: 50 | 51 | MAGIC_FIELDS = {"sku": "$field:url"} 52 | 53 | Magics admits also regular expression argument which allow to extract and assign only part of the value generated by the magic. You have to specify 54 | it using the r'' notation. Suppose that the urls of your items are like 'http://www.example.com/product.html?item_no=345' and you want to assign to the sku field 55 | only the item number. The following example, similar to the previous one but with a second regular expression argument, will do the task: 56 | 57 | MAGIC_FIELDS = {"sku": "$field:url,r'item_no=(\d+)'"} 58 | 59 | """ 60 | 61 | import re, time, datetime, os 62 | 63 | from scrapy.exceptions import NotConfigured 64 | from scrapy.item import BaseItem 65 | 66 | def _time(): 67 | return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') 68 | 69 | def _isotime(): 70 | return datetime.datetime.utcnow().isoformat() 71 | 72 | _REGEXES = {} 73 | _REGEX_ERRORS = {} 74 | def _extract_regex_group(regex, txt): 75 | compiled = _REGEXES.get(regex) 76 | errmessage = _REGEX_ERRORS.get(regex) 77 | if compiled is None and errmessage is None: 78 | try: 79 | compiled = re.compile(regex) 80 | _REGEXES[regex] = compiled 81 | except Exception as e: 82 | errmessage = e.message 83 | _REGEX_ERRORS[regex] = errmessage 84 | if errmessage: 85 | raise ValueError(errmessage) 86 | m = compiled.search(txt) 87 | if m: 88 | return "".join(m.groups()) or None 89 | 90 | _ENTITY_FUNCTION_MAP = { 91 | '$time': _time, 92 | '$unixtime': time.time, 93 | '$isotime': _isotime, 94 | } 95 | 96 | _ENTITIES_RE = re.compile("(\$[a-z]+)(:\w+)?(?:,r\'(.+)\')?") 97 | def _first_arg(args): 98 | if args: 99 | return args.pop(0) 100 | 101 | def _format(fmt, spider, response, item, fixed_values): 102 | out = fmt 103 | for m in _ENTITIES_RE.finditer(fmt): 104 | val = None 105 | entity, args, regex = m.groups() 106 | args = list(filter(None, (args or ':')[1:].split(','))) 107 | if entity == "$jobid": 108 | val = os.environ.get('SCRAPY_JOB', '') 109 | elif entity == "$spider": 110 | attr = _first_arg(args) 111 | if not attr or not hasattr(spider, attr): 112 | spider.log("Error at '%s': spider does not have attribute" % m.group()) 113 | else: 114 | val = str(getattr(spider, attr)) 115 | elif entity == "$response": 116 | attr = _first_arg(args) 117 | if not attr or not hasattr(response, attr): 118 | spider.log("Error at '%s': response does not have attribute" % m.group()) 119 | else: 120 | val = str(getattr(response, attr)) 121 | elif entity == "$field": 122 | attr = _first_arg(args) 123 | if attr in item: 124 | val = str(item[attr]) 125 | elif entity in fixed_values: 126 | attr = _first_arg(args) 127 | val = fixed_values[entity] 128 | if entity == "$setting" and attr: 129 | val = str(val[attr]) 130 | elif entity == "$env" and args: 131 | attr = _first_arg(args) 132 | if attr: 133 | val = os.environ.get(attr, '') 134 | else: 135 | function = _ENTITY_FUNCTION_MAP.get(entity) 136 | if function is not None: 137 | try: 138 | val = str(function(*args)) 139 | except: 140 | spider.log("Error at '%s': invalid argument for function" % m.group()) 141 | if val is not None: 142 | out = out.replace(m.group(), val, 1) 143 | if regex: 144 | try: 145 | out = _extract_regex_group(regex, out) 146 | except ValueError as e: 147 | spider.log("Error at '%s': %s" % (m.group(), e.message)) 148 | 149 | return out 150 | 151 | class MagicFieldsMiddleware(object): 152 | 153 | @classmethod 154 | def from_crawler(cls, crawler): 155 | mfields = crawler.settings.getdict("MAGIC_FIELDS").copy() 156 | mfields.update(crawler.settings.getdict("MAGIC_FIELDS_OVERRIDE")) 157 | if not mfields: 158 | raise NotConfigured 159 | return cls(mfields, crawler.settings) 160 | 161 | def __init__(self, mfields, settings): 162 | self.mfields = mfields 163 | self.fixed_values = { 164 | "$jobtime": _time(), 165 | "$setting": settings, 166 | } 167 | 168 | def process_spider_output(self, response, result, spider): 169 | for _res in result: 170 | if isinstance(_res, BaseItem): 171 | for field, fmt in self.mfields.items(): 172 | _res.setdefault(field, _format(fmt, spider, response, _res, self.fixed_values)) 173 | yield _res 174 | 175 | -------------------------------------------------------------------------------- /scrapylib/crawlera.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import warnings 3 | import os 4 | 5 | from w3lib.http import basic_auth_header 6 | from scrapy import log, signals 7 | from scrapy.exceptions import ScrapyDeprecationWarning 8 | from twisted.internet.error import ConnectionRefusedError 9 | 10 | 11 | class CrawleraMiddleware(object): 12 | 13 | url = 'http://paygo.crawlera.com:8010' 14 | maxbans = 400 15 | ban_code = 503 16 | download_timeout = 1800 17 | # Handle crawlera server failures 18 | connection_refused_delay = 90 19 | preserve_delay = False 20 | 21 | _settings = [ 22 | ('user', str), 23 | ('pass', str), 24 | ('url', str), 25 | ('maxbans', int), 26 | ('download_timeout', int), 27 | ('preserve_delay', bool), 28 | ] 29 | 30 | def __init__(self, crawler): 31 | warnings.warn( 32 | 'This version of CrawleraMiddleware is deprecated, ' 33 | 'please use the version found in the scrapy-crawlera ' 34 | 'package instead.') 35 | self.crawler = crawler 36 | self.job_id = os.environ.get('SCRAPY_JOB') 37 | self._bans = defaultdict(int) 38 | self._saved_delays = defaultdict(lambda: None) 39 | 40 | @classmethod 41 | def from_crawler(cls, crawler): 42 | o = cls(crawler) 43 | crawler.signals.connect(o.open_spider, signals.spider_opened) 44 | return o 45 | 46 | def open_spider(self, spider): 47 | self.enabled = self.is_enabled(spider) 48 | if not self.enabled: 49 | return 50 | 51 | for k, type_ in self._settings: 52 | setattr(self, k, self._get_setting_value(spider, k, type_)) 53 | if '?noconnect' not in self.url: 54 | self.url += '?noconnect' 55 | 56 | self._proxyauth = self.get_proxyauth(spider) 57 | log.msg("Using crawlera at %s (user: %s)" % (self.url, self.user), 58 | spider=spider) 59 | 60 | if not self.preserve_delay: 61 | # Setting spider download delay to 0 to get maximum crawl rate 62 | spider.download_delay = 0 63 | log.msg("Setting spider download delay to 0. It's default " 64 | "CrawleraMiddleware behavior, to preserve original delay" 65 | " set CRAWLERA_PRESERVE_DELAY = True in settings.", 66 | spider=spider) 67 | 68 | def _settings_get(self, type_, *a, **kw): 69 | if type_ is int: 70 | return self.crawler.settings.getint(*a, **kw) 71 | elif type_ is bool: 72 | return self.crawler.settings.getbool(*a, **kw) 73 | elif type_ is list: 74 | return self.crawler.settings.getlist(*a, **kw) 75 | elif type_ is dict: 76 | return self.crawler.settings.getdict(*a, **kw) 77 | else: 78 | return self.crawler.settings.get(*a, **kw) 79 | 80 | def _get_setting_value(self, spider, k, type_): 81 | if hasattr(spider, 'hubproxy_' + k): 82 | warnings.warn('hubproxy_%s attribute is deprecated, ' 83 | 'use crawlera_%s instead.' % (k, k), 84 | category=ScrapyDeprecationWarning, stacklevel=1) 85 | 86 | if self.crawler.settings.get('HUBPROXY_%s' % k.upper()) is not None: 87 | warnings.warn('HUBPROXY_%s setting is deprecated, ' 88 | 'use CRAWLERA_%s instead.' % (k.upper(), k.upper()), 89 | category=ScrapyDeprecationWarning, stacklevel=1) 90 | 91 | o = getattr(self, k, None) 92 | s = self._settings_get(type_, 'CRAWLERA_' + k.upper(), 93 | self._settings_get(type_, 'HUBPROXY_' + k.upper(), o)) 94 | return getattr(spider, 'crawlera_' + k, 95 | getattr(spider, 'hubproxy_' + k, s)) 96 | 97 | def is_enabled(self, spider): 98 | """Hook to enable middleware by custom rules.""" 99 | if hasattr(spider, 'use_hubproxy'): 100 | warnings.warn('use_hubproxy attribute is deprecated, ' 101 | 'use crawlera_enabled instead.', 102 | category=ScrapyDeprecationWarning, stacklevel=1) 103 | 104 | if self.crawler.settings.get('HUBPROXY_ENABLED') is not None: 105 | warnings.warn('HUBPROXY_ENABLED setting is deprecated, ' 106 | 'use CRAWLERA_ENABLED instead.', 107 | category=ScrapyDeprecationWarning, stacklevel=1) 108 | return ( 109 | getattr(spider, 'crawlera_enabled', False) or 110 | getattr(spider, 'use_hubproxy', False) or 111 | self.crawler.settings.getbool("CRAWLERA_ENABLED") or 112 | self.crawler.settings.getbool("HUBPROXY_ENABLED") 113 | ) 114 | 115 | def get_proxyauth(self, spider): 116 | """Hook to compute Proxy-Authorization header by custom rules.""" 117 | return basic_auth_header(self.user, getattr(self, 'pass')) 118 | 119 | def process_request(self, request, spider): 120 | if self._is_enabled_for_request(request): 121 | request.meta['proxy'] = self.url 122 | request.meta['download_timeout'] = self.download_timeout 123 | request.headers['Proxy-Authorization'] = self._proxyauth 124 | if self.job_id: 125 | request.headers['X-Crawlera-Jobid'] = self.job_id 126 | 127 | def process_response(self, request, response, spider): 128 | if not self._is_enabled_for_request(request): 129 | return response 130 | key = self._get_slot_key(request) 131 | self._restore_original_delay(request) 132 | if response.status == self.ban_code: 133 | self._bans[key] += 1 134 | if self._bans[key] > self.maxbans: 135 | self.crawler.engine.close_spider(spider, 'banned') 136 | else: 137 | after = response.headers.get('retry-after') 138 | if after: 139 | self._set_custom_delay(request, float(after)) 140 | else: 141 | self._bans[key] = 0 142 | return response 143 | 144 | def process_exception(self, request, exception, spider): 145 | if not self._is_enabled_for_request(request): 146 | return 147 | if isinstance(exception, ConnectionRefusedError): 148 | # Handle crawlera downtime 149 | self._set_custom_delay(request, self.connection_refused_delay) 150 | 151 | def _is_enabled_for_request(self, request): 152 | return self.enabled and 'dont_proxy' not in request.meta 153 | 154 | def _get_slot_key(self, request): 155 | return request.meta.get('download_slot') 156 | 157 | def _get_slot(self, request): 158 | key = self._get_slot_key(request) 159 | return key, self.crawler.engine.downloader.slots.get(key) 160 | 161 | def _set_custom_delay(self, request, delay): 162 | """Set custom delay for slot and save original one.""" 163 | key, slot = self._get_slot(request) 164 | if not slot: 165 | return 166 | if self._saved_delays[key] is None: 167 | self._saved_delays[key] = slot.delay 168 | slot.delay = delay 169 | 170 | def _restore_original_delay(self, request): 171 | """Restore original delay for slot if it was changed.""" 172 | key, slot = self._get_slot(request) 173 | if not slot: 174 | return 175 | if self._saved_delays[key] is not None: 176 | slot.delay, self._saved_delays[key] = self._saved_delays[key], None 177 | -------------------------------------------------------------------------------- /tests/test_hubproxy.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from six.moves import xrange 3 | 4 | from w3lib.http import basic_auth_header 5 | from scrapy.http import Request, Response 6 | from scrapy.spiders import Spider 7 | from scrapy.utils.test import get_crawler 8 | from scrapylib.hubproxy import HubProxyMiddleware 9 | 10 | 11 | class HubProxyMiddlewareTestCase(TestCase): 12 | 13 | mwcls = HubProxyMiddleware 14 | 15 | def setUp(self): 16 | self.spider = Spider('foo') 17 | self.settings = {'HUBPROXY_USER': 'user', 'HUBPROXY_PASS': 'pass'} 18 | 19 | def _mock_crawler(self, settings=None): 20 | class MockedDownloader(object): 21 | slots = {} 22 | 23 | class MockedEngine(object): 24 | downloader = MockedDownloader() 25 | fake_spider_closed_result = None 26 | def close_spider(self, spider, reason): 27 | self.fake_spider_closed_result = (spider, reason) 28 | 29 | crawler = get_crawler(settings_dict=settings) 30 | crawler.engine = MockedEngine() 31 | return crawler 32 | 33 | def _assert_disabled(self, spider, settings=None): 34 | crawler = self._mock_crawler(settings) 35 | mw = self.mwcls.from_crawler(crawler) 36 | mw.open_spider(spider) 37 | req = Request('http://www.scrapytest.org') 38 | out = mw.process_request(req, spider) 39 | self.assertEqual(out, None) 40 | self.assertEqual(req.meta.get('proxy'), None) 41 | self.assertEqual(req.meta.get('download_timeout'), None) 42 | self.assertEqual(req.headers.get('Proxy-Authorization'), None) 43 | res = Response(req.url) 44 | assert mw.process_response(req, res, spider) is res 45 | res = Response(req.url, status=mw.ban_code) 46 | assert mw.process_response(req, res, spider) is res 47 | 48 | def _assert_enabled(self, spider, 49 | settings=None, 50 | proxyurl='http://paygo.crawlera.com:8010?noconnect', 51 | proxyauth=basic_auth_header('user', 'pass'), 52 | bancode=503, 53 | maxbans=400, 54 | download_timeout=1800, 55 | ): 56 | crawler = self._mock_crawler(settings) 57 | mw = self.mwcls.from_crawler(crawler) 58 | mw.open_spider(spider) 59 | req = Request('http://www.scrapytest.org') 60 | assert mw.process_request(req, spider) is None 61 | self.assertEqual(req.meta.get('proxy'), proxyurl) 62 | self.assertEqual(req.meta.get('download_timeout'), download_timeout) 63 | self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) 64 | res = Response(req.url) 65 | assert mw.process_response(req, res, spider) is res 66 | 67 | # disabled if 'dont_proxy' is set 68 | req = Request('http://www.scrapytest.org') 69 | req.meta['dont_proxy'] = True 70 | assert mw.process_request(req, spider) is None 71 | self.assertEqual(req.meta.get('proxy'), None) 72 | self.assertEqual(req.meta.get('download_timeout'), None) 73 | self.assertEqual(req.headers.get('Proxy-Authorization'), None) 74 | res = Response(req.url) 75 | assert mw.process_response(req, res, spider) is res 76 | del req.meta['dont_proxy'] 77 | 78 | if maxbans > 0: 79 | # assert ban count is reseted after a succesful response 80 | res = Response('http://ban.me', status=bancode) 81 | assert mw.process_response(req, res, spider) is res 82 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 83 | res = Response('http://unban.me') 84 | assert mw.process_response(req, res, spider) is res 85 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 86 | self.assertEqual(mw._bans[None], 0) 87 | 88 | # check for not banning before maxbans for bancode 89 | for x in xrange(maxbans + 1): 90 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 91 | res = Response('http://ban.me/%d' % x, status=bancode) 92 | assert mw.process_response(req, res, spider) is res 93 | 94 | # max bans reached and close_spider called 95 | self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned')) 96 | 97 | def test_disabled_by_lack_of_hubproxy_settings(self): 98 | self._assert_disabled(self.spider, settings={}) 99 | 100 | def test_spider_use_hubproxy(self): 101 | self.assertFalse(hasattr(self.spider, 'use_hubproxy')) 102 | self._assert_disabled(self.spider, self.settings) 103 | self.spider.use_hubproxy = True 104 | self._assert_enabled(self.spider, self.settings) 105 | self.spider.use_hubproxy = False 106 | self._assert_disabled(self.spider, self.settings) 107 | 108 | def test_enabled(self): 109 | self._assert_disabled(self.spider, self.settings) 110 | self.settings['HUBPROXY_ENABLED'] = True 111 | self._assert_enabled(self.spider, self.settings) 112 | 113 | def test_userpass(self): 114 | self.spider.use_hubproxy = True 115 | self.settings['HUBPROXY_USER'] = user = 'other' 116 | self.settings['HUBPROXY_PASS'] = pass_ = 'secret' 117 | proxyauth = basic_auth_header(user, pass_) 118 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 119 | 120 | self.spider.hubproxy_user = user = 'notfromsettings' 121 | self.spider.hubproxy_pass = pass_ = 'anothersecret' 122 | proxyauth = basic_auth_header(user, pass_) 123 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 124 | 125 | def test_proxyurl(self): 126 | self.spider.use_hubproxy = True 127 | self.settings['HUBPROXY_URL'] = 'http://localhost:8010' 128 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect') 129 | 130 | def test_maxbans(self): 131 | self.spider.use_hubproxy = True 132 | self.settings['HUBPROXY_MAXBANS'] = maxbans = 0 133 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 134 | self.settings['HUBPROXY_MAXBANS'] = maxbans = 100 135 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 136 | 137 | def test_download_timeout(self): 138 | self.spider.use_hubproxy = True 139 | self.settings['HUBPROXY_DOWNLOAD_TIMEOUT'] = 60 140 | self._assert_enabled(self.spider, self.settings, download_timeout=60) 141 | self.spider.hubproxy_download_timeout = 120 142 | self._assert_enabled(self.spider, self.settings, download_timeout=120) 143 | 144 | def test_hooks(self): 145 | class _ECLS(self.mwcls): 146 | def is_enabled(self, spider): 147 | wascalled.append('is_enabled') 148 | return enabled 149 | def get_proxyauth(self, spider): 150 | wascalled.append('get_proxyauth') 151 | return proxyauth 152 | 153 | wascalled = [] 154 | self.mwcls = _ECLS 155 | 156 | # test is_enabled returns False 157 | enabled = False 158 | self.spider.use_hubproxy = True 159 | self._assert_disabled(self.spider, self.settings) 160 | self.assertEqual(wascalled, ['is_enabled']) 161 | 162 | wascalled[:] = [] # reset 163 | enabled = True 164 | self.spider.use_hubproxy = False 165 | proxyauth = b'Basic Foo' 166 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 167 | self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth']) 168 | -------------------------------------------------------------------------------- /tests/test_hcf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import unittest 4 | 5 | from scrapy.http import Request, Response 6 | from scrapy.spiders import Spider 7 | from scrapy.utils.test import get_crawler 8 | from scrapylib.hcf import HcfMiddleware 9 | from scrapy.exceptions import NotConfigured 10 | from hubstorage import HubstorageClient 11 | 12 | HS_ENDPOINT = os.getenv('HS_ENDPOINT', 'http://localhost:8003') 13 | HS_AUTH = os.getenv('HS_AUTH') 14 | 15 | 16 | @unittest.skipUnless(HS_AUTH, 'No valid hubstorage credentials set') 17 | class HcfTestCase(unittest.TestCase): 18 | 19 | hcf_cls = HcfMiddleware 20 | 21 | projectid = '2222222' 22 | spidername = 'hs-test-spider' 23 | frontier = 'test' 24 | slot = '0' 25 | number_of_slots = 1 26 | 27 | @classmethod 28 | def setUpClass(cls): 29 | cls.endpoint = HS_ENDPOINT 30 | cls.auth = HS_AUTH 31 | cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint) 32 | cls.project = cls.hsclient.get_project(cls.projectid) 33 | cls.fclient = cls.project.frontier 34 | 35 | @classmethod 36 | def tearDownClass(cls): 37 | cls.project.frontier.close() 38 | cls.hsclient.close() 39 | 40 | def setUp(self): 41 | class TestSpider(Spider): 42 | name = self.spidername 43 | start_urls = [ 44 | 'http://www.example.com/' 45 | ] 46 | 47 | self.spider = TestSpider() 48 | self.hcf_settings = {'HS_ENDPOINT': self.endpoint, 49 | 'HS_AUTH': self.auth, 50 | 'HS_PROJECTID': self.projectid, 51 | 'HS_FRONTIER': self.frontier, 52 | 'HS_CONSUME_FROM_SLOT': self.slot, 53 | 'HS_NUMBER_OF_SLOTS': self.number_of_slots} 54 | self._delete_slot() 55 | 56 | def tearDown(self): 57 | self._delete_slot() 58 | 59 | def _delete_slot(self): 60 | self.fclient.delete_slot(self.frontier, self.slot) 61 | 62 | def _build_response(self, url, meta=None): 63 | return Response(url, request=Request(url="http://www.example.com/parent.html", meta=meta)) 64 | 65 | def _get_crawler(self, settings=None): 66 | crawler = get_crawler(settings_dict=settings) 67 | # simulate crawler engine 68 | class Engine(): 69 | def __init__(self): 70 | self.requests = [] 71 | def schedule(self, request, spider): 72 | self.requests.append(request) 73 | crawler.engine = Engine() 74 | 75 | return crawler 76 | 77 | def test_not_loaded(self): 78 | crawler = self._get_crawler({}) 79 | self.assertRaises(NotConfigured, self.hcf_cls.from_crawler, crawler) 80 | 81 | def test_start_requests(self): 82 | crawler = self._get_crawler(self.hcf_settings) 83 | hcf = self.hcf_cls.from_crawler(crawler) 84 | 85 | # first time should be empty 86 | start_urls = self.spider.start_urls 87 | new_urls = list(hcf.process_start_requests(start_urls, self.spider)) 88 | self.assertEqual(new_urls, ['http://www.example.com/']) 89 | 90 | # now try to store some URLs in the hcf and retrieve them 91 | fps = [{'fp': 'http://www.example.com/index.html'}, 92 | {'fp': 'http://www.example.com/index2.html'}] 93 | self.fclient.add(self.frontier, self.slot, fps) 94 | self.fclient.flush() 95 | new_urls = [r.url for r in hcf.process_start_requests(start_urls, self.spider)] 96 | expected_urls = [r['fp'] for r in fps] 97 | self.assertEqual(new_urls, expected_urls) 98 | self.assertEqual(len(hcf.batch_ids), 1) 99 | 100 | def test_spider_output(self): 101 | crawler = self._get_crawler(self.hcf_settings) 102 | hcf = self.hcf_cls.from_crawler(crawler) 103 | 104 | # process new GET request 105 | response = self._build_response("http://www.example.com/qxg1231") 106 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231", meta={'use_hcf': True}) 107 | outputs = list(hcf.process_spider_output(response, [request], self.spider)) 108 | self.assertEqual(outputs, []) 109 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])} 110 | self.assertEqual(dict(hcf.new_links), expected_links) 111 | 112 | # process new POST request (don't add it to the hcf) 113 | response = self._build_response("http://www.example.com/qxg456") 114 | request = Request(url="http://www.example.com/product/?qxp=456", method='POST') 115 | outputs = list(hcf.process_spider_output(response, [request], self.spider)) 116 | self.assertEqual(outputs, [request]) 117 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])} 118 | self.assertEqual(dict(hcf.new_links), expected_links) 119 | 120 | # process new GET request (without the use_hcf meta key) 121 | response = self._build_response("http://www.example.com/qxg1231") 122 | request = Request(url="http://www.example.com/product/?qxp=789") 123 | outputs = list(hcf.process_spider_output(response, [request], self.spider)) 124 | self.assertEqual(outputs, [request]) 125 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])} 126 | self.assertEqual(dict(hcf.new_links), expected_links) 127 | 128 | # Simulate close spider 129 | hcf.close_spider(self.spider, 'finished') 130 | 131 | def test_close_spider(self): 132 | crawler = self._get_crawler(self.hcf_settings) 133 | hcf = self.hcf_cls.from_crawler(crawler) 134 | 135 | # Save 2 batches in the HCF 136 | fps = [{'fp': 'http://www.example.com/index_%s.html' % i} for i in range(0, 200)] 137 | self.fclient.add(self.frontier, self.slot, fps) 138 | self.fclient.flush() 139 | 140 | # Read the first batch 141 | start_urls = self.spider.start_urls 142 | new_urls = [r.url for r in hcf.process_start_requests(start_urls, self.spider)] 143 | expected_urls = [r['fp'] for r in fps] 144 | self.assertEqual(new_urls, expected_urls) 145 | 146 | # Simulate extracting some new urls 147 | response = self._build_response("http://www.example.com/parent.html") 148 | new_fps = ["http://www.example.com/child_%s.html" % i for i in range(0, 50)] 149 | for fp in new_fps: 150 | request = Request(url=fp, meta={'use_hcf': True}) 151 | list(hcf.process_spider_output(response, [request], self.spider)) 152 | self.assertEqual(len(hcf.new_links[self.slot]), 50) 153 | 154 | # Simulate emptying the scheduler 155 | crawler.engine.requests = [] 156 | 157 | # Simulate close spider 158 | hcf.close_spider(self.spider, 'finished') 159 | self.assertEqual(len(hcf.new_links[self.slot]), 0) 160 | self.assertEqual(len(hcf.batch_ids), 0) 161 | 162 | # HCF must be have 1 new batch 163 | batches = [b for b in self.fclient.read(self.frontier, self.slot)] 164 | self.assertEqual(len(batches), 1) 165 | 166 | def test_hcf_params(self): 167 | crawler = self._get_crawler(self.hcf_settings) 168 | hcf = self.hcf_cls.from_crawler(crawler) 169 | 170 | # Simulate extracting some new urls and adding them to the HCF 171 | response = self._build_response("http://www.example.com/parent.html") 172 | new_fps = ["http://www.example.com/child_%s.html" % i for i in range(0, 5)] 173 | new_requests = [] 174 | for fp in new_fps: 175 | hcf_params = {'qdata': {'a': '1', 'b': '2', 'c': '3'}, 176 | 'fdata': {'x': '1', 'y': '2', 'z': '3'}, 177 | 'p': 1} 178 | request = Request(url=fp, meta={'use_hcf': True, "hcf_params": hcf_params}) 179 | new_requests.append(request) 180 | list(hcf.process_spider_output(response, [request], self.spider)) 181 | expected = set(['http://www.example.com/child_4.html', 182 | 'http://www.example.com/child_1.html', 183 | 'http://www.example.com/child_0.html', 184 | 'http://www.example.com/child_3.html', 185 | 'http://www.example.com/child_2.html']) 186 | self.assertEqual(hcf.new_links[self.slot], expected) 187 | 188 | # Simulate close spider 189 | hcf.close_spider(self.spider, 'finished') 190 | 191 | # Similate running another spider 192 | start_urls = self.spider.start_urls 193 | stored_requests = list(hcf.process_start_requests(start_urls, self.spider)) 194 | for a, b in zip(new_requests, stored_requests): 195 | self.assertEqual(a.url, b.url) 196 | self.assertEqual(a.meta.get('qdata'), b.meta.get('qdata')) 197 | 198 | # Simulate emptying the scheduler 199 | crawler.engine.requests = [] 200 | 201 | # Simulate close spider 202 | hcf.close_spider(self.spider, 'finished') 203 | 204 | def test_spider_output_override_slot(self): 205 | crawler = self._get_crawler(self.hcf_settings) 206 | hcf = self.hcf_cls.from_crawler(crawler) 207 | 208 | def get_slot_callback(request): 209 | md5 = hashlib.md5() 210 | md5.update(request.url) 211 | digest = md5.hexdigest() 212 | return str(int(digest, 16) % 5) 213 | self.spider.slot_callback = get_slot_callback 214 | 215 | # process new GET request 216 | response = self._build_response("http://www.example.com/qxg1231") 217 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231", 218 | meta={'use_hcf': True}) 219 | outputs = list(hcf.process_spider_output(response, [request], self.spider)) 220 | self.assertEqual(outputs, []) 221 | expected_links = {'4': set(['http://www.example.com/product/?qxp=12&qxg=1231'])} 222 | self.assertEqual(dict(hcf.new_links), expected_links) 223 | 224 | # Simulate close spider 225 | hcf.close_spider(self.spider, 'finished') 226 | -------------------------------------------------------------------------------- /scrapylib/hcf.py: -------------------------------------------------------------------------------- 1 | """ 2 | HCF Middleware 3 | 4 | This SpiderMiddleware uses the HCF backend from hubstorage to retrieve the new 5 | urls to crawl and store back the links extracted. 6 | 7 | To activate this middleware it needs to be added to the SPIDER_MIDDLEWARES 8 | list, i.e: 9 | 10 | SPIDER_MIDDLEWARES = { 11 | 'scrapylib.hcf.HcfMiddleware': 543, 12 | } 13 | 14 | And the next settings need to be defined: 15 | 16 | HS_AUTH - API key 17 | HS_PROJECTID - Project ID in the dash (not needed if the spider is ran on dash) 18 | HS_FRONTIER - Frontier name. 19 | HS_CONSUME_FROM_SLOT - Slot from where the spider will read new URLs. 20 | 21 | Note that HS_FRONTIER and HS_CONSUME_FROM_SLOT can be overriden from inside a spider using 22 | the spider attributes: "hs_frontier" and "hs_consume_from_slot" respectively. 23 | 24 | The next optional settings can be defined: 25 | 26 | HS_ENDPOINT - URL to the API endpoint, i.e: http://localhost:8003. 27 | The default value is provided by the python-hubstorage 28 | package. 29 | 30 | HS_MAX_LINKS - Number of links to be read from the HCF, the default is 1000. 31 | 32 | HS_START_JOB_ENABLED - Enable whether to start a new job when the spider 33 | finishes. The default is False 34 | 35 | HS_START_JOB_ON_REASON - This is a list of closing reasons, if the spider ends 36 | with any of these reasons a new job will be started 37 | for the same slot. The default is ['finished'] 38 | 39 | HS_NUMBER_OF_SLOTS - This is the number of slots that the middleware will 40 | use to store the new links. The default is 8. 41 | 42 | The next keys can be defined in a Request meta in order to control the behavior 43 | of the HCF middleware: 44 | 45 | use_hcf - If set to True the request will be stored in the HCF. 46 | hcf_params - Dictionary of parameters to be stored in the HCF with the request 47 | fingerprint 48 | 49 | qdata data to be stored along with the fingerprint in the request queue 50 | fdata data to be stored along with the fingerprint in the fingerprint set 51 | p Priority - lower priority numbers are returned first. The default is 0 52 | 53 | The value of 'qdata' parameter could be retrieved later using 54 | ``response.meta['hcf_params']['qdata']``. 55 | 56 | The spider can override the default slot assignation function by setting the 57 | spider slot_callback method to a function with the following signature: 58 | 59 | def slot_callback(request): 60 | ... 61 | return slot 62 | 63 | """ 64 | import os 65 | import hashlib 66 | import logging 67 | from collections import defaultdict 68 | from datetime import datetime 69 | from scrapinghub import Connection 70 | from scrapy import signals, log 71 | from scrapy.exceptions import NotConfigured 72 | from scrapy.http import Request 73 | from hubstorage import HubstorageClient 74 | 75 | DEFAULT_MAX_LINKS = 1000 76 | DEFAULT_HS_NUMBER_OF_SLOTS = 8 77 | 78 | 79 | class HcfMiddleware(object): 80 | 81 | def __init__(self, crawler): 82 | settings = crawler.settings 83 | self.hs_endpoint = settings.get("HS_ENDPOINT") 84 | self.hs_auth = self._get_config(settings, "HS_AUTH") 85 | self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID')) 86 | self.hs_frontier = self._get_config(settings, "HS_FRONTIER") 87 | self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT") 88 | self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS) 89 | self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS) 90 | self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False) 91 | self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished']) 92 | 93 | conn = Connection(self.hs_auth) 94 | self.panel_project = conn[self.hs_projectid] 95 | 96 | self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) 97 | self.project = self.hsclient.get_project(self.hs_projectid) 98 | self.fclient = self.project.frontier 99 | 100 | self.new_links = defaultdict(set) 101 | self.batch_ids = [] 102 | 103 | crawler.signals.connect(self.close_spider, signals.spider_closed) 104 | 105 | # Make sure the logger for hubstorage.batchuploader is configured 106 | logging.basicConfig() 107 | 108 | def _get_config(self, settings, key, default=None): 109 | value = settings.get(key, default) 110 | if not value: 111 | raise NotConfigured('%s not found' % key) 112 | return value 113 | 114 | def _msg(self, msg, level=log.INFO): 115 | log.msg('(HCF) %s' % msg, level) 116 | 117 | def start_job(self, spider): 118 | self._msg("Starting new job for: %s" % spider.name) 119 | jobid = self.panel_project.schedule( 120 | spider.name, 121 | hs_consume_from_slot=self.hs_consume_from_slot, 122 | dummy=datetime.now() 123 | ) 124 | self._msg("New job started: %s" % jobid) 125 | return jobid 126 | 127 | @classmethod 128 | def from_crawler(cls, crawler): 129 | return cls(crawler) 130 | 131 | def process_start_requests(self, start_requests, spider): 132 | 133 | self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier) 134 | self._msg('Using HS_FRONTIER=%s' % self.hs_frontier) 135 | 136 | self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot) 137 | self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot) 138 | 139 | self.has_new_requests = False 140 | for req in self._get_new_requests(): 141 | self.has_new_requests = True 142 | yield req 143 | 144 | # if there are no links in the hcf, use the start_requests 145 | # unless this is not the first job. 146 | if not self.has_new_requests and not getattr(spider, 'dummy', None): 147 | self._msg('Using start_requests') 148 | for r in start_requests: 149 | yield r 150 | 151 | def process_spider_output(self, response, result, spider): 152 | slot_callback = getattr(spider, 'slot_callback', self._get_slot) 153 | for item in result: 154 | if isinstance(item, Request): 155 | request = item 156 | if request.meta.get('use_hcf', False): 157 | if request.method == 'GET': # XXX: Only GET support for now. 158 | slot = slot_callback(request) 159 | if not request.url in self.new_links[slot]: 160 | hcf_params = request.meta.get('hcf_params') 161 | fp = {'fp': request.url} 162 | if hcf_params: 163 | fp.update(hcf_params) 164 | # Save the new links as soon as possible using 165 | # the batch uploader 166 | self.fclient.add(self.hs_frontier, slot, [fp]) 167 | self.new_links[slot].add(request.url) 168 | else: 169 | self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url, 170 | log.ERROR) 171 | yield request 172 | else: 173 | yield request 174 | else: 175 | yield item 176 | 177 | def close_spider(self, spider, reason): 178 | # Only store the results if the spider finished normally, if it 179 | # didn't finished properly there is not way to know whether all the url batches 180 | # were processed and it is better not to delete them from the frontier 181 | # (so they will be picked by another process). 182 | if reason == 'finished': 183 | self._save_new_links_count() 184 | self._delete_processed_ids() 185 | 186 | # Close the frontier client in order to make sure that all the new links 187 | # are stored. 188 | self.fclient.close() 189 | self.hsclient.close() 190 | 191 | # If the reason is defined in the hs_start_job_on_reason list then start 192 | # a new job right after this spider is finished. 193 | if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason: 194 | 195 | # Start the new job if this job had requests from the HCF or it 196 | # was the first job. 197 | if self.has_new_requests or not getattr(spider, 'dummy', None): 198 | self.start_job(spider) 199 | 200 | def _get_new_requests(self): 201 | """ Get a new batch of links from the HCF.""" 202 | num_batches = 0 203 | num_links = 0 204 | for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1): 205 | for fingerprint, data in batch['requests']: 206 | num_links += 1 207 | yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}}) 208 | self.batch_ids.append(batch['id']) 209 | if num_links >= self.hs_max_links: 210 | break 211 | self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot)) 212 | self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot)) 213 | 214 | def _save_new_links_count(self): 215 | """ Save the new extracted links into the HCF.""" 216 | for slot, new_links in self.new_links.items(): 217 | self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot)) 218 | self.new_links = defaultdict(set) 219 | 220 | def _delete_processed_ids(self): 221 | """ Delete in the HCF the ids of the processed batches.""" 222 | self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids) 223 | self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids), 224 | self.hs_consume_from_slot)) 225 | self.batch_ids = [] 226 | 227 | def _get_slot(self, request): 228 | """ Determine to which slot should be saved the request.""" 229 | md5 = hashlib.md5() 230 | md5.update(request.url) 231 | digest = md5.hexdigest() 232 | return str(int(digest, 16) % self.hs_number_of_slots) 233 | -------------------------------------------------------------------------------- /tests/test_crawlera.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from w3lib.http import basic_auth_header 4 | from scrapy.http import Request, Response 5 | from scrapy.spiders import Spider 6 | from scrapy.utils.test import get_crawler 7 | from twisted.internet.error import ConnectionRefusedError 8 | from six.moves import xrange 9 | 10 | from scrapylib.crawlera import CrawleraMiddleware 11 | import os 12 | 13 | 14 | class MockedSlot(object): 15 | 16 | def __init__(self, delay=0.0): 17 | self.delay = delay 18 | 19 | 20 | class CrawleraMiddlewareTestCase(TestCase): 21 | 22 | mwcls = CrawleraMiddleware 23 | bancode = 503 24 | 25 | def setUp(self): 26 | self.spider = Spider('foo') 27 | self.settings = {'CRAWLERA_USER': 'user', 'CRAWLERA_PASS': 'pass'} 28 | 29 | def _mock_crawler(self, settings=None): 30 | 31 | class MockedDownloader(object): 32 | slots = {} 33 | 34 | class MockedEngine(object): 35 | downloader = MockedDownloader() 36 | fake_spider_closed_result = None 37 | 38 | def close_spider(self, spider, reason): 39 | self.fake_spider_closed_result = (spider, reason) 40 | 41 | crawler = get_crawler(settings_dict=settings) 42 | crawler.engine = MockedEngine() 43 | return crawler 44 | 45 | def _assert_disabled(self, spider, settings=None): 46 | crawler = self._mock_crawler(settings) 47 | mw = self.mwcls.from_crawler(crawler) 48 | mw.open_spider(spider) 49 | req = Request('http://www.scrapytest.org') 50 | out = mw.process_request(req, spider) 51 | self.assertEqual(out, None) 52 | self.assertEqual(req.meta.get('proxy'), None) 53 | self.assertEqual(req.meta.get('download_timeout'), None) 54 | self.assertEqual(req.headers.get('Proxy-Authorization'), None) 55 | res = Response(req.url) 56 | assert mw.process_response(req, res, spider) is res 57 | res = Response(req.url, status=mw.ban_code) 58 | assert mw.process_response(req, res, spider) is res 59 | 60 | def _assert_enabled(self, spider, 61 | settings=None, 62 | proxyurl='http://paygo.crawlera.com:8010?noconnect', 63 | proxyauth=basic_auth_header('user', 'pass'), 64 | maxbans=400, 65 | download_timeout=1800): 66 | crawler = self._mock_crawler(settings) 67 | mw = self.mwcls.from_crawler(crawler) 68 | mw.open_spider(spider) 69 | req = Request('http://www.scrapytest.org') 70 | assert mw.process_request(req, spider) is None 71 | self.assertEqual(req.meta.get('proxy'), proxyurl) 72 | self.assertEqual(req.meta.get('download_timeout'), download_timeout) 73 | self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) 74 | res = Response(req.url) 75 | assert mw.process_response(req, res, spider) is res 76 | 77 | # disabled if 'dont_proxy' is set 78 | req = Request('http://www.scrapytest.org') 79 | req.meta['dont_proxy'] = True 80 | assert mw.process_request(req, spider) is None 81 | self.assertEqual(req.meta.get('proxy'), None) 82 | self.assertEqual(req.meta.get('download_timeout'), None) 83 | self.assertEqual(req.headers.get('Proxy-Authorization'), None) 84 | res = Response(req.url) 85 | assert mw.process_response(req, res, spider) is res 86 | del req.meta['dont_proxy'] 87 | 88 | if maxbans > 0: 89 | # assert ban count is reseted after a succesful response 90 | res = Response('http://ban.me', status=self.bancode) 91 | assert mw.process_response(req, res, spider) is res 92 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 93 | res = Response('http://unban.me') 94 | assert mw.process_response(req, res, spider) is res 95 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 96 | self.assertEqual(mw._bans[None], 0) 97 | 98 | # check for not banning before maxbans for bancode 99 | for x in xrange(maxbans + 1): 100 | self.assertEqual(crawler.engine.fake_spider_closed_result, None) 101 | res = Response('http://ban.me/%d' % x, status=self.bancode) 102 | assert mw.process_response(req, res, spider) is res 103 | 104 | # max bans reached and close_spider called 105 | self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned')) 106 | 107 | def test_disabled_by_lack_of_crawlera_settings(self): 108 | self._assert_disabled(self.spider, settings={}) 109 | 110 | def test_spider_crawlera_enabled(self): 111 | self.assertFalse(hasattr(self.spider, 'crawlera_enabled')) 112 | self._assert_disabled(self.spider, self.settings) 113 | self.spider.crawlera_enabled = True 114 | self._assert_enabled(self.spider, self.settings) 115 | self.spider.crawlera_enabled = False 116 | self._assert_disabled(self.spider, self.settings) 117 | 118 | def test_enabled(self): 119 | self._assert_disabled(self.spider, self.settings) 120 | self.settings['CRAWLERA_ENABLED'] = True 121 | self._assert_enabled(self.spider, self.settings) 122 | 123 | def test_userpass(self): 124 | self.spider.crawlera_enabled = True 125 | self.settings['CRAWLERA_USER'] = user = 'other' 126 | self.settings['CRAWLERA_PASS'] = pass_ = 'secret' 127 | proxyauth = basic_auth_header(user, pass_) 128 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 129 | 130 | self.spider.crawlera_user = user = 'notfromsettings' 131 | self.spider.crawlera_pass = pass_ = 'anothersecret' 132 | proxyauth = basic_auth_header(user, pass_) 133 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 134 | 135 | def test_proxyurl(self): 136 | self.spider.crawlera_enabled = True 137 | self.settings['CRAWLERA_URL'] = 'http://localhost:8010' 138 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect') 139 | 140 | def test_proxyurl_including_noconnect(self): 141 | self.spider.crawlera_enabled = True 142 | self.settings['CRAWLERA_URL'] = 'http://localhost:8010?noconnect' 143 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect') 144 | 145 | def test_maxbans(self): 146 | self.spider.crawlera_enabled = True 147 | self.settings['CRAWLERA_MAXBANS'] = maxbans = 0 148 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 149 | self.settings['CRAWLERA_MAXBANS'] = maxbans = 100 150 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans) 151 | # Assert setting is coerced into correct type 152 | self.settings['CRAWLERA_MAXBANS'] = '123' 153 | self._assert_enabled(self.spider, self.settings, maxbans=123) 154 | self.spider.crawlera_maxbans = 99 155 | self._assert_enabled(self.spider, self.settings, maxbans=99) 156 | 157 | def test_download_timeout(self): 158 | self.spider.crawlera_enabled = True 159 | self.settings['CRAWLERA_DOWNLOAD_TIMEOUT'] = 60 160 | self._assert_enabled(self.spider, self.settings, download_timeout=60) 161 | # Assert setting is coerced into correct type 162 | self.settings['CRAWLERA_DOWNLOAD_TIMEOUT'] = '42' 163 | self._assert_enabled(self.spider, self.settings, download_timeout=42) 164 | self.spider.crawlera_download_timeout = 120 165 | self._assert_enabled(self.spider, self.settings, download_timeout=120) 166 | 167 | def test_hooks(self): 168 | class _ECLS(self.mwcls): 169 | def is_enabled(self, spider): 170 | wascalled.append('is_enabled') 171 | return enabled 172 | 173 | def get_proxyauth(self, spider): 174 | wascalled.append('get_proxyauth') 175 | return proxyauth 176 | 177 | wascalled = [] 178 | self.mwcls = _ECLS 179 | 180 | # test is_enabled returns False 181 | enabled = False 182 | self.spider.crawlera_enabled = True 183 | self._assert_disabled(self.spider, self.settings) 184 | self.assertEqual(wascalled, ['is_enabled']) 185 | 186 | wascalled[:] = [] # reset 187 | enabled = True 188 | self.spider.crawlera_enabled = False 189 | proxyauth = b'Basic Foo' 190 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth) 191 | self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth']) 192 | 193 | def test_delay_adjustment(self): 194 | delay = 0.5 195 | slot_key = 'www.scrapytest.org' 196 | url = 'http://www.scrapytest.org' 197 | ban_url = 'http://ban.me' 198 | 199 | self.spider.crawlera_enabled = True 200 | 201 | crawler = self._mock_crawler(self.settings) 202 | # ignore spider delay by default 203 | self.spider.download_delay = delay 204 | mw = self.mwcls.from_crawler(crawler) 205 | mw.open_spider(self.spider) 206 | self.assertEqual(self.spider.download_delay, 0) 207 | 208 | # preserve original delay 209 | self.spider.download_delay = delay 210 | self.spider.crawlera_preserve_delay = True 211 | mw = self.mwcls.from_crawler(crawler) 212 | mw.open_spider(self.spider) 213 | self.assertEqual(self.spider.download_delay, delay) 214 | 215 | slot = MockedSlot(self.spider.download_delay) 216 | crawler.engine.downloader.slots[slot_key] = slot 217 | 218 | # ban 219 | req = Request(url, meta={'download_slot': slot_key}) 220 | res = Response(ban_url, status=self.bancode, request=req) 221 | mw.process_response(req, res, self.spider) 222 | self.assertEqual(slot.delay, delay) 223 | self.assertEqual(self.spider.download_delay, delay) 224 | 225 | retry_after = 1.5 226 | headers = {'retry-after': str(retry_after)} 227 | res = Response( 228 | ban_url, status=self.bancode, headers=headers, request=req) 229 | mw.process_response(req, res, self.spider) 230 | self.assertEqual(slot.delay, retry_after) 231 | self.assertEqual(self.spider.download_delay, delay) 232 | 233 | res = Response(url, request=req) 234 | mw.process_response(req, res, self.spider) 235 | self.assertEqual(slot.delay, delay) 236 | self.assertEqual(self.spider.download_delay, delay) 237 | 238 | # server failures 239 | mw.process_exception(req, ConnectionRefusedError(), self.spider) 240 | self.assertEqual(slot.delay, mw.connection_refused_delay) 241 | self.assertEqual(self.spider.download_delay, delay) 242 | 243 | res = Response(ban_url, request=req) 244 | mw.process_response(req, res, self.spider) 245 | self.assertEqual(slot.delay, delay) 246 | self.assertEqual(self.spider.download_delay, delay) 247 | 248 | mw.process_exception(req, ConnectionRefusedError(), self.spider) 249 | self.assertEqual(slot.delay, mw.connection_refused_delay) 250 | self.assertEqual(self.spider.download_delay, delay) 251 | 252 | res = Response(ban_url, status=self.bancode, request=req) 253 | mw.process_response(req, res, self.spider) 254 | self.assertEqual(slot.delay, delay) 255 | self.assertEqual(self.spider.download_delay, delay) 256 | 257 | def test_jobid_header(self): 258 | # test without the environment variable 'SCRAPY_JOB' 259 | self.spider.crawlera_enabled = True 260 | crawler = self._mock_crawler(self.settings) 261 | mw = self.mwcls.from_crawler(crawler) 262 | mw.open_spider(self.spider) 263 | req = Request('http://www.scrapytest.org') 264 | self.assertEqual(mw.process_request(req, self.spider), None) 265 | self.assertEqual(req.headers.get('X-Crawlera-Jobid'), None) 266 | 267 | # test with the environment variable 'SCRAPY_JOB' 268 | os.environ['SCRAPY_JOB'] = '2816' 269 | self.spider.crawlera_enabled = True 270 | crawler1 = self._mock_crawler(self.settings) 271 | mw1 = self.mwcls.from_crawler(crawler) 272 | mw1.open_spider(self.spider) 273 | req1 = Request('http://www.scrapytest.org') 274 | self.assertEqual(mw1.process_request(req1, self.spider), None) 275 | self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), b'2816') 276 | -------------------------------------------------------------------------------- /tests/test_deltafetch.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, skipIf 2 | 3 | import os 4 | import mock 5 | import tempfile 6 | from scrapy import Request 7 | from scrapy.item import BaseItem 8 | from scrapy.spiders import Spider 9 | from scrapy.settings import Settings 10 | from scrapy.exceptions import NotConfigured 11 | from scrapy.utils.request import request_fingerprint 12 | from scrapy.utils.python import to_bytes 13 | from scrapylib.deltafetch import DeltaFetch 14 | from scrapy.statscollectors import StatsCollector 15 | from scrapy.utils.test import get_crawler 16 | 17 | dbmodule = None 18 | try: 19 | dbmodule = __import__('bsddb3') 20 | except ImportError: 21 | try: 22 | dbmodule = __import__('bsddb') 23 | except ImportError: 24 | pass 25 | 26 | 27 | @skipIf(not dbmodule, "bsddb3/bsddb is not found on the system") 28 | class DeltaFetchTestCase(TestCase): 29 | 30 | mwcls = DeltaFetch 31 | 32 | def setUp(self): 33 | self.spider = Spider('df_tests') 34 | self.temp_dir = tempfile.gettempdir() 35 | self.db_path = os.path.join(self.temp_dir, 'df_tests.db') 36 | crawler = get_crawler(Spider) 37 | self.stats = StatsCollector(crawler) 38 | 39 | def test_init(self): 40 | # path format is any, the folder is not created 41 | instance = self.mwcls('/any/dir', True, stats=self.stats) 42 | assert isinstance(instance, self.mwcls) 43 | self.assertEqual(instance.dir, '/any/dir') 44 | self.assertEqual(self.stats.get_stats(), {}) 45 | self.assertEqual(instance.reset, True) 46 | 47 | def test_init_from_crawler(self): 48 | crawler = mock.Mock() 49 | # void settings 50 | crawler.settings = Settings({}) 51 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) 52 | with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: 53 | data_dir.return_value = self.temp_dir 54 | 55 | # simple project_data_dir mock with based settings 56 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) 57 | instance = self.mwcls.from_crawler(crawler) 58 | assert isinstance(instance, self.mwcls) 59 | self.assertEqual( 60 | instance.dir, os.path.join(self.temp_dir, 'deltafetch')) 61 | self.assertEqual(instance.reset, False) 62 | 63 | # project_data_dir mock with advanced settings 64 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 65 | 'DELTAFETCH_DIR': 'other', 66 | 'DELTAFETCH_RESET': True}) 67 | instance = self.mwcls.from_crawler(crawler) 68 | assert isinstance(instance, self.mwcls) 69 | self.assertEqual( 70 | instance.dir, os.path.join(self.temp_dir, 'other')) 71 | self.assertEqual(instance.reset, True) 72 | 73 | def test_spider_opened_new(self): 74 | if os.path.exists(self.db_path): 75 | os.remove(self.db_path) 76 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) 77 | assert not hasattr(self.mwcls, 'db') 78 | mw.spider_opened(self.spider) 79 | assert os.path.isdir(self.temp_dir) 80 | assert os.path.exists(self.db_path) 81 | assert hasattr(mw, 'db') 82 | assert isinstance(mw.db, type(dbmodule.db.DB())) 83 | assert mw.db.items() == [] 84 | assert mw.db.get_type() == dbmodule.db.DB_HASH 85 | assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE 86 | 87 | def test_spider_opened_existing(self): 88 | self._create_test_db() 89 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) 90 | assert not hasattr(self.mwcls, 'db') 91 | mw.spider_opened(self.spider) 92 | assert hasattr(mw, 'db') 93 | assert isinstance(mw.db, type(dbmodule.db.DB())) 94 | assert mw.db.items() == [(b'test_key_1', b'test_v_1'), 95 | (b'test_key_2', b'test_v_2')] 96 | assert mw.db.get_type() == dbmodule.db.DB_HASH 97 | assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE 98 | 99 | def test_spider_opened_existing_spider_reset(self): 100 | self._create_test_db() 101 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) 102 | assert not hasattr(self.mwcls, 'db') 103 | self.spider.deltafetch_reset = True 104 | mw.spider_opened(self.spider) 105 | assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE 106 | 107 | def test_spider_opened_reset_non_existing_db(self): 108 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) 109 | assert not hasattr(self.mwcls, 'db') 110 | self.spider.deltafetch_reset = True 111 | mw.spider_opened(self.spider) 112 | assert mw.db.fd() 113 | # there's different logic for different bdb versions: 114 | # it can fail when opening a non-existing db with truncate flag, 115 | # then it should be caught and retried with rm & create flag 116 | assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or 117 | mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) 118 | 119 | def test_spider_opened_recreate(self): 120 | self._create_test_db() 121 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) 122 | assert not hasattr(self.mwcls, 'db') 123 | mw.spider_opened(self.spider) 124 | assert hasattr(mw, 'db') 125 | assert isinstance(mw.db, type(dbmodule.db.DB())) 126 | assert mw.db.items() == [] 127 | assert mw.db.get_type() == dbmodule.db.DB_HASH 128 | assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE 129 | 130 | def test_spider_closed(self): 131 | self._create_test_db() 132 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) 133 | mw.spider_opened(self.spider) 134 | assert mw.db.fd() 135 | mw.spider_closed(self.spider) 136 | self.assertRaises(dbmodule.db.DBError, mw.db.fd) 137 | 138 | def test_process_spider_output(self): 139 | self._create_test_db() 140 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) 141 | mw.spider_opened(self.spider) 142 | response = mock.Mock() 143 | response.request = Request('http://url', 144 | meta={'deltafetch_key': 'key'}) 145 | result = [] 146 | self.assertEqual(list(mw.process_spider_output( 147 | response, result, self.spider)), []) 148 | result = [ 149 | Request('http://url', meta={'deltafetch_key': 'key1'}), 150 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) 151 | ] 152 | self.assertEqual(list(mw.process_spider_output( 153 | response, result, self.spider)), [result[0]]) 154 | self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) 155 | result = [BaseItem(), "not a base item"] 156 | self.assertEqual(list(mw.process_spider_output( 157 | response, result, self.spider)), result) 158 | self.assertEqual(mw.db.keys(), [b'test_key_1', b'key', b'test_key_2']) 159 | assert mw.db[b'key'] 160 | 161 | def test_process_spider_output_stats(self): 162 | self._create_test_db() 163 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) 164 | mw.spider_opened(self.spider) 165 | response = mock.Mock() 166 | response.request = Request('http://url', 167 | meta={'deltafetch_key': 'key'}) 168 | result = [] 169 | self.assertEqual(list(mw.process_spider_output( 170 | response, result, self.spider)), []) 171 | self.assertEqual(self.stats.get_stats(), {}) 172 | result = [ 173 | Request('http://url', meta={'deltafetch_key': 'key'}), 174 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) 175 | ] 176 | self.assertEqual(list(mw.process_spider_output( 177 | response, result, self.spider)), [result[0]]) 178 | self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) 179 | result = [BaseItem(), "not a base item"] 180 | self.assertEqual(list(mw.process_spider_output( 181 | response, result, self.spider)), result) 182 | self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) 183 | 184 | def test_init_from_crawler_legacy(self): 185 | # test with subclass not handling passed stats 186 | class LegacyDeltaFetchSubClass(self.mwcls): 187 | 188 | def __init__(self, dir, reset=False, *args, **kwargs): 189 | super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) 190 | self.something = True 191 | 192 | crawler = mock.Mock() 193 | # void settings 194 | crawler.settings = Settings({}) 195 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) 196 | 197 | with mock.patch('scrapy.utils.project.project_data_dir') as data_dir: 198 | data_dir.return_value = self.temp_dir 199 | 200 | # simple project_data_dir mock with based settings 201 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) 202 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler) 203 | assert isinstance(instance, self.mwcls) 204 | self.assertEqual( 205 | instance.dir, os.path.join(self.temp_dir, 'deltafetch')) 206 | self.assertEqual(instance.reset, False) 207 | 208 | # project_data_dir mock with advanced settings 209 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True, 210 | 'DELTAFETCH_DIR': 'other', 211 | 'DELTAFETCH_RESET': True}) 212 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler) 213 | assert isinstance(instance, self.mwcls) 214 | self.assertEqual( 215 | instance.dir, os.path.join(self.temp_dir, 'other')) 216 | self.assertEqual(instance.reset, True) 217 | 218 | def test_process_spider_output_stats_legacy(self): 219 | # testing the subclass not handling stats works at runtime 220 | # (i.e. that trying to update stats does not trigger exception) 221 | class LegacyDeltaFetchSubClass(self.mwcls): 222 | 223 | def __init__(self, dir, reset=False, *args, **kwargs): 224 | super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) 225 | self.something = True 226 | 227 | self._create_test_db() 228 | mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) 229 | mw.spider_opened(self.spider) 230 | response = mock.Mock() 231 | response.request = Request('http://url', 232 | meta={'deltafetch_key': 'key'}) 233 | result = [] 234 | self.assertEqual(list(mw.process_spider_output( 235 | response, result, self.spider)), []) 236 | self.assertEqual(self.stats.get_stats(), {}) 237 | result = [ 238 | Request('http://url', meta={'deltafetch_key': 'key'}), 239 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) 240 | ] 241 | 242 | # stats should not be updated 243 | self.assertEqual(list(mw.process_spider_output( 244 | response, result, self.spider)), [result[0]]) 245 | self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) 246 | 247 | result = [BaseItem(), "not a base item"] 248 | self.assertEqual(list(mw.process_spider_output( 249 | response, result, self.spider)), result) 250 | self.assertEqual(self.stats.get_value('deltafetch/stored'), None) 251 | 252 | def test_get_key(self): 253 | mw = self.mwcls(self.temp_dir, reset=True) 254 | test_req1 = Request('http://url1') 255 | self.assertEqual(mw._get_key(test_req1), 256 | to_bytes(request_fingerprint(test_req1))) 257 | test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'}) 258 | self.assertEqual(mw._get_key(test_req2), b'dfkey1') 259 | 260 | def _create_test_db(self): 261 | db = dbmodule.db.DB() 262 | # truncate test db if there were failed tests 263 | db.open(self.db_path, dbmodule.db.DB_HASH, 264 | dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) 265 | db.put(b'test_key_1', b'test_v_1') 266 | db.put(b'test_key_2', b'test_v_2') 267 | db.close() 268 | --------------------------------------------------------------------------------