├── tests
├── __init__.py
├── test_links.py
├── test_querycleaner.py
├── test_splitvariants.py
├── test_constraints.py
├── test_processors.py
├── test_magicfields.py
├── test_hubproxy.py
├── test_hcf.py
├── test_crawlera.py
└── test_deltafetch.py
├── scrapylib
├── __init__.py
├── pipelines.py
├── constraints
│ ├── pipeline.py
│ └── __init__.py
├── links.py
├── hubproxy.py
├── processors
│ ├── date.py
│ └── __init__.py
├── splitvariants.py
├── redisqueue.py
├── proxy.py
├── guid.py
├── querycleaner.py
├── spidertrace.py
├── deltafetch.py
├── magicfields.py
├── crawlera.py
└── hcf.py
├── requirements.txt
├── .bumpversion.cfg
├── .gitignore
├── tox.ini
├── setup.py
├── .travis.yml
└── README.rst
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scrapylib/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | six
2 | boto
3 | hubstorage>=0.23
4 | python-dateutil
5 | scrapinghub
6 | Scrapy>=1.1
7 |
--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 1.7.0
3 | commit = True
4 | tag = True
5 | tag_name = {new_version}
6 |
7 | [bumpversion:file:setup.py]
8 |
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # temp files
2 | **swp
3 | **pyc
4 | **~
5 |
6 | # setuptools/distutils files
7 | scrapylib\.egg-info
8 | build/
9 | dist/
10 | \.idea/
11 | \.tox/
12 |
--------------------------------------------------------------------------------
/scrapylib/pipelines.py:
--------------------------------------------------------------------------------
1 |
2 | class SpiderFieldPipeline(object):
3 | def process_item(self, item, spider):
4 | item['spider'] = spider.name
5 | return item
6 |
--------------------------------------------------------------------------------
/scrapylib/constraints/pipeline.py:
--------------------------------------------------------------------------------
1 | from scrapy.exceptions import DropItem
2 |
3 | class ConstraintsPipeline(object):
4 |
5 | def process_item(self, item, spider):
6 | try:
7 | for c in item.constraints:
8 | c(item)
9 | except AssertionError as e:
10 | raise DropItem(str(e))
11 | return item
12 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (http://tox.testrun.org/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist = py27, pypy, py33
8 |
9 | [testenv]
10 | setenv =
11 | BERKELEYDB_DIR = /usr
12 | deps =
13 | -rrequirements.txt
14 | mock
15 | nose
16 | bsddb3
17 | commands = nosetests --with-doctest []
18 |
--------------------------------------------------------------------------------
/scrapylib/links.py:
--------------------------------------------------------------------------------
1 | from scrapy.http import Request
2 |
3 | def follow_links(link_extractor, response, callback):
4 | """Returns a generator of requests with given `callback`
5 | of links extractor from `response`.
6 |
7 | Parameters:
8 | link_extractor -- LinkExtractor to use
9 | response -- Response to extract links from
10 | callback -- callback to apply to each new requests
11 |
12 | """
13 | for link in link_extractor.extract_links(response):
14 | yield Request(link.url, callback=callback)
15 |
--------------------------------------------------------------------------------
/scrapylib/hubproxy.py:
--------------------------------------------------------------------------------
1 | from .crawlera import CrawleraMiddleware
2 |
3 |
4 | class HubProxyMiddleware(CrawleraMiddleware):
5 |
6 | def __init__(self, *args, **kwargs):
7 | import warnings
8 | from scrapy.exceptions import ScrapyDeprecationWarning
9 | warnings.warn('scrapylib.hubproxy.HubProxyMiddleware is deprecated, '
10 | 'use scrapylib.crawlera.CrawleraMiddleware instead.',
11 | category=ScrapyDeprecationWarning, stacklevel=1)
12 | super(HubProxyMiddleware, self).__init__(*args, **kwargs)
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='scrapylib',
5 | version='1.7.0',
6 | license='BSD',
7 | description='Scrapy helper functions and processors',
8 | author='Scrapinghub',
9 | author_email='info@scrapinghub.com',
10 | url='http://github.com/scrapinghub/scrapylib',
11 | packages=['scrapylib', 'scrapylib.constraints', 'scrapylib.processors'],
12 | platforms=['Any'],
13 | classifiers=[
14 | 'Development Status :: 7 - Inactive',
15 | 'License :: OSI Approved :: BSD License',
16 | 'Operating System :: OS Independent',
17 | 'Programming Language :: Python'
18 | ],
19 | install_requires=['Scrapy>=1.0.0']
20 | )
21 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python: 3.5
3 | sudo: false
4 | env:
5 | matrix:
6 | - TOXENV=py27
7 | - TOXENV=py33
8 | - TOXENV=py35
9 | addons:
10 | apt:
11 | packages:
12 | - language-pack-fr
13 | - libdb-dev
14 |
15 | install: pip install -U tox
16 | script: tox
17 |
18 | deploy:
19 | provider: pypi
20 | user: scrapinghub
21 | distributions: sdist bdist_wheel
22 | password:
23 | secure: iKVlMlKSr+LOuCCMMOqL65aYjNRy3k1Zb4d7NRN0JpWS5DGau8G8cEhJ1dY4uyc/DNKVJmd939OiLBsUqqCmz09+ozen/YrRNjEZS5lOwBNfhpiCESkbOjcInV1PQgx2XfuHGp8O/9vxtXjjH9WE9CabQ+8Zg5/rMMvXizT4/O4=
24 | on:
25 | tags: true
26 | all_branches: true
27 | repo: scrapinghub/scrapylib
28 | condition: $TOXENV = py27
29 |
--------------------------------------------------------------------------------
/scrapylib/processors/date.py:
--------------------------------------------------------------------------------
1 | from dateutil.parser import parse
2 | from scrapy.loader.processors import Compose
3 | from scrapy import log
4 | from scrapylib.processors import default_output_processor
5 |
6 | def parse_datetime(value):
7 | try:
8 | d = parse(value)
9 | except ValueError:
10 | log.msg('Unable to parse %s' % value, level=log.WARNING)
11 | return value
12 | else:
13 | return d.isoformat()
14 |
15 | def parse_date(value):
16 | try:
17 | d = parse(value)
18 | except ValueError:
19 | log.msg('Unable to parse %s' % value, level=log.WARNING)
20 | return value
21 | else:
22 | return d.strftime("%Y-%m-%d")
23 |
24 | default_out_parse_datetime = Compose(default_output_processor, parse_datetime)
25 | default_out_parse_date = Compose(default_output_processor, parse_date)
26 |
--------------------------------------------------------------------------------
/tests/test_links.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from scrapylib.links import follow_links
4 | from scrapy.http import Request
5 |
6 |
7 | class LinkMock(object):
8 | def __init__(self, url):
9 | self.url = url
10 |
11 |
12 | class LinkExtractorMock(object):
13 |
14 | def extract_links(self, response):
15 | return [LinkMock(url=x) for x in response.split('|')]
16 |
17 |
18 | def some_callback():
19 | pass
20 |
21 |
22 | class TestLinks(unittest.TestCase):
23 |
24 | def test_follow_links(self):
25 | r = list(follow_links(LinkExtractorMock(), 'http://link1|http://link2|http://link3', callback=some_callback))
26 | assert all(isinstance(x, Request) for x in r)
27 | assert all(x.callback is some_callback for x in r)
28 | self.assertEqual([x.url for x in r], ['http://link1', 'http://link2', 'http://link3'])
29 |
--------------------------------------------------------------------------------
/scrapylib/splitvariants.py:
--------------------------------------------------------------------------------
1 | """
2 | Splits each product with variants into different single products.
3 | For autoscraping products adaptation
4 | """
5 |
6 | from copy import deepcopy
7 | from scrapy.item import DictItem
8 | from scrapy.exceptions import NotConfigured
9 |
10 | class SplitVariantsMiddleware(object):
11 |
12 | @classmethod
13 | def from_crawler(cls, crawler):
14 | if not crawler.settings.getbool("SPLITVARIANTS_ENABLED"):
15 | raise NotConfigured
16 | return cls()
17 |
18 | def process_spider_output(self, response, result, spider):
19 | for r in result:
20 | if isinstance(r, DictItem) and "variants" in r:
21 | variants = r.pop("variants")
22 | for variant in variants:
23 | new_product = deepcopy(r)
24 | new_product.update(variant)
25 | yield new_product
26 | else:
27 | yield r
28 |
--------------------------------------------------------------------------------
/scrapylib/redisqueue.py:
--------------------------------------------------------------------------------
1 | try:
2 | import cPickle as pickle
3 | except ImportError:
4 | import pickle
5 |
6 | from scrapy.exceptions import NotConfigured
7 | from scrapy import signals
8 |
9 |
10 | class RedisQueue(object):
11 |
12 | def __init__(self, crawler):
13 | try:
14 | from redis import Redis
15 | except ImportError:
16 | raise NotConfigured
17 |
18 | settings = crawler.settings
19 |
20 | # get settings
21 | queue = settings.get('REDIS_QUEUE')
22 | if queue is None:
23 | raise NotConfigured
24 |
25 | host = settings.get('REDIS_HOST', 'localhost')
26 | port = settings.getint('REDIS_PORT', 6379)
27 | db = settings.getint('REDIS_DB', 0)
28 | password = settings.get('REDIS_PASSWORD')
29 |
30 | self.redis = Redis(host=host, port=port, db=db, password=password)
31 | self.queue = queue
32 | self.project = settings['BOT_NAME']
33 |
34 | crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
35 |
36 | @classmethod
37 | def from_crawler(cls, crawler):
38 | return cls(crawler)
39 |
40 | def spider_closed(self, spider, reason):
41 | msg = {'project': self.project, 'spider': spider.name, 'reason': reason}
42 | self.redis.rpush(self.queue, pickle.dumps(msg))
43 |
--------------------------------------------------------------------------------
/scrapylib/proxy.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from six.moves.urllib.parse import unquote, urlunparse
3 | try:
4 | from urllib2 import _parse_proxy
5 | except ImportError:
6 | from urllib.request import _parse_proxy
7 |
8 |
9 | class SelectiveProxyMiddleware(object):
10 | """A middleware to enable http proxy to selected spiders only.
11 |
12 | Settings:
13 | HTTP_PROXY -- proxy uri. e.g.: http://user:pass@proxy.host:port
14 | PROXY_SPIDERS -- all requests from these spiders will be routed
15 | through the proxy
16 | """
17 |
18 | def __init__(self, settings):
19 | self.proxy = self.parse_proxy(settings.get('HTTP_PROXY'), 'http')
20 | self.proxy_spiders = set(settings.getlist('PROXY_SPIDERS', []))
21 |
22 | @classmethod
23 | def from_crawler(cls, crawler):
24 | return cls(crawler.settings)
25 |
26 | def parse_proxy(self, url, orig_type):
27 | proxy_type, user, password, hostport = _parse_proxy(url)
28 | proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
29 |
30 | if user and password:
31 | user_pass = '%s:%s' % (unquote(user), unquote(password))
32 | creds = base64.b64encode(user_pass).strip()
33 | else:
34 | creds = None
35 |
36 | return creds, proxy_url
37 |
38 | def process_request(self, request, spider):
39 | if spider.name in self.proxy_spiders:
40 | creds, proxy = self.proxy
41 | request.meta['proxy'] = proxy
42 | if creds:
43 | request.headers['Proxy-Authorization'] = 'Basic ' + creds
44 |
--------------------------------------------------------------------------------
/tests/test_querycleaner.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from scrapy.http import Request, Response
4 | from scrapy.spiders import Spider
5 | from scrapy.utils.test import get_crawler
6 | from scrapylib.querycleaner import QueryCleanerMiddleware
7 | from scrapy.exceptions import NotConfigured
8 |
9 |
10 | class QueryCleanerTestCase(TestCase):
11 |
12 | mwcls = QueryCleanerMiddleware
13 |
14 | def setUp(self):
15 | self.spider = Spider('foo')
16 |
17 | def test_not_loaded(self):
18 | crawler = get_crawler(settings_dict={})
19 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
20 |
21 | def test_filter_keep(self):
22 | crawler = get_crawler(settings_dict={"QUERYCLEANER_KEEP": "qxp"})
23 | mw = self.mwcls.from_crawler(crawler)
24 | response = Response(url="http://www.example.com/qxg1231")
25 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231")
26 | new_request = list(mw.process_spider_output(response, [request], self.spider))[0]
27 | self.assertEqual(new_request.url, "http://www.example.com/product/?qxp=12")
28 | self.assertNotEqual(request, new_request)
29 |
30 | def test_filter_remove(self):
31 | crawler = get_crawler(settings_dict={"QUERYCLEANER_REMOVE": "qxg"})
32 | mw = self.mwcls.from_crawler(crawler)
33 | response = Response(url="http://www.example.com/qxg1231")
34 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231")
35 | new_request = list(mw.process_spider_output(response, [request], self.spider))[0]
36 | self.assertEqual(new_request.url, "http://www.example.com/product/?qxp=12")
37 | self.assertNotEqual(request, new_request)
38 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | =========
2 | scrapylib
3 | =========
4 |
5 | Overview
6 | ========
7 |
8 | **This library is deprecated and unmaintained.**
9 |
10 | Some of its components were moved to their own packages:
11 |
12 | +--------------------------+------------------------------------------------+
13 | | Old location | New location |
14 | +==========================+================================================+
15 | | scrapylib.crawlera | `scrapy-crawlera`_ |
16 | +--------------------------+------------------------------------------------+
17 | | scrapylib.deltafetch | `scrapy-deltafetch`_ |
18 | +--------------------------+------------------------------------------------+
19 | | scrapylib.hcf | `scrapy-hcf`_ |
20 | +--------------------------+------------------------------------------------+
21 | | scrapylib.magicfields | `scrapy-magicfields`_ |
22 | +--------------------------+------------------------------------------------+
23 | | scrapylib.querycleaner | `scrapy-querycleaner`_ |
24 | +--------------------------+------------------------------------------------+
25 | | scrapylib.splitvariants | `scrapy-splitvariants`_ |
26 | +--------------------------+------------------------------------------------+
27 |
28 | .. _scrapy-crawlera: https://github.com/scrapy-plugins/scrapy-crawlera
29 | .. _scrapy-deltafetch: https://github.com/scrapy-plugins/scrapy-deltafetch
30 | .. _scrapy-hcf: https://github.com/scrapy-plugins/scrapy-hcf
31 | .. _scrapy-magicfields: https://github.com/scrapy-plugins/scrapy-magicfields
32 | .. _scrapy-querycleaner: https://github.com/scrapy-plugins/scrapy-querycleaner
33 | .. _scrapy-splitvariants: https://github.com/scrapy-plugins/scrapy-splitvariants
34 |
--------------------------------------------------------------------------------
/tests/test_splitvariants.py:
--------------------------------------------------------------------------------
1 | """ Tests to cover splitvariants middleware """
2 | from unittest import TestCase
3 |
4 | from scrapy.spiders import Spider
5 | from scrapy.item import DictItem, Field
6 | from scrapy.http import HtmlResponse
7 | from scrapy.utils.test import get_crawler
8 |
9 | from scrapylib.splitvariants import SplitVariantsMiddleware
10 |
11 |
12 | class TestItem(DictItem):
13 | """
14 | Item used in test spider
15 | """
16 | fields = {
17 | 'id': Field(),
18 | 'name': Field(),
19 | 'size': Field(),
20 | 'price': Field(),
21 | 'variants': Field()
22 | }
23 |
24 |
25 | class SplitVariantsTest(TestCase):
26 | """ Split variants middleware test cases """
27 | def setUp(self):
28 | self.spider = Spider('myspider',
29 | start_urls=["http://example.com"])
30 | self.response = HtmlResponse(body=b"",
31 | url="http://www.example.com")
32 |
33 | def test_variants_splitted(self):
34 | """
35 | Checks if item with variants is split as expected
36 | """
37 | settings = {"SPLITVARIANTS_ENABLED": True}
38 | crawler = get_crawler(settings_dict=settings)
39 | mware = SplitVariantsMiddleware.from_crawler(crawler)
40 |
41 | # Define item with variants
42 | item = {"id": 12,
43 | "name": "Big chair",
44 | "variants": [{"size": "XL", "price": 200},
45 | {"size": "L", "price": 220}]}
46 | result = [TestItem(item)]
47 |
48 | # Define how split items should look
49 | expected = [
50 | {"id": 12, "name": "Big chair", "size": 'XL', 'price': 200},
51 | {"id": 12, "name": "Big chair", "size": 'L', 'price': 220}]
52 |
53 | # Calling middleware for given result
54 | result = mware.process_spider_output(self.response, result,
55 | self.spider)
56 | self.assertEquals(list(result), expected)
57 |
--------------------------------------------------------------------------------
/scrapylib/guid.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 |
3 | from scrapy import signals
4 | from scrapy.exceptions import DropItem
5 | try:
6 | from scrapy.utils.python import to_bytes
7 | except ImportError:
8 | from scrapy.utils.python import unicode_to_str as to_bytes
9 |
10 |
11 | def hash_values(*values):
12 | """Hash a series of non-None values.
13 |
14 | For example:
15 | >>> hash_values('some', 'values', 'to', 'hash')
16 | '1d7b7a17aeb0e5f9a6814289d12d3253'
17 | """
18 | hash = hashlib.md5()
19 | for value in values:
20 | if value is None:
21 | message = "hash_values was passed None at argument index %d" % list(values).index(None)
22 | raise ValueError(message)
23 | hash.update(to_bytes('%s' % value))
24 | return hash.hexdigest()
25 |
26 |
27 | class GUIDPipeline(object):
28 |
29 | item_fields = {}
30 |
31 | def __init__(self):
32 | self.guids = {}
33 |
34 | @classmethod
35 | def from_crawler(cls, crawler):
36 | o = cls()
37 | crawler.signals.connect(o.spider_opened, signals.spider_opened)
38 | crawler.signals.connect(o.spider_closed, signals.spider_closed)
39 | return o
40 |
41 | def spider_opened(self, spider):
42 | self.guids[spider] = set()
43 |
44 | def spider_closed(self, spider):
45 | del self.guids[spider]
46 |
47 | def process_item(self, item, spider):
48 | if type(item) in self.item_fields:
49 | item['guid'] = guid = self.generate_guid(item, spider)
50 | if guid is None:
51 | raise DropItem("Missing guid fields on: %s" % item)
52 | if guid in self.guids[spider]:
53 | raise DropItem("Duplicate item found: %s" % item)
54 | else:
55 | self.guids[spider].add(guid)
56 | return item
57 |
58 | def generate_guid(self, item, spider):
59 | values = []
60 | for field in self.item_fields[type(item)]:
61 | value = item.get(field)
62 | if value is None:
63 | return
64 | values.append(value.encode('utf-8'))
65 | values.insert(0, spider.name)
66 | return hash_values(*values)
67 |
--------------------------------------------------------------------------------
/scrapylib/processors/__init__.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import locale as localelib
3 | import re
4 | import time
5 | from six.moves.urllib.parse import urljoin
6 |
7 |
8 | from scrapy.loader.processors import MapCompose, TakeFirst
9 | from scrapy.utils.markup import (remove_tags, replace_escape_chars,
10 | unquote_markup)
11 |
12 |
13 | _clean_spaces_re = re.compile("\s+", re.U)
14 |
15 |
16 | def clean_spaces(value):
17 | return _clean_spaces_re.sub(' ', value)
18 |
19 |
20 | def make_absolute_url(val, loader_context):
21 | base_url = loader_context.get('base_url')
22 | if base_url is None:
23 | response = loader_context.get('response')
24 | if response is None:
25 | raise AttributeError('You must provide a base_url or a response '
26 | 'to the loader context')
27 | base_url = response.url
28 | return urljoin(base_url, val)
29 |
30 |
31 | def remove_query_params(value):
32 | # some urls don't have ? but have &
33 | return value.split('?')[0].split('&')[0]
34 |
35 |
36 | _br_re = re.compile('
', re.IGNORECASE)
37 | def replace_br(value):
38 | return _br_re.sub(' ', value)
39 |
40 |
41 | def replace_escape(value):
42 | return replace_escape_chars(value, replace_by=u' ')
43 |
44 |
45 | def split(value):
46 | return [v.strip() for v in value.split(',')]
47 |
48 |
49 | def strip(value):
50 | return value.strip()
51 |
52 |
53 | def to_datetime(value, format, locale=None):
54 | """Returns a datetime parsed from value with the specified format
55 | and locale.
56 |
57 | If no year is specified in the parsing format it is taken from the
58 | current date.
59 | """
60 | if locale:
61 | old_locale = localelib.getlocale(localelib.LC_TIME)
62 | localelib.setlocale(localelib.LC_TIME, locale)
63 |
64 | time_s = time.strptime(value, format)
65 | dt = datetime.datetime(*time_s[0:5])
66 | # 1900 is the default year from strptime, means no year parsed
67 | if dt.year == 1900:
68 | dt = dt.replace(year=datetime.datetime.utcnow().year)
69 |
70 | if locale:
71 | localelib.setlocale(localelib.LC_TIME, old_locale)
72 |
73 | return dt
74 |
75 |
76 | def to_date(value, format, locale=None):
77 | return to_datetime(value, format, locale).date()
78 |
79 |
80 | def to_time(value, format):
81 | time_s = time.strptime(value, format)
82 | return datetime.time(time_s[3], time_s[4])
83 |
84 |
85 | # defaults
86 |
87 | default_input_processor = MapCompose(replace_br, remove_tags, unquote_markup,
88 | replace_escape, strip, clean_spaces)
89 |
90 | default_output_processor = TakeFirst()
91 |
--------------------------------------------------------------------------------
/scrapylib/querycleaner.py:
--------------------------------------------------------------------------------
1 | """Get parameter cleaner for AS.
2 |
3 | Add removed/kept pattern (regex) with
4 |
5 | QUERYCLEANER_REMOVE
6 | QUERYCLEANER_KEEP
7 |
8 | Remove patterns has precedence.
9 | """
10 | import re
11 | from six.moves.urllib.parse import quote
12 | from six import string_types
13 |
14 | from scrapy.utils.httpobj import urlparse_cached
15 | from scrapy.http import Request
16 | from scrapy.exceptions import NotConfigured
17 |
18 | from w3lib.url import _safe_chars
19 |
20 | def _parse_query_string(query):
21 | """Used for replacing cgi.parse_qsl.
22 | The cgi version returns the same pair for query 'key'
23 | and query 'key=', so reconstruction
24 | maps to the same string. But some sites does not handle both versions
25 | in the same way.
26 | This version returns (key, None) in the first case, and (key, '') in the
27 | second one, so correct reconstruction can be performed."""
28 |
29 | params = query.split("&")
30 | keyvals = []
31 | for param in params:
32 | kv = param.split("=") + [None]
33 | keyvals.append((kv[0], kv[1]))
34 | return keyvals
35 |
36 | def _filter_query(query, remove_re=None, keep_re=None):
37 | """
38 | Filters query parameters in a query string according to key patterns
39 | >>> _filter_query('as=3&bs=8&cs=9')
40 | 'as=3&bs=8&cs=9'
41 | >>> _filter_query('as=3&bs=8&cs=9', None, re.compile("as|bs"))
42 | 'as=3&bs=8'
43 | >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs"))
44 | 'cs=9'
45 | >>> _filter_query('as=3&bs=8&cs=9', re.compile("as|bs"), re.compile("as|cs"))
46 | 'cs=9'
47 | """
48 | keyvals = _parse_query_string(query)
49 | qargs = []
50 | for k, v in keyvals:
51 | if remove_re is not None and remove_re.search(k):
52 | continue
53 | if keep_re is None or keep_re.search(k):
54 | qarg = quote(k, _safe_chars)
55 | if isinstance(v, string_types):
56 | qarg = qarg + '=' + quote(v, _safe_chars)
57 | qargs.append(qarg.replace("%20", "+"))
58 | return '&'.join(qargs)
59 |
60 | class QueryCleanerMiddleware(object):
61 | def __init__(self, settings):
62 | remove = settings.get("QUERYCLEANER_REMOVE")
63 | keep = settings.get("QUERYCLEANER_KEEP")
64 | if not (remove or keep):
65 | raise NotConfigured
66 | self.remove = re.compile(remove) if remove else None
67 | self.keep = re.compile(keep) if keep else None
68 |
69 | @classmethod
70 | def from_crawler(cls, crawler):
71 | return cls(crawler.settings)
72 |
73 | def process_spider_output(self, response, result, spider):
74 | for res in result:
75 | if isinstance(res, Request):
76 | parsed = urlparse_cached(res)
77 | if parsed.query:
78 | parsed = parsed._replace(query=_filter_query(parsed.query, self.remove, self.keep))
79 | res = res.replace(url=parsed.geturl())
80 | yield res
81 |
82 |
--------------------------------------------------------------------------------
/scrapylib/spidertrace.py:
--------------------------------------------------------------------------------
1 | """
2 | Spider Trace
3 |
4 | This SpiderMiddleware logs a trace of requests and items extracted for a
5 | spider
6 | """
7 | import os
8 | from os.path import basename
9 | from tempfile import mkstemp
10 | from gzip import GzipFile
11 | import time
12 | import boto
13 | import json
14 | from boto.s3.key import Key
15 | from scrapy import signals, log
16 | from scrapy.exceptions import NotConfigured
17 | from scrapy.http import Request
18 | from scrapy.utils.request import request_fingerprint
19 |
20 |
21 | class SpiderTraceMiddleware(object):
22 | """Saves a trace of spider execution and uploads to S3
23 |
24 | The trace records:
25 | (timestamp, http response, results extracted from spider)
26 | """
27 | REQUEST_ATTRS = ('url', 'method', 'body', 'headers', 'cookies', 'meta')
28 | RESPONSE_ATTRS = ('url', 'status', 'headers', 'body', 'request', 'flags')
29 |
30 | def __init__(self, crawler):
31 | self.bucket = crawler.settings.get("SPIDERTRACE_BUCKET")
32 | if not self.bucket:
33 | raise NotConfigured
34 | crawler.signals.connect(self.open_spider, signals.spider_opened)
35 | crawler.signals.connect(self.close_spider, signals.spider_closed)
36 | self.outputs = {}
37 |
38 | @classmethod
39 | def from_crawler(cls, crawler):
40 | return cls(crawler)
41 |
42 | def process_spider_output(self, response, result, spider):
43 | f = self.outputs[spider]
44 | fp = request_fingerprint(response.request)
45 | tracetime = time.time()
46 | data = self._objtodict(self.RESPONSE_ATTRS, response)
47 | data['request'] = self._objtodict(self.REQUEST_ATTRS, response.request)
48 | self._write(f, fp, tracetime, 'response', data)
49 |
50 | for item in result:
51 | if isinstance(item, Request):
52 | data = self._objtodict(self.REQUEST_ATTRS, item)
53 | data['fp'] = request_fingerprint(item)
54 | self._write(f, fp, tracetime, 'request', data)
55 | else:
56 | self._write(f, fp, tracetime, 'item', dict(item))
57 | yield item
58 |
59 | @staticmethod
60 | def _write(f, fp, tracetime, otype, data):
61 | f.write('%s\t%s\t%s\t%s\n' % (tracetime, fp, otype, json.dumps(data)))
62 |
63 | @staticmethod
64 | def _objtodict(attrs, obj):
65 | data = [(a, getattr(obj, a)) for a in attrs]
66 | return dict(x for x in data if x[1])
67 |
68 | def open_spider(self, spider):
69 | _, fname = mkstemp(prefix=spider.name + '-', suffix='.trace.gz')
70 | self.outputs[spider] = GzipFile(fname, 'wb')
71 |
72 | def close_spider(self, spider):
73 | f = self.outputs.pop(spider)
74 | f.close()
75 | c = boto.connect_s3()
76 | fname = basename(f.name)
77 | key = Key(c.get_bucket(self.bucket), fname)
78 | log.msg("uploading trace to s3://%s/%s" % (key.bucket.name, fname))
79 | key.set_contents_from_filename(f.name)
80 | os.remove(f.name)
81 |
--------------------------------------------------------------------------------
/tests/test_constraints.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import six
3 |
4 | from scrapylib.constraints import RequiredFields, NonEmptyFields, IsType, IsNumber, IsPrice, MaxLen, MinLen
5 |
6 |
7 | class RequiredFieldsTest(unittest.TestCase):
8 |
9 | def setUp(self):
10 | self.item = {'str': 'bar', 'list': ['one'], 'bool': False, 'none': None}
11 |
12 | def test_basic(self):
13 | RequiredFields('str')(self.item)
14 | RequiredFields('str', 'list', 'bool', 'none')(self.item)
15 |
16 | def test_fail(self):
17 | self.assertRaises(AssertionError, RequiredFields('list', 'xxx'), self.item)
18 |
19 |
20 | class NonEmptyFieldsTest(unittest.TestCase):
21 |
22 | def setUp(self):
23 | self.item = {'str': 'foo', 'list': [0], 'empty_str': '', 'empty_list': []}
24 |
25 | def test_basic(self):
26 | NonEmptyFields('str')(self.item)
27 | NonEmptyFields('str', 'list')(self.item)
28 |
29 | def test_fail(self):
30 | self.assertRaises(AssertionError, NonEmptyFields('list', 'xxx'), self.item)
31 | self.assertRaises(AssertionError, NonEmptyFields('empty_str'), self.item)
32 | self.assertRaises(AssertionError, NonEmptyFields('empty_list'), self.item)
33 |
34 |
35 | class IsTypeTest(unittest.TestCase):
36 |
37 | def setUp(self):
38 | self.item = {'str': 'bar', 'list': ['one']}
39 |
40 | def test_ok(self):
41 | IsType(six.string_types, 'str')(self.item)
42 | IsType(list, 'list')(self.item)
43 | IsType(list, 'missing')(self.item)
44 |
45 | def test_fail(self):
46 | for t in six.string_types:
47 | self.assertRaises(AssertionError, IsType(t, 'list'), self.item)
48 | self.assertRaises(AssertionError, IsType(list, 'str'), self.item)
49 |
50 |
51 | class IsNumberTest(unittest.TestCase):
52 |
53 | def setUp(self):
54 | self.item = {'name': 'foo', 'age': '23'}
55 |
56 | def test_ok(self):
57 | IsNumber('age')(self.item)
58 | IsNumber('xxx')(self.item)
59 |
60 | def test_fail(self):
61 | self.assertRaises(AssertionError, IsNumber('name'), self.item)
62 |
63 |
64 | class IsPriceTest(unittest.TestCase):
65 |
66 | def setUp(self):
67 | self.item = {'name': 'foo', 'price': '1,223.23 '}
68 |
69 | def test_basic(self):
70 | IsPrice('price')(self.item)
71 | IsPrice('xxx')(self.item)
72 |
73 | def test_fail(self):
74 | self.assertRaises(AssertionError, IsPrice('name'), self.item)
75 |
76 |
77 | class MaxLenTest(unittest.TestCase):
78 |
79 | def setUp(self):
80 | self.item = {'name': 'foo', 'other': 'very long content'}
81 |
82 | def test_ok(self):
83 | MaxLen(8, 'name')(self.item)
84 | MaxLen(8, 'xxx')(self.item)
85 |
86 | def test_fail(self):
87 | self.assertRaises(AssertionError, MaxLen(8, 'other'), self.item)
88 |
89 |
90 | class MinLenTest(MaxLenTest):
91 |
92 | def test_ok(self):
93 | MinLen(8, 'other')(self.item)
94 | MinLen(8, 'xxx')(self.item)
95 |
96 | def test_fail(self):
97 | self.assertRaises(AssertionError, MinLen(8, 'name'), self.item)
98 |
--------------------------------------------------------------------------------
/tests/test_processors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import datetime
3 | import locale
4 | import unittest
5 |
6 | from scrapylib.processors import to_datetime, to_date, default_input_processor
7 |
8 |
9 | def locale_exists():
10 | current_locale = locale.getlocale(locale.LC_TIME)
11 | try:
12 | locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')
13 | except Exception:
14 | return False
15 | else:
16 | locale.setlocale(locale.LC_TIME, current_locale)
17 | return True
18 |
19 |
20 | class TestProcessors(unittest.TestCase):
21 |
22 | def test_to_datetime(self):
23 | self.assertEquals(to_datetime('March 4, 2011 20:00', '%B %d, %Y %H:%S'),
24 | datetime.datetime(2011, 3, 4, 20, 0))
25 |
26 | # test no year in parse format
27 | test_date = to_datetime('March 4, 20:00', '%B %d, %H:%S')
28 | self.assertEquals(test_date.year, datetime.datetime.utcnow().year)
29 |
30 | # test parse only date
31 | self.assertEquals(to_datetime('March 4, 2011', '%B %d, %Y'),
32 | datetime.datetime(2011, 3, 4))
33 |
34 | @unittest.skipUnless(locale_exists(), "locale does not exist")
35 | def test_localized_to_datetime(self):
36 | current_locale = locale.getlocale(locale.LC_TIME)
37 |
38 | self.assertEquals(
39 | to_datetime('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'),
40 | datetime.datetime(2011, 1, 11)
41 | )
42 |
43 | self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME))
44 |
45 | def test_to_date(self):
46 | self.assertEquals(to_date('March 4, 2011', '%B %d, %Y'),
47 | datetime.date(2011, 3, 4))
48 |
49 | # test no year in parse format
50 | test_date = to_date('March 4', '%B %d')
51 | self.assertEquals(test_date.year, datetime.datetime.utcnow().year)
52 |
53 | @unittest.skipUnless(locale_exists(), "locale does not exist")
54 | def test_localized_to_date(self):
55 | current_locale = locale.getlocale(locale.LC_TIME)
56 |
57 | self.assertEquals(
58 | to_date('11 janvier 2011', '%d %B %Y', locale='fr_FR.UTF-8'),
59 | datetime.date(2011, 1, 11)
60 | )
61 |
62 | self.assertEquals(current_locale, locale.getlocale(locale.LC_TIME))
63 |
64 | def test_default_input_processor(self):
65 | self.assertEquals(default_input_processor(
66 | """up to 54%"""),
72 | [u'up to 54%'])
73 |
74 | self.assertEquals(default_input_processor(
75 | """
<< ...The Sunnywale, Calif.-based... >>
"""),
76 | [u'<< ...The Sunnywale, Calif.-based... >>'])
77 |
78 | self.assertEquals(default_input_processor(
79 | """newline
must be replaced before tags and only then quotes like <br>"""),
80 | [u'newline must be replaced before tags and only then quotes like
'])
81 |
82 | if __name__ == '__main__':
83 | unittest.main()
84 |
--------------------------------------------------------------------------------
/scrapylib/constraints/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Item Constrains
3 | ---------------
4 |
5 | This module provides several classes that can be used as conditions to check
6 | certain item constraints. Conditions are just callables that receive a dict and
7 | *may* raise an AssertionError if the condition is not met.
8 |
9 | Item constraints can be checked automatically (at scraping time) to drop items
10 | that fail to meet the constraints. In order to do that, add the constraints
11 | pipeline to your ITEM_PIPELINES:
12 |
13 | ITEM_PIPELINES = ['scrapylib.constraints.pipeline.ConstraintsPipeline']
14 |
15 | And define the constraints attribute in your item:
16 |
17 | class Product(Item):
18 | name = Field()
19 | price = Field()
20 | colors = Field()
21 |
22 | constraints = [
23 | RequiredFields('name', 'price'),
24 | IsPrice('price'),
25 | IsList('colors'),
26 | MinLen(10, 'name'),
27 | ]
28 |
29 | """
30 |
31 | import re
32 | from functools import partial
33 | from six import string_types, text_type
34 |
35 |
36 | class RequiredFields(object):
37 | """Assert that the specified fields are populated"""
38 |
39 | def __init__(self, *fields):
40 | self.fields = fields
41 |
42 | def __call__(self, item):
43 | for f in self.fields:
44 | assert f in item.keys(), "missing field: %s" % f
45 |
46 | class NonEmptyFields(object):
47 | """Assert that the specified fields are populated and non-empty"""
48 |
49 | def __init__(self, *fields):
50 | self.fields = fields
51 |
52 | def __call__(self, item):
53 | for f in self.fields:
54 | assert f in item.keys(), "missing field: %s" % f
55 | v = item[f]
56 | try:
57 | assert len(v) > 0, "empty field: %s" % f
58 | except TypeError:
59 | pass
60 |
61 | class IsType(object):
62 | """Assert that the specified fields are of the given type"""
63 |
64 | def __init__(self, type, *fields):
65 | self.type = type
66 | self.fields = fields
67 |
68 | def __call__(self, item):
69 | for f in self.fields:
70 | if f in item:
71 | v = item.get(f)
72 | assert isinstance(v, self.type), "field %r is not a %s: %r" % \
73 | (f, self.type.__name__, v)
74 |
75 | IsString = partial(IsType, string_types)
76 | IsUnicode = partial(IsType, text_type)
77 | IsList = partial(IsType, list)
78 | IsDict = partial(IsType, dict)
79 |
80 | class IsNumber(object):
81 | """Assert that the specified fields are string and contain only numbers"""
82 |
83 | def __init__(self, *fields):
84 | self.fields = fields
85 |
86 | def __call__(self, item):
87 | for f in self.fields:
88 | v = item.get(f)
89 | if v is None:
90 | continue
91 | assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v)
92 | assert v.strip().isdigit(), "field %r contains non-numeric chars: %r" % (f, v)
93 |
94 | class IsPrice(object):
95 | """Assert that the specified fields are string and look like a price"""
96 |
97 | def __init__(self, *fields):
98 | self.fields = fields
99 | self.price_re = re.compile('^[0-9\., ]+$')
100 |
101 | def __call__(self, item):
102 | for f in self.fields:
103 | v = item.get(f)
104 | if v:
105 | assert isinstance(v, string_types), "field %r is not a string: %r" % (f, v)
106 | assert self.price_re.search(v), "field %r is not a price: %r" % (f, v)
107 |
108 | class MaxLen(object):
109 | """Assert that the length of specified fields do not exceed the given
110 | size"""
111 |
112 | def __init__(self, size, *fields):
113 | self.size = size
114 | self.fields = fields
115 |
116 | def __call__(self, item):
117 | for f in self.fields:
118 | v = item.get(f)
119 | if v:
120 | self._proper_len(f, v)
121 |
122 | def _proper_len(self, f, v):
123 | assert len(v) <= self.size, "field %r length exceeds %d: %r" % (f, self.size, v)
124 |
125 | class MinLen(MaxLen):
126 | """Assert that the length of specified fields are larger (or equal) than
127 | the given size"""
128 |
129 | def _proper_len(self, f, v):
130 | assert len(v) >= self.size, "field %r length below %d: %r" % (f, self.size, v)
131 |
--------------------------------------------------------------------------------
/scrapylib/deltafetch.py:
--------------------------------------------------------------------------------
1 | import os, time
2 |
3 | from scrapy.http import Request
4 | from scrapy.item import BaseItem
5 | from scrapy.utils.request import request_fingerprint
6 | from scrapy.utils.project import data_path
7 | from scrapy.utils.python import to_bytes
8 | from scrapy.exceptions import NotConfigured
9 | from scrapy import log, signals
10 |
11 |
12 | class DeltaFetch(object):
13 | """This is a spider middleware to ignore requests to pages containing items
14 | seen in previous crawls of the same spider, thus producing a "delta crawl"
15 | containing only new items.
16 |
17 | This also speeds up the crawl, by reducing the number of requests that need
18 | to be crawled, and processed (typically, item requests are the most cpu
19 | intensive).
20 |
21 | Supported settings:
22 |
23 | * DELTAFETCH_ENABLED - to enable (or disable) this extension
24 | * DELTAFETCH_DIR - directory where to store state
25 | * DELTAFETCH_RESET - reset the state, clearing out all seen requests
26 |
27 | Supported spider arguments:
28 |
29 | * deltafetch_reset - same effect as DELTAFETCH_RESET setting
30 |
31 | Supported request meta keys:
32 |
33 | * deltafetch_key - used to define the lookup key for that request. by
34 | default it's the fingerprint, but it can be changed to contain an item
35 | id, for example. This requires support from the spider, but makes the
36 | extension more efficient for sites that many URLs for the same item.
37 |
38 | """
39 |
40 | def __init__(self, dir, reset=False, stats=None):
41 | dbmodule = None
42 | try:
43 | dbmodule = __import__('bsddb3').db
44 | except ImportError:
45 | try:
46 | dbmodule = __import__('bsddb').db
47 | except ImportError:
48 | pass
49 | if not dbmodule:
50 | raise NotConfigured('bssdb or bsddb3 is required')
51 | self.dbmodule = dbmodule
52 | self.dir = dir
53 | self.reset = reset
54 | self.stats = stats
55 |
56 | @classmethod
57 | def from_crawler(cls, crawler):
58 | s = crawler.settings
59 | if not s.getbool('DELTAFETCH_ENABLED'):
60 | raise NotConfigured
61 | dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
62 | reset = s.getbool('DELTAFETCH_RESET')
63 | o = cls(dir, reset, crawler.stats)
64 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
65 | crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
66 | return o
67 |
68 | def spider_opened(self, spider):
69 | if not os.path.exists(self.dir):
70 | os.makedirs(self.dir)
71 | dbpath = os.path.join(self.dir, '%s.db' % spider.name)
72 | reset = self.reset or getattr(spider, 'deltafetch_reset', False)
73 | flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE
74 | try:
75 | self.db = self.dbmodule.DB()
76 | self.db.open(filename=dbpath,
77 | dbtype=self.dbmodule.DB_HASH,
78 | flags=flag)
79 | except Exception:
80 | spider.log("Failed to open DeltaFetch database at %s, "
81 | "trying to recreate it" % dbpath)
82 | if os.path.exists(dbpath):
83 | os.remove(dbpath)
84 | self.db = self.dbmodule.DB()
85 | self.db.open(filename=dbpath,
86 | dbtype=self.dbmodule.DB_HASH,
87 | flags=self.dbmodule.DB_CREATE)
88 |
89 | def spider_closed(self, spider):
90 | self.db.close()
91 |
92 | def process_spider_output(self, response, result, spider):
93 | for r in result:
94 | if isinstance(r, Request):
95 | key = self._get_key(r)
96 | if self.db.has_key(key):
97 | spider.log("Ignoring already visited: %s" % r, level=log.INFO)
98 | if self.stats:
99 | self.stats.inc_value('deltafetch/skipped', spider=spider)
100 | continue
101 | elif isinstance(r, BaseItem):
102 | key = self._get_key(response.request)
103 | self.db[key] = str(time.time()).encode('iso8859-1')
104 | if self.stats:
105 | self.stats.inc_value('deltafetch/stored', spider=spider)
106 | yield r
107 |
108 | def _get_key(self, request):
109 | key = request.meta.get('deltafetch_key') or request_fingerprint(request)
110 | # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
111 | return to_bytes(key)
112 |
--------------------------------------------------------------------------------
/tests/test_magicfields.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import re, os
3 | from unittest import TestCase
4 |
5 | from scrapy.spiders import Spider
6 | from scrapy.utils.test import get_crawler
7 | from scrapy.item import DictItem, Field
8 | from scrapy.http import HtmlResponse
9 |
10 | from scrapylib.magicfields import _format, MagicFieldsMiddleware
11 |
12 |
13 | class TestItem(DictItem):
14 | fields = {
15 | 'url': Field(),
16 | 'nom': Field(),
17 | 'prix': Field(),
18 | 'spider': Field(),
19 | 'sku': Field(),
20 | }
21 |
22 |
23 | class MagicFieldsTest(TestCase):
24 |
25 | def setUp(self):
26 | self.environ = os.environ.copy()
27 | self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])
28 |
29 | def _log(x):
30 | print(x)
31 |
32 | self.spider.log = _log
33 | self.response = HtmlResponse(body=b"", url="http://www.example.com/product/8798732")
34 | self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
35 |
36 | def tearDown(self):
37 | os.environ = self.environ
38 |
39 | def assertRegexpMatches(self, text, regexp):
40 | """not present in python below 2.7"""
41 | return self.assertNotEqual(re.match(regexp, text), None)
42 |
43 | def test_hello(self):
44 | self.assertEqual(_format("hello world!", self.spider, self.response, self.item, {}), 'hello world!')
45 |
46 | def test_spidername_time(self):
47 | formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {})
48 | self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
49 |
50 | def test_unixtime(self):
51 | formatted = _format("Item scraped at $unixtime", self.spider, self.response, self.item, {})
52 | self.assertRegexpMatches(formatted, 'Item scraped at \d+\.\d+$')
53 |
54 | def test_isotime(self):
55 | formatted = _format("$isotime", self.spider, self.response, self.item, {})
56 | self.assertRegexpMatches(formatted, '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}$')
57 |
58 | def test_jobid(self):
59 | os.environ["SCRAPY_JOB"] = 'aa788'
60 | formatted = _format("job id '$jobid' for spider $spider:name", self.spider, self.response, self.item, {})
61 | self.assertEqual(formatted, "job id 'aa788' for spider myspider")
62 |
63 | def test_spiderarg(self):
64 | formatted = _format("Argument arg1: $spider:arg1", self.spider, self.response, self.item, {})
65 | self.assertEqual(formatted, 'Argument arg1: val1')
66 |
67 | def test_spiderattr(self):
68 | formatted = _format("$spider:start_urls", self.spider, self.response, self.item, {})
69 | self.assertEqual(formatted, "['http://example.com']")
70 |
71 | def test_settings(self):
72 | formatted = _format("$setting:MY_SETTING", self.spider, self.response, self.item, {"$setting": {"MY_SETTING": True}})
73 | self.assertEqual(formatted, 'True')
74 |
75 | def test_notexisting(self):
76 | """Not existing entities are not substituted"""
77 | formatted = _format("Item scraped at $myentity", self.spider, self.response, self.item, {})
78 | self.assertEqual(formatted, 'Item scraped at $myentity')
79 |
80 | def test_noargs(self):
81 | """If entity does not accept arguments, don't substitute"""
82 | formatted = _format("Scraped on day $unixtime:arg", self.spider, self.response, self.item, {})
83 | self.assertEqual(formatted, "Scraped on day $unixtime:arg")
84 |
85 | def test_noargs2(self):
86 | """If entity does not have enough arguments, don't substitute"""
87 | formatted = _format("$spider", self.spider, self.response, self.item, {})
88 | self.assertEqual(formatted, "$spider")
89 |
90 | def test_invalidattr(self):
91 | formatted = _format("Argument arg2: $spider:arg2", self.spider, self.response, self.item, {})
92 | self.assertEqual(formatted, "Argument arg2: $spider:arg2")
93 |
94 | def test_environment(self):
95 | os.environ["TEST_ENV"] = "testval"
96 | formatted = _format("$env:TEST_ENV", self.spider, self.response, self.item, {})
97 | self.assertEqual(formatted, "testval")
98 |
99 | def test_response(self):
100 | formatted = _format("$response:url", self.spider, self.response, self.item, {})
101 | self.assertEqual(formatted, self.response.url)
102 |
103 | def test_fields_copy(self):
104 | formatted = _format("$field:nom", self.spider, self.response, self.item, {})
105 | self.assertEqual(formatted, 'myitem')
106 |
107 | def test_regex(self):
108 | formatted = _format("$field:url,r'item_no=(\d+)'", self.spider, self.response, self.item, {})
109 | self.assertEqual(formatted, '345')
110 |
111 | def test_mware(self):
112 | settings = {"MAGIC_FIELDS": {"spider": "$spider:name"}}
113 | crawler = get_crawler(settings_dict=settings)
114 | mware = MagicFieldsMiddleware.from_crawler(crawler)
115 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0]
116 | expected = {
117 | 'nom': 'myitem',
118 | 'prix': '56.70 euros',
119 | 'spider': 'myspider',
120 | 'url': 'http://www.example.com/product.html?item_no=345'
121 | }
122 | self.assertEqual(result, expected)
123 |
124 | def test_mware_override(self):
125 | settings = {
126 | "MAGIC_FIELDS": {"spider": "$spider:name"},
127 | "MAGIC_FIELDS_OVERRIDE": {"sku": "$field:nom"}
128 | }
129 | crawler = get_crawler(settings_dict=settings)
130 | mware = MagicFieldsMiddleware.from_crawler(crawler)
131 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0]
132 | expected = {
133 | 'nom': 'myitem',
134 | 'prix': '56.70 euros',
135 | 'spider': 'myspider',
136 | 'url': 'http://www.example.com/product.html?item_no=345',
137 | 'sku': 'myitem',
138 | }
139 | self.assertEqual(result, expected)
140 |
--------------------------------------------------------------------------------
/scrapylib/magicfields.py:
--------------------------------------------------------------------------------
1 | """
2 | Allow to add extra fields to items, based on the configuration setting MAGIC_FIELDS and MAGIC_FIELDS_OVERRIDE.
3 | Both settings are a dict. The keys are the destination field names, their values, a string which admits magic variables,
4 | identified by a starting '$', which will be substituted by a corresponding value. Some magic also accept arguments, and are specified
5 | after the magic name, using a ':' as separator.
6 |
7 | You can set project global magics with MAGIC_FIELDS, and tune them for a specific spider using MAGIC_FIELDS_OVERRIDE.
8 |
9 | In case there is more than one argument, they must come separated by ','. So, the generic magic format is
10 |
11 | $[:arg1,arg2,...]
12 |
13 | Current magic variables are:
14 | - $time
15 | The UTC timestamp at which the item was scraped, in format '%Y-%m-%d %H:%M:%S'.
16 | - $unixtime
17 | The unixtime (number of seconds since the Epoch, i.e. time.time()) at which the item was scraped.
18 | - $isotime
19 | The UTC timestamp at which the item was scraped, with format '%Y-%m-%dT%H:%M:%S".
20 | - $spider
21 | Must be followed by an argument, which is the name of an attribute of the spider (like an argument passed to it).
22 | - $env
23 | The value of an environment variable. It admits as argument the name of the variable.
24 | - $jobid
25 | The job id (shortcut for $env:SCRAPY_JOB)
26 | - $jobtime
27 | The UTC timestamp at which the job started, in format '%Y-%m-%d %H:%M:%S'.
28 | - $response
29 | Access to some response properties.
30 | $response:url
31 | The url from where the item was extracted from.
32 | $response:status
33 | Response http status.
34 | $response:headers
35 | Response http headers.
36 | - $setting
37 | Access the given Scrapy setting. It accepts one argument: the name of the setting.
38 | - $field
39 | Allows to copy the value of one field to another. Its argument is the source field. Effects are unpredicable if you use as source a field that is filled
40 | using magic fields.
41 |
42 | Examples:
43 |
44 | The following configuration will add two fields to each scraped item: 'timestamp', which will be filled with the string 'item scraped at ',
45 | and 'spider', which will contain the spider name:
46 |
47 | MAGIC_FIELDS = {"timestamp": "item scraped at $time", "spider": "$spider:name"}
48 |
49 | The following configuration will copy the url to the field sku:
50 |
51 | MAGIC_FIELDS = {"sku": "$field:url"}
52 |
53 | Magics admits also regular expression argument which allow to extract and assign only part of the value generated by the magic. You have to specify
54 | it using the r'' notation. Suppose that the urls of your items are like 'http://www.example.com/product.html?item_no=345' and you want to assign to the sku field
55 | only the item number. The following example, similar to the previous one but with a second regular expression argument, will do the task:
56 |
57 | MAGIC_FIELDS = {"sku": "$field:url,r'item_no=(\d+)'"}
58 |
59 | """
60 |
61 | import re, time, datetime, os
62 |
63 | from scrapy.exceptions import NotConfigured
64 | from scrapy.item import BaseItem
65 |
66 | def _time():
67 | return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
68 |
69 | def _isotime():
70 | return datetime.datetime.utcnow().isoformat()
71 |
72 | _REGEXES = {}
73 | _REGEX_ERRORS = {}
74 | def _extract_regex_group(regex, txt):
75 | compiled = _REGEXES.get(regex)
76 | errmessage = _REGEX_ERRORS.get(regex)
77 | if compiled is None and errmessage is None:
78 | try:
79 | compiled = re.compile(regex)
80 | _REGEXES[regex] = compiled
81 | except Exception as e:
82 | errmessage = e.message
83 | _REGEX_ERRORS[regex] = errmessage
84 | if errmessage:
85 | raise ValueError(errmessage)
86 | m = compiled.search(txt)
87 | if m:
88 | return "".join(m.groups()) or None
89 |
90 | _ENTITY_FUNCTION_MAP = {
91 | '$time': _time,
92 | '$unixtime': time.time,
93 | '$isotime': _isotime,
94 | }
95 |
96 | _ENTITIES_RE = re.compile("(\$[a-z]+)(:\w+)?(?:,r\'(.+)\')?")
97 | def _first_arg(args):
98 | if args:
99 | return args.pop(0)
100 |
101 | def _format(fmt, spider, response, item, fixed_values):
102 | out = fmt
103 | for m in _ENTITIES_RE.finditer(fmt):
104 | val = None
105 | entity, args, regex = m.groups()
106 | args = list(filter(None, (args or ':')[1:].split(',')))
107 | if entity == "$jobid":
108 | val = os.environ.get('SCRAPY_JOB', '')
109 | elif entity == "$spider":
110 | attr = _first_arg(args)
111 | if not attr or not hasattr(spider, attr):
112 | spider.log("Error at '%s': spider does not have attribute" % m.group())
113 | else:
114 | val = str(getattr(spider, attr))
115 | elif entity == "$response":
116 | attr = _first_arg(args)
117 | if not attr or not hasattr(response, attr):
118 | spider.log("Error at '%s': response does not have attribute" % m.group())
119 | else:
120 | val = str(getattr(response, attr))
121 | elif entity == "$field":
122 | attr = _first_arg(args)
123 | if attr in item:
124 | val = str(item[attr])
125 | elif entity in fixed_values:
126 | attr = _first_arg(args)
127 | val = fixed_values[entity]
128 | if entity == "$setting" and attr:
129 | val = str(val[attr])
130 | elif entity == "$env" and args:
131 | attr = _first_arg(args)
132 | if attr:
133 | val = os.environ.get(attr, '')
134 | else:
135 | function = _ENTITY_FUNCTION_MAP.get(entity)
136 | if function is not None:
137 | try:
138 | val = str(function(*args))
139 | except:
140 | spider.log("Error at '%s': invalid argument for function" % m.group())
141 | if val is not None:
142 | out = out.replace(m.group(), val, 1)
143 | if regex:
144 | try:
145 | out = _extract_regex_group(regex, out)
146 | except ValueError as e:
147 | spider.log("Error at '%s': %s" % (m.group(), e.message))
148 |
149 | return out
150 |
151 | class MagicFieldsMiddleware(object):
152 |
153 | @classmethod
154 | def from_crawler(cls, crawler):
155 | mfields = crawler.settings.getdict("MAGIC_FIELDS").copy()
156 | mfields.update(crawler.settings.getdict("MAGIC_FIELDS_OVERRIDE"))
157 | if not mfields:
158 | raise NotConfigured
159 | return cls(mfields, crawler.settings)
160 |
161 | def __init__(self, mfields, settings):
162 | self.mfields = mfields
163 | self.fixed_values = {
164 | "$jobtime": _time(),
165 | "$setting": settings,
166 | }
167 |
168 | def process_spider_output(self, response, result, spider):
169 | for _res in result:
170 | if isinstance(_res, BaseItem):
171 | for field, fmt in self.mfields.items():
172 | _res.setdefault(field, _format(fmt, spider, response, _res, self.fixed_values))
173 | yield _res
174 |
175 |
--------------------------------------------------------------------------------
/scrapylib/crawlera.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import warnings
3 | import os
4 |
5 | from w3lib.http import basic_auth_header
6 | from scrapy import log, signals
7 | from scrapy.exceptions import ScrapyDeprecationWarning
8 | from twisted.internet.error import ConnectionRefusedError
9 |
10 |
11 | class CrawleraMiddleware(object):
12 |
13 | url = 'http://paygo.crawlera.com:8010'
14 | maxbans = 400
15 | ban_code = 503
16 | download_timeout = 1800
17 | # Handle crawlera server failures
18 | connection_refused_delay = 90
19 | preserve_delay = False
20 |
21 | _settings = [
22 | ('user', str),
23 | ('pass', str),
24 | ('url', str),
25 | ('maxbans', int),
26 | ('download_timeout', int),
27 | ('preserve_delay', bool),
28 | ]
29 |
30 | def __init__(self, crawler):
31 | warnings.warn(
32 | 'This version of CrawleraMiddleware is deprecated, '
33 | 'please use the version found in the scrapy-crawlera '
34 | 'package instead.')
35 | self.crawler = crawler
36 | self.job_id = os.environ.get('SCRAPY_JOB')
37 | self._bans = defaultdict(int)
38 | self._saved_delays = defaultdict(lambda: None)
39 |
40 | @classmethod
41 | def from_crawler(cls, crawler):
42 | o = cls(crawler)
43 | crawler.signals.connect(o.open_spider, signals.spider_opened)
44 | return o
45 |
46 | def open_spider(self, spider):
47 | self.enabled = self.is_enabled(spider)
48 | if not self.enabled:
49 | return
50 |
51 | for k, type_ in self._settings:
52 | setattr(self, k, self._get_setting_value(spider, k, type_))
53 | if '?noconnect' not in self.url:
54 | self.url += '?noconnect'
55 |
56 | self._proxyauth = self.get_proxyauth(spider)
57 | log.msg("Using crawlera at %s (user: %s)" % (self.url, self.user),
58 | spider=spider)
59 |
60 | if not self.preserve_delay:
61 | # Setting spider download delay to 0 to get maximum crawl rate
62 | spider.download_delay = 0
63 | log.msg("Setting spider download delay to 0. It's default "
64 | "CrawleraMiddleware behavior, to preserve original delay"
65 | " set CRAWLERA_PRESERVE_DELAY = True in settings.",
66 | spider=spider)
67 |
68 | def _settings_get(self, type_, *a, **kw):
69 | if type_ is int:
70 | return self.crawler.settings.getint(*a, **kw)
71 | elif type_ is bool:
72 | return self.crawler.settings.getbool(*a, **kw)
73 | elif type_ is list:
74 | return self.crawler.settings.getlist(*a, **kw)
75 | elif type_ is dict:
76 | return self.crawler.settings.getdict(*a, **kw)
77 | else:
78 | return self.crawler.settings.get(*a, **kw)
79 |
80 | def _get_setting_value(self, spider, k, type_):
81 | if hasattr(spider, 'hubproxy_' + k):
82 | warnings.warn('hubproxy_%s attribute is deprecated, '
83 | 'use crawlera_%s instead.' % (k, k),
84 | category=ScrapyDeprecationWarning, stacklevel=1)
85 |
86 | if self.crawler.settings.get('HUBPROXY_%s' % k.upper()) is not None:
87 | warnings.warn('HUBPROXY_%s setting is deprecated, '
88 | 'use CRAWLERA_%s instead.' % (k.upper(), k.upper()),
89 | category=ScrapyDeprecationWarning, stacklevel=1)
90 |
91 | o = getattr(self, k, None)
92 | s = self._settings_get(type_, 'CRAWLERA_' + k.upper(),
93 | self._settings_get(type_, 'HUBPROXY_' + k.upper(), o))
94 | return getattr(spider, 'crawlera_' + k,
95 | getattr(spider, 'hubproxy_' + k, s))
96 |
97 | def is_enabled(self, spider):
98 | """Hook to enable middleware by custom rules."""
99 | if hasattr(spider, 'use_hubproxy'):
100 | warnings.warn('use_hubproxy attribute is deprecated, '
101 | 'use crawlera_enabled instead.',
102 | category=ScrapyDeprecationWarning, stacklevel=1)
103 |
104 | if self.crawler.settings.get('HUBPROXY_ENABLED') is not None:
105 | warnings.warn('HUBPROXY_ENABLED setting is deprecated, '
106 | 'use CRAWLERA_ENABLED instead.',
107 | category=ScrapyDeprecationWarning, stacklevel=1)
108 | return (
109 | getattr(spider, 'crawlera_enabled', False) or
110 | getattr(spider, 'use_hubproxy', False) or
111 | self.crawler.settings.getbool("CRAWLERA_ENABLED") or
112 | self.crawler.settings.getbool("HUBPROXY_ENABLED")
113 | )
114 |
115 | def get_proxyauth(self, spider):
116 | """Hook to compute Proxy-Authorization header by custom rules."""
117 | return basic_auth_header(self.user, getattr(self, 'pass'))
118 |
119 | def process_request(self, request, spider):
120 | if self._is_enabled_for_request(request):
121 | request.meta['proxy'] = self.url
122 | request.meta['download_timeout'] = self.download_timeout
123 | request.headers['Proxy-Authorization'] = self._proxyauth
124 | if self.job_id:
125 | request.headers['X-Crawlera-Jobid'] = self.job_id
126 |
127 | def process_response(self, request, response, spider):
128 | if not self._is_enabled_for_request(request):
129 | return response
130 | key = self._get_slot_key(request)
131 | self._restore_original_delay(request)
132 | if response.status == self.ban_code:
133 | self._bans[key] += 1
134 | if self._bans[key] > self.maxbans:
135 | self.crawler.engine.close_spider(spider, 'banned')
136 | else:
137 | after = response.headers.get('retry-after')
138 | if after:
139 | self._set_custom_delay(request, float(after))
140 | else:
141 | self._bans[key] = 0
142 | return response
143 |
144 | def process_exception(self, request, exception, spider):
145 | if not self._is_enabled_for_request(request):
146 | return
147 | if isinstance(exception, ConnectionRefusedError):
148 | # Handle crawlera downtime
149 | self._set_custom_delay(request, self.connection_refused_delay)
150 |
151 | def _is_enabled_for_request(self, request):
152 | return self.enabled and 'dont_proxy' not in request.meta
153 |
154 | def _get_slot_key(self, request):
155 | return request.meta.get('download_slot')
156 |
157 | def _get_slot(self, request):
158 | key = self._get_slot_key(request)
159 | return key, self.crawler.engine.downloader.slots.get(key)
160 |
161 | def _set_custom_delay(self, request, delay):
162 | """Set custom delay for slot and save original one."""
163 | key, slot = self._get_slot(request)
164 | if not slot:
165 | return
166 | if self._saved_delays[key] is None:
167 | self._saved_delays[key] = slot.delay
168 | slot.delay = delay
169 |
170 | def _restore_original_delay(self, request):
171 | """Restore original delay for slot if it was changed."""
172 | key, slot = self._get_slot(request)
173 | if not slot:
174 | return
175 | if self._saved_delays[key] is not None:
176 | slot.delay, self._saved_delays[key] = self._saved_delays[key], None
177 |
--------------------------------------------------------------------------------
/tests/test_hubproxy.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from six.moves import xrange
3 |
4 | from w3lib.http import basic_auth_header
5 | from scrapy.http import Request, Response
6 | from scrapy.spiders import Spider
7 | from scrapy.utils.test import get_crawler
8 | from scrapylib.hubproxy import HubProxyMiddleware
9 |
10 |
11 | class HubProxyMiddlewareTestCase(TestCase):
12 |
13 | mwcls = HubProxyMiddleware
14 |
15 | def setUp(self):
16 | self.spider = Spider('foo')
17 | self.settings = {'HUBPROXY_USER': 'user', 'HUBPROXY_PASS': 'pass'}
18 |
19 | def _mock_crawler(self, settings=None):
20 | class MockedDownloader(object):
21 | slots = {}
22 |
23 | class MockedEngine(object):
24 | downloader = MockedDownloader()
25 | fake_spider_closed_result = None
26 | def close_spider(self, spider, reason):
27 | self.fake_spider_closed_result = (spider, reason)
28 |
29 | crawler = get_crawler(settings_dict=settings)
30 | crawler.engine = MockedEngine()
31 | return crawler
32 |
33 | def _assert_disabled(self, spider, settings=None):
34 | crawler = self._mock_crawler(settings)
35 | mw = self.mwcls.from_crawler(crawler)
36 | mw.open_spider(spider)
37 | req = Request('http://www.scrapytest.org')
38 | out = mw.process_request(req, spider)
39 | self.assertEqual(out, None)
40 | self.assertEqual(req.meta.get('proxy'), None)
41 | self.assertEqual(req.meta.get('download_timeout'), None)
42 | self.assertEqual(req.headers.get('Proxy-Authorization'), None)
43 | res = Response(req.url)
44 | assert mw.process_response(req, res, spider) is res
45 | res = Response(req.url, status=mw.ban_code)
46 | assert mw.process_response(req, res, spider) is res
47 |
48 | def _assert_enabled(self, spider,
49 | settings=None,
50 | proxyurl='http://paygo.crawlera.com:8010?noconnect',
51 | proxyauth=basic_auth_header('user', 'pass'),
52 | bancode=503,
53 | maxbans=400,
54 | download_timeout=1800,
55 | ):
56 | crawler = self._mock_crawler(settings)
57 | mw = self.mwcls.from_crawler(crawler)
58 | mw.open_spider(spider)
59 | req = Request('http://www.scrapytest.org')
60 | assert mw.process_request(req, spider) is None
61 | self.assertEqual(req.meta.get('proxy'), proxyurl)
62 | self.assertEqual(req.meta.get('download_timeout'), download_timeout)
63 | self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
64 | res = Response(req.url)
65 | assert mw.process_response(req, res, spider) is res
66 |
67 | # disabled if 'dont_proxy' is set
68 | req = Request('http://www.scrapytest.org')
69 | req.meta['dont_proxy'] = True
70 | assert mw.process_request(req, spider) is None
71 | self.assertEqual(req.meta.get('proxy'), None)
72 | self.assertEqual(req.meta.get('download_timeout'), None)
73 | self.assertEqual(req.headers.get('Proxy-Authorization'), None)
74 | res = Response(req.url)
75 | assert mw.process_response(req, res, spider) is res
76 | del req.meta['dont_proxy']
77 |
78 | if maxbans > 0:
79 | # assert ban count is reseted after a succesful response
80 | res = Response('http://ban.me', status=bancode)
81 | assert mw.process_response(req, res, spider) is res
82 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
83 | res = Response('http://unban.me')
84 | assert mw.process_response(req, res, spider) is res
85 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
86 | self.assertEqual(mw._bans[None], 0)
87 |
88 | # check for not banning before maxbans for bancode
89 | for x in xrange(maxbans + 1):
90 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
91 | res = Response('http://ban.me/%d' % x, status=bancode)
92 | assert mw.process_response(req, res, spider) is res
93 |
94 | # max bans reached and close_spider called
95 | self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
96 |
97 | def test_disabled_by_lack_of_hubproxy_settings(self):
98 | self._assert_disabled(self.spider, settings={})
99 |
100 | def test_spider_use_hubproxy(self):
101 | self.assertFalse(hasattr(self.spider, 'use_hubproxy'))
102 | self._assert_disabled(self.spider, self.settings)
103 | self.spider.use_hubproxy = True
104 | self._assert_enabled(self.spider, self.settings)
105 | self.spider.use_hubproxy = False
106 | self._assert_disabled(self.spider, self.settings)
107 |
108 | def test_enabled(self):
109 | self._assert_disabled(self.spider, self.settings)
110 | self.settings['HUBPROXY_ENABLED'] = True
111 | self._assert_enabled(self.spider, self.settings)
112 |
113 | def test_userpass(self):
114 | self.spider.use_hubproxy = True
115 | self.settings['HUBPROXY_USER'] = user = 'other'
116 | self.settings['HUBPROXY_PASS'] = pass_ = 'secret'
117 | proxyauth = basic_auth_header(user, pass_)
118 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
119 |
120 | self.spider.hubproxy_user = user = 'notfromsettings'
121 | self.spider.hubproxy_pass = pass_ = 'anothersecret'
122 | proxyauth = basic_auth_header(user, pass_)
123 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
124 |
125 | def test_proxyurl(self):
126 | self.spider.use_hubproxy = True
127 | self.settings['HUBPROXY_URL'] = 'http://localhost:8010'
128 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect')
129 |
130 | def test_maxbans(self):
131 | self.spider.use_hubproxy = True
132 | self.settings['HUBPROXY_MAXBANS'] = maxbans = 0
133 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans)
134 | self.settings['HUBPROXY_MAXBANS'] = maxbans = 100
135 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans)
136 |
137 | def test_download_timeout(self):
138 | self.spider.use_hubproxy = True
139 | self.settings['HUBPROXY_DOWNLOAD_TIMEOUT'] = 60
140 | self._assert_enabled(self.spider, self.settings, download_timeout=60)
141 | self.spider.hubproxy_download_timeout = 120
142 | self._assert_enabled(self.spider, self.settings, download_timeout=120)
143 |
144 | def test_hooks(self):
145 | class _ECLS(self.mwcls):
146 | def is_enabled(self, spider):
147 | wascalled.append('is_enabled')
148 | return enabled
149 | def get_proxyauth(self, spider):
150 | wascalled.append('get_proxyauth')
151 | return proxyauth
152 |
153 | wascalled = []
154 | self.mwcls = _ECLS
155 |
156 | # test is_enabled returns False
157 | enabled = False
158 | self.spider.use_hubproxy = True
159 | self._assert_disabled(self.spider, self.settings)
160 | self.assertEqual(wascalled, ['is_enabled'])
161 |
162 | wascalled[:] = [] # reset
163 | enabled = True
164 | self.spider.use_hubproxy = False
165 | proxyauth = b'Basic Foo'
166 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
167 | self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth'])
168 |
--------------------------------------------------------------------------------
/tests/test_hcf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import hashlib
3 | import unittest
4 |
5 | from scrapy.http import Request, Response
6 | from scrapy.spiders import Spider
7 | from scrapy.utils.test import get_crawler
8 | from scrapylib.hcf import HcfMiddleware
9 | from scrapy.exceptions import NotConfigured
10 | from hubstorage import HubstorageClient
11 |
12 | HS_ENDPOINT = os.getenv('HS_ENDPOINT', 'http://localhost:8003')
13 | HS_AUTH = os.getenv('HS_AUTH')
14 |
15 |
16 | @unittest.skipUnless(HS_AUTH, 'No valid hubstorage credentials set')
17 | class HcfTestCase(unittest.TestCase):
18 |
19 | hcf_cls = HcfMiddleware
20 |
21 | projectid = '2222222'
22 | spidername = 'hs-test-spider'
23 | frontier = 'test'
24 | slot = '0'
25 | number_of_slots = 1
26 |
27 | @classmethod
28 | def setUpClass(cls):
29 | cls.endpoint = HS_ENDPOINT
30 | cls.auth = HS_AUTH
31 | cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
32 | cls.project = cls.hsclient.get_project(cls.projectid)
33 | cls.fclient = cls.project.frontier
34 |
35 | @classmethod
36 | def tearDownClass(cls):
37 | cls.project.frontier.close()
38 | cls.hsclient.close()
39 |
40 | def setUp(self):
41 | class TestSpider(Spider):
42 | name = self.spidername
43 | start_urls = [
44 | 'http://www.example.com/'
45 | ]
46 |
47 | self.spider = TestSpider()
48 | self.hcf_settings = {'HS_ENDPOINT': self.endpoint,
49 | 'HS_AUTH': self.auth,
50 | 'HS_PROJECTID': self.projectid,
51 | 'HS_FRONTIER': self.frontier,
52 | 'HS_CONSUME_FROM_SLOT': self.slot,
53 | 'HS_NUMBER_OF_SLOTS': self.number_of_slots}
54 | self._delete_slot()
55 |
56 | def tearDown(self):
57 | self._delete_slot()
58 |
59 | def _delete_slot(self):
60 | self.fclient.delete_slot(self.frontier, self.slot)
61 |
62 | def _build_response(self, url, meta=None):
63 | return Response(url, request=Request(url="http://www.example.com/parent.html", meta=meta))
64 |
65 | def _get_crawler(self, settings=None):
66 | crawler = get_crawler(settings_dict=settings)
67 | # simulate crawler engine
68 | class Engine():
69 | def __init__(self):
70 | self.requests = []
71 | def schedule(self, request, spider):
72 | self.requests.append(request)
73 | crawler.engine = Engine()
74 |
75 | return crawler
76 |
77 | def test_not_loaded(self):
78 | crawler = self._get_crawler({})
79 | self.assertRaises(NotConfigured, self.hcf_cls.from_crawler, crawler)
80 |
81 | def test_start_requests(self):
82 | crawler = self._get_crawler(self.hcf_settings)
83 | hcf = self.hcf_cls.from_crawler(crawler)
84 |
85 | # first time should be empty
86 | start_urls = self.spider.start_urls
87 | new_urls = list(hcf.process_start_requests(start_urls, self.spider))
88 | self.assertEqual(new_urls, ['http://www.example.com/'])
89 |
90 | # now try to store some URLs in the hcf and retrieve them
91 | fps = [{'fp': 'http://www.example.com/index.html'},
92 | {'fp': 'http://www.example.com/index2.html'}]
93 | self.fclient.add(self.frontier, self.slot, fps)
94 | self.fclient.flush()
95 | new_urls = [r.url for r in hcf.process_start_requests(start_urls, self.spider)]
96 | expected_urls = [r['fp'] for r in fps]
97 | self.assertEqual(new_urls, expected_urls)
98 | self.assertEqual(len(hcf.batch_ids), 1)
99 |
100 | def test_spider_output(self):
101 | crawler = self._get_crawler(self.hcf_settings)
102 | hcf = self.hcf_cls.from_crawler(crawler)
103 |
104 | # process new GET request
105 | response = self._build_response("http://www.example.com/qxg1231")
106 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231", meta={'use_hcf': True})
107 | outputs = list(hcf.process_spider_output(response, [request], self.spider))
108 | self.assertEqual(outputs, [])
109 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])}
110 | self.assertEqual(dict(hcf.new_links), expected_links)
111 |
112 | # process new POST request (don't add it to the hcf)
113 | response = self._build_response("http://www.example.com/qxg456")
114 | request = Request(url="http://www.example.com/product/?qxp=456", method='POST')
115 | outputs = list(hcf.process_spider_output(response, [request], self.spider))
116 | self.assertEqual(outputs, [request])
117 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])}
118 | self.assertEqual(dict(hcf.new_links), expected_links)
119 |
120 | # process new GET request (without the use_hcf meta key)
121 | response = self._build_response("http://www.example.com/qxg1231")
122 | request = Request(url="http://www.example.com/product/?qxp=789")
123 | outputs = list(hcf.process_spider_output(response, [request], self.spider))
124 | self.assertEqual(outputs, [request])
125 | expected_links = {'0': set(['http://www.example.com/product/?qxp=12&qxg=1231'])}
126 | self.assertEqual(dict(hcf.new_links), expected_links)
127 |
128 | # Simulate close spider
129 | hcf.close_spider(self.spider, 'finished')
130 |
131 | def test_close_spider(self):
132 | crawler = self._get_crawler(self.hcf_settings)
133 | hcf = self.hcf_cls.from_crawler(crawler)
134 |
135 | # Save 2 batches in the HCF
136 | fps = [{'fp': 'http://www.example.com/index_%s.html' % i} for i in range(0, 200)]
137 | self.fclient.add(self.frontier, self.slot, fps)
138 | self.fclient.flush()
139 |
140 | # Read the first batch
141 | start_urls = self.spider.start_urls
142 | new_urls = [r.url for r in hcf.process_start_requests(start_urls, self.spider)]
143 | expected_urls = [r['fp'] for r in fps]
144 | self.assertEqual(new_urls, expected_urls)
145 |
146 | # Simulate extracting some new urls
147 | response = self._build_response("http://www.example.com/parent.html")
148 | new_fps = ["http://www.example.com/child_%s.html" % i for i in range(0, 50)]
149 | for fp in new_fps:
150 | request = Request(url=fp, meta={'use_hcf': True})
151 | list(hcf.process_spider_output(response, [request], self.spider))
152 | self.assertEqual(len(hcf.new_links[self.slot]), 50)
153 |
154 | # Simulate emptying the scheduler
155 | crawler.engine.requests = []
156 |
157 | # Simulate close spider
158 | hcf.close_spider(self.spider, 'finished')
159 | self.assertEqual(len(hcf.new_links[self.slot]), 0)
160 | self.assertEqual(len(hcf.batch_ids), 0)
161 |
162 | # HCF must be have 1 new batch
163 | batches = [b for b in self.fclient.read(self.frontier, self.slot)]
164 | self.assertEqual(len(batches), 1)
165 |
166 | def test_hcf_params(self):
167 | crawler = self._get_crawler(self.hcf_settings)
168 | hcf = self.hcf_cls.from_crawler(crawler)
169 |
170 | # Simulate extracting some new urls and adding them to the HCF
171 | response = self._build_response("http://www.example.com/parent.html")
172 | new_fps = ["http://www.example.com/child_%s.html" % i for i in range(0, 5)]
173 | new_requests = []
174 | for fp in new_fps:
175 | hcf_params = {'qdata': {'a': '1', 'b': '2', 'c': '3'},
176 | 'fdata': {'x': '1', 'y': '2', 'z': '3'},
177 | 'p': 1}
178 | request = Request(url=fp, meta={'use_hcf': True, "hcf_params": hcf_params})
179 | new_requests.append(request)
180 | list(hcf.process_spider_output(response, [request], self.spider))
181 | expected = set(['http://www.example.com/child_4.html',
182 | 'http://www.example.com/child_1.html',
183 | 'http://www.example.com/child_0.html',
184 | 'http://www.example.com/child_3.html',
185 | 'http://www.example.com/child_2.html'])
186 | self.assertEqual(hcf.new_links[self.slot], expected)
187 |
188 | # Simulate close spider
189 | hcf.close_spider(self.spider, 'finished')
190 |
191 | # Similate running another spider
192 | start_urls = self.spider.start_urls
193 | stored_requests = list(hcf.process_start_requests(start_urls, self.spider))
194 | for a, b in zip(new_requests, stored_requests):
195 | self.assertEqual(a.url, b.url)
196 | self.assertEqual(a.meta.get('qdata'), b.meta.get('qdata'))
197 |
198 | # Simulate emptying the scheduler
199 | crawler.engine.requests = []
200 |
201 | # Simulate close spider
202 | hcf.close_spider(self.spider, 'finished')
203 |
204 | def test_spider_output_override_slot(self):
205 | crawler = self._get_crawler(self.hcf_settings)
206 | hcf = self.hcf_cls.from_crawler(crawler)
207 |
208 | def get_slot_callback(request):
209 | md5 = hashlib.md5()
210 | md5.update(request.url)
211 | digest = md5.hexdigest()
212 | return str(int(digest, 16) % 5)
213 | self.spider.slot_callback = get_slot_callback
214 |
215 | # process new GET request
216 | response = self._build_response("http://www.example.com/qxg1231")
217 | request = Request(url="http://www.example.com/product/?qxp=12&qxg=1231",
218 | meta={'use_hcf': True})
219 | outputs = list(hcf.process_spider_output(response, [request], self.spider))
220 | self.assertEqual(outputs, [])
221 | expected_links = {'4': set(['http://www.example.com/product/?qxp=12&qxg=1231'])}
222 | self.assertEqual(dict(hcf.new_links), expected_links)
223 |
224 | # Simulate close spider
225 | hcf.close_spider(self.spider, 'finished')
226 |
--------------------------------------------------------------------------------
/scrapylib/hcf.py:
--------------------------------------------------------------------------------
1 | """
2 | HCF Middleware
3 |
4 | This SpiderMiddleware uses the HCF backend from hubstorage to retrieve the new
5 | urls to crawl and store back the links extracted.
6 |
7 | To activate this middleware it needs to be added to the SPIDER_MIDDLEWARES
8 | list, i.e:
9 |
10 | SPIDER_MIDDLEWARES = {
11 | 'scrapylib.hcf.HcfMiddleware': 543,
12 | }
13 |
14 | And the next settings need to be defined:
15 |
16 | HS_AUTH - API key
17 | HS_PROJECTID - Project ID in the dash (not needed if the spider is ran on dash)
18 | HS_FRONTIER - Frontier name.
19 | HS_CONSUME_FROM_SLOT - Slot from where the spider will read new URLs.
20 |
21 | Note that HS_FRONTIER and HS_CONSUME_FROM_SLOT can be overriden from inside a spider using
22 | the spider attributes: "hs_frontier" and "hs_consume_from_slot" respectively.
23 |
24 | The next optional settings can be defined:
25 |
26 | HS_ENDPOINT - URL to the API endpoint, i.e: http://localhost:8003.
27 | The default value is provided by the python-hubstorage
28 | package.
29 |
30 | HS_MAX_LINKS - Number of links to be read from the HCF, the default is 1000.
31 |
32 | HS_START_JOB_ENABLED - Enable whether to start a new job when the spider
33 | finishes. The default is False
34 |
35 | HS_START_JOB_ON_REASON - This is a list of closing reasons, if the spider ends
36 | with any of these reasons a new job will be started
37 | for the same slot. The default is ['finished']
38 |
39 | HS_NUMBER_OF_SLOTS - This is the number of slots that the middleware will
40 | use to store the new links. The default is 8.
41 |
42 | The next keys can be defined in a Request meta in order to control the behavior
43 | of the HCF middleware:
44 |
45 | use_hcf - If set to True the request will be stored in the HCF.
46 | hcf_params - Dictionary of parameters to be stored in the HCF with the request
47 | fingerprint
48 |
49 | qdata data to be stored along with the fingerprint in the request queue
50 | fdata data to be stored along with the fingerprint in the fingerprint set
51 | p Priority - lower priority numbers are returned first. The default is 0
52 |
53 | The value of 'qdata' parameter could be retrieved later using
54 | ``response.meta['hcf_params']['qdata']``.
55 |
56 | The spider can override the default slot assignation function by setting the
57 | spider slot_callback method to a function with the following signature:
58 |
59 | def slot_callback(request):
60 | ...
61 | return slot
62 |
63 | """
64 | import os
65 | import hashlib
66 | import logging
67 | from collections import defaultdict
68 | from datetime import datetime
69 | from scrapinghub import Connection
70 | from scrapy import signals, log
71 | from scrapy.exceptions import NotConfigured
72 | from scrapy.http import Request
73 | from hubstorage import HubstorageClient
74 |
75 | DEFAULT_MAX_LINKS = 1000
76 | DEFAULT_HS_NUMBER_OF_SLOTS = 8
77 |
78 |
79 | class HcfMiddleware(object):
80 |
81 | def __init__(self, crawler):
82 | settings = crawler.settings
83 | self.hs_endpoint = settings.get("HS_ENDPOINT")
84 | self.hs_auth = self._get_config(settings, "HS_AUTH")
85 | self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
86 | self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
87 | self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
88 | self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
89 | self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
90 | self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
91 | self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])
92 |
93 | conn = Connection(self.hs_auth)
94 | self.panel_project = conn[self.hs_projectid]
95 |
96 | self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
97 | self.project = self.hsclient.get_project(self.hs_projectid)
98 | self.fclient = self.project.frontier
99 |
100 | self.new_links = defaultdict(set)
101 | self.batch_ids = []
102 |
103 | crawler.signals.connect(self.close_spider, signals.spider_closed)
104 |
105 | # Make sure the logger for hubstorage.batchuploader is configured
106 | logging.basicConfig()
107 |
108 | def _get_config(self, settings, key, default=None):
109 | value = settings.get(key, default)
110 | if not value:
111 | raise NotConfigured('%s not found' % key)
112 | return value
113 |
114 | def _msg(self, msg, level=log.INFO):
115 | log.msg('(HCF) %s' % msg, level)
116 |
117 | def start_job(self, spider):
118 | self._msg("Starting new job for: %s" % spider.name)
119 | jobid = self.panel_project.schedule(
120 | spider.name,
121 | hs_consume_from_slot=self.hs_consume_from_slot,
122 | dummy=datetime.now()
123 | )
124 | self._msg("New job started: %s" % jobid)
125 | return jobid
126 |
127 | @classmethod
128 | def from_crawler(cls, crawler):
129 | return cls(crawler)
130 |
131 | def process_start_requests(self, start_requests, spider):
132 |
133 | self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
134 | self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)
135 |
136 | self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
137 | self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)
138 |
139 | self.has_new_requests = False
140 | for req in self._get_new_requests():
141 | self.has_new_requests = True
142 | yield req
143 |
144 | # if there are no links in the hcf, use the start_requests
145 | # unless this is not the first job.
146 | if not self.has_new_requests and not getattr(spider, 'dummy', None):
147 | self._msg('Using start_requests')
148 | for r in start_requests:
149 | yield r
150 |
151 | def process_spider_output(self, response, result, spider):
152 | slot_callback = getattr(spider, 'slot_callback', self._get_slot)
153 | for item in result:
154 | if isinstance(item, Request):
155 | request = item
156 | if request.meta.get('use_hcf', False):
157 | if request.method == 'GET': # XXX: Only GET support for now.
158 | slot = slot_callback(request)
159 | if not request.url in self.new_links[slot]:
160 | hcf_params = request.meta.get('hcf_params')
161 | fp = {'fp': request.url}
162 | if hcf_params:
163 | fp.update(hcf_params)
164 | # Save the new links as soon as possible using
165 | # the batch uploader
166 | self.fclient.add(self.hs_frontier, slot, [fp])
167 | self.new_links[slot].add(request.url)
168 | else:
169 | self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
170 | log.ERROR)
171 | yield request
172 | else:
173 | yield request
174 | else:
175 | yield item
176 |
177 | def close_spider(self, spider, reason):
178 | # Only store the results if the spider finished normally, if it
179 | # didn't finished properly there is not way to know whether all the url batches
180 | # were processed and it is better not to delete them from the frontier
181 | # (so they will be picked by another process).
182 | if reason == 'finished':
183 | self._save_new_links_count()
184 | self._delete_processed_ids()
185 |
186 | # Close the frontier client in order to make sure that all the new links
187 | # are stored.
188 | self.fclient.close()
189 | self.hsclient.close()
190 |
191 | # If the reason is defined in the hs_start_job_on_reason list then start
192 | # a new job right after this spider is finished.
193 | if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason:
194 |
195 | # Start the new job if this job had requests from the HCF or it
196 | # was the first job.
197 | if self.has_new_requests or not getattr(spider, 'dummy', None):
198 | self.start_job(spider)
199 |
200 | def _get_new_requests(self):
201 | """ Get a new batch of links from the HCF."""
202 | num_batches = 0
203 | num_links = 0
204 | for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1):
205 | for fingerprint, data in batch['requests']:
206 | num_links += 1
207 | yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}})
208 | self.batch_ids.append(batch['id'])
209 | if num_links >= self.hs_max_links:
210 | break
211 | self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot))
212 | self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot))
213 |
214 | def _save_new_links_count(self):
215 | """ Save the new extracted links into the HCF."""
216 | for slot, new_links in self.new_links.items():
217 | self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot))
218 | self.new_links = defaultdict(set)
219 |
220 | def _delete_processed_ids(self):
221 | """ Delete in the HCF the ids of the processed batches."""
222 | self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids)
223 | self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
224 | self.hs_consume_from_slot))
225 | self.batch_ids = []
226 |
227 | def _get_slot(self, request):
228 | """ Determine to which slot should be saved the request."""
229 | md5 = hashlib.md5()
230 | md5.update(request.url)
231 | digest = md5.hexdigest()
232 | return str(int(digest, 16) % self.hs_number_of_slots)
233 |
--------------------------------------------------------------------------------
/tests/test_crawlera.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from w3lib.http import basic_auth_header
4 | from scrapy.http import Request, Response
5 | from scrapy.spiders import Spider
6 | from scrapy.utils.test import get_crawler
7 | from twisted.internet.error import ConnectionRefusedError
8 | from six.moves import xrange
9 |
10 | from scrapylib.crawlera import CrawleraMiddleware
11 | import os
12 |
13 |
14 | class MockedSlot(object):
15 |
16 | def __init__(self, delay=0.0):
17 | self.delay = delay
18 |
19 |
20 | class CrawleraMiddlewareTestCase(TestCase):
21 |
22 | mwcls = CrawleraMiddleware
23 | bancode = 503
24 |
25 | def setUp(self):
26 | self.spider = Spider('foo')
27 | self.settings = {'CRAWLERA_USER': 'user', 'CRAWLERA_PASS': 'pass'}
28 |
29 | def _mock_crawler(self, settings=None):
30 |
31 | class MockedDownloader(object):
32 | slots = {}
33 |
34 | class MockedEngine(object):
35 | downloader = MockedDownloader()
36 | fake_spider_closed_result = None
37 |
38 | def close_spider(self, spider, reason):
39 | self.fake_spider_closed_result = (spider, reason)
40 |
41 | crawler = get_crawler(settings_dict=settings)
42 | crawler.engine = MockedEngine()
43 | return crawler
44 |
45 | def _assert_disabled(self, spider, settings=None):
46 | crawler = self._mock_crawler(settings)
47 | mw = self.mwcls.from_crawler(crawler)
48 | mw.open_spider(spider)
49 | req = Request('http://www.scrapytest.org')
50 | out = mw.process_request(req, spider)
51 | self.assertEqual(out, None)
52 | self.assertEqual(req.meta.get('proxy'), None)
53 | self.assertEqual(req.meta.get('download_timeout'), None)
54 | self.assertEqual(req.headers.get('Proxy-Authorization'), None)
55 | res = Response(req.url)
56 | assert mw.process_response(req, res, spider) is res
57 | res = Response(req.url, status=mw.ban_code)
58 | assert mw.process_response(req, res, spider) is res
59 |
60 | def _assert_enabled(self, spider,
61 | settings=None,
62 | proxyurl='http://paygo.crawlera.com:8010?noconnect',
63 | proxyauth=basic_auth_header('user', 'pass'),
64 | maxbans=400,
65 | download_timeout=1800):
66 | crawler = self._mock_crawler(settings)
67 | mw = self.mwcls.from_crawler(crawler)
68 | mw.open_spider(spider)
69 | req = Request('http://www.scrapytest.org')
70 | assert mw.process_request(req, spider) is None
71 | self.assertEqual(req.meta.get('proxy'), proxyurl)
72 | self.assertEqual(req.meta.get('download_timeout'), download_timeout)
73 | self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
74 | res = Response(req.url)
75 | assert mw.process_response(req, res, spider) is res
76 |
77 | # disabled if 'dont_proxy' is set
78 | req = Request('http://www.scrapytest.org')
79 | req.meta['dont_proxy'] = True
80 | assert mw.process_request(req, spider) is None
81 | self.assertEqual(req.meta.get('proxy'), None)
82 | self.assertEqual(req.meta.get('download_timeout'), None)
83 | self.assertEqual(req.headers.get('Proxy-Authorization'), None)
84 | res = Response(req.url)
85 | assert mw.process_response(req, res, spider) is res
86 | del req.meta['dont_proxy']
87 |
88 | if maxbans > 0:
89 | # assert ban count is reseted after a succesful response
90 | res = Response('http://ban.me', status=self.bancode)
91 | assert mw.process_response(req, res, spider) is res
92 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
93 | res = Response('http://unban.me')
94 | assert mw.process_response(req, res, spider) is res
95 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
96 | self.assertEqual(mw._bans[None], 0)
97 |
98 | # check for not banning before maxbans for bancode
99 | for x in xrange(maxbans + 1):
100 | self.assertEqual(crawler.engine.fake_spider_closed_result, None)
101 | res = Response('http://ban.me/%d' % x, status=self.bancode)
102 | assert mw.process_response(req, res, spider) is res
103 |
104 | # max bans reached and close_spider called
105 | self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned'))
106 |
107 | def test_disabled_by_lack_of_crawlera_settings(self):
108 | self._assert_disabled(self.spider, settings={})
109 |
110 | def test_spider_crawlera_enabled(self):
111 | self.assertFalse(hasattr(self.spider, 'crawlera_enabled'))
112 | self._assert_disabled(self.spider, self.settings)
113 | self.spider.crawlera_enabled = True
114 | self._assert_enabled(self.spider, self.settings)
115 | self.spider.crawlera_enabled = False
116 | self._assert_disabled(self.spider, self.settings)
117 |
118 | def test_enabled(self):
119 | self._assert_disabled(self.spider, self.settings)
120 | self.settings['CRAWLERA_ENABLED'] = True
121 | self._assert_enabled(self.spider, self.settings)
122 |
123 | def test_userpass(self):
124 | self.spider.crawlera_enabled = True
125 | self.settings['CRAWLERA_USER'] = user = 'other'
126 | self.settings['CRAWLERA_PASS'] = pass_ = 'secret'
127 | proxyauth = basic_auth_header(user, pass_)
128 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
129 |
130 | self.spider.crawlera_user = user = 'notfromsettings'
131 | self.spider.crawlera_pass = pass_ = 'anothersecret'
132 | proxyauth = basic_auth_header(user, pass_)
133 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
134 |
135 | def test_proxyurl(self):
136 | self.spider.crawlera_enabled = True
137 | self.settings['CRAWLERA_URL'] = 'http://localhost:8010'
138 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect')
139 |
140 | def test_proxyurl_including_noconnect(self):
141 | self.spider.crawlera_enabled = True
142 | self.settings['CRAWLERA_URL'] = 'http://localhost:8010?noconnect'
143 | self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8010?noconnect')
144 |
145 | def test_maxbans(self):
146 | self.spider.crawlera_enabled = True
147 | self.settings['CRAWLERA_MAXBANS'] = maxbans = 0
148 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans)
149 | self.settings['CRAWLERA_MAXBANS'] = maxbans = 100
150 | self._assert_enabled(self.spider, self.settings, maxbans=maxbans)
151 | # Assert setting is coerced into correct type
152 | self.settings['CRAWLERA_MAXBANS'] = '123'
153 | self._assert_enabled(self.spider, self.settings, maxbans=123)
154 | self.spider.crawlera_maxbans = 99
155 | self._assert_enabled(self.spider, self.settings, maxbans=99)
156 |
157 | def test_download_timeout(self):
158 | self.spider.crawlera_enabled = True
159 | self.settings['CRAWLERA_DOWNLOAD_TIMEOUT'] = 60
160 | self._assert_enabled(self.spider, self.settings, download_timeout=60)
161 | # Assert setting is coerced into correct type
162 | self.settings['CRAWLERA_DOWNLOAD_TIMEOUT'] = '42'
163 | self._assert_enabled(self.spider, self.settings, download_timeout=42)
164 | self.spider.crawlera_download_timeout = 120
165 | self._assert_enabled(self.spider, self.settings, download_timeout=120)
166 |
167 | def test_hooks(self):
168 | class _ECLS(self.mwcls):
169 | def is_enabled(self, spider):
170 | wascalled.append('is_enabled')
171 | return enabled
172 |
173 | def get_proxyauth(self, spider):
174 | wascalled.append('get_proxyauth')
175 | return proxyauth
176 |
177 | wascalled = []
178 | self.mwcls = _ECLS
179 |
180 | # test is_enabled returns False
181 | enabled = False
182 | self.spider.crawlera_enabled = True
183 | self._assert_disabled(self.spider, self.settings)
184 | self.assertEqual(wascalled, ['is_enabled'])
185 |
186 | wascalled[:] = [] # reset
187 | enabled = True
188 | self.spider.crawlera_enabled = False
189 | proxyauth = b'Basic Foo'
190 | self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
191 | self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth'])
192 |
193 | def test_delay_adjustment(self):
194 | delay = 0.5
195 | slot_key = 'www.scrapytest.org'
196 | url = 'http://www.scrapytest.org'
197 | ban_url = 'http://ban.me'
198 |
199 | self.spider.crawlera_enabled = True
200 |
201 | crawler = self._mock_crawler(self.settings)
202 | # ignore spider delay by default
203 | self.spider.download_delay = delay
204 | mw = self.mwcls.from_crawler(crawler)
205 | mw.open_spider(self.spider)
206 | self.assertEqual(self.spider.download_delay, 0)
207 |
208 | # preserve original delay
209 | self.spider.download_delay = delay
210 | self.spider.crawlera_preserve_delay = True
211 | mw = self.mwcls.from_crawler(crawler)
212 | mw.open_spider(self.spider)
213 | self.assertEqual(self.spider.download_delay, delay)
214 |
215 | slot = MockedSlot(self.spider.download_delay)
216 | crawler.engine.downloader.slots[slot_key] = slot
217 |
218 | # ban
219 | req = Request(url, meta={'download_slot': slot_key})
220 | res = Response(ban_url, status=self.bancode, request=req)
221 | mw.process_response(req, res, self.spider)
222 | self.assertEqual(slot.delay, delay)
223 | self.assertEqual(self.spider.download_delay, delay)
224 |
225 | retry_after = 1.5
226 | headers = {'retry-after': str(retry_after)}
227 | res = Response(
228 | ban_url, status=self.bancode, headers=headers, request=req)
229 | mw.process_response(req, res, self.spider)
230 | self.assertEqual(slot.delay, retry_after)
231 | self.assertEqual(self.spider.download_delay, delay)
232 |
233 | res = Response(url, request=req)
234 | mw.process_response(req, res, self.spider)
235 | self.assertEqual(slot.delay, delay)
236 | self.assertEqual(self.spider.download_delay, delay)
237 |
238 | # server failures
239 | mw.process_exception(req, ConnectionRefusedError(), self.spider)
240 | self.assertEqual(slot.delay, mw.connection_refused_delay)
241 | self.assertEqual(self.spider.download_delay, delay)
242 |
243 | res = Response(ban_url, request=req)
244 | mw.process_response(req, res, self.spider)
245 | self.assertEqual(slot.delay, delay)
246 | self.assertEqual(self.spider.download_delay, delay)
247 |
248 | mw.process_exception(req, ConnectionRefusedError(), self.spider)
249 | self.assertEqual(slot.delay, mw.connection_refused_delay)
250 | self.assertEqual(self.spider.download_delay, delay)
251 |
252 | res = Response(ban_url, status=self.bancode, request=req)
253 | mw.process_response(req, res, self.spider)
254 | self.assertEqual(slot.delay, delay)
255 | self.assertEqual(self.spider.download_delay, delay)
256 |
257 | def test_jobid_header(self):
258 | # test without the environment variable 'SCRAPY_JOB'
259 | self.spider.crawlera_enabled = True
260 | crawler = self._mock_crawler(self.settings)
261 | mw = self.mwcls.from_crawler(crawler)
262 | mw.open_spider(self.spider)
263 | req = Request('http://www.scrapytest.org')
264 | self.assertEqual(mw.process_request(req, self.spider), None)
265 | self.assertEqual(req.headers.get('X-Crawlera-Jobid'), None)
266 |
267 | # test with the environment variable 'SCRAPY_JOB'
268 | os.environ['SCRAPY_JOB'] = '2816'
269 | self.spider.crawlera_enabled = True
270 | crawler1 = self._mock_crawler(self.settings)
271 | mw1 = self.mwcls.from_crawler(crawler)
272 | mw1.open_spider(self.spider)
273 | req1 = Request('http://www.scrapytest.org')
274 | self.assertEqual(mw1.process_request(req1, self.spider), None)
275 | self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), b'2816')
276 |
--------------------------------------------------------------------------------
/tests/test_deltafetch.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase, skipIf
2 |
3 | import os
4 | import mock
5 | import tempfile
6 | from scrapy import Request
7 | from scrapy.item import BaseItem
8 | from scrapy.spiders import Spider
9 | from scrapy.settings import Settings
10 | from scrapy.exceptions import NotConfigured
11 | from scrapy.utils.request import request_fingerprint
12 | from scrapy.utils.python import to_bytes
13 | from scrapylib.deltafetch import DeltaFetch
14 | from scrapy.statscollectors import StatsCollector
15 | from scrapy.utils.test import get_crawler
16 |
17 | dbmodule = None
18 | try:
19 | dbmodule = __import__('bsddb3')
20 | except ImportError:
21 | try:
22 | dbmodule = __import__('bsddb')
23 | except ImportError:
24 | pass
25 |
26 |
27 | @skipIf(not dbmodule, "bsddb3/bsddb is not found on the system")
28 | class DeltaFetchTestCase(TestCase):
29 |
30 | mwcls = DeltaFetch
31 |
32 | def setUp(self):
33 | self.spider = Spider('df_tests')
34 | self.temp_dir = tempfile.gettempdir()
35 | self.db_path = os.path.join(self.temp_dir, 'df_tests.db')
36 | crawler = get_crawler(Spider)
37 | self.stats = StatsCollector(crawler)
38 |
39 | def test_init(self):
40 | # path format is any, the folder is not created
41 | instance = self.mwcls('/any/dir', True, stats=self.stats)
42 | assert isinstance(instance, self.mwcls)
43 | self.assertEqual(instance.dir, '/any/dir')
44 | self.assertEqual(self.stats.get_stats(), {})
45 | self.assertEqual(instance.reset, True)
46 |
47 | def test_init_from_crawler(self):
48 | crawler = mock.Mock()
49 | # void settings
50 | crawler.settings = Settings({})
51 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
52 | with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
53 | data_dir.return_value = self.temp_dir
54 |
55 | # simple project_data_dir mock with based settings
56 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
57 | instance = self.mwcls.from_crawler(crawler)
58 | assert isinstance(instance, self.mwcls)
59 | self.assertEqual(
60 | instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
61 | self.assertEqual(instance.reset, False)
62 |
63 | # project_data_dir mock with advanced settings
64 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
65 | 'DELTAFETCH_DIR': 'other',
66 | 'DELTAFETCH_RESET': True})
67 | instance = self.mwcls.from_crawler(crawler)
68 | assert isinstance(instance, self.mwcls)
69 | self.assertEqual(
70 | instance.dir, os.path.join(self.temp_dir, 'other'))
71 | self.assertEqual(instance.reset, True)
72 |
73 | def test_spider_opened_new(self):
74 | if os.path.exists(self.db_path):
75 | os.remove(self.db_path)
76 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
77 | assert not hasattr(self.mwcls, 'db')
78 | mw.spider_opened(self.spider)
79 | assert os.path.isdir(self.temp_dir)
80 | assert os.path.exists(self.db_path)
81 | assert hasattr(mw, 'db')
82 | assert isinstance(mw.db, type(dbmodule.db.DB()))
83 | assert mw.db.items() == []
84 | assert mw.db.get_type() == dbmodule.db.DB_HASH
85 | assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
86 |
87 | def test_spider_opened_existing(self):
88 | self._create_test_db()
89 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
90 | assert not hasattr(self.mwcls, 'db')
91 | mw.spider_opened(self.spider)
92 | assert hasattr(mw, 'db')
93 | assert isinstance(mw.db, type(dbmodule.db.DB()))
94 | assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
95 | (b'test_key_2', b'test_v_2')]
96 | assert mw.db.get_type() == dbmodule.db.DB_HASH
97 | assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
98 |
99 | def test_spider_opened_existing_spider_reset(self):
100 | self._create_test_db()
101 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
102 | assert not hasattr(self.mwcls, 'db')
103 | self.spider.deltafetch_reset = True
104 | mw.spider_opened(self.spider)
105 | assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
106 |
107 | def test_spider_opened_reset_non_existing_db(self):
108 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
109 | assert not hasattr(self.mwcls, 'db')
110 | self.spider.deltafetch_reset = True
111 | mw.spider_opened(self.spider)
112 | assert mw.db.fd()
113 | # there's different logic for different bdb versions:
114 | # it can fail when opening a non-existing db with truncate flag,
115 | # then it should be caught and retried with rm & create flag
116 | assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or
117 | mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)
118 |
119 | def test_spider_opened_recreate(self):
120 | self._create_test_db()
121 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
122 | assert not hasattr(self.mwcls, 'db')
123 | mw.spider_opened(self.spider)
124 | assert hasattr(mw, 'db')
125 | assert isinstance(mw.db, type(dbmodule.db.DB()))
126 | assert mw.db.items() == []
127 | assert mw.db.get_type() == dbmodule.db.DB_HASH
128 | assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
129 |
130 | def test_spider_closed(self):
131 | self._create_test_db()
132 | mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
133 | mw.spider_opened(self.spider)
134 | assert mw.db.fd()
135 | mw.spider_closed(self.spider)
136 | self.assertRaises(dbmodule.db.DBError, mw.db.fd)
137 |
138 | def test_process_spider_output(self):
139 | self._create_test_db()
140 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
141 | mw.spider_opened(self.spider)
142 | response = mock.Mock()
143 | response.request = Request('http://url',
144 | meta={'deltafetch_key': 'key'})
145 | result = []
146 | self.assertEqual(list(mw.process_spider_output(
147 | response, result, self.spider)), [])
148 | result = [
149 | Request('http://url', meta={'deltafetch_key': 'key1'}),
150 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
151 | ]
152 | self.assertEqual(list(mw.process_spider_output(
153 | response, result, self.spider)), [result[0]])
154 | self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1})
155 | result = [BaseItem(), "not a base item"]
156 | self.assertEqual(list(mw.process_spider_output(
157 | response, result, self.spider)), result)
158 | self.assertEqual(mw.db.keys(), [b'test_key_1', b'key', b'test_key_2'])
159 | assert mw.db[b'key']
160 |
161 | def test_process_spider_output_stats(self):
162 | self._create_test_db()
163 | mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
164 | mw.spider_opened(self.spider)
165 | response = mock.Mock()
166 | response.request = Request('http://url',
167 | meta={'deltafetch_key': 'key'})
168 | result = []
169 | self.assertEqual(list(mw.process_spider_output(
170 | response, result, self.spider)), [])
171 | self.assertEqual(self.stats.get_stats(), {})
172 | result = [
173 | Request('http://url', meta={'deltafetch_key': 'key'}),
174 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
175 | ]
176 | self.assertEqual(list(mw.process_spider_output(
177 | response, result, self.spider)), [result[0]])
178 | self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1)
179 | result = [BaseItem(), "not a base item"]
180 | self.assertEqual(list(mw.process_spider_output(
181 | response, result, self.spider)), result)
182 | self.assertEqual(self.stats.get_value('deltafetch/stored'), 1)
183 |
184 | def test_init_from_crawler_legacy(self):
185 | # test with subclass not handling passed stats
186 | class LegacyDeltaFetchSubClass(self.mwcls):
187 |
188 | def __init__(self, dir, reset=False, *args, **kwargs):
189 | super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
190 | self.something = True
191 |
192 | crawler = mock.Mock()
193 | # void settings
194 | crawler.settings = Settings({})
195 | self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler)
196 |
197 | with mock.patch('scrapy.utils.project.project_data_dir') as data_dir:
198 | data_dir.return_value = self.temp_dir
199 |
200 | # simple project_data_dir mock with based settings
201 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True})
202 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
203 | assert isinstance(instance, self.mwcls)
204 | self.assertEqual(
205 | instance.dir, os.path.join(self.temp_dir, 'deltafetch'))
206 | self.assertEqual(instance.reset, False)
207 |
208 | # project_data_dir mock with advanced settings
209 | crawler.settings = Settings({'DELTAFETCH_ENABLED': True,
210 | 'DELTAFETCH_DIR': 'other',
211 | 'DELTAFETCH_RESET': True})
212 | instance = LegacyDeltaFetchSubClass.from_crawler(crawler)
213 | assert isinstance(instance, self.mwcls)
214 | self.assertEqual(
215 | instance.dir, os.path.join(self.temp_dir, 'other'))
216 | self.assertEqual(instance.reset, True)
217 |
218 | def test_process_spider_output_stats_legacy(self):
219 | # testing the subclass not handling stats works at runtime
220 | # (i.e. that trying to update stats does not trigger exception)
221 | class LegacyDeltaFetchSubClass(self.mwcls):
222 |
223 | def __init__(self, dir, reset=False, *args, **kwargs):
224 | super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset)
225 | self.something = True
226 |
227 | self._create_test_db()
228 | mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
229 | mw.spider_opened(self.spider)
230 | response = mock.Mock()
231 | response.request = Request('http://url',
232 | meta={'deltafetch_key': 'key'})
233 | result = []
234 | self.assertEqual(list(mw.process_spider_output(
235 | response, result, self.spider)), [])
236 | self.assertEqual(self.stats.get_stats(), {})
237 | result = [
238 | Request('http://url', meta={'deltafetch_key': 'key'}),
239 | Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
240 | ]
241 |
242 | # stats should not be updated
243 | self.assertEqual(list(mw.process_spider_output(
244 | response, result, self.spider)), [result[0]])
245 | self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)
246 |
247 | result = [BaseItem(), "not a base item"]
248 | self.assertEqual(list(mw.process_spider_output(
249 | response, result, self.spider)), result)
250 | self.assertEqual(self.stats.get_value('deltafetch/stored'), None)
251 |
252 | def test_get_key(self):
253 | mw = self.mwcls(self.temp_dir, reset=True)
254 | test_req1 = Request('http://url1')
255 | self.assertEqual(mw._get_key(test_req1),
256 | to_bytes(request_fingerprint(test_req1)))
257 | test_req2 = Request('http://url2', meta={'deltafetch_key': 'dfkey1'})
258 | self.assertEqual(mw._get_key(test_req2), b'dfkey1')
259 |
260 | def _create_test_db(self):
261 | db = dbmodule.db.DB()
262 | # truncate test db if there were failed tests
263 | db.open(self.db_path, dbmodule.db.DB_HASH,
264 | dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
265 | db.put(b'test_key_1', b'test_v_1')
266 | db.put(b'test_key_2', b'test_v_2')
267 | db.close()
268 |
--------------------------------------------------------------------------------