├── .gitignore
├── README.md
├── requirements.txt
├── scrapy_redis_bloomfilter
    ├── __init__.py
    ├── __version__.py
    ├── bloomfilter.py
    ├── defaults.py
    ├── dupefilter.py
    └── scheduler.py
├── setup.py
└── tests
    ├── scrapy.cfg
    └── tests
        ├── __init__.py
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
            ├── __init__.py
            └── test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | ### Python template
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | .idea
 8 | # C extensions
 9 | *.so
10 | .DS_Store
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | 
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 | 
63 | # Scrapy stuff:
64 | .scrapy
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | 
69 | # PyBuilder
70 | target/
71 | 
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 | 
75 | # pyenv
76 | .python-version
77 | 
78 | # celery beat schedule file
79 | celerybeat-schedule
80 | 
81 | # SageMath parsed files
82 | *.sage.py
83 | 
84 | # dotenv
85 | .env
86 | 
87 | # virtualenv
88 | .venv
89 | venv/
90 | ENV/
91 | 
92 | # Spyder project settings
93 | .spyderproject
94 | 
95 | # Rope project settings
96 | .ropeproject
97 | 
98 | .gitignore
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapy-Redis-BloomFilter
 2 | 
 3 | This is a package for supporting BloomFilter of Scrapy-Redis.
 4 | 
 5 | ## Installation
 6 | 
 7 | You can easily install this package with pip:
 8 | 
 9 | ```
10 | pip install scrapy-redis-bloomfilter
11 | ```
12 | 
13 | Dependency:
14 | 
15 | - Scrapy-Redis >= 0.6.8
16 | 
17 | ## Usage
18 | 
19 | Add this settings to `settings.py`:
20 | 
21 | ```python
22 | # Use this Scheduler, if your scrapy_redis version is <= 0.7.1
23 | SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler"
24 | 
25 | # Ensure all spiders share same duplicates filter through redis
26 | DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter"
27 | 
28 | # Redis URL
29 | REDIS_URL = 'redis://localhost:6379'
30 | 
31 | # Number of Hash Functions to use, defaults to 6
32 | BLOOMFILTER_HASH_NUMBER = 6
33 | 
34 | # Redis Memory Bit of Bloom Filter Usage, 30 means 2^30 = 128MB, defaults to 30
35 | BLOOMFILTER_BIT = 10
36 | 
37 | # Persist
38 | SCHEDULER_PERSIST = True
39 | ```
40 | 
41 | ## Test
42 | 
43 | Here is a test of this project, usage:
44 | 
45 | ```
46 | git clone https://github.com/Python3WebSpider/ScrapyRedisBloomFilter.git
47 | cd ScrapyRedisBloomFilter/test
48 | scrapy crawl test
49 | ```
50 | 
51 | Note: please change REDIS_URL in settings.py.
52 | 
53 | Spider like this:
54 | 
55 | ```python
56 | from scrapy import Request, Spider
57 | 
58 | class TestSpider(Spider):
59 |     name = 'test'
60 |     base_url = 'https://www.baidu.com/s?wd='
61 | 
62 |     def start_requests(self):
63 |         for i in range(10):
64 |             url = self.base_url + str(i)
65 |             yield Request(url, callback=self.parse)
66 | 
67 |         # Here contains 10 duplicated Requests
68 |         for i in range(100):
69 |             url = self.base_url + str(i)
70 |             yield Request(url, callback=self.parse)
71 | 
72 |     def parse(self, response):
73 |         self.logger.debug('Response of ' + response.url)
74 | ```
75 | 
76 | Result like this:
77 | 
78 | ```python
79 | {'bloomfilter/filtered': 10, # This is the number of Request filtered by BloomFilter
80 |  'downloader/request_bytes': 34021,
81 |  'downloader/request_count': 100,
82 |  'downloader/request_method_count/GET': 100,
83 |  'downloader/response_bytes': 72943,
84 |  'downloader/response_count': 100,
85 |  'downloader/response_status_count/200': 100,
86 |  'finish_reason': 'finished',
87 |  'finish_time': datetime.datetime(2017, 8, 11, 9, 34, 30, 419597),
88 |  'log_count/DEBUG': 202,
89 |  'log_count/INFO': 7,
90 |  'memusage/max': 54153216,
91 |  'memusage/startup': 54153216,
92 |  'response_received_count': 100,
93 |  'scheduler/dequeued/redis': 100,
94 |  'scheduler/enqueued/redis': 100,
95 |  'start_time': datetime.datetime(2017, 8, 11, 9, 34, 26, 495018)}
96 | ```
97 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy-redis>=0.6.8


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Germey'
2 | __email__ = 'cqc@cuiqingcai.com'
3 | 
4 | from .dupefilter import RFPDupeFilter
5 | from .scheduler import Scheduler
6 | 


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 8, '1')
2 | 
3 | version = __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/bloomfilter.py:
--------------------------------------------------------------------------------
 1 | from .defaults import BLOOMFILTER_BIT, BLOOMFILTER_HASH_NUMBER
 2 | 
 3 | 
 4 | class HashMap(object):
 5 |     def __init__(self, m, seed):
 6 |         self.m = m
 7 |         self.seed = seed
 8 |     
 9 |     def hash(self, value):
10 |         """
11 |         Hash Algorithm
12 |         :param value: Value
13 |         :return: Hash Value
14 |         """
15 |         ret = 0
16 |         for i in range(len(value)):
17 |             ret += self.seed * ret + ord(value[i])
18 |         return (self.m - 1) & ret
19 | 
20 | 
21 | class BloomFilter(object):
22 |     def __init__(self, server, key, bit=BLOOMFILTER_BIT, hash_number=BLOOMFILTER_HASH_NUMBER):
23 |         """
24 |         Initialize BloomFilter
25 |         :param server: Redis Server
26 |         :param key: BloomFilter Key
27 |         :param bit: m = 2 ^ bit
28 |         :param hash_number: the number of hash function
29 |         """
30 |         # default to 1 << 30 = 10,7374,1824 = 2^30 = 128MB, max filter 2^30/hash_number = 1,7895,6970 fingerprints
31 |         self.m = 1 << bit
32 |         self.seeds = range(hash_number)
33 |         self.server = server
34 |         self.key = key
35 |         self.maps = [HashMap(self.m, seed) for seed in self.seeds]
36 |     
37 |     def exists(self, value):
38 |         """
39 |         if value exists
40 |         :param value:
41 |         :return:
42 |         """
43 |         if not value:
44 |             return False
45 |         exist = True
46 |         for map in self.maps:
47 |             offset = map.hash(value)
48 |             exist = exist & self.server.getbit(self.key, offset)
49 |         return exist
50 |     
51 |     def insert(self, value):
52 |         """
53 |         add value to bloom
54 |         :param value:
55 |         :return:
56 |         """
57 |         for f in self.maps:
58 |             offset = f.hash(value)
59 |             self.server.setbit(self.key, offset, 1)
60 | 


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/defaults.py:
--------------------------------------------------------------------------------
1 | from scrapy_redis.defaults import *
2 | 
3 | BLOOMFILTER_HASH_NUMBER = 6
4 | BLOOMFILTER_BIT = 30
5 | DUPEFILTER_DEBUG = False
6 | 
7 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:bloomfilter'
8 | 


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/dupefilter.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from .defaults import BLOOMFILTER_HASH_NUMBER, BLOOMFILTER_BIT, DUPEFILTER_DEBUG
  4 | from . import defaults
  5 | from scrapy_redis.connection import get_redis_from_settings
  6 | from .bloomfilter import BloomFilter
  7 | from scrapy_redis.dupefilter import RFPDupeFilter as BaseDupeFilter
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class RFPDupeFilter(BaseDupeFilter):
 13 |     """Redis-based request duplicates filter.
 14 | 
 15 |     This class can also be used with default Scrapy's scheduler.
 16 | 
 17 |     """
 18 |     
 19 |     logger = logger
 20 |     
 21 |     def __init__(self, server, key, debug, bit, hash_number):
 22 |         """Initialize the duplicates filter.
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         server : redis.StrictRedis
 27 |             The redis server instance.
 28 |         key : str
 29 |             Redis key Where to store fingerprints.
 30 |         debug : bool, optional
 31 |             Whether to log filtered requests.
 32 | 
 33 |         """
 34 |         self.server = server
 35 |         self.key = key
 36 |         self.debug = debug
 37 |         self.bit = bit
 38 |         self.hash_number = hash_number
 39 |         self.logdupes = True
 40 |         self.bf = BloomFilter(server, self.key, bit, hash_number)
 41 |     
 42 |     @classmethod
 43 |     def from_settings(cls, settings):
 44 |         """Returns an instance from given settings.
 45 | 
 46 |         This uses by default the key ``dupefilter:<timestamp>``. When using the
 47 |         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
 48 |         it needs to pass the spider name in the key.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         settings : scrapy.settings.Settings
 53 | 
 54 |         Returns
 55 |         -------
 56 |         RFPDupeFilter
 57 |             A RFPDupeFilter instance.
 58 | 
 59 | 
 60 |         """
 61 |         server = get_redis_from_settings(settings)
 62 |         # XXX: This creates one-time key. needed to support to use this
 63 |         # class as standalone dupefilter with scrapy's default scheduler
 64 |         # if scrapy passes spider on open() method this wouldn't be needed
 65 |         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
 66 |         key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
 67 |         debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
 68 |         bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
 69 |         hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)
 70 |         return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number)
 71 |     
 72 |     @classmethod
 73 |     def from_crawler(cls, crawler):
 74 |         """Returns instance from crawler.
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         crawler : scrapy.crawler.Crawler
 79 | 
 80 |         Returns
 81 |         -------
 82 |         RFPDupeFilter
 83 |             Instance of RFPDupeFilter.
 84 | 
 85 |         """
 86 |         instance = cls.from_settings(crawler.settings)
 87 |         return instance
 88 |     
 89 |     @classmethod
 90 |     def from_spider(cls, spider):
 91 |         """Returns instance from crawler.
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         spider :
 96 | 
 97 |         Returns
 98 |         -------
 99 |         RFPDupeFilter
100 |             Instance of RFPDupeFilter.
101 | 
102 |         """
103 |         settings = spider.settings
104 |         server = get_redis_from_settings(settings)
105 |         dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY)
106 |         key = dupefilter_key % {'spider': spider.name}
107 |         debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
108 |         bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
109 |         hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)
110 |         print(key, bit, hash_number)
111 |         instance = cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number)
112 |         return instance
113 |     
114 |     def request_seen(self, request):
115 |         """Returns True if request was already seen.
116 | 
117 |         Parameters
118 |         ----------
119 |         request : scrapy.http.Request
120 | 
121 |         Returns
122 |         -------
123 |         bool
124 | 
125 |         """
126 |         fp = self.request_fingerprint(request)
127 |         # This returns the number of values added, zero if already exists.
128 |         if self.bf.exists(fp):
129 |             return True
130 |         self.bf.insert(fp)
131 |         return False
132 |     
133 |     def log(self, request, spider):
134 |         """Logs given request.
135 | 
136 |         Parameters
137 |         ----------
138 |         request : scrapy.http.Request
139 |         spider : scrapy.spiders.Spider
140 | 
141 |         """
142 |         if self.debug:
143 |             msg = "Filtered duplicate request: %(request)s"
144 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
145 |         elif self.logdupes:
146 |             msg = ("Filtered duplicate request %(request)s"
147 |                    " - no more duplicates will be shown"
148 |                    " (see DUPEFILTER_DEBUG to show all duplicates)")
149 |             self.logger.debug(msg, {'request': request}, extra={'spider': spider})
150 |             self.logdupes = False
151 |         spider.crawler.stats.inc_value('bloomfilter/filtered', spider=spider)
152 | 


--------------------------------------------------------------------------------
/scrapy_redis_bloomfilter/scheduler.py:
--------------------------------------------------------------------------------
 1 | from scrapy.utils.misc import load_object
 2 | from scrapy_redis.scheduler import Scheduler as BaseScheduler
 3 | 
 4 | 
 5 | class Scheduler(BaseScheduler):
 6 |     
 7 |     def open(self, spider):
 8 |         """
 9 |         Override open method because newest scrapy-redis does not use from_spider when initializing df object
10 |         Parameters
11 |         ----------
12 |         spider
13 | 
14 |         Returns
15 |         -------
16 | 
17 |         """
18 |         self.spider = spider
19 |         
20 |         try:
21 |             self.queue = load_object(self.queue_cls)(
22 |                 server=self.server,
23 |                 spider=spider,
24 |                 key=self.queue_key % {'spider': spider.name},
25 |                 serializer=self.serializer,
26 |             )
27 |         except TypeError as e:
28 |             raise ValueError("Failed to instantiate queue class '%s': %s",
29 |                              self.queue_cls, e)
30 |         
31 |         try:
32 |             self.df = load_object(self.dupefilter_cls).from_spider(spider)
33 |         except TypeError as e:
34 |             raise ValueError("Failed to instantiate dupefilter class '%s': %s",
35 |                              self.dupefilter_cls, e)
36 |         
37 |         if self.flush_on_start:
38 |             self.flush()
39 |         # notice if there are requests already in the queue to resume the crawl
40 |         if len(self.queue):
41 |             spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from os.path import join, isfile
  5 | from os import walk
  6 | import io
  7 | import os
  8 | import sys
  9 | from shutil import rmtree
 10 | from setuptools import find_packages, setup, Command
 11 | 
 12 | 
 13 | def read_file(filename):
 14 |     with open(filename) as fp:
 15 |         return fp.read().strip()
 16 | 
 17 | 
 18 | def read_requirements(filename):
 19 |     return [line.strip() for line in read_file(filename).splitlines()
 20 |             if not line.startswith('#')]
 21 | 
 22 | 
 23 | NAME = 'Scrapy-Redis-BloomFilter'
 24 | FOLDER = 'scrapy_redis_bloomfilter'
 25 | DESCRIPTION = 'Bloom Filter Support for Scrapy-Redis'
 26 | URL = 'https://github.com/Python3WebSpider/ScrapyRedisBloomFilter'
 27 | EMAIL = 'cqc@cuiqingcai.com'
 28 | AUTHOR = 'Germey'
 29 | REQUIRES_PYTHON = '>=3.5.0'
 30 | VERSION = None
 31 | 
 32 | REQUIRED = read_requirements('requirements.txt')
 33 | 
 34 | here = os.path.abspath(os.path.dirname(__file__))
 35 | 
 36 | try:
 37 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
 38 |         long_description = '\n' + f.read()
 39 | except FileNotFoundError:
 40 |     long_description = DESCRIPTION
 41 | 
 42 | about = {}
 43 | if not VERSION:
 44 |     with open(os.path.join(here, FOLDER, '__version__.py')) as f:
 45 |         exec(f.read(), about)
 46 | else:
 47 |     about['__version__'] = VERSION
 48 | 
 49 | 
 50 | def package_files(directories):
 51 |     paths = []
 52 |     for item in directories:
 53 |         if isfile(item):
 54 |             paths.append(join('..', item))
 55 |             continue
 56 |         for (path, directories, filenames) in walk(item):
 57 |             for filename in filenames:
 58 |                 paths.append(join('..', path, filename))
 59 |     return paths
 60 | 
 61 | 
 62 | class UploadCommand(Command):
 63 |     description = 'Build and publish the package.'
 64 |     user_options = []
 65 |     
 66 |     @staticmethod
 67 |     def status(s):
 68 |         """Prints things in bold."""
 69 |         print('\033[1m{0}\033[0m'.format(s))
 70 |     
 71 |     def initialize_options(self):
 72 |         pass
 73 |     
 74 |     def finalize_options(self):
 75 |         pass
 76 |     
 77 |     def run(self):
 78 |         try:
 79 |             self.status('Removing previous builds…')
 80 |             rmtree(os.path.join(here, 'dist'))
 81 |         except OSError:
 82 |             pass
 83 |         
 84 |         self.status('Building Source and Wheel (universal) distribution…')
 85 |         os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
 86 |         
 87 |         self.status('Uploading the package to PyPI via Twine…')
 88 |         os.system('twine upload dist/*')
 89 |         
 90 |         self.status('Pushing git tags…')
 91 |         os.system('git tag v{0}'.format(about['__version__']))
 92 |         os.system('git push --tags')
 93 |         
 94 |         sys.exit()
 95 | 
 96 | 
 97 | setup(
 98 |     name=NAME,
 99 |     version=about['__version__'],
100 |     description=DESCRIPTION,
101 |     long_description=long_description,
102 |     long_description_content_type='text/markdown',
103 |     author=AUTHOR,
104 |     author_email=EMAIL,
105 |     python_requires=REQUIRES_PYTHON,
106 |     url=URL,
107 |     packages=find_packages(exclude=('tests',)),
108 |     install_requires=REQUIRED,
109 |     include_package_data=True,
110 |     license='MIT',
111 |     classifiers=[
112 |         'License :: OSI Approved :: MIT License',
113 |         'Programming Language :: Python :: 3.5',
114 |         'Programming Language :: Python :: 3.6',
115 |         'Programming Language :: Python :: 3.7',
116 |         'Programming Language :: Python :: 3.8',
117 |         'Programming Language :: Python :: Implementation :: CPython',
118 |         'Programming Language :: Python :: Implementation :: PyPy'
119 |     ],
120 |     # $ setup.py publish support.
121 |     cmdclass={
122 |         'upload': UploadCommand,
123 |     },
124 | )
125 | 


--------------------------------------------------------------------------------
/tests/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tests.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tests
12 | 


--------------------------------------------------------------------------------
/tests/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/ScrapyRedisBloomFilter/51df1529988dad89553c97ca6959644a54699d2b/tests/tests/__init__.py


--------------------------------------------------------------------------------
/tests/tests/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TestsItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/tests/tests/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TestsSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/tests/tests/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TestsPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/tests/tests/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for tests project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'tests'
 13 | 
 14 | SPIDER_MODULES = ['tests.spiders']
 15 | NEWSPIDER_MODULE = 'tests.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'tests (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | # DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | # }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'tests.middlewares.TestsSpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 54 | # DOWNLOADER_MIDDLEWARES = {
 55 | #    'tests.middlewares.MyCustomDownloaderMiddleware': 543,
 56 | # }
 57 | 
 58 | # Enable or disable extensions
 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 60 | # EXTENSIONS = {
 61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 62 | # }
 63 | 
 64 | # Configure item pipelines
 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 66 | # ITEM_PIPELINES = {
 67 | #    'tests.pipelines.TestsPipeline': 300,
 68 | # }
 69 | 
 70 | # Enable and configure the AutoThrottle extension (disabled by default)
 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 72 | # AUTOTHROTTLE_ENABLED = True
 73 | # The initial download delay
 74 | # AUTOTHROTTLE_START_DELAY = 5
 75 | # The maximum download delay to be set in case of high latencies
 76 | # AUTOTHROTTLE_MAX_DELAY = 60
 77 | # The average number of requests Scrapy should be sending in parallel to
 78 | # each remote server
 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 80 | # Enable showing throttling stats for every response received:
 81 | # AUTOTHROTTLE_DEBUG = False
 82 | 
 83 | # Enable and configure HTTP caching (disabled by default)
 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 85 | # HTTPCACHE_ENABLED = True
 86 | # HTTPCACHE_EXPIRATION_SECS = 0
 87 | # HTTPCACHE_DIR = 'httpcache'
 88 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 90 | 
 91 | SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler"
 92 | 
 93 | # Ensure all spiders share same duplicates filter through redis.
 94 | DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter"
 95 | 
 96 | # Redis URL
 97 | REDIS_URL = 'redis://localhost:6379'
 98 | 
 99 | # Number of Hash Functions to use, defaults to 6
100 | BLOOMFILTER_HASH_NUMBER = 6
101 | 
102 | # Bit
103 | BLOOMFILTER_BIT = 10
104 | 
105 | # Persist
106 | SCHEDULER_PERSIST = True
107 | 


--------------------------------------------------------------------------------
/tests/tests/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tests/tests/spiders/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Request, Spider
 3 | 
 4 | 
 5 | class TestSpider(Spider):
 6 |     name = 'test'
 7 |     
 8 |     base_url = 'https://www.baidu.com/s?wd='
 9 |     
10 |     def start_requests(self):
11 |         for i in range(10):
12 |             url = self.base_url + str(i)
13 |             yield Request(url, callback=self.parse)
14 |             
15 |         for i in range(100):
16 |             url = self.base_url + str(i)
17 |             yield Request(url, callback=self.parse)
18 |     
19 |     def parse(self, response):
20 |         self.logger.debug('Response of ' + response.url)
21 | 


--------------------------------------------------------------------------------