├── .gitignore ├── README.md ├── requirements.txt ├── scrapy_redis_bloomfilter ├── __init__.py ├── __version__.py ├── bloomfilter.py ├── defaults.py ├── dupefilter.py └── scheduler.py ├── setup.py └── tests ├── scrapy.cfg └── tests ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | .idea 8 | # C extensions 9 | *.so 10 | .DS_Store 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | .gitignore 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy-Redis-BloomFilter 2 | 3 | This is a package for supporting BloomFilter of Scrapy-Redis. 4 | 5 | ## Installation 6 | 7 | You can easily install this package with pip: 8 | 9 | ``` 10 | pip install scrapy-redis-bloomfilter 11 | ``` 12 | 13 | Dependency: 14 | 15 | - Scrapy-Redis >= 0.6.8 16 | 17 | ## Usage 18 | 19 | Add this settings to `settings.py`: 20 | 21 | ```python 22 | # Use this Scheduler, if your scrapy_redis version is <= 0.7.1 23 | SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler" 24 | 25 | # Ensure all spiders share same duplicates filter through redis 26 | DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter" 27 | 28 | # Redis URL 29 | REDIS_URL = 'redis://localhost:6379' 30 | 31 | # Number of Hash Functions to use, defaults to 6 32 | BLOOMFILTER_HASH_NUMBER = 6 33 | 34 | # Redis Memory Bit of Bloom Filter Usage, 30 means 2^30 = 128MB, defaults to 30 35 | BLOOMFILTER_BIT = 10 36 | 37 | # Persist 38 | SCHEDULER_PERSIST = True 39 | ``` 40 | 41 | ## Test 42 | 43 | Here is a test of this project, usage: 44 | 45 | ``` 46 | git clone https://github.com/Python3WebSpider/ScrapyRedisBloomFilter.git 47 | cd ScrapyRedisBloomFilter/test 48 | scrapy crawl test 49 | ``` 50 | 51 | Note: please change REDIS_URL in settings.py. 52 | 53 | Spider like this: 54 | 55 | ```python 56 | from scrapy import Request, Spider 57 | 58 | class TestSpider(Spider): 59 | name = 'test' 60 | base_url = 'https://www.baidu.com/s?wd=' 61 | 62 | def start_requests(self): 63 | for i in range(10): 64 | url = self.base_url + str(i) 65 | yield Request(url, callback=self.parse) 66 | 67 | # Here contains 10 duplicated Requests 68 | for i in range(100): 69 | url = self.base_url + str(i) 70 | yield Request(url, callback=self.parse) 71 | 72 | def parse(self, response): 73 | self.logger.debug('Response of ' + response.url) 74 | ``` 75 | 76 | Result like this: 77 | 78 | ```python 79 | {'bloomfilter/filtered': 10, # This is the number of Request filtered by BloomFilter 80 | 'downloader/request_bytes': 34021, 81 | 'downloader/request_count': 100, 82 | 'downloader/request_method_count/GET': 100, 83 | 'downloader/response_bytes': 72943, 84 | 'downloader/response_count': 100, 85 | 'downloader/response_status_count/200': 100, 86 | 'finish_reason': 'finished', 87 | 'finish_time': datetime.datetime(2017, 8, 11, 9, 34, 30, 419597), 88 | 'log_count/DEBUG': 202, 89 | 'log_count/INFO': 7, 90 | 'memusage/max': 54153216, 91 | 'memusage/startup': 54153216, 92 | 'response_received_count': 100, 93 | 'scheduler/dequeued/redis': 100, 94 | 'scheduler/enqueued/redis': 100, 95 | 'start_time': datetime.datetime(2017, 8, 11, 9, 34, 26, 495018)} 96 | ``` 97 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy-redis>=0.6.8 -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Germey' 2 | __email__ = 'cqc@cuiqingcai.com' 3 | 4 | from .dupefilter import RFPDupeFilter 5 | from .scheduler import Scheduler 6 | -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 8, '1') 2 | 3 | version = __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/bloomfilter.py: -------------------------------------------------------------------------------- 1 | from .defaults import BLOOMFILTER_BIT, BLOOMFILTER_HASH_NUMBER 2 | 3 | 4 | class HashMap(object): 5 | def __init__(self, m, seed): 6 | self.m = m 7 | self.seed = seed 8 | 9 | def hash(self, value): 10 | """ 11 | Hash Algorithm 12 | :param value: Value 13 | :return: Hash Value 14 | """ 15 | ret = 0 16 | for i in range(len(value)): 17 | ret += self.seed * ret + ord(value[i]) 18 | return (self.m - 1) & ret 19 | 20 | 21 | class BloomFilter(object): 22 | def __init__(self, server, key, bit=BLOOMFILTER_BIT, hash_number=BLOOMFILTER_HASH_NUMBER): 23 | """ 24 | Initialize BloomFilter 25 | :param server: Redis Server 26 | :param key: BloomFilter Key 27 | :param bit: m = 2 ^ bit 28 | :param hash_number: the number of hash function 29 | """ 30 | # default to 1 << 30 = 10,7374,1824 = 2^30 = 128MB, max filter 2^30/hash_number = 1,7895,6970 fingerprints 31 | self.m = 1 << bit 32 | self.seeds = range(hash_number) 33 | self.server = server 34 | self.key = key 35 | self.maps = [HashMap(self.m, seed) for seed in self.seeds] 36 | 37 | def exists(self, value): 38 | """ 39 | if value exists 40 | :param value: 41 | :return: 42 | """ 43 | if not value: 44 | return False 45 | exist = True 46 | for map in self.maps: 47 | offset = map.hash(value) 48 | exist = exist & self.server.getbit(self.key, offset) 49 | return exist 50 | 51 | def insert(self, value): 52 | """ 53 | add value to bloom 54 | :param value: 55 | :return: 56 | """ 57 | for f in self.maps: 58 | offset = f.hash(value) 59 | self.server.setbit(self.key, offset, 1) 60 | -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/defaults.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.defaults import * 2 | 3 | BLOOMFILTER_HASH_NUMBER = 6 4 | BLOOMFILTER_BIT = 30 5 | DUPEFILTER_DEBUG = False 6 | 7 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:bloomfilter' 8 | -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from .defaults import BLOOMFILTER_HASH_NUMBER, BLOOMFILTER_BIT, DUPEFILTER_DEBUG 4 | from . import defaults 5 | from scrapy_redis.connection import get_redis_from_settings 6 | from .bloomfilter import BloomFilter 7 | from scrapy_redis.dupefilter import RFPDupeFilter as BaseDupeFilter 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class RFPDupeFilter(BaseDupeFilter): 13 | """Redis-based request duplicates filter. 14 | 15 | This class can also be used with default Scrapy's scheduler. 16 | 17 | """ 18 | 19 | logger = logger 20 | 21 | def __init__(self, server, key, debug, bit, hash_number): 22 | """Initialize the duplicates filter. 23 | 24 | Parameters 25 | ---------- 26 | server : redis.StrictRedis 27 | The redis server instance. 28 | key : str 29 | Redis key Where to store fingerprints. 30 | debug : bool, optional 31 | Whether to log filtered requests. 32 | 33 | """ 34 | self.server = server 35 | self.key = key 36 | self.debug = debug 37 | self.bit = bit 38 | self.hash_number = hash_number 39 | self.logdupes = True 40 | self.bf = BloomFilter(server, self.key, bit, hash_number) 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | """Returns an instance from given settings. 45 | 46 | This uses by default the key ``dupefilter:``. When using the 47 | ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 48 | it needs to pass the spider name in the key. 49 | 50 | Parameters 51 | ---------- 52 | settings : scrapy.settings.Settings 53 | 54 | Returns 55 | ------- 56 | RFPDupeFilter 57 | A RFPDupeFilter instance. 58 | 59 | 60 | """ 61 | server = get_redis_from_settings(settings) 62 | # XXX: This creates one-time key. needed to support to use this 63 | # class as standalone dupefilter with scrapy's default scheduler 64 | # if scrapy passes spider on open() method this wouldn't be needed 65 | # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 66 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 67 | debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG) 68 | bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT) 69 | hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER) 70 | return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number) 71 | 72 | @classmethod 73 | def from_crawler(cls, crawler): 74 | """Returns instance from crawler. 75 | 76 | Parameters 77 | ---------- 78 | crawler : scrapy.crawler.Crawler 79 | 80 | Returns 81 | ------- 82 | RFPDupeFilter 83 | Instance of RFPDupeFilter. 84 | 85 | """ 86 | instance = cls.from_settings(crawler.settings) 87 | return instance 88 | 89 | @classmethod 90 | def from_spider(cls, spider): 91 | """Returns instance from crawler. 92 | 93 | Parameters 94 | ---------- 95 | spider : 96 | 97 | Returns 98 | ------- 99 | RFPDupeFilter 100 | Instance of RFPDupeFilter. 101 | 102 | """ 103 | settings = spider.settings 104 | server = get_redis_from_settings(settings) 105 | dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) 106 | key = dupefilter_key % {'spider': spider.name} 107 | debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG) 108 | bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT) 109 | hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER) 110 | print(key, bit, hash_number) 111 | instance = cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number) 112 | return instance 113 | 114 | def request_seen(self, request): 115 | """Returns True if request was already seen. 116 | 117 | Parameters 118 | ---------- 119 | request : scrapy.http.Request 120 | 121 | Returns 122 | ------- 123 | bool 124 | 125 | """ 126 | fp = self.request_fingerprint(request) 127 | # This returns the number of values added, zero if already exists. 128 | if self.bf.exists(fp): 129 | return True 130 | self.bf.insert(fp) 131 | return False 132 | 133 | def log(self, request, spider): 134 | """Logs given request. 135 | 136 | Parameters 137 | ---------- 138 | request : scrapy.http.Request 139 | spider : scrapy.spiders.Spider 140 | 141 | """ 142 | if self.debug: 143 | msg = "Filtered duplicate request: %(request)s" 144 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 145 | elif self.logdupes: 146 | msg = ("Filtered duplicate request %(request)s" 147 | " - no more duplicates will be shown" 148 | " (see DUPEFILTER_DEBUG to show all duplicates)") 149 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 150 | self.logdupes = False 151 | spider.crawler.stats.inc_value('bloomfilter/filtered', spider=spider) 152 | -------------------------------------------------------------------------------- /scrapy_redis_bloomfilter/scheduler.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy_redis.scheduler import Scheduler as BaseScheduler 3 | 4 | 5 | class Scheduler(BaseScheduler): 6 | 7 | def open(self, spider): 8 | """ 9 | Override open method because newest scrapy-redis does not use from_spider when initializing df object 10 | Parameters 11 | ---------- 12 | spider 13 | 14 | Returns 15 | ------- 16 | 17 | """ 18 | self.spider = spider 19 | 20 | try: 21 | self.queue = load_object(self.queue_cls)( 22 | server=self.server, 23 | spider=spider, 24 | key=self.queue_key % {'spider': spider.name}, 25 | serializer=self.serializer, 26 | ) 27 | except TypeError as e: 28 | raise ValueError("Failed to instantiate queue class '%s': %s", 29 | self.queue_cls, e) 30 | 31 | try: 32 | self.df = load_object(self.dupefilter_cls).from_spider(spider) 33 | except TypeError as e: 34 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 35 | self.dupefilter_cls, e) 36 | 37 | if self.flush_on_start: 38 | self.flush() 39 | # notice if there are requests already in the queue to resume the crawl 40 | if len(self.queue): 41 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from os.path import join, isfile 5 | from os import walk 6 | import io 7 | import os 8 | import sys 9 | from shutil import rmtree 10 | from setuptools import find_packages, setup, Command 11 | 12 | 13 | def read_file(filename): 14 | with open(filename) as fp: 15 | return fp.read().strip() 16 | 17 | 18 | def read_requirements(filename): 19 | return [line.strip() for line in read_file(filename).splitlines() 20 | if not line.startswith('#')] 21 | 22 | 23 | NAME = 'Scrapy-Redis-BloomFilter' 24 | FOLDER = 'scrapy_redis_bloomfilter' 25 | DESCRIPTION = 'Bloom Filter Support for Scrapy-Redis' 26 | URL = 'https://github.com/Python3WebSpider/ScrapyRedisBloomFilter' 27 | EMAIL = 'cqc@cuiqingcai.com' 28 | AUTHOR = 'Germey' 29 | REQUIRES_PYTHON = '>=3.5.0' 30 | VERSION = None 31 | 32 | REQUIRED = read_requirements('requirements.txt') 33 | 34 | here = os.path.abspath(os.path.dirname(__file__)) 35 | 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | about = {} 43 | if not VERSION: 44 | with open(os.path.join(here, FOLDER, '__version__.py')) as f: 45 | exec(f.read(), about) 46 | else: 47 | about['__version__'] = VERSION 48 | 49 | 50 | def package_files(directories): 51 | paths = [] 52 | for item in directories: 53 | if isfile(item): 54 | paths.append(join('..', item)) 55 | continue 56 | for (path, directories, filenames) in walk(item): 57 | for filename in filenames: 58 | paths.append(join('..', path, filename)) 59 | return paths 60 | 61 | 62 | class UploadCommand(Command): 63 | description = 'Build and publish the package.' 64 | user_options = [] 65 | 66 | @staticmethod 67 | def status(s): 68 | """Prints things in bold.""" 69 | print('\033[1m{0}\033[0m'.format(s)) 70 | 71 | def initialize_options(self): 72 | pass 73 | 74 | def finalize_options(self): 75 | pass 76 | 77 | def run(self): 78 | try: 79 | self.status('Removing previous builds…') 80 | rmtree(os.path.join(here, 'dist')) 81 | except OSError: 82 | pass 83 | 84 | self.status('Building Source and Wheel (universal) distribution…') 85 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 86 | 87 | self.status('Uploading the package to PyPI via Twine…') 88 | os.system('twine upload dist/*') 89 | 90 | self.status('Pushing git tags…') 91 | os.system('git tag v{0}'.format(about['__version__'])) 92 | os.system('git push --tags') 93 | 94 | sys.exit() 95 | 96 | 97 | setup( 98 | name=NAME, 99 | version=about['__version__'], 100 | description=DESCRIPTION, 101 | long_description=long_description, 102 | long_description_content_type='text/markdown', 103 | author=AUTHOR, 104 | author_email=EMAIL, 105 | python_requires=REQUIRES_PYTHON, 106 | url=URL, 107 | packages=find_packages(exclude=('tests',)), 108 | install_requires=REQUIRED, 109 | include_package_data=True, 110 | license='MIT', 111 | classifiers=[ 112 | 'License :: OSI Approved :: MIT License', 113 | 'Programming Language :: Python :: 3.5', 114 | 'Programming Language :: Python :: 3.6', 115 | 'Programming Language :: Python :: 3.7', 116 | 'Programming Language :: Python :: 3.8', 117 | 'Programming Language :: Python :: Implementation :: CPython', 118 | 'Programming Language :: Python :: Implementation :: PyPy' 119 | ], 120 | # $ setup.py publish support. 121 | cmdclass={ 122 | 'upload': UploadCommand, 123 | }, 124 | ) 125 | -------------------------------------------------------------------------------- /tests/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tests.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tests 12 | -------------------------------------------------------------------------------- /tests/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/ScrapyRedisBloomFilter/51df1529988dad89553c97ca6959644a54699d2b/tests/tests/__init__.py -------------------------------------------------------------------------------- /tests/tests/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TestsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /tests/tests/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TestsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /tests/tests/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TestsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /tests/tests/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tests project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tests' 13 | 14 | SPIDER_MODULES = ['tests.spiders'] 15 | NEWSPIDER_MODULE = 'tests.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'tests (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'tests.middlewares.TestsSpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | # DOWNLOADER_MIDDLEWARES = { 55 | # 'tests.middlewares.MyCustomDownloaderMiddleware': 543, 56 | # } 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | # ITEM_PIPELINES = { 67 | # 'tests.pipelines.TestsPipeline': 300, 68 | # } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | # AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | # AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | # AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | # AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | # HTTPCACHE_ENABLED = True 86 | # HTTPCACHE_EXPIRATION_SECS = 0 87 | # HTTPCACHE_DIR = 'httpcache' 88 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | 91 | SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler" 92 | 93 | # Ensure all spiders share same duplicates filter through redis. 94 | DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter" 95 | 96 | # Redis URL 97 | REDIS_URL = 'redis://localhost:6379' 98 | 99 | # Number of Hash Functions to use, defaults to 6 100 | BLOOMFILTER_HASH_NUMBER = 6 101 | 102 | # Bit 103 | BLOOMFILTER_BIT = 10 104 | 105 | # Persist 106 | SCHEDULER_PERSIST = True 107 | -------------------------------------------------------------------------------- /tests/tests/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tests/tests/spiders/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Request, Spider 3 | 4 | 5 | class TestSpider(Spider): 6 | name = 'test' 7 | 8 | base_url = 'https://www.baidu.com/s?wd=' 9 | 10 | def start_requests(self): 11 | for i in range(10): 12 | url = self.base_url + str(i) 13 | yield Request(url, callback=self.parse) 14 | 15 | for i in range(100): 16 | url = self.base_url + str(i) 17 | yield Request(url, callback=self.parse) 18 | 19 | def parse(self, response): 20 | self.logger.debug('Response of ' + response.url) 21 | --------------------------------------------------------------------------------