├── README.md ├── requirements.txt ├── scrapy_bloomfilter ├── __init__.py ├── bloomfilter.py ├── connection.py ├── defaults.py ├── dupefilter.py ├── picklecompat.py ├── pipelines.py ├── queue.py ├── scheduler.py ├── spiders.py └── utils.py ├── setup.py └── tests ├── scrapy.cfg └── tests ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── test.py /README.md: -------------------------------------------------------------------------------- 1 | ### 使用 2 | #### 安装 3 | ``` 4 | git clone https://github.com/ELI95/ScrapyBloomFilter.git 5 | cd ScrapyBloomFilter 6 | python setup.py build 7 | python setup.py install 8 | ``` 9 | 10 | #### 配置 11 | ```python 12 | SCHEDULER = "scrapy_bloomfilter.scheduler.Scheduler" 13 | 14 | DUPEFILTER_CLASS = "scrapy_bloomfilter.dupefilter.RedisDupeFilter" 15 | 16 | REDIS_URL = 'redis://@localhost:6379' 17 | 18 | BLOOMFILTER_HASH_NUMBER = 6 19 | 20 | BLOOMFILTER_BIT = 10 21 | 22 | SCHEDULER_PERSIST = False 23 | ``` 24 | 25 | 26 | ### 数学结论 27 | - n: 数据量 28 | - p: 误判率 29 | - m: 位数组长度 30 | - k: 哈希函数个数 31 | 32 | > ![](http://latex.codecogs.com/png.latex?m=-\frac{nln^p}{(ln^2)^2}) 33 | 34 | > ![](http://latex.codecogs.com/png.latex?k=\frac{m}{n}ln^2) 35 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy -------------------------------------------------------------------------------- /scrapy_bloomfilter/__init__.py: -------------------------------------------------------------------------------- 1 | from .connection import ( 2 | get_redis_from_settings, 3 | get_redis, 4 | ) 5 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/bloomfilter.py: -------------------------------------------------------------------------------- 1 | from .defaults import BLOOMFILTER_BIT, BLOOMFILTER_HASH_NUMBER 2 | 3 | 4 | class HashMap(object): 5 | def __init__(self, m, seed): 6 | self.m = m 7 | self.seed = seed 8 | 9 | def hash(self, value): 10 | temp = 0 11 | for i in range(len(value)): 12 | temp += self.seed * temp + ord(value[i]) 13 | return (self.m - 1) & temp 14 | 15 | 16 | class BloomFilter(object): 17 | def __init__(self, server, key, bit=BLOOMFILTER_BIT, hash_number=BLOOMFILTER_HASH_NUMBER): 18 | self.server = server 19 | self.key = key 20 | self.m = 1 << bit 21 | self.seeds = range(hash_number) 22 | self.hash_maps = [HashMap(self.m, seed) for seed in self.seeds] 23 | 24 | def exists(self, value): 25 | if not value: 26 | return False 27 | 28 | exist = True 29 | for hash_map in self.hash_maps: 30 | offset = hash_map.hash(value) 31 | exist = exist & self.server.getbit(self.key, offset) 32 | return exist 33 | 34 | def insert(self, value): 35 | for hash_map in self.hash_maps: 36 | offset = hash_map.hash(value) 37 | self.server.setbit(self.key, offset, 1) 38 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | from scrapy.utils.misc import load_object 4 | 5 | from . import defaults 6 | 7 | 8 | SETTINGS_PARAMS_MAP = { 9 | 'REDIS_URL': 'url', 10 | 'REDIS_HOST': 'host', 11 | 'REDIS_PORT': 'port', 12 | 'REDIS_ENCODING': 'encoding', 13 | } 14 | 15 | 16 | def get_redis(**kwargs): 17 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 18 | url = kwargs.pop('url', None) 19 | if url: 20 | return redis_cls.from_url(url, **kwargs) 21 | else: 22 | return redis_cls(**kwargs) 23 | 24 | 25 | def get_redis_from_settings(settings): 26 | params = defaults.REDIS_PARAMS.copy() 27 | params.update(settings.getdict('REDIS_PARAMS')) 28 | 29 | for source, dest in SETTINGS_PARAMS_MAP.items(): 30 | val = settings.get(source) 31 | if val: 32 | params[dest] = val 33 | 34 | if isinstance(params.get('redis_cls'), six.string_types): 35 | params['redis_cls'] = load_object(params['redis_cls']) 36 | 37 | return get_redis(**params) 38 | 39 | 40 | from_settings = get_redis_from_settings 41 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | 4 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 5 | 6 | PIPELINE_KEY = '%(spider)s:items' 7 | BLOOMFILTER_HASH_NUMBER = 6 8 | BLOOMFILTER_BIT = 30 9 | DUPEFILTER_DEBUG = False 10 | REDIS_CLS = redis.StrictRedis 11 | REDIS_ENCODING = 'utf-8' 12 | 13 | REDIS_PARAMS = { 14 | 'socket_timeout': 30, 15 | 'socket_connect_timeout': 30, 16 | 'retry_on_timeout': True, 17 | 'encoding': REDIS_ENCODING, 18 | } 19 | 20 | SCHEDULER_QUEUE_KEY = '%(spider)s:requests' 21 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 22 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 23 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 24 | 25 | START_URLS_KEY = '%(name)s:start_urls' 26 | START_URLS_AS_SET = False 27 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/dupefilter.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | 4 | from scrapy.dupefilters import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | from . import defaults 8 | from .connection import get_redis_from_settings 9 | from .bloomfilter import BloomFilter 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class RedisDupeFilter(BaseDupeFilter): 16 | logger = logger 17 | 18 | def __init__(self, server, key, debug, bit, hash_number): 19 | self.server = server 20 | self.key = key 21 | self.debug = debug 22 | self.bit = bit 23 | self.hash_number = hash_number 24 | self.logdupes = True 25 | self.bf = BloomFilter(server, self.key, bit, hash_number) 26 | 27 | @classmethod 28 | def from_settings(cls, settings): 29 | server = get_redis_from_settings(settings) 30 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 31 | debug = settings.getbool('DUPEFILTER_DEBUG', defaults.DUPEFILTER_DEBUG) 32 | bit = settings.getint('BLOOMFILTER_BIT', defaults.BLOOMFILTER_BIT) 33 | hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', defaults.BLOOMFILTER_HASH_NUMBER) 34 | return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number) 35 | 36 | @classmethod 37 | def from_crawler(cls, crawler): 38 | """Returns instance from crawler. 39 | 40 | Parameters 41 | ---------- 42 | crawler : scrapy.crawler.Crawler 43 | 44 | Returns 45 | ------- 46 | RFPDupeFilter 47 | Instance of RFPDupeFilter. 48 | 49 | """ 50 | instance = cls.from_settings(crawler.settings) 51 | return instance 52 | 53 | def request_seen(self, request): 54 | """Returns True if request was already seen. 55 | 56 | Parameters 57 | ---------- 58 | request : scrapy.http.Request 59 | 60 | Returns 61 | ------- 62 | bool 63 | 64 | """ 65 | fp = self.request_fingerprint(request) 66 | if self.bf.exists(fp): 67 | return True 68 | self.bf.insert(fp) 69 | return False 70 | 71 | def request_fingerprint(self, request): 72 | """Returns a fingerprint for a given request. 73 | 74 | Parameters 75 | ---------- 76 | request : scrapy.http.Request 77 | 78 | Returns 79 | ------- 80 | str 81 | 82 | """ 83 | return request_fingerprint(request) 84 | 85 | def close(self, reason=''): 86 | """Delete data on close. Called by Scrapy's scheduler. 87 | 88 | Parameters 89 | ---------- 90 | reason : str, optional 91 | 92 | """ 93 | self.clear() 94 | 95 | def clear(self): 96 | """Clears fingerprints data.""" 97 | self.server.delete(self.key) 98 | 99 | def log(self, request, spider): 100 | """Logs given request. 101 | 102 | Parameters 103 | ---------- 104 | request : scrapy.http.Request 105 | spider : scrapy.spiders.Spider 106 | 107 | """ 108 | if self.debug: 109 | msg = "Filtered duplicate request: %(request)s" 110 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 111 | elif self.logdupes: 112 | msg = ("Filtered duplicate request %(request)s" 113 | " - no more duplicates will be shown" 114 | " (see DUPEFILTER_DEBUG to show all duplicates)") 115 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 116 | self.logdupes = False 117 | spider.crawler.stats.inc_value('bloomfilter/filtered', spider=spider) 118 | 119 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/picklecompat.py: -------------------------------------------------------------------------------- 1 | try: 2 | import cPickle as pickle 3 | except ImportError: 4 | import pickle 5 | 6 | 7 | def loads(s): 8 | return pickle.loads(s) 9 | 10 | 11 | def dumps(obj): 12 | return pickle.dumps(obj, protocol=-1) 13 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.misc import load_object 2 | from scrapy.utils.serialize import ScrapyJSONEncoder 3 | from twisted.internet.threads import deferToThread 4 | 5 | from . import connection, defaults 6 | 7 | 8 | default_serialize = ScrapyJSONEncoder().encode 9 | 10 | 11 | class RedisPipeline(object): 12 | """Pushes serialized item into a redis list/queue 13 | 14 | Settings 15 | -------- 16 | REDIS_ITEMS_KEY : str 17 | Redis key where to store items. 18 | REDIS_ITEMS_SERIALIZER : str 19 | Object path to serializer function. 20 | 21 | """ 22 | 23 | def __init__(self, server, 24 | key=defaults.PIPELINE_KEY, 25 | serialize_func=default_serialize): 26 | """Initialize pipeline. 27 | 28 | Parameters 29 | ---------- 30 | server : StrictRedis 31 | Redis client instance. 32 | key : str 33 | Redis key where to store items. 34 | serialize_func : callable 35 | Items serializer function. 36 | 37 | """ 38 | self.server = server 39 | self.key = key 40 | self.serialize = serialize_func 41 | 42 | @classmethod 43 | def from_settings(cls, settings): 44 | params = { 45 | 'server': connection.from_settings(settings), 46 | } 47 | if settings.get('REDIS_ITEMS_KEY'): 48 | params['key'] = settings['REDIS_ITEMS_KEY'] 49 | if settings.get('REDIS_ITEMS_SERIALIZER'): 50 | params['serialize_func'] = load_object( 51 | settings['REDIS_ITEMS_SERIALIZER'] 52 | ) 53 | 54 | return cls(**params) 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | return cls.from_settings(crawler.settings) 59 | 60 | def process_item(self, item, spider): 61 | return deferToThread(self._process_item, item, spider) 62 | 63 | def _process_item(self, item, spider): 64 | key = self.item_key(item, spider) 65 | data = self.serialize(item) 66 | self.server.rpush(key, data) 67 | return item 68 | 69 | def item_key(self, item, spider): 70 | """Returns redis key based on given spider. 71 | 72 | Override this function to use a different key depending on the item 73 | and/or spider. 74 | 75 | """ 76 | return self.key % {'spider': spider.name} 77 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | from . import picklecompat 4 | 5 | 6 | class Base(object): 7 | """Per-spider base queue class""" 8 | 9 | def __init__(self, server, spider, key, serializer=None): 10 | """Initialize per-spider redis queue. 11 | 12 | Parameters 13 | ---------- 14 | server : StrictRedis 15 | Redis client instance. 16 | spider : Spider 17 | Scrapy spider instance. 18 | key: str 19 | Redis key where to put and get messages. 20 | serializer : object 21 | Serializer object with ``loads`` and ``dumps`` methods. 22 | 23 | """ 24 | if serializer is None: 25 | # Backward compatibility. 26 | # TODO: deprecate pickle. 27 | serializer = picklecompat 28 | if not hasattr(serializer, 'loads'): 29 | raise TypeError("serializer does not implement 'loads' function: %r" 30 | % serializer) 31 | if not hasattr(serializer, 'dumps'): 32 | raise TypeError("serializer '%s' does not implement 'dumps' function: %r" 33 | % serializer) 34 | 35 | self.server = server 36 | self.spider = spider 37 | self.key = key % {'spider': spider.name} 38 | self.serializer = serializer 39 | 40 | def _encode_request(self, request): 41 | """Encode a request object""" 42 | obj = request_to_dict(request, self.spider) 43 | return self.serializer.dumps(obj) 44 | 45 | def _decode_request(self, encoded_request): 46 | """Decode an request previously encoded""" 47 | obj = self.serializer.loads(encoded_request) 48 | return request_from_dict(obj, self.spider) 49 | 50 | def __len__(self): 51 | """Return the length of the queue""" 52 | raise NotImplementedError 53 | 54 | def push(self, request): 55 | """Push a request""" 56 | raise NotImplementedError 57 | 58 | def pop(self, timeout=0): 59 | """Pop a request""" 60 | raise NotImplementedError 61 | 62 | def clear(self): 63 | """Clear queue/stack""" 64 | self.server.delete(self.key) 65 | 66 | 67 | class FifoQueue(Base): 68 | """Per-spider FIFO queue""" 69 | 70 | def __len__(self): 71 | """Return the length of the queue""" 72 | return self.server.llen(self.key) 73 | 74 | def push(self, request): 75 | """Push a request""" 76 | self.server.lpush(self.key, self._encode_request(request)) 77 | 78 | def pop(self, timeout=0): 79 | """Pop a request""" 80 | if timeout > 0: 81 | data = self.server.brpop(self.key, timeout) 82 | if isinstance(data, tuple): 83 | data = data[1] 84 | else: 85 | data = self.server.rpop(self.key) 86 | if data: 87 | return self._decode_request(data) 88 | 89 | 90 | class LifoQueue(Base): 91 | """Per-spider LIFO queue.""" 92 | 93 | def __len__(self): 94 | """Return the length of the stack""" 95 | return self.server.llen(self.key) 96 | 97 | def push(self, request): 98 | """Push a request""" 99 | self.server.lpush(self.key, self._encode_request(request)) 100 | 101 | def pop(self, timeout=0): 102 | """Pop a request""" 103 | if timeout > 0: 104 | data = self.server.blpop(self.key, timeout) 105 | if isinstance(data, tuple): 106 | data = data[1] 107 | else: 108 | data = self.server.lpop(self.key) 109 | 110 | if data: 111 | return self._decode_request(data) 112 | 113 | 114 | class PriorityQueue(Base): 115 | """Per-spider priority queue abstraction using redis' sorted set""" 116 | 117 | def __len__(self): 118 | """Return the length of the queue""" 119 | return self.server.zcard(self.key) 120 | 121 | def push(self, request): 122 | """Push a request""" 123 | data = self._encode_request(request) 124 | score = -request.priority 125 | # We don't use zadd method as the order of arguments change depending on 126 | # whether the class is Redis or StrictRedis, and the option of using 127 | # kwargs only accepts strings, not bytes. 128 | self.server.execute_command('ZADD', self.key, score, data) 129 | 130 | def pop(self, timeout=0): 131 | """ 132 | Pop a request 133 | timeout not support in this queue class 134 | """ 135 | # use atomic range/remove using multi/exec 136 | pipe = self.server.pipeline() 137 | pipe.multi() 138 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 139 | results, count = pipe.execute() 140 | if results: 141 | return self._decode_request(results[0]) 142 | 143 | 144 | # TODO: Deprecate the use of these names. 145 | SpiderQueue = FifoQueue 146 | SpiderStack = LifoQueue 147 | SpiderPriorityQueue = PriorityQueue 148 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import six 3 | 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | # TODO: add SCRAPY_JOB support. 10 | class Scheduler(object): 11 | """Redis-based scheduler 12 | 13 | Settings 14 | -------- 15 | SCHEDULER_PERSIST : bool (default: False) 16 | Whether to persist or clear redis queue. 17 | SCHEDULER_FLUSH_ON_START : bool (default: False) 18 | Whether to flush redis queue on start. 19 | SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) 20 | How many seconds to wait before closing if no message is received. 21 | SCHEDULER_QUEUE_KEY : str 22 | Scheduler redis key. 23 | SCHEDULER_QUEUE_CLASS : str 24 | Scheduler queue class. 25 | SCHEDULER_DUPEFILTER_KEY : str 26 | Scheduler dupefilter redis key. 27 | SCHEDULER_DUPEFILTER_CLASS : str 28 | Scheduler dupefilter class. 29 | SCHEDULER_SERIALIZER : str 30 | Scheduler serializer. 31 | 32 | """ 33 | 34 | def __init__(self, server, 35 | persist=False, 36 | flush_on_start=False, 37 | queue_key=defaults.SCHEDULER_QUEUE_KEY, 38 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 39 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 40 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 41 | idle_before_close=0, 42 | serializer=None): 43 | """Initialize scheduler. 44 | 45 | Parameters 46 | ---------- 47 | server : Redis 48 | The redis server instance. 49 | persist : bool 50 | Whether to flush requests when closing. Default is False. 51 | flush_on_start : bool 52 | Whether to flush requests on start. Default is False. 53 | queue_key : str 54 | Requests queue key. 55 | queue_cls : str 56 | Importable path to the queue class. 57 | dupefilter_key : str 58 | Duplicates filter key. 59 | dupefilter_cls : str 60 | Importable path to the dupefilter class. 61 | idle_before_close : int 62 | Timeout before giving up. 63 | 64 | """ 65 | if idle_before_close < 0: 66 | raise TypeError("idle_before_close cannot be negative") 67 | 68 | self.server = server 69 | self.persist = persist 70 | self.flush_on_start = flush_on_start 71 | self.queue_key = queue_key 72 | self.queue_cls = queue_cls 73 | self.dupefilter_cls = dupefilter_cls 74 | self.dupefilter_key = dupefilter_key 75 | self.idle_before_close = idle_before_close 76 | self.serializer = serializer 77 | self.stats = None 78 | 79 | def __len__(self): 80 | return len(self.queue) 81 | 82 | @classmethod 83 | def from_settings(cls, settings): 84 | kwargs = { 85 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 86 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 87 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 88 | } 89 | 90 | # If these values are missing, it means we want to use the defaults. 91 | optional = { 92 | # TODO: Use custom prefixes for this settings to note that are 93 | # specific to scrapy-redis. 94 | 'queue_key': 'SCHEDULER_QUEUE_KEY', 95 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 96 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 97 | # We use the default setting name to keep compatibility. 98 | 'dupefilter_cls': 'DUPEFILTER_CLASS', 99 | 'serializer': 'SCHEDULER_SERIALIZER', 100 | } 101 | for name, setting_name in optional.items(): 102 | val = settings.get(setting_name) 103 | if val: 104 | kwargs[name] = val 105 | 106 | # Support serializer as a path to a module. 107 | if isinstance(kwargs.get('serializer'), six.string_types): 108 | kwargs['serializer'] = importlib.import_module(kwargs['serializer']) 109 | 110 | server = connection.from_settings(settings) 111 | # Ensure the connection is working. 112 | server.ping() 113 | 114 | return cls(server=server, **kwargs) 115 | 116 | @classmethod 117 | def from_crawler(cls, crawler): 118 | instance = cls.from_settings(crawler.settings) 119 | # FIXME: for now, stats are only supported from this constructor 120 | instance.stats = crawler.stats 121 | return instance 122 | 123 | def open(self, spider): 124 | self.spider = spider 125 | 126 | try: 127 | self.queue = load_object(self.queue_cls)( 128 | server=self.server, 129 | spider=spider, 130 | key=self.queue_key % {'spider': spider.name}, 131 | serializer=self.serializer, 132 | ) 133 | except TypeError as e: 134 | raise ValueError("Failed to instantiate queue class '%s': %s", 135 | self.queue_cls, e) 136 | 137 | try: 138 | self.df = load_object(self.dupefilter_cls)( 139 | server=self.server, 140 | key=self.dupefilter_key % {'spider': spider.name}, 141 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 142 | bit=spider.settings.getint('BLOOMFILTER_BIT', defaults.BLOOMFILTER_BIT), 143 | hash_number=spider.settings.getint('BLOOMFILTER_HASH_NUMBER', defaults.BLOOMFILTER_HASH_NUMBER) 144 | ) 145 | except TypeError as e: 146 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 147 | self.dupefilter_cls, e) 148 | 149 | if self.flush_on_start: 150 | self.flush() 151 | # notice if there are requests already in the queue to resume the crawl 152 | if len(self.queue): 153 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 154 | 155 | def close(self, reason): 156 | if not self.persist: 157 | self.flush() 158 | 159 | def flush(self): 160 | self.df.clear() 161 | self.queue.clear() 162 | 163 | def enqueue_request(self, request): 164 | if not request.dont_filter and self.df.request_seen(request): 165 | self.df.log(request, self.spider) 166 | return False 167 | if self.stats: 168 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 169 | self.queue.push(request) 170 | return True 171 | 172 | def next_request(self): 173 | block_pop_timeout = self.idle_before_close 174 | request = self.queue.pop(block_pop_timeout) 175 | if request and self.stats: 176 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 177 | return request 178 | 179 | def has_pending_requests(self): 180 | return len(self) > 0 181 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection, defaults 6 | from .utils import bytes_to_str 7 | 8 | 9 | class RedisMixin(object): 10 | """Mixin class to implement reading urls from a redis queue.""" 11 | redis_key = None 12 | redis_batch_size = None 13 | redis_encoding = None 14 | 15 | # Redis client placeholder. 16 | server = None 17 | 18 | def start_requests(self): 19 | """Returns a batch of start requests from redis.""" 20 | return self.next_requests() 21 | 22 | def setup_redis(self, crawler=None): 23 | """Setup redis connection and idle signal. 24 | 25 | This should be called after the spider has set its crawler object. 26 | """ 27 | if self.server is not None: 28 | return 29 | 30 | if crawler is None: 31 | # We allow optional crawler argument to keep backwards 32 | # compatibility. 33 | # XXX: Raise a deprecation warning. 34 | crawler = getattr(self, 'crawler', None) 35 | 36 | if crawler is None: 37 | raise ValueError("crawler is required") 38 | 39 | settings = crawler.settings 40 | 41 | if self.redis_key is None: 42 | self.redis_key = settings.get( 43 | 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, 44 | ) 45 | 46 | self.redis_key = self.redis_key % {'name': self.name} 47 | 48 | if not self.redis_key.strip(): 49 | raise ValueError("redis_key must not be empty") 50 | 51 | if self.redis_batch_size is None: 52 | # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 53 | self.redis_batch_size = settings.getint( 54 | 'REDIS_START_URLS_BATCH_SIZE', 55 | settings.getint('CONCURRENT_REQUESTS'), 56 | ) 57 | 58 | try: 59 | self.redis_batch_size = int(self.redis_batch_size) 60 | except (TypeError, ValueError): 61 | raise ValueError("redis_batch_size must be an integer") 62 | 63 | if self.redis_encoding is None: 64 | self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) 65 | 66 | self.logger.info("Reading start URLs from redis key '%(redis_key)s' " 67 | "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 68 | self.__dict__) 69 | 70 | self.server = connection.from_settings(crawler.settings) 71 | # The idle signal is called when the spider has no requests left, 72 | # that's when we will schedule new requests from redis queue 73 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 74 | 75 | def next_requests(self): 76 | """Returns a request to be scheduled or none.""" 77 | use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) 78 | fetch_one = self.server.spop if use_set else self.server.lpop 79 | # XXX: Do we need to use a timeout here? 80 | found = 0 81 | # TODO: Use redis pipeline execution. 82 | while found < self.redis_batch_size: 83 | data = fetch_one(self.redis_key) 84 | if not data: 85 | # Queue empty. 86 | break 87 | req = self.make_request_from_data(data) 88 | if req: 89 | yield req 90 | found += 1 91 | else: 92 | self.logger.debug("Request not made from data: %r", data) 93 | 94 | if found: 95 | self.logger.debug("Read %s requests from '%s'", found, self.redis_key) 96 | 97 | def make_request_from_data(self, data): 98 | """Returns a Request instance from data coming from Redis. 99 | 100 | By default, ``data`` is an encoded URL. You can override this method to 101 | provide your own message decoding. 102 | 103 | Parameters 104 | ---------- 105 | data : bytes 106 | Message from redis. 107 | 108 | """ 109 | url = bytes_to_str(data, self.redis_encoding) 110 | return self.make_requests_from_url(url) 111 | 112 | def schedule_next_requests(self): 113 | """Schedules a request if available""" 114 | # TODO: While there is capacity, schedule a batch of redis requests. 115 | for req in self.next_requests(): 116 | self.crawler.engine.crawl(req, spider=self) 117 | 118 | def spider_idle(self): 119 | """Schedules a request if available, otherwise waits.""" 120 | # XXX: Handle a sentinel to close the spider. 121 | self.schedule_next_requests() 122 | raise DontCloseSpider 123 | 124 | 125 | class RedisSpider(RedisMixin, Spider): 126 | """Spider that reads urls from redis queue when idle. 127 | 128 | Attributes 129 | ---------- 130 | redis_key : str (default: REDIS_START_URLS_KEY) 131 | Redis key where to fetch start URLs from. 132 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 133 | Number of messages to fetch from redis on each attempt. 134 | redis_encoding : str (default: REDIS_ENCODING) 135 | Encoding to use when decoding messages from redis queue. 136 | 137 | Settings 138 | -------- 139 | REDIS_START_URLS_KEY : str (default: ":start_urls") 140 | Default Redis key where to fetch start URLs from.. 141 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 142 | Default number of messages to fetch from redis on each attempt. 143 | REDIS_START_URLS_AS_SET : bool (default: False) 144 | Use SET operations to retrieve messages from the redis queue. If False, 145 | the messages are retrieve using the LPOP command. 146 | REDIS_ENCODING : str (default: "utf-8") 147 | Default encoding to use when decoding messages from redis queue. 148 | 149 | """ 150 | 151 | @classmethod 152 | def from_crawler(self, crawler, *args, **kwargs): 153 | obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) 154 | obj.setup_redis(crawler) 155 | return obj 156 | 157 | 158 | class RedisCrawlSpider(RedisMixin, CrawlSpider): 159 | """Spider that reads urls from redis queue when idle. 160 | 161 | Attributes 162 | ---------- 163 | redis_key : str (default: REDIS_START_URLS_KEY) 164 | Redis key where to fetch start URLs from.. 165 | redis_batch_size : int (default: CONCURRENT_REQUESTS) 166 | Number of messages to fetch from redis on each attempt. 167 | redis_encoding : str (default: REDIS_ENCODING) 168 | Encoding to use when decoding messages from redis queue. 169 | 170 | Settings 171 | -------- 172 | REDIS_START_URLS_KEY : str (default: ":start_urls") 173 | Default Redis key where to fetch start URLs from.. 174 | REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) 175 | Default number of messages to fetch from redis on each attempt. 176 | REDIS_START_URLS_AS_SET : bool (default: True) 177 | Use SET operations to retrieve messages from the redis queue. 178 | REDIS_ENCODING : str (default: "utf-8") 179 | Default encoding to use when decoding messages from redis queue. 180 | 181 | """ 182 | 183 | @classmethod 184 | def from_crawler(self, crawler, *args, **kwargs): 185 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 186 | obj.setup_redis(crawler) 187 | return obj 188 | -------------------------------------------------------------------------------- /scrapy_bloomfilter/utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def bytes_to_str(s, encoding='utf-8'): 5 | """Returns a str if a bytes object is given.""" 6 | if six.PY3 and isinstance(s, bytes): 7 | return s.decode(encoding) 8 | return s 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | from setuptools import setup, find_packages 3 | 4 | 5 | def read_file(filename): 6 | with io.open(filename) as fp: 7 | return fp.read().strip() 8 | 9 | 10 | def read_requirements(filename): 11 | return [line.strip() for line in read_file(filename).splitlines() 12 | if not line.startswith('#')] 13 | 14 | 15 | setup( 16 | name='scrapy-bloomfilter', 17 | version='0.0.0', 18 | description='Scrapy BloomFilter based on redis bitmap', 19 | keywords=['scrapy', 'bloomfilter', 'redis', 'bitmap'], 20 | author='ELI95', 21 | author_email='helloworld.eli@gmail.com', 22 | license='MIT', 23 | install_requires=read_requirements('requirements.txt'), 24 | packages=find_packages(), 25 | ) 26 | -------------------------------------------------------------------------------- /tests/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tests.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tests 12 | -------------------------------------------------------------------------------- /tests/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eli95/ScrapyBloomFilter/a76d1943db3f9c98ec6f1c87c05af14d8a0f4694/tests/tests/__init__.py -------------------------------------------------------------------------------- /tests/tests/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TestsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /tests/tests/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TestsSpiderMiddleware: 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TestsDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /tests/tests/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TestsPipeline: 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /tests/tests/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tests project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tests' 13 | 14 | SPIDER_MODULES = ['tests.spiders'] 15 | NEWSPIDER_MODULE = 'tests.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tests (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tests.middlewares.TestsSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tests.middlewares.TestsDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'tests.pipelines.TestsPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | 93 | SCHEDULER = "scrapy_bloomfilter.scheduler.Scheduler" 94 | 95 | DUPEFILTER_CLASS = "scrapy_bloomfilter.dupefilter.RedisDupeFilter" 96 | 97 | REDIS_URL = 'redis://@localhost:6379' 98 | 99 | BLOOMFILTER_HASH_NUMBER = 6 100 | 101 | BLOOMFILTER_BIT = 10 102 | 103 | SCHEDULER_PERSIST = False 104 | -------------------------------------------------------------------------------- /tests/tests/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tests/tests/spiders/test.py: -------------------------------------------------------------------------------- 1 | from scrapy import Request, Spider 2 | 3 | 4 | class TestSpider(Spider): 5 | name = 'test' 6 | 7 | base_url = 'https://www.toutiao.com/search/?keyword=' 8 | 9 | def start_requests(self): 10 | for i in range(10): 11 | url = self.base_url + str(i) 12 | yield Request(url, callback=self.parse) 13 | 14 | for i in range(100): 15 | url = self.base_url + str(i) 16 | yield Request(url, callback=self.parse) 17 | 18 | def parse(self, response): 19 | self.logger.debug('Response of ' + response.url) 20 | --------------------------------------------------------------------------------