├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── example-project ├── example │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── dmoz.py │ │ ├── mycrawler_redis.py │ │ └── myspider_redis.py ├── process_items.py └── scrapy.cfg ├── scrapy_redis ├── __init__.py ├── connection.py ├── dupefilter.py ├── pipelines.py ├── queue.py ├── scheduler.py ├── spiders.py └── tests.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.swp 3 | 4 | .ropeproject 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | __pycache__ 24 | 25 | # Installer logs 26 | pip-log.txt 27 | 28 | # Unit test / coverage reports 29 | .coverage 30 | .tox 31 | nosetests.xml 32 | 33 | # Translations 34 | *.mo 35 | 36 | # Mr Developer 37 | .mr.developer.cfg 38 | .project 39 | .pydevproject 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Rolando Espinoza La fuente 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of scrapy-redis nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Redis-based components for Scrapy 2 | ================================= 3 | 4 | This project attempts to provide Redis-backed components for Scrapy. 5 | 6 | Features: 7 | 8 | * Distributed crawling/scraping 9 | You can start multiple spider instances that share a single redis queue. 10 | Best suitable for broad multi-domain crawls. 11 | * Distributed post-processing 12 | Scraped items gets pushed into a redis queued meaning that you can start as 13 | many as needed post-processing processes sharing the items queue. 14 | 15 | Requirements: 16 | 17 | * Scrapy >= 0.14 18 | * redis-py (tested on 2.4.9) 19 | * redis server (tested on 2.4-2.6) 20 | 21 | Available Scrapy components: 22 | 23 | * Scheduler 24 | * Duplication Filter 25 | * Item Pipeline 26 | * Base Spider 27 | 28 | 29 | Installation 30 | ------------ 31 | 32 | From `pypi`:: 33 | 34 | $ pip install scrapy-redis 35 | 36 | From `github`:: 37 | 38 | $ git clone https://github.com/darkrho/scrapy-redis.git 39 | $ cd scrapy-redis 40 | $ python setup.py install 41 | 42 | 43 | Usage 44 | ----- 45 | 46 | Enable the components in your `settings.py`: 47 | 48 | .. code-block:: python 49 | 50 | # Enables scheduling storing requests queue in redis. 51 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 52 | 53 | # Don't cleanup redis queues, allows to pause/resume crawls. 54 | SCHEDULER_PERSIST = True 55 | 56 | # Schedule requests using a priority queue. (default) 57 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' 58 | 59 | # Schedule requests using a queue (FIFO). 60 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue' 61 | 62 | # Schedule requests using a stack (LIFO). 63 | SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack' 64 | 65 | # Max idle time to prevent the spider from being closed when distributed crawling. 66 | # This only works if queue class is SpiderQueue or SpiderStack, 67 | # and may also block the same time when your spider start at the first time (because the queue is empty). 68 | SCHEDULER_IDLE_BEFORE_CLOSE = 10 69 | 70 | # Store scraped item in redis for post-processing. 71 | ITEM_PIPELINES = [ 72 | 'scrapy_redis.pipelines.RedisPipeline', 73 | ] 74 | 75 | # Specify the host and port to use when connecting to Redis (optional). 76 | REDIS_HOST = 'localhost' 77 | REDIS_PORT = 6379 78 | 79 | # Specify the full Redis URL for connecting (optional). 80 | # If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings. 81 | REDIS_URL = 'redis://user:pass@hostname:9001' 82 | 83 | .. note:: 84 | 85 | Version 0.3 changed the requests serialization from `marshal` to `cPickle`, 86 | therefore persisted requests using version 0.2 will not able to work on 0.3. 87 | 88 | 89 | Running the example project 90 | --------------------------- 91 | 92 | This example illustrates how to share a spider's requests queue 93 | across multiple spider instances, highly suitable for broad crawls. 94 | 95 | 1. Setup scrapy_redis package in your PYTHONPATH 96 | 97 | 2. Run the crawler for first time then stop it:: 98 | 99 | $ cd example-project 100 | $ scrapy crawl dmoz 101 | ... [dmoz] ... 102 | ^C 103 | 104 | 3. Run the crawler again to resume stopped crawling:: 105 | 106 | $ scrapy crawl dmoz 107 | ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled) 108 | 109 | 4. Start one or more additional scrapy crawlers:: 110 | 111 | $ scrapy crawl dmoz 112 | ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled) 113 | 114 | 5. Start one or more post-processing workers:: 115 | 116 | $ python process_items.py 117 | Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/) 118 | Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/) 119 | ... 120 | 121 | 122 | Feeding a Spider from Redis 123 | --------------------------- 124 | 125 | The class `scrapy_redis.spiders.RedisSpider` enables a spider to read the 126 | urls from redis. The urls in the redis queue will be processed one 127 | after another, if the first request yields more requests, the spider 128 | will process those requests before fetching another url from redis. 129 | 130 | For example, create a file `myspider.py` with the code below: 131 | 132 | .. code-block:: python 133 | 134 | from scrapy_redis.spiders import RedisSpider 135 | 136 | class MySpider(RedisSpider): 137 | name = 'myspider' 138 | 139 | def parse(self, response): 140 | # do stuff 141 | pass 142 | 143 | 144 | Then: 145 | 146 | 1. run the spider:: 147 | 148 | scrapy runspider myspider.py 149 | 150 | 2. push urls to redis:: 151 | 152 | redis-cli lpush myspider:start_urls http://google.com 153 | 154 | 155 | Changelog 156 | --------- 157 | 158 | 0.5 159 | * Added `REDIS_URL` setting to support Redis connection string. 160 | * Added `SCHEDULER_IDLE_BEFORE_CLOSE` setting to prevent the spider closing too 161 | quickly when the queue is empty. Default value is zero keeping the previous 162 | behavior. 163 | 164 | 0.4 165 | * Added `RedisSpider` and `RedisMixin` classes as building blocks for spiders 166 | to be fed through a redis queue. 167 | * Added redis queue stats. 168 | * Let the encoder handle the item as it comes instead converting it to a dict. 169 | 170 | 0.3 171 | * Added support for different queue classes. 172 | * Changed requests serialization from `marshal` to `cPickle`. 173 | 174 | 0.2 175 | * Improved backward compatibility. 176 | * Added example project. 177 | 178 | 0.1 179 | * Initial version. 180 | 181 | 182 | .. image:: https://d2weczhvl823v0.cloudfront.net/darkrho/scrapy-redis/trend.png 183 | :alt: Bitdeli badge 184 | :target: https://bitdeli.com/free 185 | 186 | -------------------------------------------------------------------------------- /example-project/example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younghz/scrapy-redis/e26633bbb16709ca85f78dc744e555d3e346bd7b/example-project/example/__init__.py -------------------------------------------------------------------------------- /example-project/example/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | from scrapy.contrib.loader import ItemLoader 8 | from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Join 9 | 10 | class ExampleItem(Item): 11 | name = Field() 12 | description = Field() 13 | link = Field() 14 | crawled = Field() 15 | spider = Field() 16 | url = Field() 17 | 18 | 19 | class ExampleLoader(ItemLoader): 20 | default_item_class = ExampleItem 21 | default_input_processor = MapCompose(lambda s: s.strip()) 22 | default_output_processor = TakeFirst() 23 | description_out = Join() 24 | -------------------------------------------------------------------------------- /example-project/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | from datetime import datetime 6 | 7 | class ExamplePipeline(object): 8 | def process_item(self, item, spider): 9 | item["crawled"] = datetime.utcnow() 10 | item["spider"] = spider.name 11 | return item 12 | -------------------------------------------------------------------------------- /example-project/example/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for example project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | SPIDER_MODULES = ['example.spiders'] 9 | NEWSPIDER_MODULE = 'example.spiders' 10 | 11 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 12 | SCHEDULER_PERSIST = True 13 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 14 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 15 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 16 | 17 | ITEM_PIPELINES = { 18 | 'example.pipelines.ExamplePipeline': 300, 19 | 'scrapy_redis.pipelines.RedisPipeline': 400, 20 | } 21 | -------------------------------------------------------------------------------- /example-project/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # To create the first spider for your project use this command: 4 | # 5 | # scrapy genspider myspider myspider-domain.com 6 | # 7 | # For more info see: 8 | # http://doc.scrapy.org/topics/spiders.html 9 | -------------------------------------------------------------------------------- /example-project/example/spiders/dmoz.py: -------------------------------------------------------------------------------- 1 | from scrapy.selector import Selector 2 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from example.items import ExampleLoader 5 | 6 | class DmozSpider(CrawlSpider): 7 | name = 'dmoz' 8 | allowed_domains = ['dmoz.org'] 9 | start_urls = ['http://www.dmoz.org/'] 10 | 11 | rules = ( 12 | Rule(SgmlLinkExtractor(restrict_xpaths='//div[@id="catalogs"]')), 13 | Rule(SgmlLinkExtractor(restrict_xpaths='//ul[@class="directory dir-col"]'), 14 | callback='parse_directory', follow=True) 15 | ) 16 | 17 | def parse_directory(self, response): 18 | hxs = Selector(response) 19 | for li in hxs.xpath('//ul[@class="directory-url"]/li'): 20 | el = ExampleLoader(selector=li) 21 | el.add_xpath('name', 'a/text()') 22 | el.add_xpath('description', 'text()') 23 | el.add_xpath('link', 'a/@href') 24 | el.add_value('url', response.url) 25 | yield el.load_item() 26 | -------------------------------------------------------------------------------- /example-project/example/spiders/mycrawler_redis.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisMixin 2 | 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 5 | 6 | from example.items import ExampleLoader 7 | 8 | 9 | class MyCrawler(RedisMixin, CrawlSpider): 10 | """Spider that reads urls from redis queue (myspider:start_urls).""" 11 | name = 'mycrawler_redis' 12 | redis_key = 'mycrawler:start_urls' 13 | 14 | rules = ( 15 | # follow all links 16 | Rule(SgmlLinkExtractor(), callback='parse_page', follow=True), 17 | ) 18 | 19 | def set_crawler(self, crawler): 20 | CrawlSpider.set_crawler(self, crawler) 21 | RedisMixin.setup_redis(self) 22 | 23 | def parse_page(self, response): 24 | el = ExampleLoader(response=response) 25 | el.add_xpath('name', '//title[1]/text()') 26 | el.add_value('url', response.url) 27 | return el.load_item() 28 | -------------------------------------------------------------------------------- /example-project/example/spiders/myspider_redis.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisSpider 2 | from example.items import ExampleLoader 3 | 4 | 5 | class MySpider(RedisSpider): 6 | """Spider that reads urls from redis queue (myspider:start_urls).""" 7 | name = 'myspider_redis' 8 | redis_key = 'myspider:start_urls' 9 | 10 | def parse(self, response): 11 | el = ExampleLoader(response=response) 12 | el.add_xpath('name', '//title[1]/text()') 13 | el.add_value('url', response.url) 14 | return el.load_item() 15 | -------------------------------------------------------------------------------- /example-project/process_items.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import json 4 | import redis 5 | 6 | 7 | def main(): 8 | r = redis.Redis() 9 | while True: 10 | # process queue as FIFO, change `blpop` to `brpop` to process as LIFO 11 | source, data = r.blpop(["dmoz:items"]) 12 | item = json.loads(data) 13 | try: 14 | print u"Processing: %(name)s <%(link)s>" % item 15 | except KeyError: 16 | print u"Error procesing: %r" % item 17 | 18 | 19 | if __name__ == '__main__': 20 | main() 21 | -------------------------------------------------------------------------------- /example-project/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /scrapy_redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younghz/scrapy-redis/e26633bbb16709ca85f78dc744e555d3e346bd7b/scrapy_redis/__init__.py -------------------------------------------------------------------------------- /scrapy_redis/connection.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | 4 | # Default values. 5 | REDIS_URL = None 6 | REDIS_HOST = 'localhost' 7 | REDIS_PORT = 6379 8 | 9 | 10 | def from_settings(settings): 11 | url = settings.get('REDIS_URL', REDIS_URL) 12 | host = settings.get('REDIS_HOST', REDIS_HOST) 13 | port = settings.get('REDIS_PORT', REDIS_PORT) 14 | 15 | # REDIS_URL takes precedence over host/port specification. 16 | if url: 17 | return redis.from_url(url) 18 | else: 19 | return redis.Redis(host=host, port=port) 20 | -------------------------------------------------------------------------------- /scrapy_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import time 2 | import connection 3 | 4 | from scrapy.dupefilter import BaseDupeFilter 5 | from scrapy.utils.request import request_fingerprint 6 | 7 | 8 | class RFPDupeFilter(BaseDupeFilter): 9 | """Redis-based request duplication filter""" 10 | 11 | def __init__(self, server, key): 12 | """Initialize duplication filter 13 | 14 | Parameters 15 | ---------- 16 | server : Redis instance 17 | key : str 18 | Where to store fingerprints 19 | """ 20 | self.server = server 21 | self.key = key 22 | 23 | @classmethod 24 | def from_settings(cls, settings): 25 | server = connection.from_settings(settings) 26 | # create one-time key. needed to support to use this 27 | # class as standalone dupefilter with scrapy's default scheduler 28 | # if scrapy passes spider on open() method this wouldn't be needed 29 | key = "dupefilter:%s" % int(time.time()) 30 | return cls(server, key) 31 | 32 | @classmethod 33 | def from_crawler(cls, crawler): 34 | return cls.from_settings(crawler.settings) 35 | 36 | def request_seen(self, request): 37 | fp = request_fingerprint(request) 38 | added = self.server.sadd(self.key, fp) 39 | return not added 40 | 41 | def close(self, reason): 42 | """Delete data on close. Called by scrapy's scheduler""" 43 | self.clear() 44 | 45 | def clear(self): 46 | """Clears fingerprints data""" 47 | self.server.delete(self.key) 48 | -------------------------------------------------------------------------------- /scrapy_redis/pipelines.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import connection 3 | 4 | from twisted.internet.threads import deferToThread 5 | from scrapy.utils.serialize import ScrapyJSONEncoder 6 | 7 | 8 | class RedisPipeline(object): 9 | """Pushes serialized item into a redis list/queue""" 10 | 11 | def __init__(self, server): 12 | self.server = server 13 | self.encoder = ScrapyJSONEncoder() 14 | 15 | @classmethod 16 | def from_settings(cls, settings): 17 | server = connection.from_settings(settings) 18 | return cls(server) 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls.from_settings(crawler.settings) 23 | 24 | def process_item(self, item, spider): 25 | return deferToThread(self._process_item, item, spider) 26 | 27 | def _process_item(self, item, spider): 28 | key = self.item_key(item, spider) 29 | data = self.encoder.encode(item) 30 | self.server.rpush(key, data) 31 | return item 32 | 33 | def item_key(self, item, spider): 34 | """Returns redis key based on given spider""" 35 | return "%s:items" % spider.name 36 | -------------------------------------------------------------------------------- /scrapy_redis/queue.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.reqser import request_to_dict, request_from_dict 2 | 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | class Base(object): 10 | """Per-spider queue/stack base class""" 11 | 12 | def __init__(self, server, spider, key): 13 | """Initialize per-spider redis queue. 14 | 15 | Parameters: 16 | server -- redis connection 17 | spider -- spider instance 18 | key -- key for this queue (e.g. "%(spider)s:queue") 19 | """ 20 | self.server = server 21 | self.spider = spider 22 | self.key = key % {'spider': spider.name} 23 | 24 | def _encode_request(self, request): 25 | """Encode a request object""" 26 | return pickle.dumps(request_to_dict(request, self.spider), protocol=-1) 27 | 28 | def _decode_request(self, encoded_request): 29 | """Decode an request previously encoded""" 30 | return request_from_dict(pickle.loads(encoded_request), self.spider) 31 | 32 | def __len__(self): 33 | """Return the length of the queue""" 34 | raise NotImplementedError 35 | 36 | def push(self, request): 37 | """Push a request""" 38 | raise NotImplementedError 39 | 40 | def pop(self, timeout=0): 41 | """Pop a request""" 42 | raise NotImplementedError 43 | 44 | def clear(self): 45 | """Clear queue/stack""" 46 | self.server.delete(self.key) 47 | 48 | 49 | class SpiderQueue(Base): 50 | """Per-spider FIFO queue""" 51 | 52 | def __len__(self): 53 | """Return the length of the queue""" 54 | return self.server.llen(self.key) 55 | 56 | def push(self, request): 57 | """Push a request""" 58 | self.server.lpush(self.key, self._encode_request(request)) 59 | 60 | def pop(self, timeout=0): 61 | """Pop a request""" 62 | if timeout > 0: 63 | data = self.server.brpop(self.key, timeout) 64 | if isinstance(data, tuple): 65 | data = data[1] 66 | else: 67 | data = self.server.rpop(self.key) 68 | if data: 69 | return self._decode_request(data) 70 | 71 | 72 | class SpiderPriorityQueue(Base): 73 | """Per-spider priority queue abstraction using redis' sorted set""" 74 | 75 | def __len__(self): 76 | """Return the length of the queue""" 77 | return self.server.zcard(self.key) 78 | 79 | def push(self, request): 80 | """Push a request""" 81 | data = self._encode_request(request) 82 | pairs = {data: -request.priority} 83 | self.server.zadd(self.key, **pairs) 84 | 85 | def pop(self, timeout=0): 86 | """ 87 | Pop a request 88 | timeout not support in this queue class 89 | """ 90 | # use atomic range/remove using multi/exec 91 | pipe = self.server.pipeline() 92 | pipe.multi() 93 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 94 | results, count = pipe.execute() 95 | if results: 96 | return self._decode_request(results[0]) 97 | 98 | 99 | class SpiderStack(Base): 100 | """Per-spider stack""" 101 | 102 | def __len__(self): 103 | """Return the length of the stack""" 104 | return self.server.llen(self.key) 105 | 106 | def push(self, request): 107 | """Push a request""" 108 | self.server.lpush(self.key, self._encode_request(request)) 109 | 110 | def pop(self, timeout=0): 111 | """Pop a request""" 112 | if timeout > 0: 113 | data = self.server.blpop(self.key, timeout) 114 | if isinstance(data, tuple): 115 | data = data[1] 116 | else: 117 | data = self.server.lpop(self.key) 118 | 119 | if data: 120 | return self._decode_request(data) 121 | 122 | 123 | __all__ = ['SpiderQueue', 'SpiderPriorityQueue', 'SpiderStack'] 124 | -------------------------------------------------------------------------------- /scrapy_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import connection 2 | 3 | from scrapy.utils.misc import load_object 4 | from scrapy_redis.dupefilter import RFPDupeFilter 5 | 6 | 7 | # default values 8 | SCHEDULER_PERSIST = False 9 | QUEUE_KEY = '%(spider)s:requests' 10 | QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' 11 | DUPEFILTER_KEY = '%(spider)s:dupefilter' 12 | IDLE_BEFORE_CLOSE = 0 13 | 14 | 15 | class Scheduler(object): 16 | """Redis-based scheduler""" 17 | 18 | def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close): 19 | """Initialize scheduler. 20 | 21 | Parameters 22 | ---------- 23 | server : Redis instance 24 | persist : bool 25 | queue_key : str 26 | queue_cls : queue class 27 | dupefilter_key : str 28 | idle_before_close : int 29 | """ 30 | self.server = server 31 | self.persist = persist 32 | self.queue_key = queue_key 33 | self.queue_cls = queue_cls 34 | self.dupefilter_key = dupefilter_key 35 | self.idle_before_close = idle_before_close 36 | self.stats = None 37 | 38 | def __len__(self): 39 | return len(self.queue) 40 | 41 | @classmethod 42 | def from_settings(cls, settings): 43 | persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) 44 | queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) 45 | queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) 46 | dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) 47 | idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) 48 | server = connection.from_settings(settings) 49 | return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close) 50 | 51 | @classmethod 52 | def from_crawler(cls, crawler): 53 | instance = cls.from_settings(crawler.settings) 54 | # FIXME: for now, stats are only supported from this constructor 55 | instance.stats = crawler.stats 56 | return instance 57 | 58 | def open(self, spider): 59 | self.spider = spider 60 | self.queue = self.queue_cls(self.server, spider, self.queue_key) 61 | self.df = RFPDupeFilter(self.server, self.dupefilter_key % {'spider': spider.name}) 62 | if self.idle_before_close < 0: 63 | self.idle_before_close = 0 64 | # notice if there are requests already in the queue to resume the crawl 65 | if len(self.queue): 66 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 67 | 68 | def close(self, reason): 69 | if not self.persist: 70 | self.df.clear() 71 | self.queue.clear() 72 | 73 | def enqueue_request(self, request): 74 | if not request.dont_filter and self.df.request_seen(request): 75 | return 76 | if self.stats: 77 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 78 | self.queue.push(request) 79 | 80 | def next_request(self): 81 | block_pop_timeout = self.idle_before_close 82 | request = self.queue.pop(block_pop_timeout) 83 | if request and self.stats: 84 | self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 85 | return request 86 | 87 | def has_pending_requests(self): 88 | return len(self) > 0 89 | -------------------------------------------------------------------------------- /scrapy_redis/spiders.py: -------------------------------------------------------------------------------- 1 | import connection 2 | 3 | from scrapy import signals 4 | from scrapy.exceptions import DontCloseSpider 5 | from scrapy.spider import Spider 6 | 7 | 8 | class RedisMixin(object): 9 | """Mixin class to implement reading urls from a redis queue.""" 10 | redis_key = None # use default ':start_urls' 11 | 12 | def setup_redis(self): 13 | """Setup redis connection and idle signal. 14 | 15 | This should be called after the spider has set its crawler object. 16 | """ 17 | if not self.redis_key: 18 | self.redis_key = '%s:start_urls' % self.name 19 | 20 | self.server = connection.from_settings(self.crawler.settings) 21 | # idle signal is called when the spider has no requests left, 22 | # that's when we will schedule new requests from redis queue 23 | self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 24 | self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 25 | self.log("Reading URLs from redis list '%s'" % self.redis_key) 26 | 27 | def next_request(self): 28 | """Returns a request to be scheduled or none.""" 29 | url = self.server.lpop(self.redis_key) 30 | if url: 31 | return self.make_requests_from_url(url) 32 | 33 | def schedule_next_request(self): 34 | """Schedules a request if available""" 35 | req = self.next_request() 36 | if req: 37 | self.crawler.engine.crawl(req, spider=self) 38 | 39 | def spider_idle(self): 40 | """Schedules a request if available, otherwise waits.""" 41 | self.schedule_next_request() 42 | raise DontCloseSpider 43 | 44 | def item_scraped(self, *args, **kwargs): 45 | """Avoids waiting for the spider to idle before scheduling the next request""" 46 | self.schedule_next_request() 47 | 48 | 49 | class RedisSpider(RedisMixin, Spider): 50 | """Spider that reads urls from redis queue when idle.""" 51 | 52 | def set_crawler(self, crawler): 53 | super(RedisSpider, self).set_crawler(crawler) 54 | self.setup_redis() 55 | -------------------------------------------------------------------------------- /scrapy_redis/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import redis 3 | import connection 4 | 5 | from scrapy.http import Request 6 | from scrapy.spider import Spider 7 | from unittest import TestCase 8 | 9 | from .dupefilter import RFPDupeFilter 10 | from .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack 11 | from .scheduler import Scheduler 12 | 13 | 14 | # allow test settings from environment 15 | REDIS_HOST = os.environ.get('REDIST_HOST', 'localhost') 16 | REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) 17 | 18 | 19 | class DupeFilterTest(TestCase): 20 | 21 | def setUp(self): 22 | self.server = redis.Redis(REDIS_HOST, REDIS_PORT) 23 | self.key = 'scrapy_redis:tests:dupefilter:' 24 | self.df = RFPDupeFilter(self.server, self.key) 25 | 26 | def tearDown(self): 27 | self.server.delete(self.key) 28 | 29 | def test_dupe_filter(self): 30 | req = Request('http://example.com') 31 | 32 | self.assertFalse(self.df.request_seen(req)) 33 | self.assertTrue(self.df.request_seen(req)) 34 | 35 | self.df.close('nothing') 36 | 37 | 38 | class QueueTestMixin(object): 39 | 40 | queue_cls = None 41 | 42 | def setUp(self): 43 | self.spider = Spider('myspider') 44 | self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name 45 | self.server = redis.Redis(REDIS_HOST, REDIS_PORT) 46 | self.q = self.queue_cls(self.server, Spider('myspider'), self.key) 47 | 48 | def tearDown(self): 49 | self.server.delete(self.key) 50 | 51 | def test_clear(self): 52 | self.assertEqual(len(self.q), 0) 53 | 54 | for i in range(10): 55 | # XXX: can't use same url for all requests as SpiderPriorityQueue 56 | # uses redis' set implemention and we will end with only one 57 | # request in the set and thus failing the test. It should be noted 58 | # that when using SpiderPriorityQueue it acts as a request 59 | # duplication filter whenever the serielized requests are the same. 60 | # This might be unwanted on repetitive requests to the same page 61 | # even with dont_filter=True flag. 62 | req = Request('http://example.com/?page=%s' % i) 63 | self.q.push(req) 64 | self.assertEqual(len(self.q), 10) 65 | 66 | self.q.clear() 67 | self.assertEqual(len(self.q), 0) 68 | 69 | 70 | class SpiderQueueTest(QueueTestMixin, TestCase): 71 | 72 | queue_cls = SpiderQueue 73 | 74 | def test_queue(self): 75 | req1 = Request('http://example.com/page1') 76 | req2 = Request('http://example.com/page2') 77 | 78 | self.q.push(req1) 79 | self.q.push(req2) 80 | 81 | out1 = self.q.pop() 82 | out2 = self.q.pop() 83 | 84 | self.assertEqual(out1.url, req1.url) 85 | self.assertEqual(out2.url, req2.url) 86 | 87 | 88 | class SpiderPriorityQueueTest(QueueTestMixin, TestCase): 89 | 90 | queue_cls = SpiderPriorityQueue 91 | 92 | def test_queue(self): 93 | req1 = Request('http://example.com/page1', priority=100) 94 | req2 = Request('http://example.com/page2', priority=50) 95 | req3 = Request('http://example.com/page2', priority=200) 96 | 97 | self.q.push(req1) 98 | self.q.push(req2) 99 | self.q.push(req3) 100 | 101 | out1 = self.q.pop() 102 | out2 = self.q.pop() 103 | out3 = self.q.pop() 104 | 105 | self.assertEqual(out1.url, req3.url) 106 | self.assertEqual(out2.url, req1.url) 107 | self.assertEqual(out3.url, req2.url) 108 | 109 | 110 | class SpiderStackTest(QueueTestMixin, TestCase): 111 | 112 | queue_cls = SpiderStack 113 | 114 | def test_queue(self): 115 | req1 = Request('http://example.com/page1') 116 | req2 = Request('http://example.com/page2') 117 | 118 | self.q.push(req1) 119 | self.q.push(req2) 120 | 121 | out1 = self.q.pop() 122 | out2 = self.q.pop() 123 | 124 | self.assertEqual(out1.url, req2.url) 125 | self.assertEqual(out2.url, req1.url) 126 | 127 | 128 | class SchedulerTest(TestCase): 129 | 130 | def setUp(self): 131 | self.server = redis.Redis(REDIS_HOST, REDIS_PORT) 132 | self.key_prefix = 'scrapy_redis:tests:' 133 | self.queue_key = self.key_prefix + '%(spider)s:requests' 134 | self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' 135 | self.idle_before_close = 0 136 | self.scheduler = Scheduler(self.server, False, self.queue_key, 137 | SpiderQueue, self.dupefilter_key, 138 | self.idle_before_close) 139 | 140 | def tearDown(self): 141 | for key in self.server.keys(self.key_prefix): 142 | self.server.delete(key) 143 | 144 | def test_scheduler(self): 145 | # default no persist 146 | self.assertFalse(self.scheduler.persist) 147 | 148 | spider = Spider('myspider') 149 | self.scheduler.open(spider) 150 | self.assertEqual(len(self.scheduler), 0) 151 | 152 | req = Request('http://example.com') 153 | self.scheduler.enqueue_request(req) 154 | self.assertTrue(self.scheduler.has_pending_requests()) 155 | self.assertEqual(len(self.scheduler), 1) 156 | 157 | # dupefilter in action 158 | self.scheduler.enqueue_request(req) 159 | self.assertEqual(len(self.scheduler), 1) 160 | 161 | out = self.scheduler.next_request() 162 | self.assertEqual(out.url, req.url) 163 | 164 | self.assertFalse(self.scheduler.has_pending_requests()) 165 | self.assertEqual(len(self.scheduler), 0) 166 | 167 | self.scheduler.close('finish') 168 | 169 | def test_scheduler_persistent(self): 170 | messages = [] 171 | spider = Spider('myspider') 172 | spider.log = lambda *args, **kwargs: messages.append([args, kwargs]) 173 | 174 | self.scheduler.persist = True 175 | self.scheduler.open(spider) 176 | 177 | self.assertEqual(messages, []) 178 | 179 | self.scheduler.enqueue_request(Request('http://example.com/page1')) 180 | self.scheduler.enqueue_request(Request('http://example.com/page2')) 181 | 182 | self.assertTrue(self.scheduler.has_pending_requests()) 183 | self.scheduler.close('finish') 184 | 185 | self.scheduler.open(spider) 186 | self.assertEqual(messages, [ 187 | [('Resuming crawl (2 requests scheduled)',), {}], 188 | ]) 189 | self.assertEqual(len(self.scheduler), 2) 190 | 191 | self.scheduler.persist = False 192 | self.scheduler.close('finish') 193 | 194 | self.assertEqual(len(self.scheduler), 0) 195 | 196 | 197 | class ConnectionTest(TestCase): 198 | 199 | def setUp(self): 200 | pass 201 | 202 | def tearDown(self): 203 | pass 204 | 205 | # We can get a connection from just REDIS_URL. 206 | def test_redis_url(self): 207 | settings = dict( 208 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 209 | ) 210 | 211 | server = connection.from_settings(settings) 212 | connect_args = server.connection_pool.connection_kwargs 213 | 214 | self.assertEqual(connect_args['host'], 'localhost') 215 | self.assertEqual(connect_args['port'], 9001) 216 | self.assertEqual(connect_args['password'], 'bar') 217 | self.assertEqual(connect_args['db'], 42) 218 | 219 | # We can get a connection from REDIS_HOST/REDIS_PORT. 220 | def test_redis_host_port(self): 221 | settings = dict( 222 | REDIS_HOST = 'localhost', 223 | REDIS_PORT = 9001 224 | ) 225 | 226 | server = connection.from_settings(settings) 227 | connect_args = server.connection_pool.connection_kwargs 228 | 229 | self.assertEqual(connect_args['host'], 'localhost') 230 | self.assertEqual(connect_args['port'], 9001) 231 | 232 | # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. 233 | def test_redis_url_precedence(self): 234 | settings = dict( 235 | REDIS_HOST = 'baz', 236 | REDIS_PORT = 1337, 237 | REDIS_URL = 'redis://foo:bar@localhost:9001/42' 238 | ) 239 | 240 | server = connection.from_settings(settings) 241 | connect_args = server.connection_pool.connection_kwargs 242 | 243 | self.assertEqual(connect_args['host'], 'localhost') 244 | self.assertEqual(connect_args['port'], 9001) 245 | self.assertEqual(connect_args['password'], 'bar') 246 | self.assertEqual(connect_args['db'], 42) 247 | 248 | # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. 249 | def test_redis_host_port_fallback(self): 250 | settings = dict( 251 | REDIS_HOST = 'baz', 252 | REDIS_PORT = 1337, 253 | REDIS_URL = None 254 | ) 255 | 256 | server = connection.from_settings(settings) 257 | connect_args = server.connection_pool.connection_kwargs 258 | 259 | self.assertEqual(connect_args['host'], 'baz') 260 | self.assertEqual(connect_args['port'], 1337) 261 | 262 | # We use default values for REDIS_HOST/REDIS_PORT. 263 | def test_redis_default(self): 264 | settings = dict() 265 | 266 | server = connection.from_settings(settings) 267 | connect_args = server.connection_pool.connection_kwargs 268 | 269 | self.assertEqual(connect_args['host'], 'localhost') 270 | self.assertEqual(connect_args['port'], 6379) 271 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test = nosetests 3 | 4 | [nosetests] 5 | #with-coverage = 1 6 | cover-html = 1 7 | cover-html-dir = coverage-html 8 | cover-package = scrapy_redis 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | LONG_DESC = open(os.path.join(os.path.dirname(__file__), 'README.rst')).read() 6 | 7 | 8 | setup(name='scrapy-redis', 9 | version='0.5.1', 10 | description='Redis-based components for Scrapy', 11 | long_description=LONG_DESC, 12 | author='Rolando Espinoza La fuente', 13 | author_email='darkrho@gmail.com', 14 | url='http://github.com/darkrho/scrapy-redis', 15 | packages=['scrapy_redis'], 16 | license='BSD', 17 | install_requires=['Scrapy>=0.14', 'redis>=2.4'], 18 | classifiers=[ 19 | 'Programming Language :: Python', 20 | 'Development Status :: 4 - Beta', 21 | 'Intended Audience :: Developers', 22 | ], 23 | ) 24 | --------------------------------------------------------------------------------