├── LICENSE.txt ├── README.md ├── README_zh.md ├── setup.py └── src └── scrapy_kafka_redis ├── __init__.py ├── connection.py ├── defaults.py ├── dupefilter.py ├── picklecompat.py ├── queue.py ├── scheduler.py └── spiders.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [中文文档](README_zh.md) | 2 | [English](README.md) 3 | # Scrpay-Kafka-Redis 4 | In the case of a large number of requests, even using the `Bloomfilter` algorithm, but using [scrapy-redis] (https://github.com/rmax/scrapy-redis) still consumes a lot of memory. This project refers to `scrapy- Redis`. 5 | ### Features 6 | - Support for distributed 7 | - Use Redis as a deduplication queue, Simultaneous use of Bloomfilter to reduce the memory footprint, but increased the amount of deduplication 8 | - Use Kafka as a request queue, Can support a large number of request stacks, capacity and disk size related, rather than running memory 9 | - Due to the feature of Kafka, priority queues are not supported, only FIFO queues are supported. 10 | 11 | ### Dependencies 12 | - Python 3.0+ 13 | - Redis >= 2.8 14 | - Scrapy >= 1.5 15 | - kafka-python >= 1.4.0 16 | - kafka <= 1.1.0 (Since [kafka-python](https://github.com/dpkp/kafka-python) only supports kafka-1.1.0 version) 17 | 18 | ### How to Use 19 | - `pip install scrapy-kafka-redis` 20 | - Configuration `settings.py` file 21 | Must add param in `settings.py` file 22 | ``` 23 | # Enable Kafka scheduling storage request queue 24 | SCHEDULER = "scrapy_kafka_redis.scheduler.Scheduler" 25 | 26 | # Use BloomFilter as a deduplication queue 27 | DUPEFILTER_CLASS = "scrapy_kafka_redis.dupefilter.BloomFilter" 28 | ``` 29 | 30 | Default values for other optional parameters 31 | ``` 32 | # the key of the deduplication queue stored in redis 33 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 34 | 35 | REDIS_CLS = redis.StrictRedis 36 | REDIS_ENCODING = 'utf-8' 37 | REDIS_URL = 'redis://localhost:6378/1' 38 | 39 | REDIS_PARAMS = { 40 | 'socket_timeout': 30, 41 | 'socket_connect_timeout': 30, 42 | 'retry_on_timeout': True, 43 | 'encoding': REDIS_ENCODING, 44 | } 45 | 46 | KAFKA_BOOTSTRAP_SERVERS=['localhost:9092'] 47 | # Default TOPIC for the dispatch queue 48 | SCHEDULER_QUEUE_TOPIC = '%(spider)s-requests' 49 | # Scheduled queue used by default 50 | SCHEDULER_QUEUE_CLASS = 'scrapy_kafka_redis.queue.KafkaQueue' 51 | # The name of the key stored in the redis queue in redis 52 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 53 | # Deduplication algorithm used by the scheduler 54 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_kafka_redis.dupefilter.BloomFilter' 55 | # Number of blocks in the BloomFilter algorithm 56 | BLOOM_BLOCK_NUM = 1 57 | 58 | # TOPIC used by start urls 59 | START_URLS_TOPIC = '%(name)s-start_urls' 60 | 61 | KAFKA_BOOTSTRAP_SERVERS = None 62 | # Kafka producer constructing the request queue 63 | KAFKA_REQUEST_PRODUCER_PARAMS = { 64 | 'api_version': (0, 10, 1), 65 | 'value_serializer': dumps 66 | } 67 | # Constructing a Kafka consumer of the request queue 68 | KAFKA_REQUEST_CONSUMER_PARAMS = { 69 | 'api_version': (0, 10, 1), 70 | 'value_deserializer': loads 71 | } 72 | # Constructing a Kafka consumer in the start queue 73 | KAFKA_START_URLS_CONSUMER_PARAMS = { 74 | 'api_version': (0, 10, 1), 75 | 'value_deserializer': lambda m: m.decode('utf-8'), 76 | } 77 | ``` 78 | - how to use in `spiders` 79 | ``` 80 | import scrapy 81 | from scrapy_kafka_redis.spiders import KafkaSpider 82 | 83 | class DemoSpider(KafkaSpider): 84 | name = "demo" 85 | def parse(self, response): 86 | pass 87 | ``` 88 | - Create Kafka `Topic` 89 | Set the number of partitions for the topic based on the distributed scrapy instance you need to create. 90 | ``` 91 | ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --partitions 3 --replication-factor 1 --topic demo-start_urls 92 | 93 | ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --partitions 3 --replication-factor 1 --topic demo-requests 94 | ``` 95 | - Send Msg 96 | ``` 97 | ./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic demo-start_urls 98 | ``` 99 | It is recommended to manually create a Topic and specify the number of partitions. 100 | 101 | - run scrapy 102 | 103 | ### Reference: 104 | [scrapy-redis](https://github.com/rmax/scrapy-redis) 105 | [Bloomfilter](https://github.com/LiuXingMing/Scrapy_Redis_Bloomfilter) 106 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | [中文文档](README_zh.md) 2 | [English](README.md) 3 | # Scrpay-Kafka-Redis 4 | 在有大量请求堆积的情况下,即使用了`Bloomfilter`算法,使用[scrapy-redis](https://github.com/rmax/scrapy-redis)仍然会占用大量内存,本项目参考`scrapy-redis`, 5 | ### 特点 6 | - 支持分布式 7 | - 使用Redis作为去重队列 8 | 同时使用Bloomfilter去重算法,降低了内存占用,但是增加了可去重数量 9 | - 使用Kafka作为请求队列 10 | 可支持大量请求堆积,容量和磁盘大小相关,而不是和运行内存相关 11 | - 由于Kafka的特性,不支持优先队列,只支持先进先出队列 12 | 13 | ### 依赖 14 | - Python 3.0+ 15 | - Redis >= 2.8 16 | - Scrapy >= 1.5 17 | - kafka-python >= 1.4.0(由于该类库只支持kafka-1.1.0版本,故本类库也支持最高1.1.0版本) 18 | 19 | ### 使用 20 | - `pip install scrapy-kafka-redis` 21 | - 配置`settings.py` 22 | 必须要添加在`settings.py`的内容 23 | ``` 24 | # 启用Kafka调度存储请求队列 25 | SCHEDULER = "scrapy_kafka_redis.scheduler.Scheduler" 26 | 27 | # 使用BloomFilter作为去重队列 28 | DUPEFILTER_CLASS = "scrapy_kafka_redis.dupefilter.BloomFilter" 29 | ``` 30 | 31 | 其他可选参数的默认值 32 | ``` 33 | # 单独使用情况下,去重队列在redis中存储的key 34 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 35 | 36 | REDIS_CLS = redis.StrictRedis 37 | REDIS_ENCODING = 'utf-8' 38 | REDIS_URL = 'redis://localhost:6378/1' 39 | 40 | REDIS_PARAMS = { 41 | 'socket_timeout': 30, 42 | 'socket_connect_timeout': 30, 43 | 'retry_on_timeout': True, 44 | 'encoding': REDIS_ENCODING, 45 | } 46 | 47 | KAFKA_BOOTSTRAP_SERVERS=['localhost:9092'] 48 | # 调度队列的默认TOPIC 49 | SCHEDULER_QUEUE_TOPIC = '%(spider)s-requests' 50 | # 默认使用的调度队列 51 | SCHEDULER_QUEUE_CLASS = 'scrapy_kafka_redis.queue.KafkaQueue' 52 | # 去重队列在redis中存储的key名 53 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 54 | # 调度器使用的去重算法 55 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_kafka_redis.dupefilter.BloomFilter' 56 | # BloomFilter的块个数 57 | BLOOM_BLOCK_NUM = 1 58 | 59 | # start urls使用的TOPIC 60 | START_URLS_TOPIC = '%(name)s-start_urls' 61 | 62 | KAFKA_BOOTSTRAP_SERVERS = None 63 | # 构造请求队列的Kafka生产者 64 | KAFKA_REQUEST_PRODUCER_PARAMS = { 65 | 'api_version': (0, 10, 1), 66 | 'value_serializer': dumps 67 | } 68 | # 构造请求队列的Kafka消费者 69 | KAFKA_REQUEST_CONSUMER_PARAMS = { 70 | 'group_id': 'requests', 71 | 'api_version': (0, 10, 1), 72 | 'value_deserializer': loads 73 | } 74 | # 构造开始队列的Kafka消费者 75 | KAFKA_START_URLS_CONSUMER_PARAMS = { 76 | 'group_id': 'start_url', 77 | 'api_version': (0, 10, 1), 78 | 'value_deserializer': lambda m: m.decode('utf-8'), 79 | } 80 | ``` 81 | - `spiders` 使用 82 | ``` 83 | import scrapy 84 | from scrapy_kafka_redis.spiders import KafkaSpider 85 | 86 | class DemoSpider(KafkaSpider): 87 | name = "demo" 88 | def parse(self, response): 89 | pass 90 | ``` 91 | - 创建`Topic` 92 | 根据需要创建的分布式scrapy实例,设置topic的分区数,比如 93 | ``` 94 | ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --partitions 3 --replication-factor 1 --topic demo-start_urls 95 | 96 | ./bin/kafka-topics.sh --create --zookeeper localhost:2181 --partitions 3 --replication-factor 1 --topic demo-requests 97 | ``` 98 | - 发送消息 99 | ``` 100 | ./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic demo-start_urls 101 | ``` 102 | 建议手动创建Topic并指定分区数 103 | 104 | - 运行分布式scrapy 105 | 106 | ### 参考: 107 | [scrapy-redis](https://github.com/rmax/scrapy-redis) 108 | [Bloomfilter](https://github.com/LiuXingMing/Scrapy_Redis_Bloomfilter) 109 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from setuptools import setup 4 | 5 | 6 | setup( 7 | name='scrapy-kafka-redis', 8 | version='0.0.7', 9 | description="Kafka and Redis based components for Scrapy.", 10 | author="tenlee", 11 | author_email='tenlee2012@163.com', 12 | url='https://github.com/tenlee2012/scrapy-kafka-redis', 13 | packages=['scrapy_kafka_redis'], 14 | package_dir={'': 'src'}, 15 | install_requires=['Scrapy>=1.0', 'redis>=2.10', 'kafka-python>=1.4.0'], 16 | include_package_data=True, 17 | license="MIT", 18 | keywords='scrapy-kafka-redis', 19 | classifiers=[ 20 | 'License :: OSI Approved :: MIT License', 21 | 'Programming Language :: Python :: 3', 22 | 'Programming Language :: Python :: 3.5', 23 | 'Programming Language :: Python :: 3.6', 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/connection.py: -------------------------------------------------------------------------------- 1 | import six 2 | import scrapy 3 | from kafka import KafkaProducer, KafkaConsumer 4 | 5 | from scrapy.utils.misc import load_object 6 | from scrapy.settings import Settings 7 | 8 | from . import defaults 9 | 10 | 11 | # Shortcut maps 'setting name' -> 'parmater name'. 12 | SETTINGS_PARAMS_MAP = { 13 | 'REDIS_URL': 'url', 14 | 'REDIS_HOST': 'host', 15 | 'REDIS_PORT': 'port', 16 | 'REDIS_ENCODING': 'encoding', 17 | } 18 | 19 | 20 | def get_redis_from_settings(settings): 21 | """Returns a redis client instance from given Scrapy settings object. 22 | 23 | This function uses ``get_client`` to instantiate the client and uses 24 | ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You 25 | can override them using the ``REDIS_PARAMS`` setting. 26 | 27 | Parameters 28 | ---------- 29 | settings : Settings 30 | A scrapy settings object. See the supported settings below. 31 | 32 | Returns 33 | ------- 34 | server 35 | Redis client instance. 36 | 37 | Other Parameters 38 | ---------------- 39 | REDIS_URL : str, optional 40 | Server connection URL. 41 | REDIS_HOST : str, optional 42 | Server host. 43 | REDIS_PORT : str, optional 44 | Server port. 45 | REDIS_ENCODING : str, optional 46 | Data encoding. 47 | REDIS_PARAMS : dict, optional 48 | Additional client parameters. 49 | 50 | """ 51 | params = defaults.REDIS_PARAMS.copy() 52 | params.update(settings.getdict('REDIS_PARAMS')) 53 | # XXX: Deprecate REDIS_* settings. 54 | for source, dest in SETTINGS_PARAMS_MAP.items(): 55 | val = settings.get(source) 56 | if val: 57 | params[dest] = val 58 | 59 | # Allow ``redis_cls`` to be a path to a class. 60 | if isinstance(params.get('redis_cls'), six.string_types): 61 | params['redis_cls'] = load_object(params['redis_cls']) 62 | 63 | return get_redis(**params) 64 | 65 | 66 | def get_start_urls_consumer_from_settings(spider_name: str, settings: Settings) -> KafkaConsumer: 67 | bootstrap_servers = settings.get('KAFKA_BOOTSTRAP_SERVERS', defaults.KAFKA_BOOTSTRAP_SERVERS) 68 | kafka_params = settings.get('KAFKA_START_URLS_CONSUMER_PARAMS', defaults.KAFKA_START_URLS_CONSUMER_PARAMS) 69 | kafka_start_group = settings.get('KAFKA_START_GROUP', defaults.KAFKA_START_GROUP) 70 | kafka_params['group_id'] = kafka_start_group % {'spider': spider_name} 71 | consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers, **kafka_params) 72 | return consumer 73 | 74 | 75 | def get_request_consumer_from_settings(spider_name: str, settings: Settings) -> KafkaConsumer: 76 | bootstrap_servers = settings.get('KAFKA_BOOTSTRAP_SERVERS', defaults.KAFKA_BOOTSTRAP_SERVERS) 77 | kafka_params = settings.get('KAFKA_REQUEST_CONSUMER_PARAMS', defaults.KAFKA_REQUEST_CONSUMER_PARAMS) 78 | kafka_request_group = settings.get('KAFKA_REQUEST_GROUP', defaults.KAFKA_REQUEST_GROUP) 79 | kafka_params['group_id'] = kafka_request_group % {'spider': spider_name} 80 | consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers, **kafka_params) 81 | return consumer 82 | 83 | 84 | def get_request_producer_from_settings(settings: Settings) -> KafkaProducer: 85 | bootstrap_servers = settings.get('KAFKA_BOOTSTRAP_SERVERS', defaults.KAFKA_BOOTSTRAP_SERVERS) 86 | kafka_params = settings.get('KAFKA_REQUEST_PRODUCER_PARAMS', defaults.KAFKA_REQUEST_PRODUCER_PARAMS) 87 | producer = KafkaProducer(bootstrap_servers=bootstrap_servers, **kafka_params) 88 | return producer 89 | 90 | 91 | def get_redis(**kwargs): 92 | """Returns a redis client instance. 93 | 94 | Parameters 95 | ---------- 96 | redis_cls : class, optional 97 | Defaults to ``redis.StrictRedis``. 98 | url : str, optional 99 | If given, ``redis_cls.from_url`` is used to instantiate the class. 100 | **kwargs 101 | Extra parameters to be passed to the ``redis_cls`` class. 102 | 103 | Returns 104 | ------- 105 | server 106 | Redis client instance. 107 | 108 | """ 109 | redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) 110 | url = kwargs.pop('url', None) 111 | if url: 112 | return redis_cls.from_url(url, **kwargs) 113 | else: 114 | return redis_cls(**kwargs) 115 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/defaults.py: -------------------------------------------------------------------------------- 1 | import redis 2 | from .picklecompat import loads, dumps 3 | 4 | # For standalone use. 5 | DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 6 | 7 | PIPELINE_TOPIC = '%(spider)s-items' 8 | 9 | REDIS_CLS = redis.StrictRedis 10 | REDIS_ENCODING = 'utf-8' 11 | # Sane connection defaults. 12 | REDIS_PARAMS = { 13 | 'socket_timeout': 30, 14 | 'socket_connect_timeout': 30, 15 | 'retry_on_timeout': True, 16 | 'encoding': REDIS_ENCODING, 17 | } 18 | 19 | SCHEDULER_QUEUE_TOPIC = '%(spider)s-requests' 20 | SCHEDULER_QUEUE_CLASS = 'scrapy_kafka_redis.queue.KafkaQueue' 21 | SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' 22 | SCHEDULER_DUPEFILTER_CLASS = 'scrapy_kafka_redis.dupefilter.BloomFilter' 23 | 24 | 25 | START_URLS_TOPIC = '%(name)s-start_urls' 26 | BLOOM_BLOCK_NUM = 1 27 | 28 | KAFKA_BOOTSTRAP_SERVERS = None 29 | KAFKA_REQUEST_GROUP = '%(spider)s-requests-0' 30 | KAFKA_START_GROUP = '%(spider)s-start-0' 31 | KAFKA_REQUEST_PRODUCER_PARAMS = { 32 | 'api_version': (0, 10, 1), 33 | 'value_serializer': dumps 34 | } 35 | KAFKA_REQUEST_CONSUMER_PARAMS = { 36 | 'api_version': (0, 10, 1), 37 | 'value_deserializer': loads 38 | } 39 | KAFKA_START_URLS_CONSUMER_PARAMS = { 40 | 'api_version': (0, 10, 1), 41 | 'value_deserializer': lambda m: m.decode('utf-8'), 42 | } 43 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from hashlib import md5 4 | 5 | from scrapy.dupefilters import BaseDupeFilter 6 | from scrapy.utils.request import request_fingerprint 7 | 8 | from . import defaults 9 | from .connection import get_redis_from_settings 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class SimpleHash(object): 16 | def __init__(self, cap, seed): 17 | self.cap = cap 18 | self.seed = seed 19 | 20 | def hash(self, value): 21 | ret = 0 22 | for i in range(len(value)): 23 | ret += self.seed * ret + ord(value[i]) 24 | return (self.cap - 1) & ret 25 | 26 | 27 | class BloomFilter(BaseDupeFilter): 28 | """ 29 | 使用Bloomfilter去重 30 | 参考九茶大大 https://github.com/LiuXingMing/Scrapy_Redis_Bloomfilter 31 | 以及scrapy-redis 32 | """ 33 | logger = logger 34 | 35 | def __init__(self, server, key, block_num=1, debug=False): 36 | self.server = server 37 | self.key = key 38 | self.debug = debug 39 | self.logdupes = True 40 | 41 | self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M 42 | self.seeds = [5, 7, 11, 13, 31, 37, 61] 43 | self.block_num = block_num 44 | self.hashfunc = [] 45 | for seed in self.seeds: 46 | self.hashfunc.append(SimpleHash(self.bit_size, seed)) 47 | 48 | @classmethod 49 | def from_settings(cls, settings): 50 | server = get_redis_from_settings(settings) 51 | key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 52 | debug = settings.getbool('DUPEFILTER_DEBUG') 53 | block_num = settings.getint('DUPEFILTER_DEBUG', defaults.BLOOM_BLOCK_NUM) 54 | return cls(server, key=key, block_num=block_num, debug=debug) 55 | 56 | @classmethod 57 | def from_crawler(cls, crawler): 58 | return cls.from_settings(crawler.settings) 59 | 60 | def request_seen(self, request): 61 | fp = self.request_fingerprint(request) 62 | if self.is_contains(fp): 63 | return True 64 | else: 65 | self.insert(fp) 66 | return False 67 | 68 | def is_contains(self, str_input): 69 | if not str_input: 70 | return False 71 | m5 = md5() 72 | m5.update(str_input.encode()) 73 | str_input = m5.hexdigest() 74 | ret = True 75 | name = self.key + str(int(str_input[0:2], 16) % self.block_num) 76 | for f in self.hashfunc: 77 | loc = f.hash(str_input) 78 | ret = ret & self.server.getbit(name, loc) 79 | return ret 80 | 81 | def insert(self, str_input): 82 | m5 = md5() 83 | m5.update(str_input.encode()) 84 | str_input = m5.hexdigest() 85 | name = self.key + str(int(str_input[0:2], 16) % self.block_num) 86 | for f in self.hashfunc: 87 | loc = f.hash(str_input) 88 | self.server.setbit(name, loc, 1) 89 | 90 | def request_fingerprint(self, request): 91 | """Returns a fingerprint for a given request. 92 | 93 | Parameters 94 | ---------- 95 | request : scrapy.http.Request 96 | 97 | Returns 98 | ------- 99 | str 100 | 101 | """ 102 | return request_fingerprint(request) 103 | 104 | def close(self, reason=''): 105 | """Delete data on close. Called by Scrapy's scheduler. 106 | 107 | Parameters 108 | ---------- 109 | reason : str, optional 110 | 111 | """ 112 | self.clear() 113 | 114 | def clear(self): 115 | """Clears fingerprints data.""" 116 | self.server.delete(self.key) 117 | 118 | def log(self, request, spider): 119 | """Logs given request. 120 | 121 | Parameters 122 | ---------- 123 | request : scrapy.http.Request 124 | spider : scrapy.spiders.Spider 125 | 126 | """ 127 | if self.debug: 128 | msg = "Filtered duplicate request: %(request)s" 129 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 130 | elif self.logdupes: 131 | msg = ("Filtered duplicate request %(request)s" 132 | " - no more duplicates will be shown" 133 | " (see DUPEFILTER_DEBUG to show all duplicates)") 134 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 135 | self.logdupes = False 136 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/picklecompat.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | 4 | def loads(s): 5 | return pickle.loads(s) 6 | 7 | 8 | def dumps(obj): 9 | return pickle.dumps(obj, protocol=-1) 10 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/queue.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | from scrapy.utils.reqser import request_to_dict, request_from_dict 4 | from kafka import KafkaProducer, KafkaConsumer 5 | 6 | 7 | class Base(object): 8 | """Per-spider base queue class""" 9 | 10 | def __init__(self, producer: KafkaProducer, consumer: KafkaConsumer, 11 | spider, topic): 12 | self.producer = producer 13 | self.consumer = consumer 14 | self.consumer.subscribe(topic) 15 | self.spider = spider 16 | self.topic = topic 17 | 18 | def _encode_request(self, request): 19 | """Encode a request object""" 20 | obj = request_to_dict(request, self.spider) 21 | return obj 22 | 23 | def _decode_request(self, obj): 24 | """Decode an request previously encoded""" 25 | return request_from_dict(obj, self.spider) 26 | 27 | def __len__(self): 28 | """Return the length of the queue""" 29 | raise NotImplementedError 30 | 31 | def push(self, request): 32 | """Push a request""" 33 | raise NotImplementedError 34 | 35 | def pop(self, timeout=0): 36 | """Pop a request""" 37 | raise NotImplementedError 38 | 39 | def close(self): 40 | self.consumer.close() 41 | self.producer.close() 42 | 43 | 44 | class KafkaQueue(Base): 45 | """Per-spider FIFO queue""" 46 | 47 | def __len__(self): 48 | """Return the length of the queue""" 49 | return 0 50 | 51 | def push(self, request): 52 | """Push a request""" 53 | self.producer.send(self.topic, value=self._encode_request(request)) 54 | self.producer.flush() 55 | 56 | def pop(self, timeout=0): 57 | """Pop a request""" 58 | msgs = self.consumer.poll(timeout_ms=timeout, max_records=1) 59 | data = None 60 | partitions = self.consumer.assignment() 61 | for part in partitions: 62 | records = msgs.get(part) 63 | if records: 64 | data = records[0].value 65 | break 66 | 67 | if data: 68 | return self._decode_request(data) 69 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from kafka import KafkaProducer, KafkaConsumer 4 | from scrapy.utils.misc import load_object 5 | 6 | from . import connection, defaults 7 | 8 | 9 | class Scheduler(object): 10 | 11 | def __init__(self, redis_server, settings, 12 | persist=False, 13 | flush_on_start=False, 14 | queue_key=defaults.SCHEDULER_QUEUE_TOPIC, 15 | queue_cls=defaults.SCHEDULER_QUEUE_CLASS, 16 | dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, 17 | dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, 18 | idle_before_close=0): 19 | self.producer = None 20 | self.consumer = None 21 | self.settings = settings 22 | self.redis_server = redis_server 23 | self.persist = persist 24 | self.flush_on_start = flush_on_start 25 | self.queue_key = queue_key 26 | self.queue_cls = queue_cls 27 | self.dupefilter_cls = dupefilter_cls 28 | self.dupefilter_key = dupefilter_key 29 | self.idle_before_close = idle_before_close 30 | 31 | self.stats = None 32 | self.queue = None 33 | self.spider = None 34 | self.df = None 35 | 36 | @classmethod 37 | def from_settings(cls, settings): 38 | kwargs = { 39 | 'persist': settings.getbool('SCHEDULER_PERSIST'), 40 | 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 41 | 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), 42 | } 43 | 44 | # If these values are missing, it means we want to use the defaults. 45 | optional = { 46 | 'queue_key': 'SCHEDULER_QUEUE_TOPIC', 47 | 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 48 | 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', 49 | } 50 | for name, setting_name in optional.items(): 51 | val = settings.get(setting_name) 52 | if val: 53 | kwargs[name] = val 54 | 55 | redis_server = connection.get_redis_from_settings(settings) 56 | # 确定Redis连接有效 57 | redis_server.ping() 58 | 59 | return cls(redis_server=redis_server, settings=settings, **kwargs) 60 | 61 | @classmethod 62 | def from_crawler(cls, crawler): 63 | instance = cls.from_settings(crawler.settings) 64 | instance.stats = crawler.stats 65 | return instance 66 | 67 | def open(self, spider): 68 | self.spider = spider 69 | self.producer = connection.get_request_producer_from_settings(self.settings) 70 | self.consumer = connection.get_request_consumer_from_settings(spider.name, self.settings) 71 | 72 | try: 73 | self.queue = load_object(self.queue_cls)( 74 | producer=self.producer, 75 | consumer=self.consumer, 76 | spider=spider, 77 | topic=self.queue_key % {'spider': spider.name}, 78 | ) 79 | except TypeError as e: 80 | raise ValueError("Failed to instantiate queue class '%s': %s", 81 | self.queue_cls, e) 82 | 83 | try: 84 | self.df = load_object(self.dupefilter_cls)( 85 | server=self.redis_server, 86 | key=self.dupefilter_key % {'spider': spider.name}, 87 | debug=spider.settings.getbool('DUPEFILTER_DEBUG'), 88 | ) 89 | except TypeError as e: 90 | raise ValueError("Failed to instantiate dupefilter class '%s': %s", 91 | self.dupefilter_cls, e) 92 | 93 | def close(self, reason): 94 | self.queue.close() 95 | 96 | def enqueue_request(self, request): 97 | if not request.dont_filter and self.df.request_seen(request): 98 | self.df.log(request, self.spider) 99 | return False 100 | if self.stats: 101 | self.stats.inc_value('scheduler/enqueued/kafka', spider=self.spider) 102 | self.queue.push(request) 103 | return True 104 | 105 | def next_request(self): 106 | block_pop_timeout = self.idle_before_close 107 | request = self.queue.pop(block_pop_timeout) 108 | if request and self.stats: 109 | self.stats.inc_value('scheduler/dequeued/kafka', spider=self.spider) 110 | return request 111 | 112 | def has_pending_requests(self): 113 | return len(self.queue) > 0 114 | -------------------------------------------------------------------------------- /src/scrapy_kafka_redis/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.exceptions import DontCloseSpider 3 | from scrapy.spiders import Spider, CrawlSpider 4 | 5 | from . import connection, defaults 6 | 7 | 8 | class KafkaMixin(object): 9 | consumer = None 10 | kafka_topic = None 11 | kafka_batch_size = None 12 | 13 | def start_requests(self): 14 | """Returns a batch of start requests from redis.""" 15 | return self.next_requests() 16 | 17 | def setup_kafka(self, crawler=None): 18 | """Setup redis connection and idle signal. 19 | 20 | This should be called after the spider has set its crawler object. 21 | """ 22 | if self.consumer is not None: 23 | return 24 | 25 | if crawler is None: 26 | # We allow optional crawler argument to keep backwards 27 | # compatibility. 28 | # XXX: Raise a deprecation warning. 29 | crawler = getattr(self, 'crawler', None) 30 | 31 | if crawler is None: 32 | raise ValueError("crawler is required") 33 | 34 | settings = crawler.settings 35 | 36 | if self.kafka_topic is None: 37 | self.kafka_topic = settings.get( 38 | 'KAFKA_START_URLS_KEY', defaults.START_URLS_TOPIC, 39 | ) 40 | 41 | self.kafka_topic = self.kafka_topic % {'name': self.name} 42 | 43 | if self.kafka_batch_size is None: 44 | self.kafka_batch_size = settings.getint('CONCURRENT_REQUESTS') 45 | 46 | try: 47 | self.kafka_batch_size = int(self.kafka_batch_size) 48 | except (TypeError, ValueError): 49 | raise ValueError("redis_batch_size must be an integer") 50 | 51 | if not self.kafka_topic.strip(): 52 | raise ValueError("kafka topic must not be empty") 53 | 54 | self.logger.info("Reading start URLs from kafka topic '%(kafka_topic)s' " 55 | "(batch size: %(kafka_batch_size)s", 56 | self.__dict__) 57 | 58 | self.consumer = connection.get_start_urls_consumer_from_settings(self.name, crawler.settings) 59 | self.consumer.subscribe(self.kafka_topic) 60 | # The idle signal is called when the spider has no requests left, 61 | # that's when we will schedule new requests from redis queue 62 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 63 | 64 | def fetch_one(self): 65 | """Pop a request""" 66 | msgs = self.consumer.poll(max_records=1) 67 | data = None 68 | partitions = self.consumer.assignment() 69 | for part in partitions: 70 | records = msgs.get(part) 71 | if records: 72 | data = records[0].value 73 | break 74 | return data 75 | 76 | def next_requests(self): 77 | found = 0 78 | while found < self.kafka_batch_size: 79 | data = self.fetch_one() 80 | if not data: 81 | # Queue empty. 82 | break 83 | req = self.make_request_from_data(data) 84 | if req: 85 | yield req 86 | found += 1 87 | else: 88 | self.logger.debug("Request not made from data: %r", data) 89 | 90 | if found: 91 | self.logger.debug("Read %s requests from '%s'", found, self.kafka_topic) 92 | 93 | def make_request_from_data(self, url): 94 | """Returns a Request instance from data coming from Redis. 95 | 96 | By default, ``data`` is an encoded URL. You can override this method to 97 | provide your own message decoding. 98 | 99 | Parameters 100 | ---------- 101 | data : bytes 102 | Message from redis. 103 | 104 | """ 105 | return self.make_requests_from_url(url) 106 | 107 | def schedule_next_requests(self): 108 | """Schedules a request if available""" 109 | # TODO: While there is capacity, schedule a batch of redis requests. 110 | for req in self.next_requests(): 111 | self.crawler.engine.crawl(req, spider=self) 112 | 113 | def spider_idle(self): 114 | """Schedules a request if available, otherwise waits.""" 115 | # XXX: Handle a sentinel to close the spider. 116 | self.schedule_next_requests() 117 | raise DontCloseSpider 118 | 119 | 120 | class KafkaSpider(KafkaMixin, Spider): 121 | 122 | @classmethod 123 | def from_crawler(self, crawler, *args, **kwargs): 124 | obj = super(KafkaSpider, self).from_crawler(crawler, *args, **kwargs) 125 | obj.setup_kafka(crawler) 126 | return obj 127 | 128 | 129 | class RedisCrawlSpider(KafkaMixin, CrawlSpider): 130 | 131 | @classmethod 132 | def from_crawler(self, crawler, *args, **kwargs): 133 | obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) 134 | obj.setup_redis(crawler) 135 | return obj 136 | --------------------------------------------------------------------------------