├── .idea ├── .gitignore ├── RabbitSpider.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── MANIFEST.in ├── README.md ├── RabbitSpider ├── __init__.py ├── core │ ├── __init__.py │ ├── download.py │ ├── engine.py │ └── scheduler.py ├── default_settings.py ├── dupefilters │ ├── __init__.py │ └── memoryfilter.py ├── exceptions.py ├── http │ ├── __init__.py │ ├── request.py │ └── response.py ├── items │ ├── __init__.py │ └── item.py ├── middlewares │ ├── __init__.py │ ├── allow_http_code.py │ ├── download_delay.py │ └── retry.py ├── pipelines │ └── __init__.py ├── rabbit_execute.py ├── spider │ └── __init__.py ├── templates │ └── project │ │ ├── __init__.py │ │ ├── items.tmpl │ │ ├── middlewares.tmpl │ │ ├── pipelines.tmpl │ │ ├── settings.tmpl │ │ └── spiders │ │ ├── __init__.py │ │ └── src │ │ └── basic.tmpl └── utils │ ├── __init__.py │ ├── cmdline.py │ ├── control.py │ ├── event.py │ ├── log.py │ ├── subscriber.py │ └── template.py └── setup.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # 基于编辑器的 HTTP 客户端请求 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/RabbitSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 46 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include RabbitSpider/templates * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于rabbitmq 做消息队列开发的分布式协程爬虫框架,结构用法与scrapy类似 2 | 3 | 支持批量运行任务,运行模式 auto先生产后消费(适用于单机,运行完自动关闭任务),m 只生产,w 只消费(一直监听任务),支持按目录批量定时运行 4 | 5 | 使用curl_cffi封装的下载器,支持修改http版本,tls指纹 6 | 7 | pip install RabbitSpider==2.7.6 8 | 9 | 创建项目cmd命令: 10 | rabbit create [项目名称] [目录名称] [爬虫文件名称] 11 | 12 | 按目录批量定时运行 13 | cd 项目名称 14 | rabbit run [目录名称] -p 20 -t "*/10 * * * *" 15 | 16 | -p (可选参数默认10)同时一批运行爬虫数量 17 | -t (可选参数)crontab表达式 18 | 19 | 自动创建爬虫项目模板 20 | 如:rabbit create shop xxx mama 21 | 22 | import asyncio 23 | from RabbitSpider import go 24 | from RabbitSpider import Request 25 | from RabbitSpider.spider import Spider 26 | 27 | 28 | class MamaSpider(Spider): 29 | name = '_'.join(__file__.replace('\\', '/').rsplit('/')[-2:]).split('.')[0] 30 | custom_settings = {} 31 | 32 | async def start_requests(self): 33 | yield Request(url='https://www.baidu.com') 34 | 35 | async def parse(self, request, response): 36 | pass 37 | 38 | 39 | if __name__ == '__main__': 40 | asyncio.run(go(MamaSpider, 'auto', 1)) 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /RabbitSpider/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from RabbitSpider.items.item import BaseItem 4 | from RabbitSpider.http.request import Request 5 | from RabbitSpider.http.response import Response 6 | from RabbitSpider.rabbit_execute import go, batch_go 7 | 8 | 9 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '..'))) 10 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../..'))) 11 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../../..'))) 12 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../../../..'))) 13 | 14 | logo = r""" 15 | ____ __ __ _ __ _____ _ __ 16 | / __ \ ____ _ / /_ / /_ (_) / /_ / ___/ ____ (_) ____/ / ___ _____ 17 | / /_/ / / __ `/ / __ \ / __ \ / / / __/ \__ \ / __ \ / / / __ / / _ \ / ___/ 18 | / _, _/ / /_/ / / /_/ / / /_/ / / / / /_ ___/ / / /_/ / / / / /_/ / / __/ / / 19 | /_/ |_| \__,_/ /_.___/ /_.___/ /_/ \__/ /____/ / .___/ /_/ \__,_/ \___/ /_/ 20 | /_/ 21 | """ 22 | 23 | __all__ = ['Request', 'Response', 'BaseItem', 'go', 'batch_go'] 24 | __author__ = '一纸' 25 | __email__ = '2395396520@qq.com' 26 | __version__ = '2.7.7' 27 | 28 | sys.stdout.write(f'\033[0;35;1m{logo}\033[0m') 29 | -------------------------------------------------------------------------------- /RabbitSpider/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/core/__init__.py -------------------------------------------------------------------------------- /RabbitSpider/core/download.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider import Response 2 | from curl_cffi import CurlHttpVersion 3 | from RabbitSpider.exceptions import RabbitExpect 4 | 5 | 6 | class CurlDownload(object): 7 | def __init__(self): 8 | self.impersonate = 'chrome120' 9 | self.http_version = CurlHttpVersion.V2TLS 10 | 11 | async def fetch(self, session, request) -> Response: 12 | if request['method'].upper() == 'GET': 13 | res = await session.get(request['url'], 14 | params=request.get('params'), cookies=request.get('cookies'), 15 | headers=request.get('headers'), proxy=request.get('proxy'), 16 | allow_redirects=request.get('allow_redirects', True), 17 | http_version=self.http_version, 18 | impersonate=self.impersonate, 19 | timeout=request.get('timeout') 20 | ) 21 | 22 | elif request['method'].upper() == 'POST': 23 | res = await session.post(request['url'], 24 | data=request.get('data'), json=request.get('json'), 25 | cookies=request.get('cookies'), headers=request.get('headers'), 26 | proxy=request.get('proxy'), 27 | http_version=self.http_version, 28 | impersonate=self.impersonate, 29 | allow_redirects=request.get('allow_redirects', True), 30 | timeout=request.get('timeout')) 31 | 32 | else: 33 | raise RabbitExpect(f"{request['method']}请求方式未定义,请自定义添加!") 34 | 35 | if res: 36 | return Response(res.status_code, res.headers, res.cookies, res.charset, res.content) 37 | -------------------------------------------------------------------------------- /RabbitSpider/core/engine.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import asyncio 3 | from traceback import print_exc 4 | from asyncio import CancelledError 5 | from aio_pika import IncomingMessage 6 | from RabbitSpider.utils import event 7 | from RabbitSpider.utils.log import Logger 8 | from RabbitSpider import Request, BaseItem 9 | from RabbitSpider.exceptions import RabbitExpect 10 | from RabbitSpider.core.scheduler import Scheduler 11 | from typing import AsyncGenerator, Coroutine, Generator 12 | from RabbitSpider.utils.control import MiddlewareManager, FilterManager, PipelineManager, TaskManager 13 | 14 | 15 | class Engine(object): 16 | 17 | def __init__(self, settings): 18 | self.logger = Logger(settings) 19 | self.mode: str = settings.get('MODE') 20 | self.scheduler = Scheduler(settings) 21 | self.filter = FilterManager(settings) 22 | self.pipeline = PipelineManager(settings) 23 | self.middlewares = MiddlewareManager(settings) 24 | self.task_count: int = settings.get('TASK_COUNT') 25 | 26 | async def __aenter__(self): 27 | await self.scheduler.connect() 28 | await self.pipeline.open_spider() 29 | return self 30 | 31 | async def __aexit__(self, exc_type, exc_val, exc_tb): 32 | await self.scheduler.close() 33 | await self.pipeline.close_spider() 34 | 35 | async def routing(self, spider, result): 36 | async def rule(res): 37 | if isinstance(res, Request): 38 | if self.filter.request_seen(res): 39 | self.logger.info(f'生产数据:{res.to_dict()}', spider.name) 40 | await self.scheduler.producer(queue=spider.name, body=res.to_dict()) 41 | elif isinstance(res, BaseItem): 42 | await spider.subscriber.notify(event.item_scraped, res) 43 | await self.pipeline.process_item(res, spider) 44 | elif res is None: 45 | pass 46 | else: 47 | raise TypeError('回调函数返回类型错误!') 48 | 49 | if isinstance(result, AsyncGenerator): 50 | async for r in result: 51 | await rule(r) 52 | elif isinstance(result, Generator): 53 | for r in result: 54 | await rule(r) 55 | elif isinstance(result, Coroutine): 56 | await rule(await result) 57 | elif isinstance(result, Request): 58 | await rule(result) 59 | elif result is None: 60 | pass 61 | else: 62 | raise TypeError('回调函数返回类型错误!') 63 | 64 | async def produce(self, spider): 65 | await self.scheduler.create_queue(spider.name) 66 | await self.scheduler.queue_purge(spider.name) 67 | await self.routing(spider, spider.start_requests()) 68 | 69 | async def crawl(self, spider): 70 | task_manager = TaskManager(self.task_count) 71 | while True: 72 | incoming_message: IncomingMessage = await self.scheduler.consumer(spider) 73 | if incoming_message: 74 | await task_manager.semaphore.acquire() 75 | task_manager.create_task(self.deal_resp(spider, incoming_message)) 76 | else: 77 | if task_manager.all_done(): 78 | await self.scheduler.delete_queue(spider.name) 79 | break 80 | 81 | async def consume(self, spider): 82 | await self.scheduler.consumer(spider, callback=self.deal_resp, 83 | prefetch=self.task_count) 84 | await asyncio.Future() 85 | 86 | async def deal_resp(self, spider, incoming_message: IncomingMessage): 87 | try: 88 | request = Request(**pickle.loads(incoming_message.body)) 89 | await spider.subscriber.notify(event.request_received, request) 90 | self.logger.info(f'消费数据:{request.to_dict()}', spider.name) 91 | request, response = await self.middlewares.send(spider, request) 92 | if response: 93 | await spider.subscriber.notify(event.response_received, response) 94 | result = getattr(spider, request.callback)(request, response) 95 | result and await self.routing(spider, result) 96 | elif request: 97 | await self.routing(spider, request) 98 | await incoming_message.ack() 99 | except Exception as e: 100 | print_exc() 101 | for task in asyncio.all_tasks(): 102 | task.cancel() 103 | 104 | async def start(self, spider): 105 | self.logger.info(f'任务{spider.name}启动') 106 | await spider.subscriber.notify(event.spider_opened) 107 | try: 108 | if self.mode == 'auto': 109 | await self.produce(spider) 110 | await self.crawl(spider) 111 | elif self.mode == 'm': 112 | await self.produce(spider) 113 | elif self.mode == 'w': 114 | await self.consume(spider) 115 | else: 116 | raise RabbitExpect('执行模式错误!') 117 | except CancelledError as exc: 118 | self.logger.error(f'任务{spider.name}异常: {exc}') 119 | await spider.subscriber.notify(event.spider_error, exc) 120 | except Exception as exc: 121 | self.logger.error(f'任务{spider.name}异常: {exc}') 122 | await spider.subscriber.notify(event.spider_error, exc) 123 | else: 124 | await spider.subscriber.notify(event.spider_closed) 125 | self.logger.info(f'任务{spider.name}结束') 126 | finally: 127 | await spider.session.close() 128 | -------------------------------------------------------------------------------- /RabbitSpider/core/scheduler.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from typing import Callable, Optional 3 | from aio_pika import connect_robust, Message, pool 4 | from aio_pika.exceptions import ChannelNotFoundEntity 5 | 6 | 7 | class Scheduler(object): 8 | def __init__(self, settings): 9 | self.connection = None 10 | self.channel_pool = None 11 | self.host = settings.get('RABBIT_HOST') 12 | self.port = settings.get('RABBIT_PORT') 13 | self.username = settings.get('RABBIT_USERNAME') 14 | self.password = settings.get('RABBIT_PASSWORD') 15 | self.channel_size = settings.get('CHANNEL_SIZE') 16 | self.virtual_host = settings.get('RABBIT_VIRTUAL_HOST') 17 | 18 | async def connect(self): 19 | self.connection = await connect_robust(host=self.host, login=self.username, password=self.password, 20 | virtualhost=self.virtual_host, heartbeat=30, timeout=60) 21 | self.channel_pool = pool.Pool(self.connection.channel, max_size=self.channel_size) 22 | 23 | async def create_queue(self, queue: str): 24 | async with self.channel_pool.acquire() as channel: 25 | await channel.declare_queue(name=queue, durable=True, arguments={"x-max-priority": 10}, timeout=60) 26 | 27 | async def producer(self, queue: str, body: dict): 28 | ret = pickle.dumps(body) 29 | async with self.channel_pool.acquire() as channel: 30 | await channel.default_exchange.publish( 31 | Message(body=ret, delivery_mode=2, priority=body['retry_times']), routing_key=queue, timeout=60) 32 | 33 | async def consumer(self, spider, callback: Optional[Callable] = None, prefetch: int = 1): 34 | async with self.channel_pool.acquire() as channel: 35 | try: 36 | queue = await channel.declare_queue(name=spider.name, durable=True, passive=True, timeout=60) 37 | except ChannelNotFoundEntity: 38 | queue = await channel.declare_queue(name=spider.name, durable=True, arguments={"x-max-priority": 10}, 39 | timeout=60) 40 | if callback: 41 | await channel.set_qos(prefetch_count=prefetch) 42 | await queue.consume(callback=lambda incoming_message: callback(spider, incoming_message), timeout=60) 43 | else: 44 | return await queue.get(fail=False, timeout=60) 45 | 46 | async def queue_purge(self, queue: str): 47 | async with self.channel_pool.acquire() as channel: 48 | queue = await channel.declare_queue(name=queue, durable=True, passive=True, timeout=60) 49 | await queue.purge() 50 | 51 | async def delete_queue(self, queue: str): 52 | async with self.channel_pool.acquire() as channel: 53 | await channel.queue_delete(queue) 54 | 55 | async def get_message_count(self, queue: str): 56 | async with self.channel_pool.acquire() as channel: 57 | queue = await channel.declare_queue(name=queue, durable=True, passive=True, timeout=60) 58 | return queue.declaration_result.message_count 59 | 60 | async def close(self): 61 | await self.channel_pool.close() 62 | await self.connection.close() 63 | -------------------------------------------------------------------------------- /RabbitSpider/default_settings.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath, join, dirname 2 | 3 | TEMPLATE_DIR = abspath(join(dirname(__file__), 'templates/project')) 4 | -------------------------------------------------------------------------------- /RabbitSpider/dupefilters/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import hashlib 3 | from io import BytesIO 4 | from urllib.parse import urlencode 5 | from RabbitSpider.http.request import Request 6 | from RabbitSpider.utils.control import SettingManager 7 | 8 | 9 | class DupeFilter(object): 10 | def __init__(self, settings): 11 | self.settings: SettingManager = settings 12 | 13 | def request_fingerprint(self, request: Request): 14 | if isinstance(request.data, (dict, list, tuple)): 15 | body = urlencode(request.data).encode('utf-8') 16 | elif isinstance(request.data, str): 17 | body = request.data.encode('utf-8') 18 | elif isinstance(request.data, BytesIO): 19 | body = request.data.read() 20 | elif isinstance(request.data, bytes): 21 | body = request.data 22 | else: 23 | body = b"" 24 | 25 | if request.json is not None: 26 | body = json.dumps(request.json, separators=(",", ":")).encode() 27 | 28 | sha1 = hashlib.sha1() 29 | if isinstance(request.params, (dict, list, tuple)): 30 | sha1.update(f'{request.url}?{urlencode(request.params)}'.encode('utf-8')) 31 | else: 32 | sha1.update(request.url.encode('utf-8')) 33 | sha1.update(request.method.encode('utf-8')) 34 | sha1.update(body) 35 | sha1.update(str(request.retry_times).encode('utf-8')) 36 | return sha1.hexdigest() 37 | 38 | def request_seen(self, request: Request) -> bool: 39 | pass 40 | -------------------------------------------------------------------------------- /RabbitSpider/dupefilters/memoryfilter.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider.dupefilters import DupeFilter 2 | 3 | 4 | class MemoryFilter(DupeFilter): 5 | def __init__(self, settings): 6 | super().__init__(settings) 7 | self.repeat = set() 8 | 9 | def request_seen(self, request): 10 | fingerprint = self.request_fingerprint(request) 11 | if fingerprint in self.repeat: 12 | return False 13 | else: 14 | self.repeat.add(fingerprint) 15 | return True 16 | -------------------------------------------------------------------------------- /RabbitSpider/exceptions.py: -------------------------------------------------------------------------------- 1 | class RabbitExpect(Exception): 2 | def __init__(self, msg): 3 | self.msg = msg 4 | 5 | def __str__(self): 6 | return self.msg 7 | -------------------------------------------------------------------------------- /RabbitSpider/http/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /RabbitSpider/http/request.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Callable 3 | 4 | 5 | class Request(object): 6 | def __init__(self, 7 | url: str, 8 | params: dict = None, 9 | data: dict | str | bytes | None = None, 10 | json: dict = None, 11 | method: str = 'get', 12 | headers: dict | None = None, 13 | cookies: dict | None = None, 14 | proxy: str | None = None, 15 | timeout: int = 60, 16 | allow_redirects: bool = True, 17 | callback: str | Callable = 'parse', 18 | retry_times: int = 0, 19 | meta: dict | None = None 20 | ): 21 | self._url = url 22 | self.params = params 23 | self.data = data 24 | self.json = json 25 | self.method = method 26 | self.headers = headers 27 | self.cookies = cookies 28 | self.proxy = proxy 29 | self.timeout = timeout 30 | self.allow_redirects = allow_redirects 31 | self._callback = callback 32 | self.retry_times = retry_times 33 | self._meta = meta 34 | 35 | @property 36 | def url(self): 37 | pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 38 | if not re.match(pattern, self._url): 39 | raise ValueError(f'请检查url是否正确{self._url}') 40 | return self._url 41 | 42 | @url.setter 43 | def url(self, value): 44 | self._url = value 45 | 46 | @property 47 | def meta(self): 48 | return self._meta if self._meta else {} 49 | 50 | @property 51 | def callback(self): 52 | return self._callback.__name__ if callable(self._callback) else self._callback 53 | 54 | def to_dict(self): 55 | return { 56 | 'url': self.url, 57 | 'params': self.params, 58 | 'data': self.data, 59 | 'json': self.json, 60 | 'method': self.method, 61 | 'headers': self.headers, 62 | 'cookies': self.cookies, 63 | 'proxy': self.proxy, 64 | 'timeout': self.timeout, 65 | 'allow_redirects': self.allow_redirects, 66 | 'callback': self.callback, 67 | 'meta': self.meta, 68 | 'retry_times': self.retry_times 69 | } 70 | -------------------------------------------------------------------------------- /RabbitSpider/http/response.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import chardet 4 | import parsel 5 | from w3lib.encoding import http_content_type_encoding, html_to_unicode 6 | 7 | 8 | class Response: 9 | def __init__(self, status_code, headers, cookies, charset, content): 10 | self.content = content 11 | self.charset = charset 12 | self.status_code = status_code 13 | self.headers = {k: v for k, v in headers.items()} 14 | self.cookies = {k: v for k, v in cookies.items()} 15 | self.__r = parsel.Selector(self.text) 16 | 17 | @property 18 | def text(self): 19 | if not self.content: 20 | return '' 21 | if self.charset: 22 | try: 23 | text = self.content.decode(self.charset) 24 | except UnicodeDecodeError: 25 | try: 26 | benc = http_content_type_encoding(self.headers['Content-Type']) 27 | if benc: 28 | charset = 'charset=%s' % benc 29 | text = html_to_unicode(charset, self.content)[1] 30 | else: 31 | raise UnicodeDecodeError 32 | except (UnicodeDecodeError, KeyError): 33 | try: 34 | char = chardet.detect(self.content) 35 | if char: 36 | text = self.content.decode(char['encoding']) 37 | else: 38 | raise UnicodeDecodeError 39 | except UnicodeDecodeError: 40 | try: 41 | text = self.content.decode('utf-8') 42 | except UnicodeDecodeError: 43 | try: 44 | text = self.content.decode("gb18030") 45 | except UnicodeDecodeError: 46 | text = self.content.decode('utf-8', "ignore") 47 | else: 48 | try: 49 | text = self.content.decode('utf-8') 50 | except UnicodeDecodeError: 51 | try: 52 | char = chardet.detect(self.content) 53 | if char: 54 | text = self.content.decode(char['encoding']) 55 | else: 56 | raise UnicodeDecodeError 57 | except UnicodeDecodeError: 58 | try: 59 | text = self.content.decode('gb18030') 60 | except UnicodeDecodeError: 61 | text = self.content.decode('utf-8', "ignore") 62 | return text 63 | 64 | @property 65 | def json(self): 66 | result = re.findall(r'[.*?(]?(\[?{.*}]?)[).*]?', self.text, re.DOTALL) 67 | if result: 68 | return json.loads(result[0], strict=False) 69 | 70 | def xpath(self, x): 71 | return self.__r.xpath(x) 72 | 73 | def css(self, x): 74 | return self.__r.css(x) 75 | -------------------------------------------------------------------------------- /RabbitSpider/items/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from collections import defaultdict 3 | 4 | 5 | class ItemMeta(type): 6 | def __new__(mcs, name, bases, attrs): 7 | field: Dict[str, dict] = defaultdict(dict) 8 | for key, value in attrs.items(): 9 | if not (callable(value) or key.startswith('_')): 10 | field[key]['value'] = value 11 | if key == '__annotations__': 12 | for k, v in value.items(): 13 | field[k]['annotation'] = v 14 | cls_instance = super().__new__(mcs, name, bases, attrs) 15 | cls_instance.FIELDS = field 16 | return cls_instance 17 | -------------------------------------------------------------------------------- /RabbitSpider/items/item.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider.items import ItemMeta 2 | 3 | 4 | class BaseItem(metaclass=ItemMeta): 5 | def __init__(self): 6 | self._values = {} 7 | for k, v in self.FIELDS.items(): 8 | if v.get('value') is not None: 9 | self._values[k] = v['value'] 10 | 11 | def __setitem__(self, key, value): 12 | if key in self.FIELDS: 13 | if self.FIELDS[key]['annotation']: 14 | if isinstance(value, self.FIELDS[key]['annotation']): 15 | self._values[key] = value 16 | else: 17 | raise TypeError(f"{value} is not type {self.FIELDS[key]['annotation']}") 18 | else: 19 | self._values[key] = value 20 | else: 21 | raise KeyError(f'field {key} undefined') 22 | 23 | def __getitem__(self, item): 24 | return self._values[item] 25 | 26 | def __contains__(self, item): 27 | if item in self._values: 28 | return True 29 | else: 30 | return False 31 | 32 | def __iter__(self): 33 | return iter(self._values) 34 | 35 | def __len__(self) -> int: 36 | return len(self._values) 37 | 38 | def __delitem__(self, v): 39 | delattr(self._values, v) 40 | 41 | def __setattr__(self, key, value): 42 | if not key.startswith('_'): 43 | raise 44 | else: 45 | super().__setattr__(key, value) 46 | 47 | def __getattribute__(self, item): 48 | field = super().__getattribute__('FIELDS') 49 | if item in field: 50 | raise 51 | else: 52 | return super(BaseItem, self).__getattribute__(item) 53 | 54 | def to_dict(self): 55 | return self._values 56 | -------------------------------------------------------------------------------- /RabbitSpider/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider import Request 2 | from RabbitSpider import Response 3 | from RabbitSpider.utils.log import Logger 4 | from RabbitSpider.utils.control import SettingManager 5 | 6 | 7 | class BaseMiddleware: 8 | def __init__(self, settings): 9 | self.logger = Logger(settings) 10 | self.settings: SettingManager = settings 11 | 12 | async def process_request(self, request, spider) -> None | Request | Response: 13 | """请求预处理""" 14 | pass 15 | 16 | async def process_response(self, request, response, spider) -> Request | Response: 17 | """响应预处理""" 18 | pass 19 | 20 | async def process_exception(self, request, exc, spider) -> None | Request | Response: 21 | """异常预处理""" 22 | pass 23 | -------------------------------------------------------------------------------- /RabbitSpider/middlewares/allow_http_code.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider.middlewares import BaseMiddleware 2 | 3 | 4 | class AllowHttpCodeMiddleware(BaseMiddleware): 5 | def __init__(self, settings): 6 | super().__init__(settings) 7 | self.allow_http_code = settings.getlist('ALLOW_HTTP_CODES') 8 | 9 | async def process_response(self, request, response, spider): 10 | if response.status_code not in self.allow_http_code: 11 | self.logger.error(f'{request.to_dict()},不允许的状态码:{response.status_code}',spider.name) 12 | return True 13 | -------------------------------------------------------------------------------- /RabbitSpider/middlewares/download_delay.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | 4 | from RabbitSpider.middlewares import BaseMiddleware 5 | 6 | 7 | class DownloadDelayMiddleware(BaseMiddleware): 8 | def __init__(self, settings): 9 | super().__init__(settings) 10 | self.download_delay = settings.get('DOWNLOAD_DELAY') 11 | 12 | async def process_request(self, request, spider): 13 | if self.download_delay: 14 | delay = random.uniform(self.download_delay[0], self.download_delay[1]) 15 | await asyncio.sleep(delay) 16 | -------------------------------------------------------------------------------- /RabbitSpider/middlewares/retry.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider.middlewares import BaseMiddleware 2 | 3 | 4 | class RetryMiddleware(BaseMiddleware): 5 | def __init__(self, settings): 6 | super().__init__(settings) 7 | self.retry_http_code = settings.getlist('RETRY_HTTP_CODES') 8 | self.retry_exceptions = settings.getlist('RETRY_EXCEPTIONS') 9 | self.max_retry = settings.get('MAX_RETRY') 10 | 11 | async def process_response(self, request, response, spider): 12 | if response.status_code in self.retry_http_code: 13 | if request.retry_times < self.max_retry: 14 | request.retry_times += 1 15 | return request 16 | else: 17 | self.logger.warning(f'丢弃{request.to_dict()},状态码:{response.status_code}',spider.name) 18 | return True 19 | 20 | async def process_exception(self, request, exc, spider): 21 | if exc.__class__.__name__ in self.retry_exceptions: 22 | if request.retry_times < self.max_retry: 23 | request.retry_times += 1 24 | return request 25 | else: 26 | self.logger.warning(f'丢弃{request.to_dict()},异常:{repr(exc)}',spider.name) 27 | return True 28 | -------------------------------------------------------------------------------- /RabbitSpider/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from RabbitSpider.utils.log import Logger 2 | from RabbitSpider.utils.control import SettingManager 3 | 4 | 5 | class BasePipeline(object): 6 | def __init__(self, settings): 7 | self.logger = Logger(settings) 8 | self.settings: SettingManager = settings 9 | 10 | async def open_spider(self): 11 | """初始化数据库""" 12 | pass 13 | 14 | async def process_item(self, item, spider): 15 | """入库逻辑""" 16 | pass 17 | 18 | async def close_spider(self): 19 | """关闭连接""" 20 | pass 21 | -------------------------------------------------------------------------------- /RabbitSpider/rabbit_execute.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import time 5 | import asyncio 6 | from typing import Type, List 7 | from croniter import croniter 8 | from datetime import datetime 9 | from RabbitSpider.spider import Spider 10 | from RabbitSpider.core.engine import Engine 11 | from RabbitSpider.utils.control import TaskManager, SettingManager 12 | from importlib.util import spec_from_file_location, module_from_spec 13 | 14 | 15 | async def go(spider_cls: Type[Spider], mode: str = 'auto', task_count: int = 1): 16 | settings = SettingManager() 17 | for i in sys.argv[1:]: 18 | key, value = i.split('=') 19 | if key == 'mode': 20 | mode = value 21 | if key == 'task_count': 22 | task_count = int(value) 23 | settings.set('MODE', mode) 24 | settings.set('TASK_COUNT', task_count) 25 | settings.set('CHANNEL_SIZE', task_count * 2) 26 | async with Engine(settings) as engine: 27 | await engine.start(spider_cls()) 28 | 29 | 30 | async def batch_go(spiders: List[Type[Spider]], task_count: int = 10): 31 | settings = SettingManager() 32 | settings.set('MODE', 'auto') 33 | settings.set('TASK_COUNT', task_count) 34 | settings.set('CHANNEL_SIZE', task_count * 2) 35 | task_group: TaskManager = TaskManager(task_count) 36 | async with Engine(settings) as engine: 37 | for spider_cls in spiders: 38 | await task_group.semaphore.acquire() 39 | task_group.create_task(engine.start(spider_cls())) 40 | while True: 41 | if task_group.all_done(): 42 | break 43 | else: 44 | await asyncio.sleep(1) 45 | 46 | 47 | def runner(spider_dir, task_pool, cron_expr): 48 | spider_classes = [] 49 | loop = asyncio.get_event_loop() 50 | spider_path = os.path.join('spiders', spider_dir) 51 | sys.path.extend([os.path.abspath('.'), os.path.abspath('..')]) 52 | 53 | for script_name in os.listdir(spider_path): 54 | if script_name.endswith('.py') and not script_name.startswith('__'): 55 | script_path = os.path.join(spider_path, script_name) 56 | with open(script_path, 'r', encoding='utf-8') as file: 57 | class_name = re.findall(r'class\s+(\w+)\s*\(\w+\)', file.read())[0] 58 | spec = spec_from_file_location(class_name, script_path) 59 | module = module_from_spec(spec) 60 | spec.loader.exec_module(module) 61 | spider_classes.append(getattr(module, class_name)) 62 | 63 | if croniter.is_valid(cron_expr): 64 | cron_schedule = croniter(cron_expr, datetime.now()) 65 | next_run_time = cron_schedule.get_next(datetime) 66 | print(f'下次运行时间:{next_run_time}') 67 | while True: 68 | now_time = datetime.now().replace(second=0, microsecond=0) 69 | if now_time == next_run_time: 70 | loop.run_until_complete(batch_go(spider_classes, task_pool)) 71 | if next_run_time <= now_time: 72 | next_run_time = cron_schedule.get_next(datetime) 73 | print(f'下次运行时间:{next_run_time}') 74 | else: 75 | time.sleep(5) 76 | else: 77 | loop.run_until_complete(batch_go(spider_classes, task_pool)) 78 | loop.close() 79 | -------------------------------------------------------------------------------- /RabbitSpider/spider/__init__.py: -------------------------------------------------------------------------------- 1 | from asyncio import CancelledError 2 | from typing import AsyncGenerator, Union 3 | from curl_cffi.requests import AsyncSession 4 | from RabbitSpider.utils import event 5 | from RabbitSpider import Request, Response, BaseItem 6 | from RabbitSpider.utils.subscriber import Subscriber 7 | 8 | 9 | class Spider(object): 10 | name: str 11 | 12 | def __init__(self): 13 | self.subscriber = Subscriber() 14 | self.session = AsyncSession(verify=False) 15 | self.subscriber.subscribe(self.spider_opened, event.spider_opened) 16 | self.subscriber.subscribe(self.spider_closed, event.spider_closed) 17 | self.subscriber.subscribe(self.spider_error, event.spider_error) 18 | self.subscriber.subscribe(self.request_received, event.request_received) 19 | self.subscriber.subscribe(self.response_received, event.response_received) 20 | self.subscriber.subscribe(self.item_scraped, event.item_scraped) 21 | 22 | async def start_requests(self) -> AsyncGenerator[Request, None]: 23 | """初始请求""" 24 | raise NotImplementedError 25 | 26 | async def parse(self, request: Request, response: Response) -> AsyncGenerator[Union[Request, BaseItem, None], None]: 27 | """默认回调""" 28 | pass 29 | 30 | async def spider_opened(self) -> None: 31 | """爬虫启动时触发""" 32 | pass 33 | 34 | async def spider_closed(self) -> None: 35 | """爬虫关闭时触发""" 36 | pass 37 | 38 | async def spider_error(self, error: Exception | CancelledError) -> None: 39 | """爬虫异常时触发""" 40 | pass 41 | 42 | async def request_received(self, request: Request) -> None: 43 | """发起请求时触发""" 44 | pass 45 | 46 | async def response_received(self, response: Response) -> None: 47 | """获取到响应时触发""" 48 | pass 49 | 50 | async def item_scraped(self, item: BaseItem) -> None: 51 | """生成item时触发""" 52 | pass 53 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/__init__.py: -------------------------------------------------------------------------------- 1 | default_path = __path__[0] 2 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/items.tmpl: -------------------------------------------------------------------------------- 1 | from RabbitSpider import BaseItem 2 | 3 | 4 | class Item(BaseItem): 5 | # field: str = None 6 | pass 7 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/middlewares.tmpl: -------------------------------------------------------------------------------- 1 | from RabbitSpider.middlewares import BaseMiddleware 2 | 3 | 4 | class Middleware(BaseMiddleware): 5 | 6 | async def process_request(self, request, spider): 7 | """请求预处理""" 8 | pass 9 | 10 | async def process_response(self, request, response, spider): 11 | """响应预处理""" 12 | pass 13 | 14 | async def process_exception(self, request, exc, spider): 15 | """异常预处理""" 16 | pass 17 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/pipelines.tmpl: -------------------------------------------------------------------------------- 1 | from RabbitSpider.pipelines import BasePipeline 2 | 3 | 4 | class Pipeline(BasePipeline): 5 | async def open_spider(self): 6 | """初始化数据库""" 7 | pass 8 | 9 | async def process_item(self, item, spider): 10 | """入库逻辑""" 11 | self.logger.info(item.to_dict(), spider.name) 12 | 13 | async def close_spider(self): 14 | """关闭连接""" 15 | pass 16 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/settings.tmpl: -------------------------------------------------------------------------------- 1 | from ${project} import default_path 2 | 3 | BOT_DIR = default_path 4 | 5 | # Rabbitmq 6 | RABBIT_HOST = '127.0.0.1' 7 | RABBIT_PORT = 5672 8 | RABBIT_USERNAME = 'yuntom' 9 | RABBIT_PASSWORD = '123456' 10 | RABBIT_VIRTUAL_HOST = '/' 11 | 12 | # 内存 去重 13 | DUPEFILTER_CLASS = 'RabbitSpider.dupefilters.memoryfilter.MemoryFilter' 14 | 15 | # 中间件 16 | MIDDLEWARES = [ 17 | 'RabbitSpider.middlewares.allow_http_code.AllowHttpCodeMiddleware', 18 | 'RabbitSpider.middlewares.retry.RetryMiddleware', 19 | 'RabbitSpider.middlewares.download_delay.DownloadDelayMiddleware', 20 | '${project}.middlewares.Middleware', 21 | ] 22 | 23 | # 管道 24 | ITEM_PIPELINES = ['${project}.pipelines.Pipeline'] 25 | 26 | # 日志 27 | # LOG_LEVEL = 'WARNING' 28 | # LOG_FILE = './rabbit_log' 29 | 30 | # 延时下载 31 | # DOWNLOAD_DELAY = (1, 3) 32 | 33 | # 最大重试次数 34 | MAX_RETRY = 5 35 | # 重试状态码 36 | RETRY_HTTP_CODES = [] 37 | # 异常重试 38 | RETRY_EXCEPTIONS = ['RequestsError', 'IncompleteRead', 'DNSError', 'ConnectionError', 'Timeout'] 39 | # 允许通过状态码 40 | ALLOW_HTTP_CODES = [200] 41 | -------------------------------------------------------------------------------- /RabbitSpider/templates/project/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/templates/project/spiders/__init__.py -------------------------------------------------------------------------------- /RabbitSpider/templates/project/spiders/src/basic.tmpl: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | from RabbitSpider import go 4 | from RabbitSpider import Request 5 | from RabbitSpider.spider import Spider 6 | 7 | 8 | class ${classname}(Spider): 9 | name = os.path.basename(__file__).split('.')[0] 10 | 11 | async def start_requests(self): 12 | pass 13 | 14 | async def parse(self, request, response): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | asyncio.run(go(${classname}, 'auto', 1)) 20 | -------------------------------------------------------------------------------- /RabbitSpider/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/utils/__init__.py -------------------------------------------------------------------------------- /RabbitSpider/utils/cmdline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from RabbitSpider.rabbit_execute import runner 3 | from RabbitSpider.utils.template import template_to_file 4 | 5 | 6 | def execute(): 7 | parser = argparse.ArgumentParser() 8 | subparsers = parser.add_subparsers(dest='command', required=True, help='可用的子命令') 9 | 10 | create_parser = subparsers.add_parser('create', help='创建一个新的爬虫项目') 11 | create_parser.add_argument('project', help='项目名称') 12 | create_parser.add_argument('directory', help='目录') 13 | create_parser.add_argument('filename', help='爬虫文件名') 14 | 15 | run_parser = subparsers.add_parser('run', help='运行一个爬虫项目') 16 | run_parser.add_argument('directory', help='目录') 17 | run_parser.add_argument('-p', '--task_pool', type=int, default=10, help='并发数') 18 | run_parser.add_argument('-t', '--cron_expression', type=str, default='', help='crontab表达式') 19 | args = parser.parse_args() 20 | if args.command == 'create': 21 | template_to_file(args.project, directory=args.directory, filename=args.filename) 22 | elif args.command == 'run': 23 | runner(args.directory, args.task_pool, args.cron_expression) 24 | -------------------------------------------------------------------------------- /RabbitSpider/utils/control.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import defaultdict 3 | from importlib import import_module 4 | from asyncio import Task, Future, Semaphore 5 | from RabbitSpider import Request, Response 6 | from RabbitSpider import default_settings 7 | from typing import Final, Dict, List, Callable 8 | from RabbitSpider.core.download import CurlDownload 9 | 10 | 11 | def load_class(_path): 12 | if not isinstance(_path, str): 13 | if callable(_path): 14 | return _path 15 | else: 16 | raise TypeError(f"args expected string or object, got {type(_path)}") 17 | module, name = _path.rsplit('.', 1) 18 | mod = import_module(module) 19 | try: 20 | cls = getattr(mod, name) 21 | except AttributeError: 22 | raise NameError(f"Module {module!r} doesn't define any project named {name!r}") 23 | 24 | return cls 25 | 26 | 27 | class SettingManager(object): 28 | def __init__(self): 29 | try: 30 | settings = import_module('settings') 31 | except ModuleNotFoundError: 32 | settings = default_settings 33 | self.attribute = {} 34 | for key in dir(settings): 35 | if key.isupper(): 36 | self.attribute[key] = getattr(settings, key) 37 | 38 | def __setitem__(self, key, value): 39 | self.attribute[key] = value 40 | 41 | def __getitem__(self, key): 42 | return self.attribute.get(key) 43 | 44 | def __delitem__(self, key): 45 | del self.attribute[key] 46 | 47 | def get(self, key, value=None): 48 | return self[key] if self[key] else value 49 | 50 | def getlist(self, key): 51 | return self[key] if self[key] else [] 52 | 53 | def set(self, key, value): 54 | self[key] = value 55 | 56 | def update(self, custom_settings): 57 | self.attribute.update(custom_settings) 58 | 59 | 60 | class TaskManager(object): 61 | def __init__(self, task_count: int): 62 | self.current_task: Final[set] = set() 63 | self.semaphore = Semaphore(task_count) 64 | 65 | def create_task(self, coroutine) -> Task: 66 | task = asyncio.create_task(coroutine) 67 | self.current_task.add(task) 68 | 69 | def done_callback(_fut: Future): 70 | self.current_task.remove(task) 71 | self.semaphore.release() 72 | 73 | task.add_done_callback(done_callback) 74 | return task 75 | 76 | def all_done(self): 77 | return len(self.current_task) == 0 78 | 79 | 80 | class PipelineManager(object): 81 | def __init__(self, settings): 82 | self.settings = settings 83 | self.methods: Dict[str, List[Callable]] = defaultdict(list) 84 | self._add_pipe(settings.getlist('ITEM_PIPELINES')) 85 | 86 | def _add_pipe(self, pipelines): 87 | for pipeline in pipelines: 88 | pipeline_obj = load_class(pipeline)(self.settings) 89 | if hasattr(pipeline_obj, 'open_spider'): 90 | self.methods['open_spider'].append(getattr(pipeline_obj, 'open_spider')) 91 | if hasattr(pipeline_obj, 'process_item'): 92 | self.methods['process_item'].append(getattr(pipeline_obj, 'process_item')) 93 | if hasattr(pipeline_obj, 'close_spider'): 94 | self.methods['close_spider'].append(getattr(pipeline_obj, 'close_spider')) 95 | 96 | async def open_spider(self): 97 | for method in self.methods['open_spider']: 98 | await method() 99 | 100 | async def process_item(self, req, spider): 101 | for method in self.methods['process_item']: 102 | await method(req, spider) 103 | 104 | async def close_spider(self): 105 | for method in self.methods['close_spider']: 106 | await method() 107 | 108 | 109 | class MiddlewareManager(object): 110 | def __init__(self, settings): 111 | self.settings = settings 112 | self.download = CurlDownload() 113 | self.methods: Dict[str, List[Callable]] = defaultdict(list) 114 | self._add_middleware(settings.getlist('MIDDLEWARES')) 115 | 116 | def _add_middleware(self, middlewares): 117 | for middleware in middlewares: 118 | middleware_obj = load_class(middleware)(self.settings) 119 | if hasattr(middleware_obj, 'process_request'): 120 | self.methods['process_request'].append(getattr(middleware_obj, 'process_request')) 121 | if hasattr(middleware_obj, 'process_response'): 122 | self.methods['process_response'].append(getattr(middleware_obj, 'process_response')) 123 | if hasattr(middleware_obj, 'process_exception'): 124 | self.methods['process_exception'].append(getattr(middleware_obj, 'process_exception')) 125 | 126 | async def process_request(self, spider, request): 127 | for method in self.methods['process_request']: 128 | result = await method(request, spider) 129 | if isinstance(result, (Request, Response)): 130 | return result 131 | if result: 132 | break 133 | else: 134 | return await self.download.fetch(spider.session, request.to_dict()) 135 | 136 | async def process_response(self, spider, request, response): 137 | for method in reversed(self.methods['process_response']): 138 | result = await method(request, response, spider) 139 | if isinstance(result, (Request, Response)): 140 | return result 141 | if result: 142 | break 143 | else: 144 | return response 145 | 146 | async def process_exception(self, spider, request, exc): 147 | for method in self.methods['process_exception']: 148 | result = await method(request, exc, spider) 149 | if isinstance(result, (Request, Response)): 150 | return result 151 | if result: 152 | break 153 | else: 154 | raise exc 155 | 156 | async def send(self, spider, request: Request): 157 | try: 158 | resp = await self.process_request(spider, request) 159 | except Exception as exc: 160 | resp = await self.process_exception(spider, request, exc) 161 | if isinstance(resp, Response): 162 | resp = await self.process_response(spider, request, resp) 163 | if isinstance(resp, Request): 164 | return request, None 165 | if not resp: 166 | return None, None 167 | return request, resp 168 | 169 | 170 | class FilterManager(object): 171 | def __init__(self, settings): 172 | filter_cls = settings.get('DUPEFILTER_CLASS') 173 | if filter_cls: 174 | self.filter_obj = load_class(filter_cls)(settings) 175 | else: 176 | self.filter_obj = None 177 | 178 | def request_seen(self, request: Request) -> bool: 179 | if self.filter_obj: 180 | result = self.filter_obj.request_seen(request) 181 | return result 182 | else: 183 | return True 184 | -------------------------------------------------------------------------------- /RabbitSpider/utils/event.py: -------------------------------------------------------------------------------- 1 | # 爬虫启动时触发 2 | spider_opened = 'spider_opened' 3 | # 爬虫关闭时触发 4 | spider_closed = 'spider_closed' 5 | # 爬虫异常时触发 6 | spider_error = 'spider_error' 7 | # 发起请求时触发 8 | request_received = 'request_received' 9 | # 获取到响应时触发 10 | response_received = 'response_received' 11 | # 生成item时触发 12 | item_scraped = 'item_scraped' 13 | -------------------------------------------------------------------------------- /RabbitSpider/utils/log.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from loguru import logger 4 | 5 | 6 | class Logger(object): 7 | def __init__(self, settings): 8 | logger.remove() 9 | log_path = os.path.join(settings.get('BOT_DIR'), settings.get('LOG_FILE')) if settings.get( 10 | 'LOG_FILE') and settings.get('LOG_FILE').startswith('.') else settings.get('LOG_FILE') 11 | if log_path: 12 | logger.add("%s/rabbit_{time:YYYY-MM-DD}.log" % log_path, 13 | level=settings.get('LOG_LEVEL', 'ERROR'), 14 | rotation="1 day", 15 | retention="1 week", 16 | format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[scope]} | {message}") 17 | 18 | logger.add(sink=sys.stdout, 19 | colorize=True, 20 | level='INFO', 21 | format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[scope]} | {message}") 22 | 23 | self._logger = logger 24 | 25 | def info(self, msg, scope='RabbitSpider'): 26 | self._logger = logger.bind(scope=scope) 27 | self._logger.info(msg) 28 | 29 | def warning(self, msg, scope='RabbitSpider'): 30 | self._logger = logger.bind(scope=scope) 31 | self._logger.warning(msg) 32 | 33 | def error(self, msg, scope='RabbitSpider'): 34 | self._logger = logger.bind(scope=scope) 35 | self._logger.error(msg) 36 | 37 | def exception(self, msg, scope='RabbitSpider'): 38 | self._logger = logger.bind(scope=scope) 39 | self._logger.exception(msg) 40 | -------------------------------------------------------------------------------- /RabbitSpider/utils/subscriber.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import defaultdict 3 | from typing import Dict, Set, Callable 4 | 5 | 6 | class Subscriber(object): 7 | def __init__(self): 8 | self._subscriber: Dict[str, Set[Callable]] = defaultdict(set) 9 | 10 | def subscribe(self, receiver: Callable, event: str): 11 | self._subscriber[event].add(receiver) 12 | 13 | def unsubscribe(self, receiver: Callable, event: str): 14 | self._subscriber[event].discard(receiver) 15 | 16 | async def notify(self, event: str, *args, **kwargs): 17 | await asyncio.gather(*[receiver(*args, **kwargs) for receiver in self._subscriber[event]]) 18 | -------------------------------------------------------------------------------- /RabbitSpider/utils/template.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from string import Template 4 | from RabbitSpider.utils.control import SettingManager 5 | 6 | settings = SettingManager() 7 | 8 | 9 | def tmpl_file_path(_path): 10 | for i in os.listdir(_path): 11 | if os.path.isfile(os.path.join(_path, i)): 12 | if i.endswith('tmpl'): 13 | yield os.path.join(_path, i) 14 | if os.path.isdir(os.path.join(_path, i)): 15 | for i in tmpl_file_path(os.path.join(_path, i)): 16 | yield i 17 | 18 | 19 | def template_to_file(project, directory, filename): 20 | if project.lower() == 'test': 21 | print(f'项目名称不可以是{project}') 22 | return 23 | try: 24 | shutil.copytree(settings.get('TEMPLATE_DIR'), project) 25 | except FileExistsError: 26 | if not os.path.exists(os.path.join(project, 'spiders', directory)): 27 | os.mkdir(os.path.abspath(os.path.join(project, 'spiders', directory))) 28 | 29 | if os.path.exists(os.path.join(project, 'spiders', directory, f'{filename}.py')): 30 | print(f'{project}/spiders/{filename}已存在') 31 | return 32 | shutil.copy(os.path.abspath(os.path.join(settings.get('TEMPLATE_DIR'), 'spiders/src/basic.tmpl')), 33 | os.path.join(project, 'spiders', directory)) 34 | for file in tmpl_file_path(project): 35 | with open(file, 'r', encoding='utf-8') as f: 36 | text = Template(f.read()).substitute(project=project, dir=directory, spider=filename, 37 | classname='TemplateSpider') 38 | with open(file.replace('tmpl', 'py'), 'w', encoding='utf-8') as f: 39 | f.write(text) 40 | os.remove(file) 41 | if not os.path.exists(os.path.join(project, 'spiders', directory)): 42 | os.rename(os.path.abspath(os.path.join(project, 'spiders', 'src')), 43 | os.path.abspath(os.path.join(project, 'spiders', directory))) 44 | os.rename(os.path.join(project, 'spiders', directory, 'basic.py'), 45 | os.path.join(project, 'spiders', directory, f'{filename}.py')) 46 | print(f'{project}/{directory}/{filename}创建完成') 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='RabbitSpider', 5 | version='2.7.7', 6 | author='一纸', 7 | author_email='2395396520@qq.com', 8 | url='https://github.com/YunTom/RabbitSpider/tree/master', 9 | packages=['RabbitSpider', 'RabbitSpider.core', 'RabbitSpider.dupefilters', 'RabbitSpider.http', 10 | 'RabbitSpider.spider', 'RabbitSpider.items', 'RabbitSpider.middlewares', 'RabbitSpider.pipelines', 11 | 'RabbitSpider.utils'], 12 | include_package_data=True, 13 | entry_points={ 14 | 'console_scripts': [ 15 | 'rabbit = RabbitSpider.utils.cmdline:execute', 16 | ], 17 | }, 18 | python_requires='>=3.10', 19 | install_requires=[ 20 | 'aio-pika>=9.4.1', 21 | 'curl_cffi>=0.6.2', 22 | 'loguru>=0.7.2', 23 | 'parsel>=1.9.1', 24 | 'w3lib>=2.1.2', 25 | 'chardet>=5.2.0', 26 | 'croniter>=2.0.5' 27 | ], 28 | ) 29 | --------------------------------------------------------------------------------