├── .idea
├── .gitignore
├── RabbitSpider.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── MANIFEST.in
├── README.md
├── RabbitSpider
├── __init__.py
├── core
│ ├── __init__.py
│ ├── download.py
│ ├── engine.py
│ └── scheduler.py
├── default_settings.py
├── dupefilters
│ ├── __init__.py
│ └── memoryfilter.py
├── exceptions.py
├── http
│ ├── __init__.py
│ ├── request.py
│ └── response.py
├── items
│ ├── __init__.py
│ └── item.py
├── middlewares
│ ├── __init__.py
│ ├── allow_http_code.py
│ ├── download_delay.py
│ └── retry.py
├── pipelines
│ └── __init__.py
├── rabbit_execute.py
├── spider
│ └── __init__.py
├── templates
│ └── project
│ │ ├── __init__.py
│ │ ├── items.tmpl
│ │ ├── middlewares.tmpl
│ │ ├── pipelines.tmpl
│ │ ├── settings.tmpl
│ │ └── spiders
│ │ ├── __init__.py
│ │ └── src
│ │ └── basic.tmpl
└── utils
│ ├── __init__.py
│ ├── cmdline.py
│ ├── control.py
│ ├── event.py
│ ├── log.py
│ ├── subscriber.py
│ └── template.py
└── setup.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # 基于编辑器的 HTTP 客户端请求
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/RabbitSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include RabbitSpider/templates *
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 基于rabbitmq 做消息队列开发的分布式协程爬虫框架,结构用法与scrapy类似
2 |
3 | 支持批量运行任务,运行模式 auto先生产后消费(适用于单机,运行完自动关闭任务),m 只生产,w 只消费(一直监听任务),支持按目录批量定时运行
4 |
5 | 使用curl_cffi封装的下载器,支持修改http版本,tls指纹
6 |
7 | pip install RabbitSpider==2.7.6
8 |
9 | 创建项目cmd命令:
10 | rabbit create [项目名称] [目录名称] [爬虫文件名称]
11 |
12 | 按目录批量定时运行
13 | cd 项目名称
14 | rabbit run [目录名称] -p 20 -t "*/10 * * * *"
15 |
16 | -p (可选参数默认10)同时一批运行爬虫数量
17 | -t (可选参数)crontab表达式
18 |
19 | 自动创建爬虫项目模板
20 | 如:rabbit create shop xxx mama
21 |
22 | import asyncio
23 | from RabbitSpider import go
24 | from RabbitSpider import Request
25 | from RabbitSpider.spider import Spider
26 |
27 |
28 | class MamaSpider(Spider):
29 | name = '_'.join(__file__.replace('\\', '/').rsplit('/')[-2:]).split('.')[0]
30 | custom_settings = {}
31 |
32 | async def start_requests(self):
33 | yield Request(url='https://www.baidu.com')
34 |
35 | async def parse(self, request, response):
36 | pass
37 |
38 |
39 | if __name__ == '__main__':
40 | asyncio.run(go(MamaSpider, 'auto', 1))
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/RabbitSpider/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from RabbitSpider.items.item import BaseItem
4 | from RabbitSpider.http.request import Request
5 | from RabbitSpider.http.response import Response
6 | from RabbitSpider.rabbit_execute import go, batch_go
7 |
8 |
9 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '..')))
10 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../..')))
11 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../../..')))
12 | sys.path.append(os.path.abspath(os.path.join(os.path.abspath(sys.argv[0]), '../../../..')))
13 |
14 | logo = r"""
15 | ____ __ __ _ __ _____ _ __
16 | / __ \ ____ _ / /_ / /_ (_) / /_ / ___/ ____ (_) ____/ / ___ _____
17 | / /_/ / / __ `/ / __ \ / __ \ / / / __/ \__ \ / __ \ / / / __ / / _ \ / ___/
18 | / _, _/ / /_/ / / /_/ / / /_/ / / / / /_ ___/ / / /_/ / / / / /_/ / / __/ / /
19 | /_/ |_| \__,_/ /_.___/ /_.___/ /_/ \__/ /____/ / .___/ /_/ \__,_/ \___/ /_/
20 | /_/
21 | """
22 |
23 | __all__ = ['Request', 'Response', 'BaseItem', 'go', 'batch_go']
24 | __author__ = '一纸'
25 | __email__ = '2395396520@qq.com'
26 | __version__ = '2.7.7'
27 |
28 | sys.stdout.write(f'\033[0;35;1m{logo}\033[0m')
29 |
--------------------------------------------------------------------------------
/RabbitSpider/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/core/__init__.py
--------------------------------------------------------------------------------
/RabbitSpider/core/download.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider import Response
2 | from curl_cffi import CurlHttpVersion
3 | from RabbitSpider.exceptions import RabbitExpect
4 |
5 |
6 | class CurlDownload(object):
7 | def __init__(self):
8 | self.impersonate = 'chrome120'
9 | self.http_version = CurlHttpVersion.V2TLS
10 |
11 | async def fetch(self, session, request) -> Response:
12 | if request['method'].upper() == 'GET':
13 | res = await session.get(request['url'],
14 | params=request.get('params'), cookies=request.get('cookies'),
15 | headers=request.get('headers'), proxy=request.get('proxy'),
16 | allow_redirects=request.get('allow_redirects', True),
17 | http_version=self.http_version,
18 | impersonate=self.impersonate,
19 | timeout=request.get('timeout')
20 | )
21 |
22 | elif request['method'].upper() == 'POST':
23 | res = await session.post(request['url'],
24 | data=request.get('data'), json=request.get('json'),
25 | cookies=request.get('cookies'), headers=request.get('headers'),
26 | proxy=request.get('proxy'),
27 | http_version=self.http_version,
28 | impersonate=self.impersonate,
29 | allow_redirects=request.get('allow_redirects', True),
30 | timeout=request.get('timeout'))
31 |
32 | else:
33 | raise RabbitExpect(f"{request['method']}请求方式未定义,请自定义添加!")
34 |
35 | if res:
36 | return Response(res.status_code, res.headers, res.cookies, res.charset, res.content)
37 |
--------------------------------------------------------------------------------
/RabbitSpider/core/engine.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import asyncio
3 | from traceback import print_exc
4 | from asyncio import CancelledError
5 | from aio_pika import IncomingMessage
6 | from RabbitSpider.utils import event
7 | from RabbitSpider.utils.log import Logger
8 | from RabbitSpider import Request, BaseItem
9 | from RabbitSpider.exceptions import RabbitExpect
10 | from RabbitSpider.core.scheduler import Scheduler
11 | from typing import AsyncGenerator, Coroutine, Generator
12 | from RabbitSpider.utils.control import MiddlewareManager, FilterManager, PipelineManager, TaskManager
13 |
14 |
15 | class Engine(object):
16 |
17 | def __init__(self, settings):
18 | self.logger = Logger(settings)
19 | self.mode: str = settings.get('MODE')
20 | self.scheduler = Scheduler(settings)
21 | self.filter = FilterManager(settings)
22 | self.pipeline = PipelineManager(settings)
23 | self.middlewares = MiddlewareManager(settings)
24 | self.task_count: int = settings.get('TASK_COUNT')
25 |
26 | async def __aenter__(self):
27 | await self.scheduler.connect()
28 | await self.pipeline.open_spider()
29 | return self
30 |
31 | async def __aexit__(self, exc_type, exc_val, exc_tb):
32 | await self.scheduler.close()
33 | await self.pipeline.close_spider()
34 |
35 | async def routing(self, spider, result):
36 | async def rule(res):
37 | if isinstance(res, Request):
38 | if self.filter.request_seen(res):
39 | self.logger.info(f'生产数据:{res.to_dict()}', spider.name)
40 | await self.scheduler.producer(queue=spider.name, body=res.to_dict())
41 | elif isinstance(res, BaseItem):
42 | await spider.subscriber.notify(event.item_scraped, res)
43 | await self.pipeline.process_item(res, spider)
44 | elif res is None:
45 | pass
46 | else:
47 | raise TypeError('回调函数返回类型错误!')
48 |
49 | if isinstance(result, AsyncGenerator):
50 | async for r in result:
51 | await rule(r)
52 | elif isinstance(result, Generator):
53 | for r in result:
54 | await rule(r)
55 | elif isinstance(result, Coroutine):
56 | await rule(await result)
57 | elif isinstance(result, Request):
58 | await rule(result)
59 | elif result is None:
60 | pass
61 | else:
62 | raise TypeError('回调函数返回类型错误!')
63 |
64 | async def produce(self, spider):
65 | await self.scheduler.create_queue(spider.name)
66 | await self.scheduler.queue_purge(spider.name)
67 | await self.routing(spider, spider.start_requests())
68 |
69 | async def crawl(self, spider):
70 | task_manager = TaskManager(self.task_count)
71 | while True:
72 | incoming_message: IncomingMessage = await self.scheduler.consumer(spider)
73 | if incoming_message:
74 | await task_manager.semaphore.acquire()
75 | task_manager.create_task(self.deal_resp(spider, incoming_message))
76 | else:
77 | if task_manager.all_done():
78 | await self.scheduler.delete_queue(spider.name)
79 | break
80 |
81 | async def consume(self, spider):
82 | await self.scheduler.consumer(spider, callback=self.deal_resp,
83 | prefetch=self.task_count)
84 | await asyncio.Future()
85 |
86 | async def deal_resp(self, spider, incoming_message: IncomingMessage):
87 | try:
88 | request = Request(**pickle.loads(incoming_message.body))
89 | await spider.subscriber.notify(event.request_received, request)
90 | self.logger.info(f'消费数据:{request.to_dict()}', spider.name)
91 | request, response = await self.middlewares.send(spider, request)
92 | if response:
93 | await spider.subscriber.notify(event.response_received, response)
94 | result = getattr(spider, request.callback)(request, response)
95 | result and await self.routing(spider, result)
96 | elif request:
97 | await self.routing(spider, request)
98 | await incoming_message.ack()
99 | except Exception as e:
100 | print_exc()
101 | for task in asyncio.all_tasks():
102 | task.cancel()
103 |
104 | async def start(self, spider):
105 | self.logger.info(f'任务{spider.name}启动')
106 | await spider.subscriber.notify(event.spider_opened)
107 | try:
108 | if self.mode == 'auto':
109 | await self.produce(spider)
110 | await self.crawl(spider)
111 | elif self.mode == 'm':
112 | await self.produce(spider)
113 | elif self.mode == 'w':
114 | await self.consume(spider)
115 | else:
116 | raise RabbitExpect('执行模式错误!')
117 | except CancelledError as exc:
118 | self.logger.error(f'任务{spider.name}异常: {exc}')
119 | await spider.subscriber.notify(event.spider_error, exc)
120 | except Exception as exc:
121 | self.logger.error(f'任务{spider.name}异常: {exc}')
122 | await spider.subscriber.notify(event.spider_error, exc)
123 | else:
124 | await spider.subscriber.notify(event.spider_closed)
125 | self.logger.info(f'任务{spider.name}结束')
126 | finally:
127 | await spider.session.close()
128 |
--------------------------------------------------------------------------------
/RabbitSpider/core/scheduler.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from typing import Callable, Optional
3 | from aio_pika import connect_robust, Message, pool
4 | from aio_pika.exceptions import ChannelNotFoundEntity
5 |
6 |
7 | class Scheduler(object):
8 | def __init__(self, settings):
9 | self.connection = None
10 | self.channel_pool = None
11 | self.host = settings.get('RABBIT_HOST')
12 | self.port = settings.get('RABBIT_PORT')
13 | self.username = settings.get('RABBIT_USERNAME')
14 | self.password = settings.get('RABBIT_PASSWORD')
15 | self.channel_size = settings.get('CHANNEL_SIZE')
16 | self.virtual_host = settings.get('RABBIT_VIRTUAL_HOST')
17 |
18 | async def connect(self):
19 | self.connection = await connect_robust(host=self.host, login=self.username, password=self.password,
20 | virtualhost=self.virtual_host, heartbeat=30, timeout=60)
21 | self.channel_pool = pool.Pool(self.connection.channel, max_size=self.channel_size)
22 |
23 | async def create_queue(self, queue: str):
24 | async with self.channel_pool.acquire() as channel:
25 | await channel.declare_queue(name=queue, durable=True, arguments={"x-max-priority": 10}, timeout=60)
26 |
27 | async def producer(self, queue: str, body: dict):
28 | ret = pickle.dumps(body)
29 | async with self.channel_pool.acquire() as channel:
30 | await channel.default_exchange.publish(
31 | Message(body=ret, delivery_mode=2, priority=body['retry_times']), routing_key=queue, timeout=60)
32 |
33 | async def consumer(self, spider, callback: Optional[Callable] = None, prefetch: int = 1):
34 | async with self.channel_pool.acquire() as channel:
35 | try:
36 | queue = await channel.declare_queue(name=spider.name, durable=True, passive=True, timeout=60)
37 | except ChannelNotFoundEntity:
38 | queue = await channel.declare_queue(name=spider.name, durable=True, arguments={"x-max-priority": 10},
39 | timeout=60)
40 | if callback:
41 | await channel.set_qos(prefetch_count=prefetch)
42 | await queue.consume(callback=lambda incoming_message: callback(spider, incoming_message), timeout=60)
43 | else:
44 | return await queue.get(fail=False, timeout=60)
45 |
46 | async def queue_purge(self, queue: str):
47 | async with self.channel_pool.acquire() as channel:
48 | queue = await channel.declare_queue(name=queue, durable=True, passive=True, timeout=60)
49 | await queue.purge()
50 |
51 | async def delete_queue(self, queue: str):
52 | async with self.channel_pool.acquire() as channel:
53 | await channel.queue_delete(queue)
54 |
55 | async def get_message_count(self, queue: str):
56 | async with self.channel_pool.acquire() as channel:
57 | queue = await channel.declare_queue(name=queue, durable=True, passive=True, timeout=60)
58 | return queue.declaration_result.message_count
59 |
60 | async def close(self):
61 | await self.channel_pool.close()
62 | await self.connection.close()
63 |
--------------------------------------------------------------------------------
/RabbitSpider/default_settings.py:
--------------------------------------------------------------------------------
1 | from os.path import abspath, join, dirname
2 |
3 | TEMPLATE_DIR = abspath(join(dirname(__file__), 'templates/project'))
4 |
--------------------------------------------------------------------------------
/RabbitSpider/dupefilters/__init__.py:
--------------------------------------------------------------------------------
1 | import json
2 | import hashlib
3 | from io import BytesIO
4 | from urllib.parse import urlencode
5 | from RabbitSpider.http.request import Request
6 | from RabbitSpider.utils.control import SettingManager
7 |
8 |
9 | class DupeFilter(object):
10 | def __init__(self, settings):
11 | self.settings: SettingManager = settings
12 |
13 | def request_fingerprint(self, request: Request):
14 | if isinstance(request.data, (dict, list, tuple)):
15 | body = urlencode(request.data).encode('utf-8')
16 | elif isinstance(request.data, str):
17 | body = request.data.encode('utf-8')
18 | elif isinstance(request.data, BytesIO):
19 | body = request.data.read()
20 | elif isinstance(request.data, bytes):
21 | body = request.data
22 | else:
23 | body = b""
24 |
25 | if request.json is not None:
26 | body = json.dumps(request.json, separators=(",", ":")).encode()
27 |
28 | sha1 = hashlib.sha1()
29 | if isinstance(request.params, (dict, list, tuple)):
30 | sha1.update(f'{request.url}?{urlencode(request.params)}'.encode('utf-8'))
31 | else:
32 | sha1.update(request.url.encode('utf-8'))
33 | sha1.update(request.method.encode('utf-8'))
34 | sha1.update(body)
35 | sha1.update(str(request.retry_times).encode('utf-8'))
36 | return sha1.hexdigest()
37 |
38 | def request_seen(self, request: Request) -> bool:
39 | pass
40 |
--------------------------------------------------------------------------------
/RabbitSpider/dupefilters/memoryfilter.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider.dupefilters import DupeFilter
2 |
3 |
4 | class MemoryFilter(DupeFilter):
5 | def __init__(self, settings):
6 | super().__init__(settings)
7 | self.repeat = set()
8 |
9 | def request_seen(self, request):
10 | fingerprint = self.request_fingerprint(request)
11 | if fingerprint in self.repeat:
12 | return False
13 | else:
14 | self.repeat.add(fingerprint)
15 | return True
16 |
--------------------------------------------------------------------------------
/RabbitSpider/exceptions.py:
--------------------------------------------------------------------------------
1 | class RabbitExpect(Exception):
2 | def __init__(self, msg):
3 | self.msg = msg
4 |
5 | def __str__(self):
6 | return self.msg
7 |
--------------------------------------------------------------------------------
/RabbitSpider/http/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/RabbitSpider/http/request.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Callable
3 |
4 |
5 | class Request(object):
6 | def __init__(self,
7 | url: str,
8 | params: dict = None,
9 | data: dict | str | bytes | None = None,
10 | json: dict = None,
11 | method: str = 'get',
12 | headers: dict | None = None,
13 | cookies: dict | None = None,
14 | proxy: str | None = None,
15 | timeout: int = 60,
16 | allow_redirects: bool = True,
17 | callback: str | Callable = 'parse',
18 | retry_times: int = 0,
19 | meta: dict | None = None
20 | ):
21 | self._url = url
22 | self.params = params
23 | self.data = data
24 | self.json = json
25 | self.method = method
26 | self.headers = headers
27 | self.cookies = cookies
28 | self.proxy = proxy
29 | self.timeout = timeout
30 | self.allow_redirects = allow_redirects
31 | self._callback = callback
32 | self.retry_times = retry_times
33 | self._meta = meta
34 |
35 | @property
36 | def url(self):
37 | pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
38 | if not re.match(pattern, self._url):
39 | raise ValueError(f'请检查url是否正确{self._url}')
40 | return self._url
41 |
42 | @url.setter
43 | def url(self, value):
44 | self._url = value
45 |
46 | @property
47 | def meta(self):
48 | return self._meta if self._meta else {}
49 |
50 | @property
51 | def callback(self):
52 | return self._callback.__name__ if callable(self._callback) else self._callback
53 |
54 | def to_dict(self):
55 | return {
56 | 'url': self.url,
57 | 'params': self.params,
58 | 'data': self.data,
59 | 'json': self.json,
60 | 'method': self.method,
61 | 'headers': self.headers,
62 | 'cookies': self.cookies,
63 | 'proxy': self.proxy,
64 | 'timeout': self.timeout,
65 | 'allow_redirects': self.allow_redirects,
66 | 'callback': self.callback,
67 | 'meta': self.meta,
68 | 'retry_times': self.retry_times
69 | }
70 |
--------------------------------------------------------------------------------
/RabbitSpider/http/response.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import chardet
4 | import parsel
5 | from w3lib.encoding import http_content_type_encoding, html_to_unicode
6 |
7 |
8 | class Response:
9 | def __init__(self, status_code, headers, cookies, charset, content):
10 | self.content = content
11 | self.charset = charset
12 | self.status_code = status_code
13 | self.headers = {k: v for k, v in headers.items()}
14 | self.cookies = {k: v for k, v in cookies.items()}
15 | self.__r = parsel.Selector(self.text)
16 |
17 | @property
18 | def text(self):
19 | if not self.content:
20 | return ''
21 | if self.charset:
22 | try:
23 | text = self.content.decode(self.charset)
24 | except UnicodeDecodeError:
25 | try:
26 | benc = http_content_type_encoding(self.headers['Content-Type'])
27 | if benc:
28 | charset = 'charset=%s' % benc
29 | text = html_to_unicode(charset, self.content)[1]
30 | else:
31 | raise UnicodeDecodeError
32 | except (UnicodeDecodeError, KeyError):
33 | try:
34 | char = chardet.detect(self.content)
35 | if char:
36 | text = self.content.decode(char['encoding'])
37 | else:
38 | raise UnicodeDecodeError
39 | except UnicodeDecodeError:
40 | try:
41 | text = self.content.decode('utf-8')
42 | except UnicodeDecodeError:
43 | try:
44 | text = self.content.decode("gb18030")
45 | except UnicodeDecodeError:
46 | text = self.content.decode('utf-8', "ignore")
47 | else:
48 | try:
49 | text = self.content.decode('utf-8')
50 | except UnicodeDecodeError:
51 | try:
52 | char = chardet.detect(self.content)
53 | if char:
54 | text = self.content.decode(char['encoding'])
55 | else:
56 | raise UnicodeDecodeError
57 | except UnicodeDecodeError:
58 | try:
59 | text = self.content.decode('gb18030')
60 | except UnicodeDecodeError:
61 | text = self.content.decode('utf-8', "ignore")
62 | return text
63 |
64 | @property
65 | def json(self):
66 | result = re.findall(r'[.*?(]?(\[?{.*}]?)[).*]?', self.text, re.DOTALL)
67 | if result:
68 | return json.loads(result[0], strict=False)
69 |
70 | def xpath(self, x):
71 | return self.__r.xpath(x)
72 |
73 | def css(self, x):
74 | return self.__r.css(x)
75 |
--------------------------------------------------------------------------------
/RabbitSpider/items/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 | from collections import defaultdict
3 |
4 |
5 | class ItemMeta(type):
6 | def __new__(mcs, name, bases, attrs):
7 | field: Dict[str, dict] = defaultdict(dict)
8 | for key, value in attrs.items():
9 | if not (callable(value) or key.startswith('_')):
10 | field[key]['value'] = value
11 | if key == '__annotations__':
12 | for k, v in value.items():
13 | field[k]['annotation'] = v
14 | cls_instance = super().__new__(mcs, name, bases, attrs)
15 | cls_instance.FIELDS = field
16 | return cls_instance
17 |
--------------------------------------------------------------------------------
/RabbitSpider/items/item.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider.items import ItemMeta
2 |
3 |
4 | class BaseItem(metaclass=ItemMeta):
5 | def __init__(self):
6 | self._values = {}
7 | for k, v in self.FIELDS.items():
8 | if v.get('value') is not None:
9 | self._values[k] = v['value']
10 |
11 | def __setitem__(self, key, value):
12 | if key in self.FIELDS:
13 | if self.FIELDS[key]['annotation']:
14 | if isinstance(value, self.FIELDS[key]['annotation']):
15 | self._values[key] = value
16 | else:
17 | raise TypeError(f"{value} is not type {self.FIELDS[key]['annotation']}")
18 | else:
19 | self._values[key] = value
20 | else:
21 | raise KeyError(f'field {key} undefined')
22 |
23 | def __getitem__(self, item):
24 | return self._values[item]
25 |
26 | def __contains__(self, item):
27 | if item in self._values:
28 | return True
29 | else:
30 | return False
31 |
32 | def __iter__(self):
33 | return iter(self._values)
34 |
35 | def __len__(self) -> int:
36 | return len(self._values)
37 |
38 | def __delitem__(self, v):
39 | delattr(self._values, v)
40 |
41 | def __setattr__(self, key, value):
42 | if not key.startswith('_'):
43 | raise
44 | else:
45 | super().__setattr__(key, value)
46 |
47 | def __getattribute__(self, item):
48 | field = super().__getattribute__('FIELDS')
49 | if item in field:
50 | raise
51 | else:
52 | return super(BaseItem, self).__getattribute__(item)
53 |
54 | def to_dict(self):
55 | return self._values
56 |
--------------------------------------------------------------------------------
/RabbitSpider/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider import Request
2 | from RabbitSpider import Response
3 | from RabbitSpider.utils.log import Logger
4 | from RabbitSpider.utils.control import SettingManager
5 |
6 |
7 | class BaseMiddleware:
8 | def __init__(self, settings):
9 | self.logger = Logger(settings)
10 | self.settings: SettingManager = settings
11 |
12 | async def process_request(self, request, spider) -> None | Request | Response:
13 | """请求预处理"""
14 | pass
15 |
16 | async def process_response(self, request, response, spider) -> Request | Response:
17 | """响应预处理"""
18 | pass
19 |
20 | async def process_exception(self, request, exc, spider) -> None | Request | Response:
21 | """异常预处理"""
22 | pass
23 |
--------------------------------------------------------------------------------
/RabbitSpider/middlewares/allow_http_code.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider.middlewares import BaseMiddleware
2 |
3 |
4 | class AllowHttpCodeMiddleware(BaseMiddleware):
5 | def __init__(self, settings):
6 | super().__init__(settings)
7 | self.allow_http_code = settings.getlist('ALLOW_HTTP_CODES')
8 |
9 | async def process_response(self, request, response, spider):
10 | if response.status_code not in self.allow_http_code:
11 | self.logger.error(f'{request.to_dict()},不允许的状态码:{response.status_code}',spider.name)
12 | return True
13 |
--------------------------------------------------------------------------------
/RabbitSpider/middlewares/download_delay.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import random
3 |
4 | from RabbitSpider.middlewares import BaseMiddleware
5 |
6 |
7 | class DownloadDelayMiddleware(BaseMiddleware):
8 | def __init__(self, settings):
9 | super().__init__(settings)
10 | self.download_delay = settings.get('DOWNLOAD_DELAY')
11 |
12 | async def process_request(self, request, spider):
13 | if self.download_delay:
14 | delay = random.uniform(self.download_delay[0], self.download_delay[1])
15 | await asyncio.sleep(delay)
16 |
--------------------------------------------------------------------------------
/RabbitSpider/middlewares/retry.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider.middlewares import BaseMiddleware
2 |
3 |
4 | class RetryMiddleware(BaseMiddleware):
5 | def __init__(self, settings):
6 | super().__init__(settings)
7 | self.retry_http_code = settings.getlist('RETRY_HTTP_CODES')
8 | self.retry_exceptions = settings.getlist('RETRY_EXCEPTIONS')
9 | self.max_retry = settings.get('MAX_RETRY')
10 |
11 | async def process_response(self, request, response, spider):
12 | if response.status_code in self.retry_http_code:
13 | if request.retry_times < self.max_retry:
14 | request.retry_times += 1
15 | return request
16 | else:
17 | self.logger.warning(f'丢弃{request.to_dict()},状态码:{response.status_code}',spider.name)
18 | return True
19 |
20 | async def process_exception(self, request, exc, spider):
21 | if exc.__class__.__name__ in self.retry_exceptions:
22 | if request.retry_times < self.max_retry:
23 | request.retry_times += 1
24 | return request
25 | else:
26 | self.logger.warning(f'丢弃{request.to_dict()},异常:{repr(exc)}',spider.name)
27 | return True
28 |
--------------------------------------------------------------------------------
/RabbitSpider/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from RabbitSpider.utils.log import Logger
2 | from RabbitSpider.utils.control import SettingManager
3 |
4 |
5 | class BasePipeline(object):
6 | def __init__(self, settings):
7 | self.logger = Logger(settings)
8 | self.settings: SettingManager = settings
9 |
10 | async def open_spider(self):
11 | """初始化数据库"""
12 | pass
13 |
14 | async def process_item(self, item, spider):
15 | """入库逻辑"""
16 | pass
17 |
18 | async def close_spider(self):
19 | """关闭连接"""
20 | pass
21 |
--------------------------------------------------------------------------------
/RabbitSpider/rabbit_execute.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | import time
5 | import asyncio
6 | from typing import Type, List
7 | from croniter import croniter
8 | from datetime import datetime
9 | from RabbitSpider.spider import Spider
10 | from RabbitSpider.core.engine import Engine
11 | from RabbitSpider.utils.control import TaskManager, SettingManager
12 | from importlib.util import spec_from_file_location, module_from_spec
13 |
14 |
15 | async def go(spider_cls: Type[Spider], mode: str = 'auto', task_count: int = 1):
16 | settings = SettingManager()
17 | for i in sys.argv[1:]:
18 | key, value = i.split('=')
19 | if key == 'mode':
20 | mode = value
21 | if key == 'task_count':
22 | task_count = int(value)
23 | settings.set('MODE', mode)
24 | settings.set('TASK_COUNT', task_count)
25 | settings.set('CHANNEL_SIZE', task_count * 2)
26 | async with Engine(settings) as engine:
27 | await engine.start(spider_cls())
28 |
29 |
30 | async def batch_go(spiders: List[Type[Spider]], task_count: int = 10):
31 | settings = SettingManager()
32 | settings.set('MODE', 'auto')
33 | settings.set('TASK_COUNT', task_count)
34 | settings.set('CHANNEL_SIZE', task_count * 2)
35 | task_group: TaskManager = TaskManager(task_count)
36 | async with Engine(settings) as engine:
37 | for spider_cls in spiders:
38 | await task_group.semaphore.acquire()
39 | task_group.create_task(engine.start(spider_cls()))
40 | while True:
41 | if task_group.all_done():
42 | break
43 | else:
44 | await asyncio.sleep(1)
45 |
46 |
47 | def runner(spider_dir, task_pool, cron_expr):
48 | spider_classes = []
49 | loop = asyncio.get_event_loop()
50 | spider_path = os.path.join('spiders', spider_dir)
51 | sys.path.extend([os.path.abspath('.'), os.path.abspath('..')])
52 |
53 | for script_name in os.listdir(spider_path):
54 | if script_name.endswith('.py') and not script_name.startswith('__'):
55 | script_path = os.path.join(spider_path, script_name)
56 | with open(script_path, 'r', encoding='utf-8') as file:
57 | class_name = re.findall(r'class\s+(\w+)\s*\(\w+\)', file.read())[0]
58 | spec = spec_from_file_location(class_name, script_path)
59 | module = module_from_spec(spec)
60 | spec.loader.exec_module(module)
61 | spider_classes.append(getattr(module, class_name))
62 |
63 | if croniter.is_valid(cron_expr):
64 | cron_schedule = croniter(cron_expr, datetime.now())
65 | next_run_time = cron_schedule.get_next(datetime)
66 | print(f'下次运行时间:{next_run_time}')
67 | while True:
68 | now_time = datetime.now().replace(second=0, microsecond=0)
69 | if now_time == next_run_time:
70 | loop.run_until_complete(batch_go(spider_classes, task_pool))
71 | if next_run_time <= now_time:
72 | next_run_time = cron_schedule.get_next(datetime)
73 | print(f'下次运行时间:{next_run_time}')
74 | else:
75 | time.sleep(5)
76 | else:
77 | loop.run_until_complete(batch_go(spider_classes, task_pool))
78 | loop.close()
79 |
--------------------------------------------------------------------------------
/RabbitSpider/spider/__init__.py:
--------------------------------------------------------------------------------
1 | from asyncio import CancelledError
2 | from typing import AsyncGenerator, Union
3 | from curl_cffi.requests import AsyncSession
4 | from RabbitSpider.utils import event
5 | from RabbitSpider import Request, Response, BaseItem
6 | from RabbitSpider.utils.subscriber import Subscriber
7 |
8 |
9 | class Spider(object):
10 | name: str
11 |
12 | def __init__(self):
13 | self.subscriber = Subscriber()
14 | self.session = AsyncSession(verify=False)
15 | self.subscriber.subscribe(self.spider_opened, event.spider_opened)
16 | self.subscriber.subscribe(self.spider_closed, event.spider_closed)
17 | self.subscriber.subscribe(self.spider_error, event.spider_error)
18 | self.subscriber.subscribe(self.request_received, event.request_received)
19 | self.subscriber.subscribe(self.response_received, event.response_received)
20 | self.subscriber.subscribe(self.item_scraped, event.item_scraped)
21 |
22 | async def start_requests(self) -> AsyncGenerator[Request, None]:
23 | """初始请求"""
24 | raise NotImplementedError
25 |
26 | async def parse(self, request: Request, response: Response) -> AsyncGenerator[Union[Request, BaseItem, None], None]:
27 | """默认回调"""
28 | pass
29 |
30 | async def spider_opened(self) -> None:
31 | """爬虫启动时触发"""
32 | pass
33 |
34 | async def spider_closed(self) -> None:
35 | """爬虫关闭时触发"""
36 | pass
37 |
38 | async def spider_error(self, error: Exception | CancelledError) -> None:
39 | """爬虫异常时触发"""
40 | pass
41 |
42 | async def request_received(self, request: Request) -> None:
43 | """发起请求时触发"""
44 | pass
45 |
46 | async def response_received(self, response: Response) -> None:
47 | """获取到响应时触发"""
48 | pass
49 |
50 | async def item_scraped(self, item: BaseItem) -> None:
51 | """生成item时触发"""
52 | pass
53 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/__init__.py:
--------------------------------------------------------------------------------
1 | default_path = __path__[0]
2 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/items.tmpl:
--------------------------------------------------------------------------------
1 | from RabbitSpider import BaseItem
2 |
3 |
4 | class Item(BaseItem):
5 | # field: str = None
6 | pass
7 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/middlewares.tmpl:
--------------------------------------------------------------------------------
1 | from RabbitSpider.middlewares import BaseMiddleware
2 |
3 |
4 | class Middleware(BaseMiddleware):
5 |
6 | async def process_request(self, request, spider):
7 | """请求预处理"""
8 | pass
9 |
10 | async def process_response(self, request, response, spider):
11 | """响应预处理"""
12 | pass
13 |
14 | async def process_exception(self, request, exc, spider):
15 | """异常预处理"""
16 | pass
17 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/pipelines.tmpl:
--------------------------------------------------------------------------------
1 | from RabbitSpider.pipelines import BasePipeline
2 |
3 |
4 | class Pipeline(BasePipeline):
5 | async def open_spider(self):
6 | """初始化数据库"""
7 | pass
8 |
9 | async def process_item(self, item, spider):
10 | """入库逻辑"""
11 | self.logger.info(item.to_dict(), spider.name)
12 |
13 | async def close_spider(self):
14 | """关闭连接"""
15 | pass
16 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/settings.tmpl:
--------------------------------------------------------------------------------
1 | from ${project} import default_path
2 |
3 | BOT_DIR = default_path
4 |
5 | # Rabbitmq
6 | RABBIT_HOST = '127.0.0.1'
7 | RABBIT_PORT = 5672
8 | RABBIT_USERNAME = 'yuntom'
9 | RABBIT_PASSWORD = '123456'
10 | RABBIT_VIRTUAL_HOST = '/'
11 |
12 | # 内存 去重
13 | DUPEFILTER_CLASS = 'RabbitSpider.dupefilters.memoryfilter.MemoryFilter'
14 |
15 | # 中间件
16 | MIDDLEWARES = [
17 | 'RabbitSpider.middlewares.allow_http_code.AllowHttpCodeMiddleware',
18 | 'RabbitSpider.middlewares.retry.RetryMiddleware',
19 | 'RabbitSpider.middlewares.download_delay.DownloadDelayMiddleware',
20 | '${project}.middlewares.Middleware',
21 | ]
22 |
23 | # 管道
24 | ITEM_PIPELINES = ['${project}.pipelines.Pipeline']
25 |
26 | # 日志
27 | # LOG_LEVEL = 'WARNING'
28 | # LOG_FILE = './rabbit_log'
29 |
30 | # 延时下载
31 | # DOWNLOAD_DELAY = (1, 3)
32 |
33 | # 最大重试次数
34 | MAX_RETRY = 5
35 | # 重试状态码
36 | RETRY_HTTP_CODES = []
37 | # 异常重试
38 | RETRY_EXCEPTIONS = ['RequestsError', 'IncompleteRead', 'DNSError', 'ConnectionError', 'Timeout']
39 | # 允许通过状态码
40 | ALLOW_HTTP_CODES = [200]
41 |
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/templates/project/spiders/__init__.py
--------------------------------------------------------------------------------
/RabbitSpider/templates/project/spiders/src/basic.tmpl:
--------------------------------------------------------------------------------
1 | import os
2 | import asyncio
3 | from RabbitSpider import go
4 | from RabbitSpider import Request
5 | from RabbitSpider.spider import Spider
6 |
7 |
8 | class ${classname}(Spider):
9 | name = os.path.basename(__file__).split('.')[0]
10 |
11 | async def start_requests(self):
12 | pass
13 |
14 | async def parse(self, request, response):
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | asyncio.run(go(${classname}, 'auto', 1))
20 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YunTom/RabbitSpider/469429964225148d2386ddcdd3b32b580e0051d3/RabbitSpider/utils/__init__.py
--------------------------------------------------------------------------------
/RabbitSpider/utils/cmdline.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from RabbitSpider.rabbit_execute import runner
3 | from RabbitSpider.utils.template import template_to_file
4 |
5 |
6 | def execute():
7 | parser = argparse.ArgumentParser()
8 | subparsers = parser.add_subparsers(dest='command', required=True, help='可用的子命令')
9 |
10 | create_parser = subparsers.add_parser('create', help='创建一个新的爬虫项目')
11 | create_parser.add_argument('project', help='项目名称')
12 | create_parser.add_argument('directory', help='目录')
13 | create_parser.add_argument('filename', help='爬虫文件名')
14 |
15 | run_parser = subparsers.add_parser('run', help='运行一个爬虫项目')
16 | run_parser.add_argument('directory', help='目录')
17 | run_parser.add_argument('-p', '--task_pool', type=int, default=10, help='并发数')
18 | run_parser.add_argument('-t', '--cron_expression', type=str, default='', help='crontab表达式')
19 | args = parser.parse_args()
20 | if args.command == 'create':
21 | template_to_file(args.project, directory=args.directory, filename=args.filename)
22 | elif args.command == 'run':
23 | runner(args.directory, args.task_pool, args.cron_expression)
24 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/control.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from collections import defaultdict
3 | from importlib import import_module
4 | from asyncio import Task, Future, Semaphore
5 | from RabbitSpider import Request, Response
6 | from RabbitSpider import default_settings
7 | from typing import Final, Dict, List, Callable
8 | from RabbitSpider.core.download import CurlDownload
9 |
10 |
11 | def load_class(_path):
12 | if not isinstance(_path, str):
13 | if callable(_path):
14 | return _path
15 | else:
16 | raise TypeError(f"args expected string or object, got {type(_path)}")
17 | module, name = _path.rsplit('.', 1)
18 | mod = import_module(module)
19 | try:
20 | cls = getattr(mod, name)
21 | except AttributeError:
22 | raise NameError(f"Module {module!r} doesn't define any project named {name!r}")
23 |
24 | return cls
25 |
26 |
27 | class SettingManager(object):
28 | def __init__(self):
29 | try:
30 | settings = import_module('settings')
31 | except ModuleNotFoundError:
32 | settings = default_settings
33 | self.attribute = {}
34 | for key in dir(settings):
35 | if key.isupper():
36 | self.attribute[key] = getattr(settings, key)
37 |
38 | def __setitem__(self, key, value):
39 | self.attribute[key] = value
40 |
41 | def __getitem__(self, key):
42 | return self.attribute.get(key)
43 |
44 | def __delitem__(self, key):
45 | del self.attribute[key]
46 |
47 | def get(self, key, value=None):
48 | return self[key] if self[key] else value
49 |
50 | def getlist(self, key):
51 | return self[key] if self[key] else []
52 |
53 | def set(self, key, value):
54 | self[key] = value
55 |
56 | def update(self, custom_settings):
57 | self.attribute.update(custom_settings)
58 |
59 |
60 | class TaskManager(object):
61 | def __init__(self, task_count: int):
62 | self.current_task: Final[set] = set()
63 | self.semaphore = Semaphore(task_count)
64 |
65 | def create_task(self, coroutine) -> Task:
66 | task = asyncio.create_task(coroutine)
67 | self.current_task.add(task)
68 |
69 | def done_callback(_fut: Future):
70 | self.current_task.remove(task)
71 | self.semaphore.release()
72 |
73 | task.add_done_callback(done_callback)
74 | return task
75 |
76 | def all_done(self):
77 | return len(self.current_task) == 0
78 |
79 |
80 | class PipelineManager(object):
81 | def __init__(self, settings):
82 | self.settings = settings
83 | self.methods: Dict[str, List[Callable]] = defaultdict(list)
84 | self._add_pipe(settings.getlist('ITEM_PIPELINES'))
85 |
86 | def _add_pipe(self, pipelines):
87 | for pipeline in pipelines:
88 | pipeline_obj = load_class(pipeline)(self.settings)
89 | if hasattr(pipeline_obj, 'open_spider'):
90 | self.methods['open_spider'].append(getattr(pipeline_obj, 'open_spider'))
91 | if hasattr(pipeline_obj, 'process_item'):
92 | self.methods['process_item'].append(getattr(pipeline_obj, 'process_item'))
93 | if hasattr(pipeline_obj, 'close_spider'):
94 | self.methods['close_spider'].append(getattr(pipeline_obj, 'close_spider'))
95 |
96 | async def open_spider(self):
97 | for method in self.methods['open_spider']:
98 | await method()
99 |
100 | async def process_item(self, req, spider):
101 | for method in self.methods['process_item']:
102 | await method(req, spider)
103 |
104 | async def close_spider(self):
105 | for method in self.methods['close_spider']:
106 | await method()
107 |
108 |
109 | class MiddlewareManager(object):
110 | def __init__(self, settings):
111 | self.settings = settings
112 | self.download = CurlDownload()
113 | self.methods: Dict[str, List[Callable]] = defaultdict(list)
114 | self._add_middleware(settings.getlist('MIDDLEWARES'))
115 |
116 | def _add_middleware(self, middlewares):
117 | for middleware in middlewares:
118 | middleware_obj = load_class(middleware)(self.settings)
119 | if hasattr(middleware_obj, 'process_request'):
120 | self.methods['process_request'].append(getattr(middleware_obj, 'process_request'))
121 | if hasattr(middleware_obj, 'process_response'):
122 | self.methods['process_response'].append(getattr(middleware_obj, 'process_response'))
123 | if hasattr(middleware_obj, 'process_exception'):
124 | self.methods['process_exception'].append(getattr(middleware_obj, 'process_exception'))
125 |
126 | async def process_request(self, spider, request):
127 | for method in self.methods['process_request']:
128 | result = await method(request, spider)
129 | if isinstance(result, (Request, Response)):
130 | return result
131 | if result:
132 | break
133 | else:
134 | return await self.download.fetch(spider.session, request.to_dict())
135 |
136 | async def process_response(self, spider, request, response):
137 | for method in reversed(self.methods['process_response']):
138 | result = await method(request, response, spider)
139 | if isinstance(result, (Request, Response)):
140 | return result
141 | if result:
142 | break
143 | else:
144 | return response
145 |
146 | async def process_exception(self, spider, request, exc):
147 | for method in self.methods['process_exception']:
148 | result = await method(request, exc, spider)
149 | if isinstance(result, (Request, Response)):
150 | return result
151 | if result:
152 | break
153 | else:
154 | raise exc
155 |
156 | async def send(self, spider, request: Request):
157 | try:
158 | resp = await self.process_request(spider, request)
159 | except Exception as exc:
160 | resp = await self.process_exception(spider, request, exc)
161 | if isinstance(resp, Response):
162 | resp = await self.process_response(spider, request, resp)
163 | if isinstance(resp, Request):
164 | return request, None
165 | if not resp:
166 | return None, None
167 | return request, resp
168 |
169 |
170 | class FilterManager(object):
171 | def __init__(self, settings):
172 | filter_cls = settings.get('DUPEFILTER_CLASS')
173 | if filter_cls:
174 | self.filter_obj = load_class(filter_cls)(settings)
175 | else:
176 | self.filter_obj = None
177 |
178 | def request_seen(self, request: Request) -> bool:
179 | if self.filter_obj:
180 | result = self.filter_obj.request_seen(request)
181 | return result
182 | else:
183 | return True
184 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/event.py:
--------------------------------------------------------------------------------
1 | # 爬虫启动时触发
2 | spider_opened = 'spider_opened'
3 | # 爬虫关闭时触发
4 | spider_closed = 'spider_closed'
5 | # 爬虫异常时触发
6 | spider_error = 'spider_error'
7 | # 发起请求时触发
8 | request_received = 'request_received'
9 | # 获取到响应时触发
10 | response_received = 'response_received'
11 | # 生成item时触发
12 | item_scraped = 'item_scraped'
13 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/log.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from loguru import logger
4 |
5 |
6 | class Logger(object):
7 | def __init__(self, settings):
8 | logger.remove()
9 | log_path = os.path.join(settings.get('BOT_DIR'), settings.get('LOG_FILE')) if settings.get(
10 | 'LOG_FILE') and settings.get('LOG_FILE').startswith('.') else settings.get('LOG_FILE')
11 | if log_path:
12 | logger.add("%s/rabbit_{time:YYYY-MM-DD}.log" % log_path,
13 | level=settings.get('LOG_LEVEL', 'ERROR'),
14 | rotation="1 day",
15 | retention="1 week",
16 | format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[scope]} | {message}")
17 |
18 | logger.add(sink=sys.stdout,
19 | colorize=True,
20 | level='INFO',
21 | format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[scope]} | {message}")
22 |
23 | self._logger = logger
24 |
25 | def info(self, msg, scope='RabbitSpider'):
26 | self._logger = logger.bind(scope=scope)
27 | self._logger.info(msg)
28 |
29 | def warning(self, msg, scope='RabbitSpider'):
30 | self._logger = logger.bind(scope=scope)
31 | self._logger.warning(msg)
32 |
33 | def error(self, msg, scope='RabbitSpider'):
34 | self._logger = logger.bind(scope=scope)
35 | self._logger.error(msg)
36 |
37 | def exception(self, msg, scope='RabbitSpider'):
38 | self._logger = logger.bind(scope=scope)
39 | self._logger.exception(msg)
40 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/subscriber.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from collections import defaultdict
3 | from typing import Dict, Set, Callable
4 |
5 |
6 | class Subscriber(object):
7 | def __init__(self):
8 | self._subscriber: Dict[str, Set[Callable]] = defaultdict(set)
9 |
10 | def subscribe(self, receiver: Callable, event: str):
11 | self._subscriber[event].add(receiver)
12 |
13 | def unsubscribe(self, receiver: Callable, event: str):
14 | self._subscriber[event].discard(receiver)
15 |
16 | async def notify(self, event: str, *args, **kwargs):
17 | await asyncio.gather(*[receiver(*args, **kwargs) for receiver in self._subscriber[event]])
18 |
--------------------------------------------------------------------------------
/RabbitSpider/utils/template.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | from string import Template
4 | from RabbitSpider.utils.control import SettingManager
5 |
6 | settings = SettingManager()
7 |
8 |
9 | def tmpl_file_path(_path):
10 | for i in os.listdir(_path):
11 | if os.path.isfile(os.path.join(_path, i)):
12 | if i.endswith('tmpl'):
13 | yield os.path.join(_path, i)
14 | if os.path.isdir(os.path.join(_path, i)):
15 | for i in tmpl_file_path(os.path.join(_path, i)):
16 | yield i
17 |
18 |
19 | def template_to_file(project, directory, filename):
20 | if project.lower() == 'test':
21 | print(f'项目名称不可以是{project}')
22 | return
23 | try:
24 | shutil.copytree(settings.get('TEMPLATE_DIR'), project)
25 | except FileExistsError:
26 | if not os.path.exists(os.path.join(project, 'spiders', directory)):
27 | os.mkdir(os.path.abspath(os.path.join(project, 'spiders', directory)))
28 |
29 | if os.path.exists(os.path.join(project, 'spiders', directory, f'{filename}.py')):
30 | print(f'{project}/spiders/{filename}已存在')
31 | return
32 | shutil.copy(os.path.abspath(os.path.join(settings.get('TEMPLATE_DIR'), 'spiders/src/basic.tmpl')),
33 | os.path.join(project, 'spiders', directory))
34 | for file in tmpl_file_path(project):
35 | with open(file, 'r', encoding='utf-8') as f:
36 | text = Template(f.read()).substitute(project=project, dir=directory, spider=filename,
37 | classname='TemplateSpider')
38 | with open(file.replace('tmpl', 'py'), 'w', encoding='utf-8') as f:
39 | f.write(text)
40 | os.remove(file)
41 | if not os.path.exists(os.path.join(project, 'spiders', directory)):
42 | os.rename(os.path.abspath(os.path.join(project, 'spiders', 'src')),
43 | os.path.abspath(os.path.join(project, 'spiders', directory)))
44 | os.rename(os.path.join(project, 'spiders', directory, 'basic.py'),
45 | os.path.join(project, 'spiders', directory, f'{filename}.py'))
46 | print(f'{project}/{directory}/{filename}创建完成')
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | setuptools.setup(
4 | name='RabbitSpider',
5 | version='2.7.7',
6 | author='一纸',
7 | author_email='2395396520@qq.com',
8 | url='https://github.com/YunTom/RabbitSpider/tree/master',
9 | packages=['RabbitSpider', 'RabbitSpider.core', 'RabbitSpider.dupefilters', 'RabbitSpider.http',
10 | 'RabbitSpider.spider', 'RabbitSpider.items', 'RabbitSpider.middlewares', 'RabbitSpider.pipelines',
11 | 'RabbitSpider.utils'],
12 | include_package_data=True,
13 | entry_points={
14 | 'console_scripts': [
15 | 'rabbit = RabbitSpider.utils.cmdline:execute',
16 | ],
17 | },
18 | python_requires='>=3.10',
19 | install_requires=[
20 | 'aio-pika>=9.4.1',
21 | 'curl_cffi>=0.6.2',
22 | 'loguru>=0.7.2',
23 | 'parsel>=1.9.1',
24 | 'w3lib>=2.1.2',
25 | 'chardet>=5.2.0',
26 | 'croniter>=2.0.5'
27 | ],
28 | )
29 |
--------------------------------------------------------------------------------