├── .gitignore
├── README.md
├── magical
├── __init__.py
├── cmdline.py
├── sync_spider
│ ├── __init__.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── base_spider.py
│ │ ├── email_handler.py
│ │ ├── log_setting.py
│ │ ├── proxy_handler.py
│ │ ├── redis_lock.py
│ │ ├── spider_util.py
│ │ ├── user_agent.json
│ │ └── utils.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── default_settings.py
│ │ └── settings.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── spider.py
│ │ └── start_spider.py
│ ├── databases
│ │ ├── __init__.py
│ │ ├── init_db.py
│ │ ├── mysql_pool.py
│ │ ├── post_gre_sql_pool.py
│ │ └── red_pool.py
│ ├── extends_module
│ │ ├── __init__.py
│ │ ├── base_module
│ │ │ ├── __init__.py
│ │ │ ├── downloader.py
│ │ │ └── pipeline.py
│ │ ├── download
│ │ │ ├── __init__.py
│ │ │ └── retry.py
│ │ └── mqs
│ │ │ ├── __init__.py
│ │ │ └── rabbit_mq
│ │ │ ├── __init__.py
│ │ │ └── handler.py
│ ├── http
│ │ ├── __init__.py
│ │ ├── request.py
│ │ └── response.py
│ └── middleware
│ │ ├── __init__.py
│ │ ├── download
│ │ ├── __init__.py
│ │ ├── downloader.py
│ │ ├── handler.py
│ │ └── manager.py
│ │ ├── duplicate
│ │ ├── __init__.py
│ │ ├── bit_array.py
│ │ ├── bloom_filter.py
│ │ ├── expire_filter.py
│ │ └── handler.py
│ │ └── pipeline
│ │ ├── __init__.py
│ │ ├── handler.py
│ │ └── manager.py
├── template.py
├── templates
│ ├── __init__.py
│ └── sync_spider
│ │ ├── __init__.py
│ │ ├── base_spider.py.tmpl
│ │ ├── middleware.py.tmpl
│ │ ├── settings.py.tmpl
│ │ ├── spider.py.tmpl
│ │ └── spiders
│ │ ├── __init__.py
│ │ └── __init__.py.tmpl
└── utils.py
├── requirements.txt
├── setup.py
└── spiders
├── __init__.py
├── common
├── __init__.py
├── excel.py
├── proxy.py
├── settings.py
└── spider_init.py
├── test_douban
├── __init__.py
├── base_spider.py
├── middleware.py
├── settings.py
└── spiders
│ ├── __init__.py
│ └── douban_spider.py
└── test_spider
├── __init__.py
├── base_spider.py
├── middleware.py
├── settings.py
├── spiders
├── __init__.py
├── test_common.py
├── test_excel.py
└── test_proxy.py
└── static
├── __init__.py
└── test.xls
/.gitignore:
--------------------------------------------------------------------------------
1 | test
2 | __pycache__
3 | .idea
4 | .DS_Store
5 | logs
6 | captcha
7 | file
8 | *.egg-info
9 | build
10 | dist
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 2021-07-01 更新
2 | ### 添加自定义公用组件,详情可查看,spiders 示例
3 |
4 | ## 2021-07-31 更新
5 | ### 添加统一初始化,爬虫工具类,模块
6 | 1、`spiders.common.settings.py` 文件配置
7 | ```python
8 | SPIDER_INIT_HANDLER = 'spiders.common.spider_init.SpiderInit'
9 | EXCEL = 'spiders.common.excel'
10 | ```
11 | 2、`spiders.common.spider_init.py` 具体实现
12 | ```python
13 | from magical.sync_spider import load_files
14 |
15 |
16 | class SpiderInit(object):
17 | def __init__(self, spider):
18 | self.settings = spider.settings
19 |
20 | spider.excel = load_files(self.settings['EXCEL'])
21 | ```
22 | 3、`spiders.test_spider.spider.test_common.py` 示例
23 | ```python
24 | import os
25 | import sys
26 |
27 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
28 | sys.path.append(file_path)
29 |
30 | from magical.sync_spider import SyncSpider, run_spider
31 |
32 |
33 | class TestCommonSpider(SyncSpider):
34 | name = 'test_common'
35 | settings_path = 'spiders.test_spider.settings'
36 |
37 | default_custom_setting = {}
38 |
39 | def __init__(self, *args, **kwargs):
40 | custom_setting = {}
41 | kwargs.update(dict(custom_setting=custom_setting))
42 | super().__init__(*args, **kwargs)
43 |
44 | def start_spider(self):
45 | print(self.excel)
46 |
47 |
48 | if __name__ == '__main__':
49 | run_spider(TestCommonSpider)
50 | ```
51 |
52 | ------------------------------
53 |
54 | ## 简介
55 |
56 | **magical** 轻量级爬虫框架, 模仿 scrapy 开发,没有 scrapy 复杂,抛弃了 yield 跟 回掉函数,流程简单化,全部可自定义,框架只是简单封装了一些常用函数
57 |
58 | ### 项目文件:
59 | - `spiders` 爬虫列表文件夹
60 | - `settings` 爬虫配置文件
61 | - `middleware` 中间件文件
62 | - `pipeline` 管道文件
63 | - `base_spdier`
64 |
65 | ### spider 提供3个爬虫类:
66 | - `SyncSpider` 单线程爬虫
67 | - `RedisMessageMQSpider` redis 发布者订阅者模式爬虫
68 | - `RabbitMessageMQSpider` rabbitMQ 生产者消费者爬虫
69 | - `ThreadSyncSpider` 多线程爬虫,启动多个线程,去实例化以上三种爬虫类
70 |
71 | **sync_spider** `requests`同步版本
72 | **async_spider** `aiohttp`异步版本 (问题较多,已放弃开发)
73 |
74 | ## 创建项目 (需要先创建 spiders 文件夹,执行以下代码可自动生成代码文件)
75 | ```python
76 | import os
77 | from magical.cmdline import generate_spider_project, generate_spider_file
78 |
79 |
80 | def main():
81 | project_path = os.path.dirname(os.path.abspath(__file__))
82 | spider_name = 'test_spider_pipelines'
83 |
84 | # 创建单个爬虫文件
85 | generate_spider_file('sync_spider', project_path, spider_name)
86 |
87 | # 创建爬虫项目
88 | # generate_spider_project('sync_spider', project_path, spider_name)
89 |
90 |
91 | if __name__ == '__main__':
92 | main()
93 | ```
94 |
95 | ## Spider
96 | ```python
97 | from magical.sync_spider import run_spider, SyncSpider, Request
98 |
99 |
100 | class TestSpider(SyncSpider):
101 | name = 'test_spider'
102 | settings_path = 'spiders.test.settings.py'
103 |
104 | default_custom_setting = {}
105 |
106 | def __init__(self, *args, **kwargs):
107 | custom_setting = {}
108 | kwargs.update(dict(custom_setting=custom_setting))
109 | super().__init__(*args, **kwargs)
110 |
111 | def start_spider(self):
112 | self.logger.info(f'Hello {self.name}')
113 |
114 | # 发起request请求
115 | request = Request(url='http://www.baidu.com/')
116 | response = self.download(request)
117 |
118 | title = response.re.findall('
(.*?)')
119 | self.logger.info(f'title: {title}')
120 |
121 | data = {'title': title[0]}
122 |
123 | # 调用 pipeline 处理数据,返回 True or False
124 | pip_res = self.pipeline(data)
125 | print('pip_res: ', pip_res)
126 |
127 | # 调用 redis
128 | self.red.get('key1')
129 |
130 | # 调用 mysql
131 | self.mysql.select('select * from test;')
132 |
133 | # 调用 postgresql
134 | self.post_gre.select('select * from test;')
135 |
136 |
137 | if __name__ == '__main__':
138 | run_spider(TestSpider)
139 | ```
140 |
141 | ## Database
142 | 数据库配置, redis 为例
143 | - 单个数据库
144 | ```python
145 | REDIS_CONFIG = {
146 | 'host': '',
147 | 'host': '',
148 | 'db': '',
149 | 'user': '',
150 | 'password': '',
151 | 'decode_responses': True
152 | }
153 |
154 | """red 默认变量名称
155 | Usage:
156 | self.red.get('key1')
157 | spider.red.get('key1')
158 | """
159 | ```
160 | - 多个数据库
161 | ```python
162 | REDIS_CONFIG = [
163 | {
164 | 'name': 'name1',
165 | 'host': '',
166 | 'host': '',
167 | 'db': '',
168 | 'user': '',
169 | 'password': '',
170 | 'decode_responses': True
171 | },
172 | {
173 | 'name': 'name2',
174 | 'host': '',
175 | 'host': '',
176 | 'db': '',
177 | 'user': '',
178 | 'password': '',
179 | 'decode_responses': True
180 | }
181 | ]
182 | """
183 | Usage:
184 | self.name1.get('key1')
185 | spider.name1.get('key1')
186 |
187 | self.name2.get('key1')
188 | spider.name2.get('key1')
189 | """
190 | ```
191 | - RedisPool 使用 (默认访问名称 red, 如果有多个连接 通过 name 字段访问)
192 | ```python
193 |
194 | self.red.get('key1')
195 | self.red.set('key1', 'value1')
196 | ```
197 |
198 | - MysqlPool 使用 (默认访问名称 mysql, 如果有多个连接 通过 name 字段访问)
199 | ```python
200 |
201 | # 执行 sql
202 | self.mysql.execute('select * from test;')
203 |
204 | # 查询 sql
205 | self.mysql.select('select * from test;')
206 |
207 | # 插入单条数据
208 | data = {
209 | 'feild1': 'data1',
210 | 'field2': 'data2'
211 | }
212 | self.mysql.insert_dict(table_name='table1', info_dict=data, ignore=False, replace=False)
213 |
214 | # 插入多条数据
215 | data = [
216 | {
217 | 'feild1': 'data1',
218 | 'field2': 'data2'
219 | },
220 | {
221 | 'feild1': 'data1',
222 | 'field2': 'data2'
223 | }
224 | ]
225 | self.mysql.insert_list(table_name='table1', info_list=data, ignore=False, replace=False)
226 | ```
227 |
228 | - PostGreSqlPool 使用 (默认访问名称 post_gre, 如果有多个连接 通过 name 字段访问)
229 | ```python
230 |
231 | # 执行 sql
232 | self.post_gre.execute('select * from test;')
233 |
234 | # 查询 sql
235 | self.post_gre.select('select * from test;')
236 |
237 | # 插入单条数据 (indexes = 表的唯一索引,用于过滤已存在的数据)
238 | data = {
239 | 'feild1': 'data1',
240 | 'field2': 'data2'
241 | }
242 | self.post_gre.insert_conflict_dict(table_name='table1', info_dict=data, indexes=False)
243 |
244 | # 插入多条数据 (indexes = 表的唯一索引,用于过滤已存在的数据)
245 | data = [
246 | {
247 | 'feild1': 'data1',
248 | 'field2': 'data2'
249 | },
250 | {
251 | 'feild1': 'data1',
252 | 'field2': 'data2'
253 | }
254 | ]
255 | self.post_gre.insert_conflict_list(table_name='table1', info_list=data, indexes=False)
256 | ```
257 |
258 | ## Download Middleware
259 | ```python
260 |
261 | import requests
262 |
263 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
264 |
265 |
266 | # 效果不好,不推荐使用
267 | class DuplicateMiddleware(DownloaderMiddleware):
268 | """去重中间件"""
269 |
270 | def __init__(self, spider):
271 | super().__init__(spider)
272 |
273 | def process_request(self, request):
274 |
275 | if request.meta.get('is_filter'):
276 | # 0 == 不存在,1 == 存在
277 | if self.duplicate.get(**request.meta['filter_info']) != 0:
278 | return None
279 |
280 | return request
281 |
282 | def process_response(self, request, response):
283 |
284 | if response and request.meta.get('is_filter'):
285 | # 请求成功添加到,去重种子列表里。 0 == 已存在,1 == 不存在,添加成功
286 | if self.duplicate.add(**request.meta['filter_info']) == 1:
287 | pass
288 |
289 | return response
290 |
291 |
292 | class HeadersMiddleware(DownloaderMiddleware):
293 | """请求头中间件,User-Agent 随机切换"""
294 |
295 | def __init__(self, spider):
296 | super().__init__(spider)
297 |
298 | def process_request(self, request):
299 | request.headers.update({
300 | 'Connection': 'close',
301 | 'user-agent': self.spider.spider_util.random_ua()
302 | })
303 | return request
304 |
305 |
306 | class ProxyMiddleware(DownloaderMiddleware):
307 | """代理 IP 中间件"""
308 |
309 | def __init__(self, spider):
310 | super().__init__(spider)
311 |
312 | # 初始化代理 IP,num 初始化几条
313 | # self.proxy_handler(num=1)
314 |
315 | def process_request(self, request):
316 | # 获取一条代理 IP
317 | # request.meta['proxy'] = self.proxy.get_proxy()
318 | return request
319 |
320 | def process_response(self, request, response):
321 | return response
322 |
323 | def process_exception(self, request, exception):
324 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
325 |
326 | if isinstance(
327 | exception,
328 | (
329 | requests.exceptions.ConnectionError,
330 | requests.exceptions.ConnectTimeout,
331 | requests.exceptions.ReadTimeout,
332 | requests.exceptions.Timeout,
333 | )
334 | ):
335 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}')
336 | time.sleep(random.randint(3, 5))
337 | self.proxy.proxy_handler(request, num=1)
338 | return self._retry(request)
339 |
340 | elif isinstance(exception, requests.exceptions.HTTPError):
341 | self.logger.error(f'ProxyMiddleware - requests.exceptions.HTTPError - request: {request}')
342 | return None
343 |
344 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError):
345 | self.logger.error(f'ProxyMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}')
346 | return None
347 |
348 | elif isinstance(exception, requests.exceptions.SSLError):
349 | self.logger.error(f'ProxyMiddleware - requests.exceptions.SSLError - request: {request}')
350 | return None
351 |
352 | return exception
353 |
354 |
355 | class TestSpiderMiddleware(DownloaderMiddleware):
356 | """爬虫中间件"""
357 |
358 | def __init__(self, spider):
359 | super().__init__(spider)
360 |
361 | def process_request(self, request):
362 | return request
363 |
364 | def process_response(self, request, response):
365 | if not request.use_middleware:
366 | return response
367 |
368 | return response
369 |
370 | def process_exception(self, request, exception):
371 | self.logger.exception(f'TestSpiderMiddleware.process_exception: {exception}, request: {request}')
372 | return exception
373 | ```
374 |
375 | ## Pipeline Middleware
376 | ```python
377 | class TestSpiderPipeline(PipelineMiddleware):
378 |
379 | def __init__(self, spider):
380 | super().__init__(spider)
381 |
382 | def process_item(self, item, **kwargs):
383 | """数据处理
384 |
385 | Args:
386 | item : 要处理的数据
387 | kwargs:
388 | table_name: 表名称
389 | replace : True or False (mysql 数据库使用)
390 | ignore : True or False (mysql 数据库使用)
391 | indexes : 数据库表唯一索引字段 (PostGreSql 数据库使用)
392 |
393 | Return:
394 | 返回的数据类型如果不等于 type(item) 则不会调用后面的 pipeline process_item 函数
395 | """
396 | return item
397 |
398 | def process_exception(self, item, exception, **kwargs):
399 | if isinstance(exception, Exception):
400 | self.logger.error(f'TestSpiderPipeline - exception: {exception}')
401 | return None
402 |
403 | return exception
404 | ```
405 |
406 | # 持续更新中······
407 |
--------------------------------------------------------------------------------
/magical/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/5/24 下午9:18
7 | -------------------------------------------------
8 | Change Activity: 2021/5/24 下午9:18
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/cmdline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: cmdline.py
6 | Time: 2021/4/14 下午3:09
7 | -------------------------------------------------
8 | Change Activity: 2021/4/14 下午3:09
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import string
14 | import sys
15 | import datetime
16 | from shutil import copy2, copystat
17 | from os.path import join, exists, abspath, dirname
18 |
19 | from magical.template import render_template_file, string_camelcase
20 |
21 | TEMPLATES_TO_RENDER = [
22 | ('${spider_name}', 'spiders', '__init__.py.tmpl'),
23 | ('${spider_name}', 'base_spider.py.tmpl'),
24 | ('${spider_name}', 'middleware.py.tmpl'),
25 | ('${spider_name}', 'settings.py.tmpl')
26 | ]
27 |
28 |
29 | def _copytree(src, dst):
30 | """复制文件
31 |
32 | Args:
33 | src: 模版文件路径(str)
34 | dst: 项目路径(str)
35 | Returns:
36 | """
37 | if not exists(dst):
38 | os.makedirs(dst)
39 |
40 | names = os.listdir(src)
41 |
42 | for name in names:
43 | if name == 'spider.py.tmpl':
44 | continue
45 |
46 | if name == '__init__.py':
47 | continue
48 |
49 | src_name = os.path.join(src, name)
50 | dst_name = os.path.join(dst, name)
51 |
52 | if os.path.isdir(src_name):
53 | _copytree(src_name, dst_name)
54 |
55 | else:
56 | copy2(src_name, dst_name)
57 |
58 | copystat(src, dst)
59 |
60 |
61 | def generate_spider_project(spider_type, project_path=None, spider_name=None):
62 | """生成项目爬虫文件
63 |
64 | Args:
65 | spider_type: 爬虫类型(sync_spider, async_spider)
66 | project_path: 项目路径
67 | spider_name: 爬虫名称
68 | """
69 | if not spider_type:
70 | sys.exit('spider_type is not null')
71 |
72 | if not project_path:
73 | sys.exit('project_path is not null')
74 |
75 | if not spider_name:
76 | sys.exit('spider_name is not null')
77 |
78 | templates_dir = abspath(join(dirname(__file__), f'templates/{spider_type}'))
79 | _copytree(templates_dir, join(abspath(project_path)))
80 | copy2(join(templates_dir, 'spider.py.tmpl'), join(abspath(project_path), 'spiders', f'{spider_name}.py.tmpl'))
81 |
82 | s_path = abspath(project_path).split('/')
83 | spider_path = '.'.join(s_path[s_path.index('spiders'):])
84 | settings_path = spider_path + '.settings'
85 | project_name = s_path[s_path.index('spiders') + 1]
86 |
87 | TEMPLATES_TO_RENDER.append(('${spider_name}', 'spiders', f'{spider_name}.py.tmpl'))
88 |
89 | for paths in TEMPLATES_TO_RENDER:
90 | path = join(*paths)
91 |
92 | tpl_file = string.Template(path).substitute(spider_name=project_path)
93 |
94 | render_template_file(
95 | tpl_file,
96 | project_name=project_name,
97 | settings_path=settings_path,
98 | spider_path=spider_path,
99 | spider_name=spider_name,
100 | create_time=datetime.datetime.now().strftime('%Y/%d/%d %H:%M:%S'),
101 | SpiderName=string_camelcase(spider_name),
102 | )
103 |
104 |
105 | def generate_spider_file(spider_type, project_path=None, spider_name=None):
106 | """生成爬虫文件
107 |
108 | Args:
109 | spider_type: 爬虫类型(sync_spider, async_spider)
110 | project_path: 项目路径
111 | spider_name: 爬虫名称
112 | """
113 | if not spider_type:
114 | sys.exit('spider_type is not null')
115 |
116 | if not project_path:
117 | sys.exit('project_path is not null')
118 |
119 | if not spider_name:
120 | sys.exit('spider_name is not null')
121 |
122 | templates_dir = abspath(join(dirname(__file__), f'templates/{spider_type}'))
123 | copy2(join(templates_dir, 'spider.py.tmpl'), join(abspath(project_path), 'spiders', f'{spider_name}.py.tmpl'))
124 |
125 | s_path = abspath(project_path).split('/')
126 | spider_path = '.'.join(s_path[s_path.index('spiders'):])
127 | settings_path = spider_path + '.settings'
128 |
129 | path = join(*('${spider_name}', 'spiders', f'{spider_name}.py.tmpl'))
130 |
131 | tpl_file = string.Template(path).substitute(spider_name=project_path)
132 |
133 | render_template_file(
134 | tpl_file,
135 | settings_path=settings_path,
136 | spider_path=spider_path,
137 | spider_name=spider_name,
138 | create_time=datetime.datetime.now().strftime('%Y/%d/%d %H:%M:%S'),
139 | SpiderName=string_camelcase(spider_name),
140 | )
141 |
142 |
143 | if __name__ == '__main__':
144 | generate_spider_project('async_spider', '/Users/qinjiahu/Desktop/project/gn/spider_project/test/test1', 'test1')
145 |
--------------------------------------------------------------------------------
/magical/sync_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午4:48
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:48
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | from magical.utils import load_objects, load_files
14 | from magical.sync_spider.core.start_spider import run_spider, run_thread_spider
15 | from magical.sync_spider.http.request import Request
16 | from magical.sync_spider.core.spider import SyncSpider, ThreadSyncSpider, RedisMessageMQSpider, RabbitMessageMQSpider
17 | from magical.sync_spider.common.log_setting import get_logger
18 |
19 |
20 | def get_settings(settings_path=None):
21 | import importlib
22 | from magical.sync_spider.config.settings import Settings
23 |
24 | settings = Settings()
25 |
26 | if settings_path:
27 | custom_settings = importlib.import_module(settings_path)
28 | settings.load_config(custom_settings)
29 |
30 | return settings
31 |
32 |
33 | class TestSyncSpider(object):
34 | name = 'test_sync_spider'
35 |
36 | def __init__(self, settings_path=None):
37 | self.settings = get_settings(settings_path)
38 | self.loop = None
39 | self.logger = get_logger(self)
40 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午4:48
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:48
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/base_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: base_spider.py
6 | Time: 2021/4/11 下午9:06
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 下午9:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | class BaseSpider(object):
15 |
16 | def __init__(self, spider):
17 | self.red = spider.red
18 | self.logger = spider.logger
19 | self.post_gre = spider.post_gre
20 | self.download = spider.download
21 | self.settings = spider.settings
22 | self.spider_util = spider.spider_util
23 | self.spider_data = spider.spider_data
24 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/email_handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: email_handler.py
6 | Time: 2021/4/10 下午4:49
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:49
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import time
13 | import smtplib
14 | from email.mime.text import MIMEText
15 | from email.mime.image import MIMEImage
16 | from email.mime.multipart import MIMEMultipart
17 |
18 |
19 | class EmailHandler(object):
20 | mail_host = 'smtp.qq.com'
21 | mail_user = '2027762055@qq.com'
22 | mail_pass = 'lcpzxzargptleibi'
23 | sender = '2027762055@qq.com'
24 | receivers = ['qinjiahu@gnlab.com']
25 |
26 | @staticmethod
27 | def __send_email_image(image_path, title):
28 | send_str = f'''
29 |
30 |
31 |
32 | '''
33 |
34 | # 构建message
35 | msg = MIMEMultipart()
36 |
37 | # 添加邮件内容
38 | content = MIMEText(send_str, _subtype='html', _charset='utf8')
39 | msg.attach(content)
40 |
41 | # 构建并添加图像对象
42 | img1 = MIMEImage(open(image_path, 'rb').read(), _subtype='octet-stream')
43 | img1.add_header('Content-ID', 'image1')
44 | msg.attach(img1)
45 |
46 | # 邮件主题
47 | msg['Subject'] = title
48 |
49 | # 邮件收、发件人
50 | msg['To'] = EmailHandler.receivers[0]
51 | msg['From'] = EmailHandler.sender
52 |
53 | try:
54 | # 登录邮箱
55 | server = smtplib.SMTP_SSL("smtp.qq.com", port=465)
56 | server.login(EmailHandler.sender, EmailHandler.mail_pass)
57 | server.sendmail(EmailHandler.sender, EmailHandler.receivers, msg.as_string())
58 | server.quit()
59 | except smtplib.SMTPException as e:
60 | print('send_email_image.error: ', e) # 打印错误
61 |
62 | @staticmethod
63 | def __send_email(content, title):
64 | message = MIMEText(content, 'plain', 'utf-8')
65 | # 邮件主题
66 | message['Subject'] = title
67 | # 发送方信息
68 | message['From'] = EmailHandler.sender
69 | # 接受方信息
70 | message['To'] = EmailHandler.receivers[0]
71 |
72 | # 登录并发送邮件
73 | try:
74 | smtpObj = smtplib.SMTP()
75 | # 连接到服务器
76 | smtpObj.connect(EmailHandler.mail_host, 25)
77 | # 登录到服务器
78 | smtpObj.login(EmailHandler.mail_user, EmailHandler.mail_pass)
79 | # 发送
80 | smtpObj.sendmail(EmailHandler.sender, EmailHandler.receivers, message.as_string())
81 | # 退出
82 | smtpObj.quit()
83 | except smtplib.SMTPException as e:
84 | print('__send_email.error: ', e) # 打印错误
85 |
86 | @staticmethod
87 | def send_email(title, image_path=None):
88 | content = f'cookie 失效,请及时补充 {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))}'
89 |
90 | if image_path:
91 | EmailHandler.__send_email_image(image_path, title)
92 |
93 | else:
94 | EmailHandler.__send_email(content, title)
95 |
96 |
97 | if __name__ == "__main__":
98 | EmailHandler.send_email('JD 商家后台 Cookie')
99 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/log_setting.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: log_setting.py
6 | Time: 2021/4/10 下午1:05
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午1:05
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import logging.config
13 |
14 | import os
15 | import datetime
16 | import logging
17 | import logging.handlers
18 |
19 |
20 | class Logger(object):
21 | instance = {}
22 | init_flag = {}
23 |
24 | def __new__(cls, *args, **kwargs):
25 | spider = kwargs['spider']
26 | name = spider.name
27 |
28 | if not cls.instance.get(name):
29 | cls.instance[name] = super().__new__(cls)
30 |
31 | return cls.instance[name]
32 |
33 | def __init__(self, spider):
34 | name = spider.name
35 | if Logger.init_flag.get(name):
36 | return
37 | Logger.init_flag[name] = True
38 |
39 | self.logger = logging.getLogger(name)
40 | if not self.logger.handlers:
41 | self.logger.setLevel(logging.DEBUG)
42 | day_date = datetime.datetime.now().strftime("%Y-%m-%d")
43 | log_path = spider.settings['LOGGER_PATH']
44 | self.log_path = os.path.join(log_path or 'logs/', f'{day_date}/')
45 | if not os.path.exists(self.log_path):
46 | os.makedirs(self.log_path)
47 |
48 | self.log_name = f'{self.log_path}{name + ".log"}'
49 | fh = logging.FileHandler(self.log_name, 'a', encoding='utf-8')
50 | fh.setLevel(logging.INFO)
51 | ch = logging.StreamHandler()
52 | ch.setLevel(logging.INFO)
53 | formatter = logging.Formatter(
54 | '[%(asctime)s] %(filename)s -> %(funcName)s line:%(lineno)d [%(levelname)s] %(message)s')
55 | fh.setFormatter(formatter)
56 | ch.setFormatter(formatter)
57 | self.logger.addHandler(fh)
58 | self.logger.addHandler(ch)
59 | fh.close()
60 | ch.close()
61 |
62 |
63 | def get_logger(spider):
64 | return Logger(spider=spider).logger
65 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/proxy_handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: proxy_handler.py
6 | Time: 2021/5/5 下午2:48
7 | -------------------------------------------------
8 | Change Activity: 2021/5/5 下午2:48
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import random
13 |
14 |
15 | class ProxyHandler(object):
16 | __instance = {}
17 |
18 | def __new__(cls, *args, **kwargs):
19 | if not cls.__instance.get(cls.__name__):
20 | cls.__instance[cls.__name__] = super().__new__(cls)
21 | return cls.__instance[cls.__name__]
22 |
23 | def __init__(self, spider):
24 | self.spider = spider
25 | self.logger = spider.logger
26 |
27 | self.proxy_list = []
28 |
29 | self.proxy_num = spider.settings.get('PROXY_NUM', 1)
30 |
31 |
32 | class GetRedisProxy(ProxyHandler):
33 | def __init__(self, spider):
34 | super().__init__(spider)
35 |
36 | def generate_proxy(self, num):
37 | red_proxy = self.spider.red_proxy
38 | proxy_keys = list(red_proxy.keys('ip_pool_win7*'))
39 |
40 | for i in range(num):
41 | proxy = (red_proxy.get(random.choice(proxy_keys))).split("_")[0]
42 |
43 | new_proxy = {
44 | 'https': f'socks5://{proxy}/',
45 | 'http': f'socks5://{proxy}/'
46 | }
47 |
48 | if self.spider.test_ip(new_proxy):
49 | self.proxy_list.append(new_proxy)
50 |
51 | def proxy_handler(self, request=None, num=None):
52 | if not request:
53 | self.generate_proxy(num or self.proxy_num)
54 |
55 | else:
56 | if request.meta.get('proxy') in self.proxy_list:
57 | self.proxy_list.remove(request.meta.get('proxy'))
58 | self.generate_proxy(num or self.proxy_num)
59 |
60 | def get_proxy(self):
61 | return random.choice(self.proxy_list) if len(self.proxy_list) > 0 else None
62 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/redis_lock.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: redis_lock.py
6 | Time: 2021/5/13 下午3:52
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午3:52
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import time
13 |
14 |
15 | class RedisLock(object):
16 | def __init__(self, key, timeout=300, wait_timeout=300, break_wait=None, redis_cli=None):
17 | """
18 | redis超时锁
19 | :param key: 关键字 不同项目区分
20 | :param timeout: 锁超时时间
21 | :param wait_timeout: 等待加锁超时时间 防止多线程竞争时可能出现的 某个线程无限等待
22 | <=0 则不等待 直接加锁失败
23 | :param break_wait: 可自定义函数 灵活控制 wait_timeout 时间 当此函数返回True时 不再wait
24 | :param redis_cli: redis客户端
25 |
26 | 用法示例:
27 | with RedisLock(key="test", timeout=10, wait_timeout=100, redis_uri="") as _lock:
28 | if _lock.locked:
29 | # 用来判断是否加上了锁
30 | # do somethings
31 | """
32 | self.redis_index = -1
33 | if not key:
34 | raise Exception("lock key is empty")
35 | if not redis_cli:
36 | raise Exception("redis_cli is empty")
37 |
38 | self.redis_conn = redis_cli
39 | self.lock_key = "redis_lock:{}".format(key)
40 | # 锁超时时间
41 | self.timeout = timeout
42 | # 等待加锁时间
43 | self.wait_timeout = wait_timeout
44 | # wait中断函数
45 | self.break_wait = break_wait
46 | if self.break_wait is None:
47 | self.break_wait = lambda: False
48 | if not callable(self.break_wait):
49 | raise TypeError(
50 | "break_wait must be function or None, but: {}".format(
51 | type(self.break_wait)
52 | )
53 | )
54 |
55 | self.locked = False
56 |
57 | def __enter__(self):
58 | if not self.locked:
59 | self.acquire()
60 | return self
61 |
62 | def __exit__(self, exc_type, exc_val, exc_tb):
63 | self.release()
64 |
65 | def __repr__(self):
66 | return "".format(self.lock_key, self.redis_index)
67 |
68 | def acquire(self):
69 | start = time.time()
70 | while 1:
71 | # 尝试加锁
72 | if self.redis_conn.setnx(self.lock_key, time.time()):
73 | self.redis_conn.expire(self.lock_key, self.timeout)
74 | self.locked = True
75 | break
76 | else:
77 | # 修复bug: 当加锁时被干掉 导致没有设置expire成功 锁无限存在
78 | if self.redis_conn.ttl(self.lock_key) < 0:
79 | self.redis_conn.delete(self.lock_key)
80 |
81 | if self.wait_timeout > 0:
82 | if time.time() - start > self.wait_timeout:
83 | # log.info("加锁失败")
84 | break
85 | else:
86 | # 不等待
87 | break
88 | if self.break_wait():
89 | # log.info("break_wait 生效 不再等待加锁")
90 | break
91 | # log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
92 | if self.wait_timeout > 10:
93 | time.sleep(5)
94 | else:
95 | time.sleep(1)
96 | return
97 |
98 | def release(self):
99 | if self.locked:
100 | self.redis_conn.delete(self.lock_key)
101 | self.locked = False
102 | return
103 |
104 | def prolong_life(self, life_time: int) -> int:
105 | """
106 | 延长这个锁的超时时间
107 | :param life_time: 延长时间
108 | :return:
109 | """
110 | expire = self.redis_conn.ttl(self.lock_key)
111 | if expire < 0:
112 | return expire
113 | expire += life_time
114 | self.redis_conn.expire(self.lock_key, expire)
115 | return self.redis_conn.ttl(self.lock_key)
116 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/spider_util.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: spider_util.py
6 | Time: 2021/4/11 下午9:35
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 下午9:35
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import json
14 | import random
15 | import time
16 | import datetime
17 | import hashlib
18 |
19 |
20 | class SpiderUtil(object):
21 |
22 | file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'user_agent.json')
23 | with open(file_path, 'r', encoding='utf-8') as f:
24 | usa = json.load(f)
25 |
26 | def __init__(self, spider):
27 | self.spider = spider
28 |
29 | @staticmethod
30 | def random_ua():
31 | return random.choice(SpiderUtil.usa)
32 |
33 | @staticmethod
34 | def microsecond_handler(time_str, symbol='-'):
35 | new_time_str = time_str.replace('T', ' ')
36 |
37 | dt = datetime.datetime.strptime(new_time_str, "%Y-%m-%d %H:%M:%S.%f+0800")
38 | dt1 = time.mktime(dt.timetuple()) + (dt.microsecond / 1000000)
39 | dt1 = dt1 * 1000
40 | dt1 = dt1 - 1 if '-' == symbol else dt1 + 1
41 | dt2 = datetime.datetime.fromtimestamp((int(dt1)) / 1000)
42 |
43 | return (dt2.strftime("%Y-%m-%dT%H:%M:%S.%f+0800")).replace('000', '')
44 |
45 | @staticmethod
46 | def get_sha1_encrypt(string):
47 | return hashlib.sha1(string.encode()).hexdigest()
48 |
49 | @staticmethod
50 | def get_md5_encrypt(string):
51 | new_md5 = hashlib.md5()
52 | new_md5.update(string.encode(encoding='utf-8'))
53 | return new_md5.hexdigest()
54 |
55 |
56 | if __name__ == '__main__':
57 | print(SpiderUtil.random_ua())
58 |
--------------------------------------------------------------------------------
/magical/sync_spider/common/user_agent.json:
--------------------------------------------------------------------------------
1 | [
2 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
3 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
4 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
5 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
6 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
7 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
8 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
9 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
10 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
11 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
12 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
13 | "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
14 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
15 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
16 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
17 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
18 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
19 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
20 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
21 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
22 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
23 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
24 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
25 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
26 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
29 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
31 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
33 | "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
34 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
35 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
36 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
38 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
41 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
42 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
43 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
45 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
46 | "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
47 | "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
48 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
49 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
50 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
52 | "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
53 | "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
54 | "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
55 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
56 | "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
57 | "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
58 | "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
59 | "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
60 | "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
61 | "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
62 | "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
63 | "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
64 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
65 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
66 | "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
67 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
68 | "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
69 | "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
70 | "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
71 | "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
72 | "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
73 | "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
74 | "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
75 | "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
76 | "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
77 | "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
78 | "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
79 | "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
80 | "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
81 | "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
82 | "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
83 | "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
84 | "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
85 | "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
86 | "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
87 | "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
88 | "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
89 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
90 | "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
91 | "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
92 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
93 | "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
94 | "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
95 | "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
96 | "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
97 | "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
98 | "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
99 | "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
100 | "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
101 | "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
102 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
103 | "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
104 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
105 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
106 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
107 | "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
108 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
109 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
110 | "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
111 | "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
112 | "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
113 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
114 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
115 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
116 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
117 | "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
118 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
119 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
120 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
121 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
122 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
123 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
124 | "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
125 | "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
126 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
127 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
128 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
129 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
130 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
131 | "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
132 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
133 | "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
134 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
135 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
136 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
137 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
138 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
139 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
140 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
141 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
142 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
143 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
144 | "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
146 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
147 | "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
148 | "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
149 | "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
150 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
151 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
152 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
153 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
154 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
155 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
156 | "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3",
157 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
158 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
159 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
160 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
161 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
162 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
163 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
164 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
165 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
166 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
167 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
168 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
169 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
170 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
171 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
172 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
173 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
174 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
175 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
176 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
177 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
178 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
179 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
180 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
181 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
182 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
183 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
184 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
185 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
186 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
187 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
188 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
189 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
190 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
191 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
192 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
193 | "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
194 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
195 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
196 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
197 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
198 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
199 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
200 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
201 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5"
202 | ]
--------------------------------------------------------------------------------
/magical/sync_spider/common/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: utils.py
6 | Time: 2021/4/10 下午9:36
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午9:36
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import time
13 |
14 | from magical.utils import round_half_up, get_fmt_time
15 |
16 | start_time = time.time()
17 | success_rate = 0
18 | success_num = 0
19 | failure_num = 0
20 | end_time = None
21 | req_num = None
22 |
23 |
24 | def _gen_content(name):
25 | global req_num, success_rate
26 | req_num = success_num + failure_num
27 |
28 | success_rate = float(round_half_up(success_num / req_num, 4)) * 100 if req_num else 0
29 |
30 | return [
31 | f'爬虫名称: {name}',
32 | f'请求成功率: {success_rate}%',
33 | f'请求成功次数: {success_num}',
34 | f'请求失败次数: {failure_num}',
35 | f'开始时间: {get_fmt_time(timestamp=start_time)}',
36 | f'结束时间: {get_fmt_time(timestamp=end_time)}',
37 | ]
38 |
39 |
40 | def call_func(request_func, exception_func, response_func, *args, **kwargs):
41 | global success_num, failure_num, end_time
42 |
43 | failure_num += 1
44 | try:
45 | result = request_func(*args, **kwargs)
46 |
47 | except Exception as exc:
48 | failure_num -= 1
49 | return exception_func(exc)
50 |
51 | else:
52 | failure_num -= 1
53 | success_num += 1
54 | return response_func(result)
55 |
56 | finally:
57 | end_time = time.time()
58 |
59 |
60 | def call_func_item(item_func, exception_func, *args, **kwargs):
61 | try:
62 | return item_func(*args, **kwargs)
63 |
64 | except Exception as exc:
65 | return exception_func(exc)
66 |
--------------------------------------------------------------------------------
/magical/sync_spider/config/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午2:32
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午2:32
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/config/default_settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: default_settings.py
6 | Time: 2021/4/10 下午5:19
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午5:19
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | # -------------------------------------------------------------------------------------------------------------------
14 |
15 | # # 去重中间件
16 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler"
17 | #
18 | # # 去重过滤器
19 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter"
20 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter"
21 | #
22 | # # 去重队列,redis, memory = 内存
23 | # FILTER_QUEUE_TYPE = 'redis'
24 | #
25 | # # 去重是否 md5 加密
26 | # FILTER_USE_MD5 = False
27 | #
28 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red
29 | # FILTER_REDIS_NAME = 'red'
30 | #
31 | # # 去重初始容量
32 | # FILTER_INITIAL_CAPACITY = 100000000
33 | #
34 | # # 去重错误率
35 | # FILTER_ERROR_RATE = 0.00001
36 |
37 | # -------------------------------------------------------------------------------------------------------------------
38 |
39 | # # rabbit mq 配置
40 | # MESSAGE_MQ_CONFIG = {
41 | # 'username': 'admin',
42 | # 'password': 'admin123',
43 | # 'host': '127.0.0.1',
44 | # 'port': 18097
45 | # }
46 | #
47 | # # rabbit mq 消费批次,每次消费 10 条
48 | # MESSAGE_MQ_PREFETCH_COUNT = 10
49 | #
50 | # # rabbit mq virtual host
51 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider'
52 | #
53 | # # rabbit mq 操作类
54 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler'
55 |
56 | # -------------------------------------------------------------------------------------------------------------------
57 |
58 | # 下载中间件
59 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader"
60 |
61 | # 下载处理中间件
62 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler"
63 |
64 | # 下载调度器
65 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager"
66 |
67 | # 中间件,可配置多个,默认是重试中间件
68 | DOWNLOAD_MIDDLEWARE_PATH = {}
69 |
70 | # -------------------------------------------------------------------------------------------------------------------
71 |
72 | # 管道处理中间件
73 | PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler"
74 |
75 | # 管道调度器
76 | PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager"
77 |
78 | # 管道中间件,可配置多个
79 | PIPELINE_MIDDLEWARE_PATH = {}
80 |
81 | # -------------------------------------------------------------------------------------------------------------------
82 |
83 | # 爬虫公共类,基类
84 | BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider"
85 |
86 | # 爬虫工具类
87 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil"
88 |
89 | # 邮件
90 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler'
91 |
92 | # post ger sql 操作类
93 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle'
94 |
95 | # mysql 操作类
96 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler'
97 |
98 | # redis 操作类
99 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler'
100 |
101 | # 代理IP中间件
102 | # redis IP 获取
103 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy'
104 | # # 芝麻代理 IP
105 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy'
106 |
107 | # -------------------------------------------------------------------------------------------------------------------
108 |
109 | # 初始化 代理 IP 数量
110 | PROXY_NUM = 5
111 |
112 | # 重试次数
113 | RETRY_COUNT = 3
114 |
115 | # 包含一下状态吗,重试
116 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
117 |
118 | # 忽略 ssl 验证
119 | REQUEST_VERIFY = False
120 |
121 | # 请求超时时间
122 | REQUEST_TIMEOUT = 30
123 |
124 | # 5s盾,delay 时间
125 | SCRAPER_DELAY = 30
126 |
127 | # 消费者线程数
128 | CONSUMER_THREAD_NUM = 10
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/magical/sync_spider/config/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: settings.py
6 | Time: 2021/3/24 上午9:34
7 | -------------------------------------------------
8 | Change Activity: 2021/3/24 上午9:34
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import json
13 | from importlib import import_module
14 |
15 | from magical.sync_spider.config import default_settings
16 |
17 |
18 | class Attribute(object):
19 |
20 | def __init__(self, value):
21 | self.value = value
22 |
23 | def __str__(self):
24 | return "" % self.value
25 |
26 | __repr__ = __str__
27 |
28 |
29 | class Settings(object):
30 |
31 | def __init__(self, ):
32 | self.attrs = {}
33 | self.load_config(default_settings)
34 |
35 | def __getitem__(self, key):
36 | return self.attrs[key].value if key in self.attrs else None
37 |
38 | def load_config(self, module):
39 | if isinstance(module, str):
40 | module = import_module(module)
41 |
42 | for key in dir(module):
43 | if key.isupper():
44 | self.set(key, getattr(module, key))
45 |
46 | def set(self, key: str, value):
47 | self.attrs[key] = Attribute(value)
48 |
49 | def set_dict(self, values):
50 | for key, value in values.items():
51 | self.set(key, value)
52 |
53 | def get(self, key, default=None):
54 | return self[key] or default
55 |
56 | def get_int(self, key, default=0):
57 | return int(self.get(key, default))
58 |
59 | def get_float(self, key, default=0.0):
60 | return float(self.get(key, default))
61 |
62 | def get_list(self, key, default=None):
63 | value = self.get(key, default or None)
64 | if isinstance(value, str):
65 | value = value.split(",")
66 | return value
67 |
68 | def get_dict(self, key, default=None):
69 | value = self.get(key, default or None)
70 | if isinstance(value, str):
71 | value = json.loads(value)
72 | return value
73 |
--------------------------------------------------------------------------------
/magical/sync_spider/core/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午4:49
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:49
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/core/spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: spider.py
6 | Time: 2021/4/10 下午4:55
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:55
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import copy
13 | import json
14 | import time
15 | import importlib
16 | from queue import Queue
17 |
18 | import threading
19 | import requests
20 | from sqlalchemy import create_engine
21 |
22 | from magical.utils import load_objects
23 | from magical.sync_spider.common.log_setting import get_logger
24 |
25 | from magical.sync_spider.databases.init_db import InitDatabase
26 |
27 | from magical.sync_spider.config.settings import Settings
28 | from magical.sync_spider.http.response import Response
29 | from magical.sync_spider.http.request import Request
30 |
31 |
32 | # 爬虫初始化
33 | class InitSpider(object):
34 | name = 'base_init_spider'
35 |
36 | spider_start_time = time.time()
37 |
38 | this = None
39 |
40 | def __init__(self, *args, **kwargs):
41 | self.name = kwargs.get('name', self.name)
42 | self.custom_setting = kwargs.get('custom_setting', {})
43 | self.settings_path = kwargs.get('settings_path')
44 | self.common_settings_path = kwargs.get('common_settings_path')
45 |
46 | self.__load_settings(self.custom_setting)
47 |
48 | self.logger = get_logger(self)
49 | self.__load_dbs()
50 |
51 | self.email_handler = load_objects(self.settings['EMAIL_HANDLER'])
52 | self.spider_util = load_objects(self.settings['SPIDER_UTIL_PATH'])(self)
53 |
54 | if self.settings['PROXY_HANDLER']:
55 | self.proxy = load_objects(self.settings['PROXY_HANDLER'])(self)
56 |
57 | if self.settings['FILTER_DUPLICATE_HANDLER']:
58 | self.duplicate = load_objects(self.settings['FILTER_DUPLICATE_HANDLER'])(self)
59 |
60 | InitSpider.this = self
61 |
62 | def __load_settings(self, custom_setting={}):
63 | self.settings = Settings()
64 | self.settings.set_dict(custom_setting)
65 | if self.settings_path:
66 | try:
67 | self.settings.load_config(importlib.import_module(self.common_settings_path))
68 | except Exception as e:
69 | pass
70 | self.settings.load_config(importlib.import_module(self.settings_path))
71 |
72 | def __load_dbs(self):
73 | self.dbs = InitDatabase(self).dbs
74 |
75 | for db in self.dbs:
76 | setattr(self, db['name'], db['instance'])
77 |
78 | def __close_dbs(self):
79 | for db in self.dbs:
80 | db['instance'] and db['instance'].close_pool()
81 |
82 | def test_ip(self, proxy):
83 | res = None
84 | try:
85 | res = requests.get('http://www.httpbin.org/ip', proxies=proxy)
86 | res_json = res.json()
87 |
88 | if res_json.get('origin') in proxy.get('http', proxy.get('https', )):
89 | self.logger.info(f'可用代理: {proxy}')
90 | return True
91 |
92 | else:
93 | self.logger.error(f'不可用代理: {proxy}')
94 |
95 | except Exception as e:
96 | self.logger.error(f'测试代理异常: {proxy}, error: {e}, res: {res and res.text}', exc_info=True)
97 |
98 | def close_spider(self):
99 | self.__close_dbs()
100 | self.logger.info(f'Time usage: {time.time() - self.spider_start_time}')
101 | self.logger.info(f'Spider finished!')
102 | self.logger.info(f'Close Spider!')
103 |
104 | @staticmethod
105 | def this_close_spider():
106 | InitSpider.this.close_spider()
107 |
108 | @staticmethod
109 | def get_create_engine(db_type, name, settings_path):
110 | """获取数据库 create_engine 连接,用于 pandas
111 |
112 | Args:
113 | db_type: mysql or post_gre
114 | name: 数据库名称
115 | settings_path: 配置文件路径
116 | """
117 | custom_settings = importlib.import_module(settings_path)
118 |
119 | configs = getattr(custom_settings, f'{db_type.upper()}_CONFIG')
120 |
121 | if isinstance(configs, list):
122 | dbs = list(filter(lambda x: x['name'] == name, configs))
123 | if len(dbs) == 0:
124 | raise KeyError(f'{db_type} {name} 数据库 不存在')
125 |
126 | else:
127 | config = dbs[0]
128 | else:
129 | config = configs
130 |
131 | db = config['db']
132 | user = config['user']
133 | host = config['host']
134 | port = config['port']
135 | password = config['password']
136 |
137 | if db_type == 'post_gre':
138 | db_engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}')
139 |
140 | else:
141 | db_engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}?charset=utf8mb4')
142 |
143 | return db_engine
144 |
145 |
146 | # 爬虫基类
147 | class BaseSyncSpider(object):
148 | name = 'base_sync_spider'
149 | spider_data = {}
150 | default_custom_setting = {}
151 | settings_path = None
152 | base_spider = None
153 | common_settings_path = 'spiders.common.settings'
154 |
155 | def __init__(self, *args, **kwargs):
156 | self.custom_setting = kwargs.get('custom_setting', {})
157 | self.custom_setting.update(self.default_custom_setting)
158 |
159 | kwargs['custom_setting'] = self.custom_setting
160 | kwargs['name'] = self.name
161 | kwargs['settings_path'] = self.settings_path
162 | kwargs['common_settings_path'] = self.common_settings_path
163 |
164 | if not kwargs.get('init_spider'):
165 | self.init_spider = InitSpider(*args, **kwargs)
166 |
167 | else:
168 | self.init_spider = kwargs.get('init_spider')
169 |
170 | self.settings = Settings()
171 | self.settings.set_dict({k: v.value for k, v in self.init_spider.settings.attrs.items()})
172 | self.settings.set_dict(copy.deepcopy(self.custom_setting))
173 |
174 | self.download_cls = load_objects(self.settings['DOWNLOADER_PATH'])(self)
175 | self.pipeline_cls = load_objects(self.settings['PIPELINE_HANDLER_PATH'])(self)
176 | self.base_spider = load_objects(self.settings['BASE_SPIDER_PATH'])(self)
177 |
178 | self.__load_mq()
179 |
180 | if self.settings.get('SPIDER_INIT_HANDLER'):
181 | self.spider_init = load_objects(self.settings['SPIDER_INIT_HANDLER'])(self)
182 |
183 | def close_message_mq(self):
184 | message_mq = getattr(self, 'message_mq')
185 | if message_mq:
186 | message_mq.close_mq()
187 |
188 | def __load_mq(self):
189 | message_mq_handler = self.settings['MESSAGE_MQ_HANDLER']
190 | if message_mq_handler:
191 | setattr(self, 'message_mq', load_objects(message_mq_handler)(self))
192 |
193 | def __getattr__(self, item: str):
194 |
195 | if hasattr(self.init_spider, item):
196 | return getattr(self.init_spider, item)
197 |
198 | elif self.base_spider and hasattr(self.base_spider, item):
199 | return getattr(self.base_spider, item)
200 |
201 | else:
202 | self.logger.error(f'{item} 属性不在,base_spider or init_spider')
203 | return None
204 |
205 | def __download(self, request: Request) -> Response:
206 | response = self.download_cls.fetch(request)
207 | return response
208 |
209 | def download(self, request: Request = None, **kwargs) -> Response:
210 | if not isinstance(request, Request):
211 | request = Request(**kwargs)
212 | try:
213 | response = self.__download(request)
214 | except AttributeError as exc:
215 | self.logger.error(f'AttributeError: {str(exc)}', exc_info=True)
216 | self.logger.warning('find a error,post to error back.')
217 | except Exception as exc:
218 | self.logger.error(f'AttributeError: {str(exc)}', exc_info=True)
219 | else:
220 | if isinstance(response, Request):
221 | return self.download(response)
222 |
223 | return response
224 |
225 | def pipeline(self, item, **kwargs):
226 | return self.pipeline_cls.pipeline(item, **kwargs)
227 |
228 | def test_ip(self, proxy: dict) -> bool:
229 | return self.init_spider.test_ip(proxy)
230 |
231 | @staticmethod
232 | def create_thread(func, **kwargs):
233 | t = threading.Thread(target=func, args=(kwargs,))
234 | t.start()
235 | return t
236 |
237 | @staticmethod
238 | def create_engine(db_type, name, settings_path):
239 | return InitSpider.get_create_engine(db_type, name, settings_path)
240 |
241 |
242 | # redis 订阅者爬虫类
243 | class RedisMessageMQSpider(BaseSyncSpider):
244 | name = 'redis_message_mq_spider'
245 |
246 | def __init__(self, *args, **kwargs):
247 | super().__init__(*args, **kwargs)
248 |
249 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10
250 | self.spider_queue = Queue(100)
251 |
252 | def start_spider(self):
253 | raise NotImplementedError
254 |
255 | def start(self):
256 | self.logger.info('Start Spider!')
257 |
258 | try:
259 | self.start_spider()
260 |
261 | except Exception as e:
262 | self.logger.error(f'redis_message_mq_spider.start.error: {e}', exc_info=True)
263 |
264 | finally:
265 | self.close_spider()
266 |
267 | def start_thread(self):
268 | """启动爬虫,适用于多线程"""
269 | try:
270 | self.start_spider()
271 |
272 | except Exception as e:
273 | self.logger.error(f'redis_message_mq_spider.start_thread.error: {e}', exc_info=True)
274 |
275 | def __consumer_queue(self, func):
276 | while True:
277 | msg = self.spider_queue.get()
278 | try:
279 | self.logger.info(f'spider_queue.msg: {msg}')
280 | func(msg)
281 |
282 | except Exception as e:
283 | self.logger.exception(e)
284 |
285 | self.spider_queue.task_done()
286 |
287 | def __consumer_mq(self, key):
288 | redis_sub = self.red_mq.subscribe(key)
289 | msgs = redis_sub.listen()
290 |
291 | for msg in msgs:
292 | if msg['type'] == 'message':
293 | self.spider_queue.put(json.loads(msg['data']))
294 |
295 | def producer_mq(self, key, value=None, values=None):
296 | if isinstance(values, list):
297 | for i in values:
298 | if isinstance(i, dict):
299 | i = json.dumps(i, ensure_ascii=False)
300 |
301 | self.red_mq.public(key, i)
302 |
303 | else:
304 | if isinstance(value, dict):
305 | value = json.dumps(value, ensure_ascii=False)
306 |
307 | self.red_mq.public(key, value)
308 |
309 | def producer(self, func=None, **kwargs):
310 | t = threading.Thread(target=func, args=(kwargs,))
311 | t.start()
312 | return t
313 |
314 | def consumer_mq(self, key):
315 | t = threading.Thread(target=self.__consumer_mq, args=(key,))
316 | t.start()
317 | return t
318 |
319 | def consumer_queue(self, func, thread_num=None):
320 | for index in range(thread_num or self.consumer_thread_num):
321 | consumer_thread = threading.Thread(target=self.__consumer_queue, args=(func,))
322 | consumer_thread.daemon = True
323 | consumer_thread.start()
324 |
325 |
326 | # rabbit MQ 爬虫类
327 | class RabbitMessageMQSpider(BaseSyncSpider):
328 | name = 'rabbit_message_mq_spider'
329 |
330 | def __init__(self, *args, **kwargs):
331 | super().__init__(*args, **kwargs)
332 |
333 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10
334 | self.spider_queue = Queue(100)
335 |
336 | self.fail_spider_queue = Queue(100)
337 |
338 | def start_spider(self):
339 | raise NotImplementedError
340 |
341 | def start(self):
342 | self.logger.info('Start Spider!')
343 |
344 | try:
345 | self.start_spider()
346 |
347 | except Exception as e:
348 | self.logger.error(f'rabbit_message_mq_spider.start.error: {e}', exc_info=True)
349 |
350 | finally:
351 | self.close_message_mq()
352 | self.close_spider()
353 |
354 | def start_thread(self):
355 | """启动爬虫,适用于多线程"""
356 | try:
357 | self.start_spider()
358 |
359 | except Exception as e:
360 | self.logger.error(f'rabbit_message_mq_spider.start_thread.error: {e}', exc_info=True)
361 |
362 | def __consumer_queue(self, func):
363 | while True:
364 | channel, method, properties, body = self.spider_queue.get()
365 | try:
366 | msg = json.loads(body)
367 |
368 | except json.decoder.JSONDecodeError:
369 | msg = body.decode()
370 |
371 | try:
372 | self.logger.info(f'spider_queue.msg: {msg}')
373 |
374 | if func(msg):
375 | self.logger.info(f'rabbit mq 消费成功: {msg}')
376 |
377 | else:
378 | self.logger.error(f'rabbit mq 消费失败: {msg}')
379 | self.fail_spider_queue.put(msg)
380 |
381 | except Exception as e:
382 | self.logger.exception(e)
383 | self.fail_spider_queue.put(msg)
384 |
385 | finally:
386 | self.message_mq.receiver.basic_ack(channel, method)
387 | self.spider_queue.task_done()
388 |
389 | def consumer_queue(self, func, thread_num=None):
390 | for index in range(thread_num or self.consumer_thread_num):
391 | consumer_thread = threading.Thread(target=self.__consumer_queue, args=(func,))
392 | consumer_thread.daemon = True
393 | consumer_thread.start()
394 |
395 | def __consumer_mq_callback(self, channel, method, properties, body):
396 | self.spider_queue.put((channel, method, properties, body))
397 |
398 | def consumer_mq(self, key):
399 | t = threading.Thread(target=self.message_mq.consumer, args=(key, self.__consumer_mq_callback))
400 | t.start()
401 | return t
402 |
403 | def producer_mq(self, key=None, value=None, values=None):
404 | if isinstance(values, list):
405 | for i in values:
406 | if isinstance(i, dict):
407 | i = json.dumps(i, ensure_ascii=False)
408 |
409 | self.message_mq.producer(key, i)
410 |
411 | else:
412 | if isinstance(value, dict):
413 | value = json.dumps(value, ensure_ascii=False)
414 |
415 | self.message_mq.producer(key, value)
416 |
417 | def get_queue_len(self):
418 | return self.spider_queue.qsize()
419 |
420 |
421 | # 单线程爬虫类
422 | class SyncSpider(BaseSyncSpider):
423 | name = 'sync_spider'
424 |
425 | def __init__(self, *args, **kwargs):
426 | super().__init__(*args, **kwargs)
427 |
428 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10
429 | self.spider_queue = Queue(1000)
430 |
431 | def start_spider(self):
432 | raise NotImplementedError
433 |
434 | def start(self):
435 | """启动爬虫,适用于单线程"""
436 | self.logger.info('Start Spider!')
437 |
438 | try:
439 | self.start_spider()
440 |
441 | except Exception as e:
442 | self.logger.error(f'sync_spider.start.error: {e}', exc_info=True)
443 |
444 | finally:
445 | self.close_spider()
446 |
447 | def start_thread(self):
448 | """启动爬虫,适用于多线程"""
449 | try:
450 | self.start_spider()
451 |
452 | except Exception as e:
453 | self.logger.error(f'sync_spider.start_thread.error: {e}', exc_info=True)
454 |
455 | def start_mq(self):
456 | """启动爬虫,适用于消息队列, redis mq"""
457 | try:
458 | self.start_spider()
459 |
460 | except Exception as e:
461 | self.logger.error(f'sync_spider.start_mq.error: {e}', exc_info=True)
462 |
463 | def __producer(self, items):
464 | for item in items:
465 | self.spider_queue.put(item)
466 |
467 | self.spider_queue.join()
468 |
469 | def producer(self, items=[], func=None, **kwargs):
470 |
471 | if func:
472 | t = threading.Thread(target=func, args=(kwargs,))
473 | else:
474 | t = threading.Thread(target=self.__producer, args=(items,))
475 |
476 | t.start()
477 | return t
478 |
479 | def __consumer(self, func, queue):
480 | spider_queue = queue if queue else self.spider_queue
481 |
482 | while True:
483 | msg = spider_queue.get()
484 | try:
485 | # self.logger.info(f'spider_queue.msg: {msg}')
486 | func(msg)
487 |
488 | except Exception as e:
489 | self.logger.exception(e)
490 |
491 | spider_queue.task_done()
492 |
493 | def consumer(self, func, thread_num=None, queue=None):
494 | for index in range(thread_num or self.consumer_thread_num):
495 | consumer_thread = threading.Thread(target=self.__consumer, args=(func, queue))
496 | consumer_thread.daemon = True
497 | consumer_thread.start()
498 |
499 |
500 | # 多线程爬虫类
501 | class ThreadSyncSpider(object):
502 | def __init__(self, items, spider_cls, *args, **kwargs):
503 | kwargs['name'] = spider_cls.name
504 | kwargs['settings_path'] = spider_cls.settings_path
505 | kwargs['custom_setting'] = spider_cls.default_custom_setting
506 |
507 | self.init_spider = InitSpider(*args, **kwargs)
508 |
509 | self.items = items
510 | self.spider_cls = spider_cls
511 |
512 | self.tasks = []
513 |
514 | def __start(self, item):
515 | self.spider_cls(init_spider=self.init_spider, **item).start_thread()
516 |
517 | def start(self):
518 | InitSpider.this.logger.info('Start Spider!')
519 |
520 | try:
521 | for item in self.items:
522 | t = threading.Thread(target=self.__start, args=(item,))
523 | t.start()
524 | self.tasks.append(t)
525 |
526 | for task in self.tasks:
527 | task.join()
528 |
529 | except Exception as e:
530 | InitSpider.this.logger.error(f'sync_spider.start.error: {e}', exc_info=True)
531 |
532 | finally:
533 | InitSpider.this_close_spider()
534 |
--------------------------------------------------------------------------------
/magical/sync_spider/core/start_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: start_spider.py
6 | Time: 2021/4/14 下午5:21
7 | -------------------------------------------------
8 | Change Activity: 2021/4/14 下午5:21
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.sync_spider.core.spider import ThreadSyncSpider
13 |
14 |
15 | def run_spider(spider_cls, *args, **kwargs):
16 | spider = spider_cls(*args, **kwargs)
17 | spider.start()
18 |
19 |
20 | def run_thread_spider(items, spider_cls, *args, **kwargs):
21 | ThreadSyncSpider(items, spider_cls, *args, **kwargs).start()
22 |
--------------------------------------------------------------------------------
/magical/sync_spider/databases/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午4:48
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:48
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/databases/init_db.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: init_db.py
6 | Time: 2021/4/29 下午6:58
7 | -------------------------------------------------
8 | Change Activity: 2021/4/29 下午6:58
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import load_objects
13 |
14 |
15 | class InitDatabase(object):
16 | instance = None
17 | init_flag = None
18 |
19 | def __new__(cls, *args, **kwargs):
20 | if not cls.instance:
21 | cls.instance = super().__new__(cls)
22 | return cls.instance
23 |
24 | def __init__(self, spider):
25 | if InitDatabase.init_flag:
26 | return
27 | InitDatabase.init_flag = True
28 |
29 | self.spider = spider
30 | self.logger = spider.logger
31 | self.settings = spider.settings
32 |
33 | self.post_gre_config = self.settings['POST_GRE_CONFIG']
34 | self.mysql_config = self.settings['MYSQL_CONFIG']
35 | self.redis_config = self.settings['REDIS_CONFIG']
36 |
37 | self.dbs = []
38 |
39 | self.__load_dbs()
40 | self.__init_dbs()
41 |
42 | def __set_dict(self, name, instance=None):
43 | self.dbs.append({'name': name, 'instance': instance})
44 |
45 | def __load_dbs(self):
46 | self.sql_handler = load_objects(self.settings['POST_GRE_SQL_HANDLER'])
47 | self.red_handler = load_objects(self.settings['REDIS_HANDLER'])
48 | self.mysql_handler = load_objects(self.settings['MYSQL_HANDLER'])
49 |
50 | def __init_dbs(self):
51 | # redis
52 | if isinstance(self.redis_config, dict):
53 | self.__set_dict('red', self.red_handler(config=self.redis_config))
54 | elif isinstance(self.redis_config, list):
55 | for rc in self.redis_config:
56 | self.__set_dict(rc["name"], self.red_handler(config=rc))
57 | else:
58 | self.logger.info('未添加 redis 配置')
59 | self.__set_dict('red')
60 |
61 | # PostGreSql
62 | if isinstance(self.post_gre_config, dict):
63 | self.__set_dict('post_gre', self.sql_handler(config=self.post_gre_config, spider=self.spider))
64 | elif isinstance(self.post_gre_config, list):
65 | for pgc in self.post_gre_config:
66 | self.__set_dict(pgc["name"], self.sql_handler(config=pgc, spider=self.spider))
67 | else:
68 | self.logger.info('未添加 post gre sql 配置')
69 | self.__set_dict('post_gre')
70 |
71 | # mysql
72 | if isinstance(self.mysql_config, dict):
73 | self.__set_dict('mysql', self.mysql_handler(config=self.mysql_config, spider=self.spider))
74 | elif isinstance(self.mysql_config, list):
75 | for my in self.mysql_config:
76 | self.__set_dict(my["name"], self.mysql_handler(config=my, spider=self.spider))
77 | else:
78 | self.logger.info('未添加 mysql 配置')
79 | self.__set_dict('mysql')
80 |
--------------------------------------------------------------------------------
/magical/sync_spider/databases/mysql_pool.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: mysql_pool.py
6 | Time: 2021/4/22 上午12:41
7 | -------------------------------------------------
8 | Change Activity: 2021/4/22 上午12:41
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import pymysql
13 | from DBUtils.PooledDB import PooledDB
14 |
15 |
16 | class MysqlHandler(object):
17 | __instance = {}
18 | __init = {}
19 |
20 | def __new__(cls, *args, **kwargs):
21 | config = kwargs['config']
22 | name = config.get('name', 'mysql')
23 |
24 | if not cls.__instance.get(name):
25 | cls.__instance[name] = super().__new__(cls)
26 |
27 | return cls.__instance[name]
28 |
29 | def __init__(self, config, spider):
30 | name = config.get('name', 'mysql')
31 | if MysqlHandler.__init.get(name):
32 | return
33 | MysqlHandler.__init[name] = True
34 |
35 | self.log = spider.logger
36 | self.config = config
37 |
38 | self.pool = PooledDB(
39 | creator=pymysql,
40 | maxconnections=0,
41 | mincached=5,
42 | maxcached=5,
43 | maxshared=3,
44 | blocking=True,
45 | maxusage=None,
46 | setsession=[],
47 | ping=0,
48 | host=self.config['host'],
49 | port=self.config['port'],
50 | user=self.config['user'],
51 | password=self.config['password'],
52 | database=self.config['db'],
53 | charset=self.config['charset']
54 | )
55 |
56 | def get_pool(self):
57 | conn = self.pool.connection()
58 | cur = conn.cursor()
59 | return conn, cur
60 |
61 | def execute(self, sql, info_data=None):
62 | conn, cur = self.get_pool()
63 | try:
64 | if isinstance(info_data, dict):
65 | cur.execute(sql, info_data)
66 | elif isinstance(info_data, list):
67 | cur.executemany(sql, info_data)
68 | else:
69 | cur.execute(sql)
70 | conn.commit()
71 | return True
72 |
73 | except pymysql.err.IntegrityError as e:
74 | self.log.info(f'pymysql.err.IntegrityError: {e}')
75 | self.log.info(f"execute failed: {sql}")
76 | return False
77 |
78 | except Exception as e:
79 | self.log.info(f'mysql db: {e}')
80 | self.log.info(f"execute failed: {sql}")
81 | return False
82 |
83 | finally:
84 | cur.close()
85 | conn.close()
86 |
87 | def insert_dict(self, table_name, info_dict, ignore=False, replace=False):
88 | fs = ','.join(list(map(lambda x: '`' + x + '`', [*info_dict.keys()])))
89 | vs = ','.join(list(map(lambda x: '%(' + x + ')s', [*info_dict.keys()])))
90 |
91 | sql = f"insert into `{table_name}` ({fs}) values ({vs});"
92 | if ignore:
93 | sql = f"insert ignore into `{table_name}` ({fs}) values ({vs});"
94 | elif replace:
95 | sql = f"replace into {table_name} ({fs}) values ({vs});"
96 |
97 | try:
98 | return self.execute(sql, info_dict)
99 |
100 | except Exception as e:
101 | self.log.info(f'insert_dict.mysql db: {e}')
102 | self.log.info("insert_dict.failed: " + sql + "\t" + str(info_dict.values()))
103 |
104 | def insert_list(self, table_name, info_list, ignore=False, replace=False):
105 | keys = list(info_list[0].keys())
106 | fs = ', '.join(keys)
107 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', keys)))
108 |
109 | sql = f"insert into {table_name} ({fs}) values ({vs});"
110 | if ignore:
111 | sql = f"insert ignore into {table_name} ({fs}) values ({vs});"
112 | elif replace:
113 | sql = f"replace into {table_name} ({fs}) values ({vs});"
114 |
115 | try:
116 | return self.execute(sql, info_list)
117 | except Exception as e:
118 | self.log.info(f'insert_list.mysql db: {e}')
119 |
120 | def select(self, sql):
121 | conn, cur = self.get_pool()
122 | cur.execute(sql)
123 | result = cur.fetchall()
124 | conn.close()
125 | cur.close()
126 | return result
127 |
128 | def close_pool(self):
129 | self.pool.close()
130 |
--------------------------------------------------------------------------------
/magical/sync_spider/databases/post_gre_sql_pool.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: post_gre_sql_pool.py
6 | Time: 2021/4/10 下午4:49
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:49
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import psycopg2
13 | from DBUtils.PooledDB import PooledDB
14 |
15 |
16 | class PostGreHandle(object):
17 | __instance = {}
18 | __init = {}
19 |
20 | def __new__(cls, *args, **kwargs):
21 | config = kwargs['config']
22 | name = config.get('name', 'post_gre')
23 |
24 | if not cls.__instance.get(name):
25 | cls.__instance[name] = super().__new__(cls)
26 |
27 | return cls.__instance[name]
28 |
29 | def __init__(self, config, spider):
30 | name = config.get('name', 'post_gre')
31 | if PostGreHandle.__init.get(name):
32 | return
33 | PostGreHandle.__init[name] = True
34 |
35 | self.log = spider.logger
36 | self.config = config
37 |
38 | self.pool = PooledDB(
39 | creator=psycopg2,
40 | maxconnections=0,
41 | mincached=5,
42 | maxcached=5,
43 | maxshared=3,
44 | blocking=True,
45 | maxusage=None,
46 | setsession=[],
47 | ping=0,
48 | host=self.config['host'],
49 | port=self.config['port'],
50 | user=self.config['user'],
51 | password=self.config['password'],
52 | database=self.config['db']
53 | )
54 |
55 | def get_pool(self):
56 | conn = self.pool.connection()
57 | cur = conn.cursor()
58 | return conn, cur
59 |
60 | def execute(self, sql, info_data=None):
61 | conn, cur = self.get_pool()
62 | try:
63 | if isinstance(info_data, dict):
64 | cur.execute(sql, info_data)
65 | elif isinstance(info_data, list):
66 | cur.executemany(sql, info_data)
67 | else:
68 | cur.execute(sql)
69 | conn.commit()
70 | return True
71 |
72 | except Exception as e:
73 | self.log.info(f'sql db: {e}')
74 | self.log.info(f"execute failed: {sql}")
75 | return False
76 |
77 | finally:
78 | cur.close()
79 | conn.close()
80 |
81 | def insert_conflict_list(self, table_name, info_list, indexes=None):
82 | keys = list(info_list[0].keys())
83 | fs = ', '.join(keys)
84 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', keys)))
85 |
86 | sql = f"insert into {table_name} ({fs}) values ({vs}) on conflict ({indexes}) do nothing;"
87 |
88 | try:
89 | return self.execute(sql, info_list)
90 | except Exception as e:
91 | self.log.exception(f'insert_conflict_list.sql db: {e}')
92 | return False
93 |
94 | def insert_conflict_dict(self, table_name, info_dict, indexes=None):
95 | fs = ', '.join(list(info_dict.keys()))
96 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', [*info_dict.keys()])))
97 | sql = f"insert into {table_name} ({fs}) values ({vs}) on conflict ({indexes}) do nothing;"
98 |
99 | try:
100 | return self.execute(sql, info_dict)
101 | except Exception as e:
102 | self.log.exception(f'insert_conflict_dict.sql db: {e}')
103 | self.log.error("insert_conflict_dict.failed: " + sql + "\t" + str(info_dict.values()))
104 | return False
105 |
106 | def select(self, sql):
107 | conn, cur = self.get_pool()
108 |
109 | try:
110 | cur.execute(sql)
111 | result = cur.fetchall()
112 |
113 | finally:
114 | conn.close()
115 | cur.close()
116 | return result
117 |
118 | def close_pool(self):
119 | self.pool.close()
120 |
--------------------------------------------------------------------------------
/magical/sync_spider/databases/red_pool.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: red_pool.py
6 | Time: 2021/4/10 下午4:49
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:49
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import json
13 | import redis
14 | import copy
15 |
16 |
17 | class RedisBase(redis.StrictRedis):
18 | __instance = {}
19 | __init = {}
20 |
21 | def __new__(cls, *args, **kwargs):
22 | config = kwargs['config']
23 | name = config.get('name', 'red')
24 |
25 | if not cls.__instance.get(name):
26 | cls.__instance[name] = super().__new__(cls)
27 |
28 | return cls.__instance[name]
29 |
30 | def __init__(self, config):
31 | name = config.get('name', 'red')
32 | if RedisBase.__init.get(name):
33 | return
34 | RedisBase.__init[name] = True
35 |
36 | new_config = copy.deepcopy(config)
37 |
38 | if new_config.get('name'):
39 | del new_config['name']
40 |
41 | super().__init__(**new_config)
42 |
43 | def public(self, key, msg):
44 | self.publish(key, msg)
45 | return True
46 |
47 | def subscribe(self, key):
48 | pub = self.pubsub()
49 | pub.subscribe(key)
50 | return pub
51 |
52 | def set_str(self, key, value, **kwargs):
53 | return self.set(key, value, **kwargs)
54 |
55 | def set_dict(self, key, value):
56 | if isinstance(value, (list, dict)):
57 | value = json.dumps(value, ensure_ascii=False)
58 | return self.set(key, value)
59 |
60 | def get_dict(self, key):
61 | data = self.get(key)
62 | return json.loads(data) if data else {}
63 |
64 | def get_list(self, key):
65 | data = self.get(key)
66 | return json.loads(data) if data else []
67 |
68 | def get_str(self, key):
69 | return self.get(key)
70 |
71 | def close_pool(self):
72 | self.connection_pool.disconnect()
73 |
74 | def _pipeline(self):
75 | pipe = self.pipeline(transaction=True)
76 | pipe.multi()
77 | return pipe
78 |
79 |
80 | class RedisHandler(RedisBase):
81 | def __init__(self, config):
82 | super().__init__(config=config)
83 |
84 | def get_str(self, key):
85 | return self.get(key)
86 |
87 | def set_bit(self, table, offsets, values):
88 | if isinstance(offsets, list):
89 | if not isinstance(values, list):
90 | values = [values] * len(offsets)
91 | else:
92 | assert len(offsets) == len(values), "offsets值要与values值一一对应"
93 |
94 | pipe = self._pipeline()
95 |
96 | for offset, value in zip(offsets, values):
97 | pipe.setbit(table, offset, value)
98 |
99 | return pipe.execute()
100 |
101 | else:
102 | return self.setbit(table, offsets, values)
103 |
104 | def get_bit(self, table, offsets):
105 | if isinstance(offsets, list):
106 | pipe = self._pipeline()
107 | for offset in offsets:
108 | pipe.getbit(table, offset)
109 |
110 | return pipe.execute()
111 |
112 | else:
113 | return self.getbit(table, offsets)
114 |
115 | def bit_count(self, table):
116 | return self.bitcount(table)
117 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/11 上午12:40
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 上午12:40
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/base_module/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/4/18 上午11:21
7 | -------------------------------------------------
8 | Change Activity: 2021/4/18 上午11:21
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/base_module/downloader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: downloader.py
6 | Time: 2021/4/11 上午12:41
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 上午12:41
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | class DownloaderMiddleware(object):
15 | __instance = {}
16 |
17 | def __new__(cls, *args, **kwargs):
18 | if not cls.__instance:
19 | cls.__instance = super().__new__(cls)
20 | return cls.__instance
21 |
22 | def __init__(self, spider, **kwargs):
23 | self.spider = spider
24 | self.proxy = spider.proxy
25 | self.logger = spider.logger
26 | self.settings = spider.settings
27 | # self.duplicate = spider.duplicate
28 | self.max_retry_count = spider.settings.get_int("RETRY_COUNT")
29 | self.retry_status_codes = spider.settings.get_list("RETRY_STATUS_CODES")
30 |
31 | def process_request(self, request):
32 | return request
33 |
34 | def process_response(self, request, response):
35 | return response
36 |
37 | def process_exception(self, request, exception):
38 | return exception
39 |
40 | def _retry(self, request):
41 | retry_count = request.meta.get('retry_count', 0) + 1
42 | if retry_count < self.max_retry_count:
43 | retry_request = request.copy()
44 | retry_request.meta["retry_count"] = retry_count
45 | return retry_request
46 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/base_module/pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: pipeline.py
6 | Time: 2021/5/17 下午5:56
7 | -------------------------------------------------
8 | Change Activity: 2021/5/17 下午5:56
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | class PipelineMiddleware(object):
15 | __instance = {}
16 |
17 | def __new__(cls, *args, **kwargs):
18 | if not cls.__instance:
19 | cls.__instance = super().__new__(cls)
20 | return cls.__instance
21 |
22 | def __init__(self, spider, **kwargs):
23 | self.spider = spider
24 | self.logger = spider.logger
25 | self.settings = spider.settings
26 |
27 | def process_item(self, item, **kwargs):
28 | """数据处理
29 |
30 | Args:
31 | item : 要处理的数据
32 | kwargs:
33 | table_name: 表名称
34 | replace : True or False (mysql 数据库使用)
35 | ignore : True or False (mysql 数据库使用)
36 | indexes : 数据库表唯一索引字段 (PostGreSql 数据库使用)
37 |
38 | Return:
39 | 返回的数据类型如果不等于 type(item) 则不会调用后面的 pipeline process_item 函数
40 | """
41 | return item
42 |
43 | def process_exception(self, item, exception, **kwargs):
44 | return exception
45 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/download/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/11 上午12:40
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 上午12:40
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/download/retry.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: retry.py
6 | Time: 2021/4/11 上午12:51
7 | -------------------------------------------------
8 | Change Activity: 2021/4/11 上午12:51
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
13 |
14 |
15 | class RetryMiddleware(DownloaderMiddleware):
16 | RETRY_EXCEPTIONS = ()
17 |
18 | def __init__(self, spider):
19 | super().__init__(spider)
20 |
21 | def process_response(self, request, response):
22 | if not request.use_middleware:
23 | return response
24 | if not request.meta.get("is_retry", False):
25 | return response
26 | if response.status in self.retry_status_codes:
27 | return self._retry(request) or response
28 | return response
29 |
30 | def process_exception(self, request, exception):
31 | if isinstance(exception, self.RETRY_EXCEPTIONS) and request.meta.get("is_retry", False):
32 | return self._retry(request)
33 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/mqs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/5/7 下午11:00
7 | -------------------------------------------------
8 | Change Activity: 2021/5/7 下午11:00
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/mqs/rabbit_mq/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/5/7 下午11:00
7 | -------------------------------------------------
8 | Change Activity: 2021/5/7 下午11:00
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/extends_module/mqs/rabbit_mq/handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: handler.py
6 | Time: 2021/5/7 下午11:23
7 | -------------------------------------------------
8 | Change Activity: 2021/5/7 下午11:23
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import threading
13 | from functools import partial
14 |
15 | import pika
16 |
17 |
18 | class MQBase(object):
19 | """消息队列基类, 该类线程不安全的"""
20 |
21 | def __init__(self, spider, ack=True):
22 | """当开启手动消息确认, 要考虑消息重入的情况, 默认开启手动消息确认
23 |
24 | Args:
25 | spider = 爬虫对象
26 | ack = 是否自动确认消息
27 | """
28 | # 使用 线程局部变量,保证rabbit连接线程安全
29 | self.local = threading.local()
30 |
31 | self.spider = spider
32 | self.logger = spider.logger
33 |
34 | self._conn = None
35 | self._properties = pika.BasicProperties(delivery_mode=2, )
36 |
37 | self.virtual_host = spider.settings['MESSAGE_MQ_VIRTUAL_HOST']
38 | self.prefetch_count = spider.settings['MESSAGE_MQ_PREFETCH_COUNT']
39 | self.rabbit_config = spider.settings['MESSAGE_MQ_CONFIG']
40 |
41 | self.port = self.rabbit_config['port']
42 | self.host = self.rabbit_config['host']
43 | self.username = self.rabbit_config['username']
44 | self.password = self.rabbit_config['password']
45 |
46 | self.ack = ack
47 |
48 | def close(self):
49 | try:
50 | if hasattr(self.local, 'channel'):
51 | self.local.channel.close()
52 | self.logger.info('rabbit mq channel closed!')
53 |
54 | if not (self._conn and self._conn.is_open):
55 | self._conn.close()
56 | self.logger.info('rabbit mq connection closed!')
57 |
58 | except Exception as e:
59 | self.logger.error(f'rabbit mq closed error: {e}')
60 |
61 | def _check_channel(self):
62 | if not hasattr(self.local, 'channel'):
63 | channel = self._rabbit_mq_init()
64 | self.local.channel = channel
65 |
66 | def _rabbit_mq_init(self):
67 | """初始化 连接 rabbit mq"""
68 | credentials = pika.PlainCredentials(username=self.username, password=self.password)
69 | parameters = pika.ConnectionParameters(
70 | host=self.host,
71 | port=self.port,
72 | virtual_host=self.virtual_host,
73 | credentials=credentials,
74 | heartbeat=0
75 | )
76 | self._conn = pika.BlockingConnection(parameters)
77 | channel = self._conn.channel()
78 |
79 | if self.ack:
80 | channel.confirm_delivery()
81 |
82 | self.logger.info('rabbit mq connection successfully !')
83 |
84 | return channel
85 |
86 |
87 | class MQSender(MQBase):
88 |
89 | def __init__(self, *args, **kwargs):
90 | super().__init__(*args, **kwargs)
91 |
92 | def try_send(self, queue_name, msg):
93 | try:
94 | self._check_channel()
95 |
96 | self.local.channel.queue_declare(queue=queue_name, durable=True)
97 | self.local.channel.basic_publish(
98 | exchange='',
99 | routing_key=queue_name,
100 | body=msg.encode(),
101 | properties=self._properties
102 | )
103 | self.logger.info(f'rabbit MQ 消息推送成功, msg: {msg}')
104 | success = True
105 | except Exception as e:
106 | self.logger.exception(e)
107 | self.logger.error(f'rabbit MQ 消息推送失败, msg: {msg}')
108 | success = False
109 |
110 | return success
111 |
112 | def push(self, queue_name, msg):
113 | ret = self.try_send(queue_name, msg) or self.try_send(queue_name, msg)
114 | return ret
115 |
116 |
117 | class MQReceiver(MQBase):
118 |
119 | def __init__(self, *args, **kwargs):
120 | super().__init__(*args, **kwargs)
121 |
122 | def basic_ack(self, channel, method):
123 | return self._conn.add_callback_threadsafe(partial(channel.basic_ack, method.delivery_tag))
124 |
125 | def start(self, queue_name, callback):
126 | """开始消费"""
127 | self._check_channel()
128 |
129 | self.local.channel.queue_declare(queue=queue_name, durable=True, auto_delete=False)
130 | self.local.channel.basic_qos(prefetch_count=self.prefetch_count)
131 | self.local.channel.basic_consume(queue_name, callback, auto_ack=not self.ack)
132 | self.local.channel.start_consuming()
133 |
134 |
135 | class RabbitMQHandler(object):
136 | def __init__(self, spider):
137 | self.spider = spider
138 |
139 | self.sender = MQSender(spider)
140 | self.receiver = MQReceiver(spider)
141 |
142 | def close_mq(self):
143 | # self.sender.close()
144 | self.receiver.close()
145 |
146 | def producer(self, queue_name, value):
147 | self.sender.push(queue_name, value)
148 |
149 | def consumer(self, queue_name, callback=None):
150 | self.receiver.start(queue_name, callback)
151 |
--------------------------------------------------------------------------------
/magical/sync_spider/http/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午10:27
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午10:27
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/http/request.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: request.py
6 | Time: 2021/4/10 下午4:55
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午4:55
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from pickle import dumps, loads
13 | from urllib.parse import urlencode
14 |
15 |
16 | class Request(object):
17 | def __init__(self, url, params=None, method='GET', data={}, headers={}, meta={},
18 | json=None, encoding='utf-8', use_middleware=True, session=True, s5=False,
19 | **kwargs):
20 | self.s5 = s5
21 | self.url = url
22 | self.data = data
23 | self.json = json
24 | self.params = params
25 | self.method = method
26 | self.encoding = encoding
27 | self.headers = headers
28 | self.session = session
29 | self.use_middleware = use_middleware
30 | self.kwargs = kwargs
31 |
32 | self.meta = self._load_meta(meta)
33 |
34 | def copy(self, *args, **kwargs):
35 | keys = [
36 | 'url', 'method', 'data', 'json', 'params', 'headers', 'meta', 'session',
37 | 'use_middleware', 's5'
38 | ]
39 | for key in keys:
40 | kwargs.setdefault(key, getattr(self, key))
41 | cls = kwargs.pop('cls', self.__class__)
42 | return cls(*args, **kwargs)
43 |
44 | def dumps(self):
45 | return dumps(self)
46 |
47 | def loads(self):
48 | return loads(self)
49 |
50 | @staticmethod
51 | def _load_meta(custom_meta):
52 | meta = {
53 | 'test_key': 'test_key1',
54 | 'proxy': None,
55 | 'retry_count': 0
56 | }
57 |
58 | if isinstance(custom_meta, dict):
59 | meta.update(custom_meta)
60 | return meta
61 |
62 | def __str__(self):
63 | return " %s %s>" % (
64 | self.meta['retry_count'],
65 | self.method,
66 | self.url + urlencode(self.meta.get('params')) if self.meta.get('params') else self.url
67 | )
68 |
69 | __repr__ = __str__
70 |
--------------------------------------------------------------------------------
/magical/sync_spider/http/response.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: response.py
6 | Time: 2021/4/10 下午10:28
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午10:28
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import re
13 | from lxml import etree
14 |
15 |
16 | class Response(object):
17 | def __init__(self, response, request):
18 | self.response = response
19 | self.request = request
20 |
21 | self.meta = request.meta
22 | self.url = response.url
23 | self.status = response.status_code
24 | self.text = response.text
25 | self.headers = response.headers
26 | self.cookies = response.cookies
27 |
28 | def set_encoding(self, encoding):
29 | self.response.encoding = encoding
30 | self.text = self.response.text
31 |
32 | def json(self):
33 | try:
34 | return self.response.json()
35 | except Exception as e:
36 | return None
37 |
38 | def __str__(self):
39 | return "" % (self.status, self.url)
40 |
41 | __repr__ = __str__
42 |
43 | @property
44 | def re(self):
45 | return Regex(self.text)
46 |
47 | @property
48 | def selector(self):
49 | selector = etree.HTML(self.text)
50 | return selector
51 |
52 | def css(self, css_select: str):
53 | return self.selector.cssselect(css_select)
54 |
55 | def xpath(self, xpath_str: str) -> list:
56 | result_list = self.selector.xpath(xpath_str)
57 | return result_list
58 |
59 |
60 | class Regex(object):
61 | def __init__(self, html):
62 | self.html = html
63 |
64 | def findall(self, pattern, flags=0):
65 | return re.findall(pattern, self.html, flags)
66 |
67 | def search(self, pattern, flags=0):
68 | return re.search(pattern, self.html, flags)
69 |
70 | def match(self, pattern, flags=0):
71 | return re.match(pattern, self.html, flags)
72 |
73 | def sub(self, pattern, repl, count, flags=0):
74 | return re.sub(pattern, repl, self.html, count, flags)
75 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.tmpl.py
6 | Time: 2021/4/10 下午11:24
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午11:24
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/download/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/4/18 下午12:37
7 | -------------------------------------------------
8 | Change Activity: 2021/4/18 下午12:37
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/download/downloader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: downloader.py
6 | Time: 2021/4/10 下午11:27
7 | -------------------------------------------------
8 | Change Activity: 2021/4/10 下午11:27
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import load_objects
13 |
14 |
15 | class Downloader(object):
16 | """Downloader中间件"""
17 |
18 | def __init__(self, spider):
19 | handler_cls = spider.settings['DOWNLOAD_HANDLER_PATH']
20 | handler_manager_cls = spider.settings['DOWNLOAD_MIDDLEWARE_MANAGER_PATH']
21 | self.handler = load_objects(handler_cls)(spider)
22 | self.middleware = load_objects(handler_manager_cls)(spider)
23 |
24 | def _download(self, request):
25 | """请求函数
26 | Args:
27 | request: request对象
28 | Returns:
29 | response 对象
30 | """
31 | resp = self.handler.fetch(request)
32 | return resp
33 |
34 | def fetch(self, request):
35 | """请求函数
36 | Args:
37 | request: request对象
38 | Returns:
39 | response 对象
40 | """
41 | resp = self.middleware.download(self._download, request)
42 | return resp
43 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/download/handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: handler.py
6 | Time: 2021/4/18 下午12:37
7 | -------------------------------------------------
8 | Change Activity: 2021/4/18 下午12:37
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import urllib3
13 | import cfscrape
14 | import requests
15 | from urllib.parse import urlparse
16 | from requests import adapters
17 |
18 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
19 | adapters.DEFAULT_RETRIES = 5
20 |
21 | from magical.sync_spider.http.response import Response
22 |
23 |
24 | class DownloadHandler(object):
25 | """请求中间件处理"""
26 |
27 | def __init__(self, spider, **kwargs):
28 | self.spider = spider
29 | self.kwargs = kwargs
30 | self.logger = spider.logger
31 | self.settings = spider.settings
32 |
33 | self.session = requests.session()
34 | self.scrape = cfscrape.create_scraper(delay=self.settings['SCRAPER_DELAY'])
35 | self.scrape_session = cfscrape.create_scraper(sess=self.session, delay=self.settings['SCRAPER_DELAY'])
36 |
37 | def fetch(self, request):
38 | """开始下载
39 |
40 | Args:
41 | request: request 对象
42 | Returns:
43 | response 对象
44 | """
45 |
46 | if request.s5:
47 | instance = self.scrape
48 | if request.session:
49 | instance = self.scrape_session
50 |
51 | elif request.session:
52 | instance = self.session
53 |
54 | else:
55 | instance = requests
56 |
57 | if request.method == 'POST':
58 | response = instance.post(
59 | request.url,
60 | data=request.data,
61 | json=request.json,
62 | headers=request.headers,
63 | params=request.params,
64 | proxies=request.meta.get('proxy'),
65 | verify=self.settings['REQUEST_VERIFY'],
66 | timeout=self.settings['REQUEST_TIMEOUT'],
67 | **request.kwargs
68 | )
69 | else:
70 | response = instance.get(
71 | request.url,
72 | headers=request.headers,
73 | params=request.params,
74 | proxies=request.meta.get('proxy'),
75 | verify=self.settings['REQUEST_VERIFY'],
76 | timeout=self.settings['REQUEST_TIMEOUT'],
77 | **request.kwargs
78 | )
79 |
80 | response.encoding = request.encoding
81 |
82 | res = Response(response, request)
83 |
84 | self.logger.debug(f"Downloaded ({res.status}) {str(request)}")
85 | return res
86 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/download/manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: manager.py
6 | Time: 2021/4/18 下午12:37
7 | -------------------------------------------------
8 | Change Activity: 2021/4/18 下午12:37
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from collections import defaultdict
13 |
14 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
15 | from magical.sync_spider.http.request import Request
16 | from magical.sync_spider.common.utils import call_func
17 | from magical.utils import load_objects
18 |
19 |
20 | class DownloadMiddlewareManager(object):
21 | def __init__(self, spider):
22 | self.methods = defaultdict(list)
23 | self.spider = spider
24 | self.settings = spider.settings
25 | self.middleware_s = self._load_middleware()
26 |
27 | for miw in self.middleware_s:
28 | self._add_middleware(miw)
29 |
30 | def _load_middleware(self):
31 | middleware_s = []
32 | middleware_s_dict = self.settings["DOWNLOAD_MIDDLEWARE_PATH"]
33 | middleware_s_list = sorted(middleware_s_dict.items(), key=lambda x: x[1])
34 |
35 | for middleware_key, value in middleware_s_list:
36 | middleware = load_objects(middleware_key)
37 | if issubclass(middleware, DownloaderMiddleware):
38 | middleware_instance = middleware(self.spider)
39 | middleware_s.append(middleware_instance)
40 | return middleware_s
41 |
42 | def _add_middleware(self, miw):
43 | if hasattr(miw, "process_request"):
44 | self.methods['process_request'].append(miw.process_request)
45 |
46 | if hasattr(miw, "process_response"):
47 | self.methods['process_response'].append(miw.process_response)
48 |
49 | if hasattr(miw, "process_exception"):
50 | self.methods['process_exception'].append(miw.process_exception)
51 |
52 | def download(self, download_func, request):
53 | this = self
54 |
55 | def process_request(request):
56 | for method in this.methods['process_request']:
57 | request = method(request)
58 | if not request:
59 | return request
60 | response = download_func(request)
61 | return response
62 |
63 | def process_response(response):
64 | for method in this.methods['process_response']:
65 | response = method(request, response)
66 | if isinstance(response, Request) or not response:
67 | return response
68 | return response
69 |
70 | def process_exception(exception):
71 | for method in this.methods['process_exception']:
72 | response = method(request, exception)
73 | if isinstance(response, Request) or not response:
74 | return response
75 | return exception
76 |
77 | resp = call_func(process_request, process_exception, process_response, request)
78 |
79 | return resp
80 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/duplicate/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/5/13 下午1:45
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午1:45
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/duplicate/bit_array.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: bit_array.py
6 | Time: 2021/5/13 下午3:45
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午3:45
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from __future__ import absolute_import
13 |
14 | import bitarray
15 |
16 |
17 | class BitArray:
18 | def set_all(self, value):
19 | pass
20 |
21 | def set(self, key, offsets, values):
22 | raise ImportError("this method mush be implement")
23 |
24 | def get(self, key, offsets):
25 | raise ImportError("this method mush be implement")
26 |
27 | def count(self, key, value=True):
28 | raise ImportError("this method mush be implement")
29 |
30 |
31 | class MemoryBitArray(BitArray):
32 | def __init__(self, num_bits):
33 | self.num_bits = num_bits
34 | self.bit_array = bitarray.bitarray(num_bits, endian="little")
35 |
36 | self.set_all(0)
37 |
38 | def set_all(self, value):
39 | self.bit_array.setall(value)
40 |
41 | def set(self, key, offsets, values):
42 | old_values = []
43 |
44 | if isinstance(offsets, list):
45 | if not isinstance(values, list):
46 | values = [values] * len(offsets)
47 | else:
48 | assert len(offsets) == len(values), "offsets值要与values值一一对应"
49 |
50 | for offset, value in zip(offsets, values):
51 | old_values.append(int(self.bit_array[offset]))
52 | self.bit_array[offset] = value
53 |
54 | else:
55 | old_values = int(self.bit_array[offsets])
56 | self.bit_array[offsets] = values
57 |
58 | return old_values
59 |
60 | def get(self, key, offsets):
61 | if isinstance(offsets, list):
62 | return [self.bit_array[offset] for offset in offsets]
63 | else:
64 | return self.bit_array[offsets]
65 |
66 | def count(self, key, value=True):
67 | return self.bit_array.count(value)
68 |
69 |
70 | class RedisBitArray(BitArray):
71 | redis_db = None
72 |
73 | def __init__(self, spider):
74 | red_name = spider.settings['FILTER_REDIS_NAME']
75 | self.red = getattr(spider, red_name) if not red_name else spider.red
76 |
77 | self.count_cached_name = "{}_count_cached"
78 |
79 | def set(self, key, offsets, values):
80 | return self.red.set_bit(key, offsets, values)
81 |
82 | def get(self, key, offsets):
83 | return self.red.get_bit(key, offsets)
84 |
85 | def count(self, key, value=True):
86 | # 先查redis的缓存,若没有 在统计数量
87 | count = self.red.get_str(self.count_cached_name)
88 | if count:
89 | return int(count)
90 | else:
91 | count = self.red.bit_count(key)
92 | # 半小时过期
93 | self.red.set_str(self.count_cached_name.format(key), count, ex=1800)
94 | return count
95 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/duplicate/bloom_filter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: bloom_filter.py
6 | Time: 2021/5/13 下午3:31
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午3:31
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import hashlib
13 | import math
14 | import threading
15 | import time
16 |
17 | from struct import unpack, pack
18 |
19 | from magical.sync_spider.middleware.duplicate import bit_array
20 | from magical.sync_spider.common.redis_lock import RedisLock
21 |
22 |
23 | def make_hash_funcs(num_slices, num_bits):
24 | if num_bits >= (1 << 31):
25 | fmt_code, chunk_size = "Q", 8
26 | elif num_bits >= (1 << 15):
27 | fmt_code, chunk_size = "I", 4
28 | else:
29 | fmt_code, chunk_size = "H", 2
30 | total_hash_bits = 8 * num_slices * chunk_size
31 | if total_hash_bits > 384:
32 | hash_fn = hashlib.sha512
33 | elif total_hash_bits > 256:
34 | hash_fn = hashlib.sha384
35 | elif total_hash_bits > 160:
36 | hash_fn = hashlib.sha256
37 | elif total_hash_bits > 128:
38 | hash_fn = hashlib.sha1
39 | else:
40 | hash_fn = hashlib.md5
41 |
42 | fmt = fmt_code * (hash_fn().digest_size // chunk_size)
43 | num_salts, extra = divmod(num_slices, len(fmt))
44 | if extra:
45 | num_salts += 1
46 | salts = tuple(hash_fn(hash_fn(pack("I", i)).digest()) for i in range(num_salts))
47 |
48 | def _make_hash_funcs(key):
49 | if isinstance(key, str):
50 | key = key.encode("utf-8")
51 | else:
52 | key = str(key).encode("utf-8")
53 |
54 | i = 0
55 | for salt in salts:
56 | h = salt.copy()
57 | h.update(key)
58 | for uint in unpack(fmt, h.digest()):
59 | yield uint % num_bits
60 | i += 1
61 | if i >= num_slices:
62 | return
63 |
64 | return _make_hash_funcs
65 |
66 |
67 | class BloomFilter(object):
68 | def __init__(self, spider, filter_queue_type):
69 | self.capacity = spider.settings.get_int('FILTER_INITIAL_CAPACITY')
70 | self.error_rate = spider.settings.get_float('FILTER_ERROR_RATE')
71 |
72 | if not (0 < self.error_rate < 1):
73 | raise ValueError("Error_Rate must be between 0 and 1.")
74 |
75 | if not self.capacity > 0:
76 | raise ValueError("Capacity must be > 0")
77 |
78 | num_slices = int(math.ceil(math.log(1.0 / self.error_rate, 2)))
79 | bits_per_slice = int(
80 | math.ceil(
81 | (self.capacity * abs(math.log(self.error_rate)))
82 | / (num_slices * (math.log(2) ** 2))
83 | )
84 | )
85 |
86 | self.num_slices = num_slices
87 | self.bits_per_slice = bits_per_slice
88 | self.num_bits = num_slices * bits_per_slice
89 | self.make_hashes = make_hash_funcs(self.num_slices, self.bits_per_slice)
90 |
91 | self._is_at_capacity = False
92 | self._check_capacity_time = 0
93 |
94 | if filter_queue_type == 'memory':
95 | self.bit_array = bit_array.MemoryBitArray(self.num_bits)
96 | self.bit_array.set_all(False)
97 |
98 | elif filter_queue_type == 'redis':
99 | self.bit_array = bit_array.RedisBitArray(spider)
100 |
101 | else:
102 | raise ValueError("not support this filter_queue_type")
103 |
104 | def is_at_capacity(self, filter_key):
105 | if self._is_at_capacity:
106 | return self._is_at_capacity
107 |
108 | bit_count = self.bit_array.count(filter_key)
109 | if bit_count and bit_count / self.num_bits > 0.5:
110 | self._is_at_capacity = True
111 |
112 | return self._is_at_capacity
113 |
114 | def get(self, filter_key, value):
115 | is_list = isinstance(value, list)
116 | keys = value if is_list else [value]
117 | is_exists = []
118 |
119 | offsets = []
120 | for key in keys:
121 | hashes = self.make_hashes(key)
122 | offset = 0
123 | for k in hashes:
124 | offsets.append(offset + k)
125 | offset += self.bits_per_slice
126 |
127 | old_values = self.bit_array.get(filter_key, offsets)
128 |
129 | for i in range(0, len(old_values), self.num_slices):
130 | is_exists.append(int(all(old_values[i: i + self.num_slices])))
131 |
132 | return is_exists if is_list else is_exists[0]
133 |
134 | def add(self, filter_key, value):
135 | if self.is_at_capacity(filter_key):
136 | raise IndexError("BloomFilter is at capacity")
137 |
138 | is_list = isinstance(value, list)
139 | keys = value if is_list else [value]
140 | is_added = []
141 |
142 | offsets = []
143 | for key in keys:
144 | hashes = self.make_hashes(key)
145 | offset = 0
146 | for k in hashes:
147 | offsets.append(offset + k)
148 | offset += self.bits_per_slice
149 |
150 | old_values = self.bit_array.set(filter_key, offsets, 1)
151 | for i in range(0, len(old_values), self.num_slices):
152 | is_added.append(1 ^ int(all(old_values[i: i + self.num_slices])))
153 |
154 | return is_added if is_list else is_added[0]
155 |
156 |
157 | class ScalableBloomFilter(object):
158 | def __init__(self, spider):
159 | self.spider = spider
160 | red_name = spider.settings['FILTER_REDIS_NAME']
161 | self.red = getattr(spider, red_name) if not red_name else spider.red
162 |
163 | self.filter_queue_type = spider.settings['FILTER_QUEUE_TYPE']
164 |
165 | self.filters = []
166 | self.filters.append(self.create_filter())
167 |
168 | self._thread_lock = threading.RLock()
169 | self._check_capacity_time = 0
170 |
171 | def create_filter(self):
172 | return BloomFilter(self.spider, self.filter_queue_type)
173 |
174 | def __check_filter_capacity(self, filter_key):
175 | if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800:
176 | if self.filter_queue_type == 'memory':
177 | with self._thread_lock:
178 | while True:
179 | if self.filters[-1].is_at_capacity(filter_key):
180 | self.filters.append(self.create_filter())
181 | else:
182 | break
183 |
184 | self._check_capacity_time = time.time()
185 | else:
186 | # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
187 | with RedisLock(key="ScalableBloomFilter", timeout=300, wait_timeout=300, redis_cli=self.red) as lock:
188 | if lock.locked:
189 | while True:
190 | if self.filters[-1].is_at_capacity(filter_key):
191 | self.filters.append(self.create_filter())
192 | else:
193 | break
194 |
195 | self._check_capacity_time = time.time()
196 |
197 | def get(self, filter_key, value):
198 | self.__check_filter_capacity(filter_key)
199 |
200 | is_list = isinstance(value, list)
201 | keys = value if is_list else [value]
202 | not_exist_keys = list(set(keys))
203 |
204 | # 检查之前的bloomFilter是否存在
205 | # 记录下每级filter存在的key,不存在的key继续向下检查
206 | for f in reversed(self.filters):
207 | # 当前的filter是否存在
208 | current_filter_is_exists = f.get(filter_key, not_exist_keys)
209 |
210 | not_exist_keys_temp = []
211 |
212 | for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists):
213 | # 当前filter不存在的key 需要继续向下检查
214 | if not is_exist:
215 | not_exist_keys_temp.append(checked_key)
216 |
217 | not_exist_keys = not_exist_keys_temp
218 |
219 | if not not_exist_keys:
220 | break
221 |
222 | # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
223 | for i, key in enumerate(keys):
224 | for j, not_exist_key in enumerate(not_exist_keys):
225 | if key == not_exist_key:
226 | keys[i] = 0
227 | not_exist_keys.pop(j)
228 | break
229 | else:
230 | keys[i] = 1
231 |
232 | is_exists = keys
233 | return is_exists if is_list else is_exists[0]
234 |
235 | def add(self, filter_key, value, skip_check=False):
236 | self.__check_filter_capacity(filter_key)
237 |
238 | current_filter = self.filters[-1]
239 |
240 | if skip_check:
241 | return current_filter.add(filter_key, value)
242 |
243 | else:
244 | is_list = isinstance(value, list)
245 | keys = value if is_list else [value]
246 | not_exist_keys = list(set(keys))
247 |
248 | # 检查之前的bloomFilter是否存在
249 | # 记录下每级filter存在的key,不存在的key继续向下检查
250 | for f in reversed(self.filters):
251 | # 当前的filter是否存在
252 | current_filter_is_exists = f.get(filter_key, not_exist_keys)
253 |
254 | not_exist_keys_temp = []
255 |
256 | for key, is_exist in zip(not_exist_keys, current_filter_is_exists):
257 | # 当前filter不存在的key 需要继续向下检查
258 | if not is_exist:
259 | not_exist_keys_temp.append(key)
260 |
261 | not_exist_keys = not_exist_keys_temp
262 |
263 | if not not_exist_keys:
264 | break
265 |
266 | # 仍有不存在的关键词,记录该关键词
267 | if not_exist_keys:
268 | current_filter.add(filter_key, not_exist_keys)
269 |
270 | # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
271 | for i, key in enumerate(keys):
272 | for j, not_exist_key in enumerate(not_exist_keys):
273 | if key == not_exist_key:
274 | keys[i] = 1
275 | not_exist_keys.pop(j)
276 | break
277 | else:
278 | keys[i] = 0
279 |
280 | is_added = keys
281 | return is_added if is_list else is_added[0]
282 |
283 | @property
284 | def capacity(self):
285 | return sum(f.capacity for f in self.filters)
286 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/duplicate/expire_filter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: expire_filter.py
6 | Time: 2021/5/13 下午2:26
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午2:26
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import time
13 |
14 |
15 | class ExpireFilter(object):
16 |
17 | def __init__(self, spider):
18 | red_name = spider.settings['FILTER_REDIS_NAME']
19 |
20 | self.expire = spider.settings['FILTER_REDIS_KEY_EXPIRE']
21 | self.red = getattr(spider, red_name) if not red_name else spider.red
22 |
23 | @property
24 | def current_timestamp(self):
25 | return int(time.time())
26 |
27 | def get(self, filter_key, value):
28 | return self.red.zscore(filter_key, value)
29 |
30 | def add(self, filter_key, value):
31 | return self.red.zadd(filter_key, value)
32 |
33 | # def del_expire_key(self):
34 | # self.red.zremrangebyscore(self.name, "-inf", self.current_timestamp - self.expire_time)
35 | #
36 | # def record_expire_time(self):
37 | # if self.expire_time_record_key:
38 | # self.red.hset(self.expire_time_record_key, key=self.name, value=self.expire_time)
39 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/duplicate/handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: handler.py
6 | Time: 2021/5/13 下午1:45
7 | -------------------------------------------------
8 | Change Activity: 2021/5/13 下午1:45
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import copy
13 |
14 | from magical.utils import load_objects
15 |
16 |
17 | class DuplicateHandler(object):
18 |
19 | def __init__(self, spider):
20 | self.spider = spider
21 |
22 | self.use_md5 = self.spider.settings['FILTER_USE_MD5']
23 | self.filter_method = load_objects(self.spider.settings['FILTER_METHOD_MANAGER'])(spider)
24 |
25 | def __deal_data(self, filter_data):
26 | if self.use_md5:
27 | value = self.spider.spider_util.get_md5_encrypt(filter_data)
28 |
29 | else:
30 | value = copy.deepcopy(filter_data)
31 |
32 | return value
33 |
34 | def get(self, key, value):
35 | return self.filter_method.get(key, self.__deal_data(value))
36 |
37 | def add(self, key, value):
38 | return self.filter_method.add(key, self.__deal_data(value))
39 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/5/17 下午5:54
7 | -------------------------------------------------
8 | Change Activity: 2021/5/17 下午5:54
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/pipeline/handler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: handler.py
6 | Time: 2021/5/17 下午6:07
7 | -------------------------------------------------
8 | Change Activity: 2021/5/17 下午6:07
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import load_objects
13 |
14 |
15 | class PipelineHandler(object):
16 |
17 | def __init__(self, spider, **kwargs):
18 | self.spider = spider
19 | self.kwargs = kwargs
20 | self.logger = spider.logger
21 | self.settings = spider.settings
22 |
23 | handler_manager_cls = self.settings['PIPELINE_MIDDLEWARE_MANAGER_PATH']
24 | self.middleware = load_objects(handler_manager_cls)(spider)
25 |
26 | def pipeline(self, item, **kwargs):
27 | return self.middleware.pipeline(item, **kwargs)
28 |
--------------------------------------------------------------------------------
/magical/sync_spider/middleware/pipeline/manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: manager.py
6 | Time: 2021/5/17 下午5:54
7 | -------------------------------------------------
8 | Change Activity: 2021/5/17 下午5:54
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from collections import defaultdict
13 |
14 | from magical.sync_spider.extends_module.base_module.pipeline import PipelineMiddleware
15 | from magical.sync_spider.common.utils import call_func_item
16 | from magical.utils import load_objects
17 |
18 |
19 | class PipelineMiddlewareManager(object):
20 | def __init__(self, spider):
21 | self.methods = defaultdict(list)
22 | self.spider = spider
23 | self.settings = spider.settings
24 | self.middleware_s = self.__load_middleware()
25 |
26 | for miw in self.middleware_s:
27 | self.__add_middleware(miw)
28 |
29 | def __load_middleware(self):
30 | middleware_s = []
31 | middleware_s_dict = self.settings["PIPELINE_MIDDLEWARE_PATH"]
32 | middleware_s_list = sorted(middleware_s_dict.items(), key=lambda x: x[1])
33 |
34 | for middleware_key, value in middleware_s_list:
35 | middleware = load_objects(middleware_key)
36 | if issubclass(middleware, PipelineMiddleware):
37 | middleware_instance = middleware(self.spider)
38 | middleware_s.append(middleware_instance)
39 | return middleware_s
40 |
41 | def __add_middleware(self, miw):
42 | if hasattr(miw, "process_item"):
43 | self.methods['process_item'].append(miw.process_item)
44 |
45 | if hasattr(miw, "process_exception"):
46 | self.methods['process_exception'].append(miw.process_exception)
47 |
48 | def pipeline(self, item, **kwargs):
49 |
50 | def process_item(item):
51 | for method in self.methods['process_item']:
52 | item = method(item, **kwargs)
53 | if not isinstance(item, type(item)):
54 | return item
55 |
56 | return item
57 |
58 | def process_exception(exception):
59 | for method in self.methods['process_exception']:
60 | exception = method(item, exception)
61 | if not exception:
62 | return exception
63 | return exception
64 |
65 | return call_func_item(process_item, process_exception, item)
66 |
--------------------------------------------------------------------------------
/magical/template.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: template.py
6 | Time: 2021/4/14 下午3:37
7 | -------------------------------------------------
8 | Change Activity: 2021/4/14 下午3:37
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import re
14 | import string
15 |
16 |
17 | def render_template_file(path, **kwargs):
18 | with open(path, 'rb') as fp:
19 | raw = fp.read().decode('utf8')
20 |
21 | content = string.Template(raw).substitute(**kwargs)
22 |
23 | render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
24 | with open(render_path, 'wb') as fp:
25 | fp.write(content.encode('utf8'))
26 | if path.endswith('.tmpl'):
27 | os.remove(path)
28 |
29 |
30 | CAMELCASE_INVALID_CHARS = re.compile('[^a-zA-Z\d]')
31 |
32 |
33 | def string_camelcase(string):
34 | return CAMELCASE_INVALID_CHARS.sub('', string.title())
35 |
--------------------------------------------------------------------------------
/magical/templates/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: 2021/4/22 上午12:05
7 | -------------------------------------------------
8 | Change Activity: 2021/4/22 上午12:05
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: 2021/4/22 上午12:08
7 | -------------------------------------------------
8 | Change Activity: 2021/4/22 上午12:08
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/base_spider.py.tmpl:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: base_spider.py
6 | Time: ${create_time}
7 | -------------------------------------------------
8 | Change Activity: ${create_time}
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | from magical.sync_spider.common.base_spider import BaseSpider
14 |
15 |
16 | class ${SpiderName}BaseSpider(BaseSpider):
17 | def __init__(self, spider):
18 | super().__init__(spider)
19 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/middleware.py.tmpl:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: middleware.py
6 | Time: ${create_time}
7 | -------------------------------------------------
8 | Change Activity: ${create_time}
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import random
13 | import time
14 |
15 | import requests
16 |
17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
18 |
19 |
20 | # ------------------------------------------------default middleware------------------------------------------------
21 |
22 |
23 | class HeadersMiddleware(DownloaderMiddleware):
24 | """请求头处理中间件"""
25 |
26 | def __init__(self, spider):
27 | super().__init__(spider)
28 |
29 | def process_request(self, request):
30 | request.headers.update({'Connection': 'close'})
31 | return request
32 |
33 |
34 | class ProxyMiddleware(DownloaderMiddleware):
35 | """代理IP中间件"""
36 |
37 | def __init__(self, spider):
38 | super().__init__(spider)
39 |
40 | self.proxy.proxy_handler(num=1)
41 |
42 | def process_request(self, request):
43 | request.meta['proxy'] = self.proxy.get_proxy()
44 | return request
45 |
46 | def process_response(self, request, response):
47 | return response
48 |
49 | def process_exception(self, request, exception):
50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
51 |
52 | if isinstance(exception, (
53 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout,
54 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)):
55 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}')
56 | time.sleep(random.randint(3, 5))
57 | self.proxy.proxy_handler(request, num=1)
58 | return self._retry(request)
59 |
60 | return exception
61 |
62 |
63 | class RequestErrorMiddleware(DownloaderMiddleware):
64 | """请求异常中间件"""
65 |
66 | def __init__(self, spider):
67 | super().__init__(spider)
68 |
69 | def process_exception(self, request, exception):
70 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
71 |
72 | if isinstance(exception, (
73 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout,
74 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)):
75 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}')
76 | time.sleep(random.randint(3, 5))
77 | return self._retry(request)
78 |
79 | elif isinstance(exception, requests.exceptions.HTTPError):
80 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}')
81 | return None
82 |
83 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError):
84 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}')
85 | return None
86 |
87 | elif isinstance(exception, requests.exceptions.SSLError):
88 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}')
89 | return None
90 |
91 | return exception
92 |
93 |
94 | # -------------------------------------------------spider middleware-------------------------------------------------
95 |
96 |
97 | class ${SpiderName}Middleware(DownloaderMiddleware):
98 |
99 | def __init__(self, spider):
100 | super().__init__(spider)
101 |
102 | def process_request(self, request):
103 | return request
104 |
105 | def process_response(self, request, response):
106 | if not request.use_middleware:
107 | return response
108 |
109 | return response
110 |
111 | def process_exception(self, request, exception):
112 | self.logger.error(f'${SpiderName}Middleware.process_exception: {exception}, request: {request}')
113 | return exception
114 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/settings.py.tmpl:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: settings.py
6 | Time: ${create_time}
7 | -------------------------------------------------
8 | Change Activity: ${create_time}
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import log_path
13 |
14 | # project settings
15 |
16 |
17 | # -------------------------------------------------------------------------------------------------------------------
18 |
19 | # 项目名称
20 | PROJECT_NAME = '${project_name}'
21 |
22 | # logger 路径
23 | LOGGER_PATH = log_path(__file__)
24 |
25 | # 重试次数
26 | RETRY_COUNT = 10
27 |
28 | # 管道中间件,可配置多个
29 | # PIPELINE_MIDDLEWARE_PATH = {
30 | # "${spider_path}.pipeline.${SpiderName}Pipeline": 10
31 | # }
32 |
33 | # 下载中间件,可配置多个
34 | DOWNLOAD_MIDDLEWARE_PATH = {
35 | # "${spider_path}.middleware.DuplicateMiddleware": 7,
36 | # "${spider_path}.middleware.HeadersMiddleware": 8,
37 | # "${spider_path}.middleware.ProxyMiddleware": 9,
38 | "${spider_path}.middleware.RequestErrorMiddleware": 10,
39 | "${spider_path}.middleware.${SpiderName}Middleware": 100
40 | }
41 |
42 | # 爬虫公共类,基类
43 | BASE_SPIDER_PATH = "${spider_path}.base_spider.${SpiderName}BaseSpider"
44 |
45 | # user-agent
46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
48 |
49 | # -------------------------------------------------------------------------------------------------------------------
50 |
51 |
52 | # default settings
53 |
54 | # 下载中间件
55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader"
56 |
57 | # 下载处理中间件
58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler"
59 |
60 | # 下载调度器
61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager"
62 |
63 | # 下载中间件,可配置多个
64 | # DOWNLOAD_MIDDLEWARE_PATH = {}
65 |
66 | # -------------------------------------------------------------------------------------------------------------------
67 |
68 | # 管道处理中间件
69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler"
70 |
71 | # 管道调度器
72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager"
73 |
74 | # 管道中间件,可配置多个
75 | # PIPELINE_MIDDLEWARE_PATH = {}
76 |
77 | # -------------------------------------------------------------------------------------------------------------------
78 | # 暂时不使用,存在问题
79 | # # 去重中间件
80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler"
81 | #
82 | # # 去重过滤器
83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter"
84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter"
85 | #
86 | # # 去重队列,redis, memory = 内存
87 | # FILTER_QUEUE_TYPE = 'redis'
88 | #
89 | # # 去重是否 md5 加密
90 | # FILTER_USE_MD5 = False
91 | #
92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red
93 | # FILTER_REDIS_NAME = 'red'
94 | #
95 | # # 去重初始容量
96 | # FILTER_INITIAL_CAPACITY = 100000000
97 | #
98 | # # 去重错误率
99 | # FILTER_ERROR_RATE = 0.00001
100 |
101 | # -------------------------------------------------------------------------------------------------------------------
102 |
103 | # # rabbit mq 配置
104 | # MESSAGE_MQ_CONFIG = {
105 | # 'username': 'admin',
106 | # 'password': 'admin123',
107 | # 'host': '127.0.0.1',
108 | # 'port': 9999
109 | # }
110 | #
111 | # # rabbit mq 消费批次,每次消费 10 条
112 | # MESSAGE_MQ_PREFETCH_COUNT = 10
113 | #
114 | # # rabbit mq virtual host
115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider'
116 | #
117 | # # rabbit mq 操作类
118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler'
119 |
120 | # -------------------------------------------------------------------------------------------------------------------
121 |
122 | # 爬虫公共类,基类
123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider"
124 |
125 | # 爬虫工具类
126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil"
127 |
128 | # 代理IP中间件
129 | # redis IP 获取
130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy'
131 | # # 芝麻代理 IP
132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy'
133 |
134 | # 邮件
135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler'
136 |
137 | # post ger sql 操作类
138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle'
139 |
140 | # mysql 操作类
141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler'
142 |
143 | # redis 操作类
144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler'
145 |
146 | # -------------------------------------------------------------------------------------------------------------------
147 |
148 | # 初始化 代理 IP 数量
149 | PROXY_NUM = 5
150 |
151 | # 重试次数
152 | # RETRY_COUNT = 3
153 |
154 | # 包含一下状态吗,重试
155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
156 |
157 | # 忽略 ssl 验证
158 | REQUEST_VERIFY = False
159 |
160 | # 请求超时时间
161 | REQUEST_TIMEOUT = 30
162 |
163 | # 5s盾,delay 时间
164 | SCRAPER_DELAY = 30
165 |
166 | # 消费者线程数
167 | CONSUMER_THREAD_NUM = 10
168 |
169 | # -------------------------------------------------------------------------------------------------------------------
170 |
171 | """
172 | 数据库配置
173 |
174 | 单个数据库
175 | REDIS_CONFIG = {
176 | 'host': '',
177 | 'host': '',
178 | 'db': '',
179 | 'user': '',
180 | 'password': '',
181 | 'decode_responses': True
182 | }
183 | 使用:
184 | red 默认变量名称
185 | self.red.get('key1')
186 | spider.red.get('key1')
187 |
188 | 多个数据库
189 | REDIS_CONFIG = [
190 | {
191 | 'name': 'name1',
192 | 'host': '',
193 | 'host': '',
194 | 'db': '',
195 | 'user': '',
196 | 'password': '',
197 | 'decode_responses': True
198 | },
199 | {
200 | 'name': 'name2',
201 | 'host': '',
202 | 'host': '',
203 | 'db': '',
204 | 'user': '',
205 | 'password': '',
206 | 'decode_responses': True
207 | },
208 | ]
209 | 使用:
210 | self.name1.get('key1')
211 | spider.name1.get('key1')
212 |
213 | self.name2.get('key1')
214 | spider.name2.get('key1')
215 | """
216 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/spider.py.tmpl:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: ${spider_name}.py
6 | Time: ${create_time}
7 | -------------------------------------------------
8 | Change Activity: ${create_time}
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import sys
14 |
15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16 | sys.path.append(file_path)
17 |
18 | from magical.sync_spider import SyncSpider, Request, run_spider
19 |
20 |
21 | class ${SpiderName}Spider(SyncSpider):
22 | name = '${spider_name}'
23 | settings_path = '${settings_path}'
24 |
25 | default_custom_setting = {}
26 |
27 | def __init__(self, *args, **kwargs):
28 | custom_setting = {}
29 | kwargs.update(dict(custom_setting=custom_setting))
30 | super().__init__(*args, **kwargs)
31 |
32 | def start_spider(self):
33 | self.logger.info(f'Hello {self.name}')
34 |
35 | request = Request(url='http://www.baidu.com/')
36 | response = self.download(request)
37 |
38 | title = response.re.findall('(.*?)')
39 | self.logger.info(f'title: {title}')
40 |
41 |
42 | if __name__ == '__main__':
43 | run_spider(${SpiderName}Spider)
44 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: 2021/4/22 上午12:08
7 | -------------------------------------------------
8 | Change Activity: 2021/4/22 上午12:08
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/templates/sync_spider/spiders/__init__.py.tmpl:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: ${create_time}
7 | -------------------------------------------------
8 | Change Activity: ${create_time}
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/magical/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: goods_utils.py
6 | Time: 2021/4/20 上午12:31
7 | -------------------------------------------------
8 | Change Activity: 2021/4/20 上午12:31
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import time
14 | import datetime
15 |
16 | from importlib import import_module
17 | from decimal import Decimal, ROUND_HALF_UP
18 |
19 |
20 | def log_path(project_path):
21 | """获取 logs 路径
22 |
23 | Args:
24 | project_path: 项目绝对路径
25 | Returns: 返回 log 路径
26 | """
27 | s_path = os.path.basename(os.path.abspath(project_path))
28 |
29 | if s_path == 'spiders':
30 | return os.path.join(os.path.dirname(project_path), 'logs')
31 |
32 | else:
33 | return log_path(os.path.dirname(project_path))
34 |
35 |
36 | # 加载 py 文件
37 | def load_files(path):
38 | return import_module(path)
39 |
40 |
41 | # 加载模块类函数
42 | def load_objects(path):
43 | try:
44 | dot = path.rindex('.')
45 | except ValueError as e:
46 | raise ValueError("Error loading object '%s': not a full path" % path)
47 |
48 | module, name = path[:dot], path[dot + 1:]
49 | mod = import_module(module)
50 |
51 | try:
52 | obj = getattr(mod, name)
53 | except AttributeError:
54 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
55 |
56 | return obj
57 |
58 |
59 | def round_half_up(digit, n=2):
60 | return Decimal(str(digit)).quantize(Decimal('0.' + '0' * n), rounding=ROUND_HALF_UP)
61 |
62 |
63 | def get_fmt_time(fmt="%Y-%m-%d %H:%M:%S", timestamp=None):
64 | if timestamp:
65 | return time.strftime(fmt, time.localtime(int(timestamp)))
66 | return datetime.datetime.now().strftime(fmt)
67 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | redis==3.5.3
2 | pandas==1.2.3
3 | PyExecJS==1.5.1
4 | opencv_python==4.5.1.48
5 | requests==2.25.1
6 | PyMySQL==0.9.3
7 | urllib3==1.26.4
8 | rsa==4.7.2
9 | SQLAlchemy==1.4.5
10 | xlwt==1.3.0
11 | psycopg2==2.7.7
12 | lxml==4.6.3
13 | DBUtils==1.3
14 | pycryptodome==3.10.1
15 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: setup.py
6 | Time: 2021/4/21 下午9:49
7 | -------------------------------------------------
8 | Change Activity: 2021/4/21 下午9:49
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | from setuptools import setup, find_packages
15 |
16 |
17 | setup(
18 | name='magical',
19 | version='1.1.0',
20 | description='参照 scrapy 轻量级爬虫框架',
21 | author='magical developers',
22 | maintainer='qinjiahu',
23 | maintainer_email='qinless@qinless.com',
24 | license='BSD',
25 |
26 | packages=find_packages(exclude=(
27 | 'examples', 'examples.*', 'public', 'public.*', 'test', 'test.*', '.gitee', '.gitee.*',
28 | 'public', 'public.*', 'spiders', 'spiders.*', 'logs', 'logs.*'
29 | )),
30 |
31 | package_data={
32 | '': ['*.py.tmpl', '*.json']
33 | },
34 |
35 | include_package_data=True,
36 | zip_safe=False,
37 |
38 | classifiers=[
39 | 'Framework :: Crawler',
40 | 'Environment :: Console',
41 | 'Programming Language :: Python :: 3.6',
42 | 'Programming Language :: Python :: Implementation :: CPython',
43 | 'Programming Language :: Python :: Implementation :: PyPy',
44 | 'Topic :: Software Development :: Libraries :: Application Frameworks',
45 | 'Topic :: Software Development :: Libraries :: Python Modules',
46 | ],
47 | python_requires='>=3.6.*',
48 | install_requires=[]
49 | )
50 |
--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/7/1 上午11:38
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:38
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/spiders/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/7/1 上午11:38
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:38
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/spiders/common/excel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: excel.py
6 | Time: 2021/7/1 上午11:39
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:39
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import xlwt
13 |
14 |
15 | def excel_style():
16 | # 为样式创建字体
17 | font = xlwt.Font()
18 | # 设置字体名字对应系统内字体
19 | font.name = u'微软雅黑'
20 | font.height = 240
21 |
22 | alignment = xlwt.Alignment()
23 | # 设置水平居中
24 | alignment.horz = xlwt.Alignment.HORZ_CENTER
25 | # 设置垂直居中
26 | alignment.vert = xlwt.Alignment.VERT_CENTER
27 |
28 | borders = xlwt.Borders() # Create borders
29 | # 添加边框-虚线边框
30 | borders.left = xlwt.Borders.THIN
31 | borders.right = xlwt.Borders.THIN
32 | borders.top = xlwt.Borders.THIN
33 | borders.bottom = xlwt.Borders.THIN
34 | # 边框上色
35 | borders.left_colour = 23
36 | borders.right_colour = 23
37 | borders.top_colour = 23
38 | borders.bottom_colour = 23
39 |
40 | # 初始化样式
41 | style = xlwt.XFStyle()
42 | # 为样式设置字体
43 | style.font = font
44 | # 对齐方式设置
45 | style.alignment = alignment
46 | style.borders = borders
47 |
48 | return style
49 |
50 |
51 | def write_excel(data, headers, name, path_name):
52 | workbook = xlwt.Workbook()
53 | sheet = workbook.add_sheet(name)
54 |
55 | style = excel_style()
56 |
57 | num = 1
58 | for k, v in headers.items():
59 | if k.startswith('$'):
60 | continue
61 | sheet.col(num).width = 100 * 50
62 | sheet.write(0, num, v, style)
63 | num += 1
64 |
65 | col = 0
66 | for n in range(0, len(data)):
67 | num = 1
68 |
69 | pattern = xlwt.Pattern()
70 | if n % 2 == 0:
71 | pattern.pattern = xlwt.Pattern.SOLID_PATTERN
72 | pattern.pattern_fore_colour = 22
73 | else:
74 | pattern.pattern = xlwt.Pattern.SOLID_PATTERN
75 | pattern.pattern_fore_colour = 1
76 |
77 | style.pattern = pattern
78 |
79 | sheet.row(col).height_mismatch = True
80 | sheet.row(col).height = 30 * 20
81 |
82 | # 根据文件头来获取数据
83 | for key in headers.keys():
84 | item = data[n][key]
85 | sheet.write(n + 1, num, item, style)
86 | num += 1
87 |
88 | # for k, v in data[n].items():
89 | # sheet.write(n + 1, num, v, style)
90 | # num += 1
91 |
92 | col += 1
93 |
94 | sheet.row(col).height_mismatch = True
95 | sheet.row(col).height = 30 * 20
96 |
97 | workbook.save(f'{path_name}')
98 | print(f'{name}.xls 写入成功')
99 |
100 |
101 | if __name__ == '__main__':
102 | data_list = [
103 | {'desc': 'desc1', 'name': 'name1', 'plat': 'plat1'},
104 | {'desc': 'desc2', 'name': 'name2', 'plat': 'plat2'},
105 | {'desc': 'desc3', 'name': 'name3', 'plat': 'plat3'},
106 | {'desc': 'desc4', 'name': 'name4', 'plat': 'plat4'},
107 | {'desc': 'desc5', 'name': 'name5', 'plat': 'plat5'},
108 | ]
109 |
110 | title = {'desc': '描述', 'name': '店铺名称', 'plat': '渠道'}
111 | excel_name = 'test'
112 | write_excel(data_list, title, excel_name, excel_name)
113 |
--------------------------------------------------------------------------------
/spiders/common/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: proxy.py
6 | Time: 2021/7/1 上午11:39
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:39
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | class GetProxy(object):
15 |
16 | # spider 是爬虫实例对象
17 | def __init__(self, spider):
18 | self.logger = spider.logger
19 |
20 | def get_proxy(self):
21 | self.logger.info('获取一条代理Ip')
22 | return None
23 |
--------------------------------------------------------------------------------
/spiders/common/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: settings.py
6 | Time: 2021/7/1 上午11:39
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:39
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | # 统一初始化,爬虫其他工具类,模块
13 | SPIDER_INIT_HANDLER = 'spiders.common.spider_init.SpiderInit'
14 |
15 | EXCEL = 'spiders.common.excel'
16 | PROXY_HANDLER = 'spiders.common.proxy.GetProxy'
17 |
18 | REDIS_CONFIG = {
19 | 'host': '127.0.0.1',
20 | 'port': '6379',
21 | 'db': '0',
22 | 'decode_responses': True
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/spiders/common/spider_init.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: spider_init.py
6 | Time: 2021/7/31 下午5:07
7 | -------------------------------------------------
8 | Change Activity: 2021/7/31 下午5:07
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.sync_spider import load_files
13 |
14 |
15 | class SpiderInit(object):
16 | def __init__(self, spider):
17 | self.settings = spider.settings
18 |
19 | spider.excel = load_files(self.settings['EXCEL'])
20 |
--------------------------------------------------------------------------------
/spiders/test_douban/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/7/13 上午11:29
7 | -------------------------------------------------
8 | Change Activity: 2021/7/13 上午11:29
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | from magical.cmdline import generate_spider_project, generate_spider_file
14 |
15 |
16 | def main():
17 | project_path = os.path.dirname(os.path.abspath(__file__))
18 | spider_name = 'douban_spider'
19 |
20 | generate_spider_project('sync_spider', project_path, spider_name)
21 | # generate_spider_file('sync_spider', project_path, spider_name)
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/spiders/test_douban/base_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: base_spider.py
6 | Time: 2021/13/13 11:30:06
7 | -------------------------------------------------
8 | Change Activity: 2021/13/13 11:30:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | from magical.sync_spider.common.base_spider import BaseSpider
14 |
15 |
16 | class DoubanSpiderBaseSpider(BaseSpider):
17 | def __init__(self, spider):
18 | super().__init__(spider)
19 |
--------------------------------------------------------------------------------
/spiders/test_douban/middleware.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: middleware.py
6 | Time: 2021/13/13 11:30:06
7 | -------------------------------------------------
8 | Change Activity: 2021/13/13 11:30:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import random
13 | import time
14 |
15 | import requests
16 |
17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
18 |
19 |
20 | # ------------------------------------------------default middleware------------------------------------------------
21 |
22 |
23 | class HeadersMiddleware(DownloaderMiddleware):
24 | """请求头处理中间件"""
25 |
26 | def __init__(self, spider):
27 | super().__init__(spider)
28 |
29 | def process_request(self, request):
30 | request.headers.update({'Connection': 'close'})
31 | return request
32 |
33 |
34 | class ProxyMiddleware(DownloaderMiddleware):
35 | """代理IP中间件"""
36 |
37 | def __init__(self, spider):
38 | super().__init__(spider)
39 |
40 | self.proxy.proxy_handler(num=1)
41 |
42 | def process_request(self, request):
43 | request.meta['proxy'] = self.proxy.get_proxy()
44 | return request
45 |
46 | def process_response(self, request, response):
47 | return response
48 |
49 | def process_exception(self, request, exception):
50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
51 |
52 | if isinstance(exception, (
53 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout,
54 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)):
55 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}')
56 | time.sleep(random.randint(3, 5))
57 | self.proxy.proxy_handler(request, num=1)
58 | return self._retry(request)
59 |
60 | return exception
61 |
62 |
63 | class RequestErrorMiddleware(DownloaderMiddleware):
64 | """请求异常中间件"""
65 |
66 | def __init__(self, spider):
67 | super().__init__(spider)
68 |
69 | def process_exception(self, request, exception):
70 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
71 |
72 | if isinstance(exception, (
73 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout,
74 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)):
75 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}')
76 | time.sleep(random.randint(3, 5))
77 | return self._retry(request)
78 |
79 | elif isinstance(exception, requests.exceptions.HTTPError):
80 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}')
81 | return None
82 |
83 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError):
84 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}')
85 | return None
86 |
87 | elif isinstance(exception, requests.exceptions.SSLError):
88 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}')
89 | return None
90 |
91 | return exception
92 |
93 |
94 | # -------------------------------------------------spider middleware-------------------------------------------------
95 |
96 |
97 | class DoubanSpiderMiddleware(DownloaderMiddleware):
98 |
99 | def __init__(self, spider):
100 | super().__init__(spider)
101 |
102 | def process_request(self, request):
103 | return request
104 |
105 | def process_response(self, request, response):
106 | if not request.use_middleware:
107 | return response
108 |
109 | return response
110 |
111 | def process_exception(self, request, exception):
112 | self.logger.error(f'DoubanSpiderMiddleware.process_exception: {exception}, request: {request}')
113 | return exception
114 |
--------------------------------------------------------------------------------
/spiders/test_douban/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: settings.py
6 | Time: 2021/13/13 11:30:06
7 | -------------------------------------------------
8 | Change Activity: 2021/13/13 11:30:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import log_path
13 |
14 | # project settings
15 |
16 |
17 | # -------------------------------------------------------------------------------------------------------------------
18 |
19 | # 项目名称
20 | PROJECT_NAME = 'test_douban'
21 |
22 | # logger 路径
23 | LOGGER_PATH = log_path(__file__)
24 |
25 | # 重试次数
26 | RETRY_COUNT = 10
27 |
28 | # 管道中间件,可配置多个
29 | # PIPELINE_MIDDLEWARE_PATH = {
30 | # "spiders.test_douban.pipeline.DoubanSpiderPipeline": 10
31 | # }
32 |
33 | # 下载中间件,可配置多个
34 | DOWNLOAD_MIDDLEWARE_PATH = {
35 | # "spiders.test_douban.middleware.DuplicateMiddleware": 7,
36 | # "spiders.test_douban.middleware.HeadersMiddleware": 8,
37 | # "spiders.test_douban.middleware.ProxyMiddleware": 9,
38 | "spiders.test_douban.middleware.RequestErrorMiddleware": 10,
39 | "spiders.test_douban.middleware.DoubanSpiderMiddleware": 100
40 | }
41 |
42 | # 爬虫公共类,基类
43 | BASE_SPIDER_PATH = "spiders.test_douban.base_spider.DoubanSpiderBaseSpider"
44 |
45 | # user-agent
46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
48 |
49 | # -------------------------------------------------------------------------------------------------------------------
50 |
51 |
52 | # default settings
53 |
54 | # 下载中间件
55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader"
56 |
57 | # 下载处理中间件
58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler"
59 |
60 | # 下载调度器
61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager"
62 |
63 | # 下载中间件,可配置多个
64 | # DOWNLOAD_MIDDLEWARE_PATH = {}
65 |
66 | # -------------------------------------------------------------------------------------------------------------------
67 |
68 | # 管道处理中间件
69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler"
70 |
71 | # 管道调度器
72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager"
73 |
74 | # 管道中间件,可配置多个
75 | # PIPELINE_MIDDLEWARE_PATH = {}
76 |
77 | # -------------------------------------------------------------------------------------------------------------------
78 | # 暂时不使用,存在问题
79 | # # 去重中间件
80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler"
81 | #
82 | # # 去重过滤器
83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter"
84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter"
85 | #
86 | # # 去重队列,redis, memory = 内存
87 | # FILTER_QUEUE_TYPE = 'redis'
88 | #
89 | # # 去重是否 md5 加密
90 | # FILTER_USE_MD5 = False
91 | #
92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red
93 | # FILTER_REDIS_NAME = 'red'
94 | #
95 | # # 去重初始容量
96 | # FILTER_INITIAL_CAPACITY = 100000000
97 | #
98 | # # 去重错误率
99 | # FILTER_ERROR_RATE = 0.00001
100 |
101 | # -------------------------------------------------------------------------------------------------------------------
102 |
103 | # # rabbit mq 配置
104 | # MESSAGE_MQ_CONFIG = {
105 | # 'username': 'admin',
106 | # 'password': 'admin123',
107 | # 'host': '127.0.0.1',
108 | # 'port': 9999
109 | # }
110 | #
111 | # # rabbit mq 消费批次,每次消费 10 条
112 | # MESSAGE_MQ_PREFETCH_COUNT = 10
113 | #
114 | # # rabbit mq virtual host
115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider'
116 | #
117 | # # rabbit mq 操作类
118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler'
119 |
120 | # -------------------------------------------------------------------------------------------------------------------
121 |
122 | # 爬虫公共类,基类
123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider"
124 |
125 | # 爬虫工具类
126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil"
127 |
128 | # 代理IP中间件
129 | # redis IP 获取
130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy'
131 | # # 芝麻代理 IP
132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy'
133 |
134 | # 邮件
135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler'
136 |
137 | # post ger sql 操作类
138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle'
139 |
140 | # mysql 操作类
141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler'
142 |
143 | # redis 操作类
144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler'
145 |
146 | # -------------------------------------------------------------------------------------------------------------------
147 |
148 | # 初始化 代理 IP 数量
149 | PROXY_NUM = 5
150 |
151 | # 重试次数
152 | # RETRY_COUNT = 3
153 |
154 | # 包含一下状态吗,重试
155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
156 |
157 | # 忽略 ssl 验证
158 | REQUEST_VERIFY = False
159 |
160 | # 请求超时时间
161 | REQUEST_TIMEOUT = 30
162 |
163 | # 消费者线程数
164 | CONSUMER_THREAD_NUM = 10
165 |
166 | # -------------------------------------------------------------------------------------------------------------------
167 |
168 | """
169 | 数据库配置
170 |
171 | 单个数据库
172 | REDIS_CONFIG = {
173 | 'host': '',
174 | 'host': '',
175 | 'db': '',
176 | 'user': '',
177 | 'password': '',
178 | 'decode_responses': True
179 | }
180 | 使用:
181 | red 默认变量名称
182 | self.red.get('key1')
183 | spider.red.get('key1')
184 |
185 | 多个数据库
186 | REDIS_CONFIG = [
187 | {
188 | 'name': 'name1',
189 | 'host': '',
190 | 'host': '',
191 | 'db': '',
192 | 'user': '',
193 | 'password': '',
194 | 'decode_responses': True
195 | },
196 | {
197 | 'name': 'name2',
198 | 'host': '',
199 | 'host': '',
200 | 'db': '',
201 | 'user': '',
202 | 'password': '',
203 | 'decode_responses': True
204 | },
205 | ]
206 | 使用:
207 | self.name1.get('key1')
208 | spider.name1.get('key1')
209 |
210 | self.name2.get('key1')
211 | spider.name2.get('key1')
212 | """
213 |
--------------------------------------------------------------------------------
/spiders/test_douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: 2021/13/13 11:30:06
7 | -------------------------------------------------
8 | Change Activity: 2021/13/13 11:30:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/spiders/test_douban/spiders/douban_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: douban_spider.py
6 | Time: 2021/13/13 11:30:06
7 | -------------------------------------------------
8 | Change Activity: 2021/13/13 11:30:06
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import json
13 | import os
14 | import sys
15 |
16 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
17 | sys.path.append(file_path)
18 |
19 | from magical.sync_spider import SyncSpider, run_spider, load_files
20 |
21 |
22 | class DoubanSpiderSpider(SyncSpider):
23 | name = 'douban_spider'
24 | settings_path = 'spiders.test_douban.settings'
25 |
26 | default_custom_setting = {}
27 |
28 | def __init__(self, *args, **kwargs):
29 | custom_setting = {}
30 | kwargs.update(dict(custom_setting=custom_setting))
31 | super().__init__(*args, **kwargs)
32 |
33 | self.ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
34 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
35 |
36 | self.excel = load_files(self.settings['EXCEL'])
37 |
38 | def get_list(self, start=0, limit=100, tag='热门'):
39 |
40 | self.logger.info(f'start: {start}, tag: {tag}')
41 |
42 | headers = {
43 | 'Host': 'movie.douban.com',
44 | 'Referer': 'https://movie.douban.com/tv/',
45 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
46 | 'sec-ch-ua-mobile': '?0',
47 | 'Sec-Fetch-Dest': 'empty',
48 | 'Sec-Fetch-Mode': 'cors',
49 | 'Sec-Fetch-Site': 'same-origin',
50 | 'User-Agent': self.ua,
51 | 'X-Requested-With': 'XMLHttpRequest'
52 | }
53 | params = {
54 | 'type': 'tv',
55 | 'tag': tag,
56 | 'sort': 'recommend',
57 | 'page_limit': limit,
58 | 'page_start': start
59 | }
60 | url = 'https://movie.douban.com/j/search_subjects'
61 | response = self.download(url=url, headers=headers, params=params)
62 | subjects = response.json().get('subjects', [])
63 |
64 | if len(subjects) > 0:
65 | self.red.sadd('dbList', *[json.dumps(i, ensure_ascii=False) for i in subjects])
66 |
67 | if len(subjects) < 100:
68 | return
69 |
70 | else:
71 | start += 100
72 | return self.get_list(start=start)
73 |
74 | def get_info(self, list_info):
75 | info_url = list_info['url']
76 | headers = {
77 | 'Host': 'movie.douban.com',
78 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
79 | 'User-Agent': self.ua,
80 | 'Upgrade-Insecure-Requests': '1'
81 | }
82 |
83 | try:
84 | response = self.download(url=info_url, headers=headers)
85 |
86 | # 年份
87 | year = response.re.findall('\((.*?)\)')
88 | print('year: ', year)
89 |
90 | # 导演
91 | dao_yan = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
92 | print('dao_yan: ', dao_yan)
93 |
94 | # 编剧
95 | bian_ju = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()')
96 | print('bian_ju: ', bian_ju)
97 |
98 | # 主演
99 | zhu_yan = response.re.findall('(.*?)')
100 | print('zhu_yan: ', zhu_yan)
101 |
102 | # 类型
103 | lei_xing = response.re.findall('(.*?)')
104 | print('lei_xing: ', lei_xing)
105 |
106 | # 制片国家/地区
107 | di_qu = response.re.findall('制片国家/地区: (.*?)
')
108 | print('di_qu: ', di_qu)
109 |
110 | # 语言
111 | yu_yan = response.re.findall('语言: (.*?)
')
112 | print('yu_yan: ', yu_yan)
113 |
114 | # 首播
115 | shou_bo = response.xpath('//*[@id="info"]/span[10]/text()')
116 | print('shou_bo: ', shou_bo)
117 |
118 | # 集数
119 | ji_shu = response.re.findall('集数: (.*?)
')
120 | print('ji_shu: ', ji_shu)
121 |
122 | # 单集片长
123 | dan_ji = response.re.findall('单集片长: (.*?)
')
124 | print('dan_ji: ', dan_ji)
125 |
126 | # 豆瓣总评分
127 | score = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')
128 | print('score: ', score)
129 |
130 | # 评价人数
131 | comment_num = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')
132 | print('comment_num: ', comment_num)
133 |
134 | # 短评数
135 | duan_ping_num = response.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()')
136 | print('duan_ping_num: ', duan_ping_num)
137 |
138 | # 小组讨论数
139 |
140 | # 剧情简介
141 | desc = response.xpath('//*[@id="link-report"]/span/text()')
142 | print('desc: ', desc)
143 |
144 | # 标签
145 | tag = response.xpath('//*[@id="content"]/div[2]/div[2]/div[4]/div/a/text()')
146 | print('tag: ', tag)
147 |
148 | # 播放平台
149 |
150 | # 在看人数
151 | zai_kan = response.xpath('//*[@id="subject-others-interests"]/div/a[1]/text()')
152 | print('zai_kan: ', zai_kan)
153 |
154 | # 看过人数
155 | kan_guo = response.xpath('//*[@id="subject-others-interests"]/div/a[2]/text()')
156 | print('kan_guo: ', kan_guo)
157 |
158 | # 想看人数
159 | xiang_kan = response.xpath('//*[@id="subject-others-interests"]/div/a[3]/text()')
160 | print('xiang_kan: ', xiang_kan)
161 |
162 | data = {
163 | 'url': list_info['url'],
164 | 'year': ', '.join(year),
165 | 'dao_yan': ', '.join(dao_yan),
166 | 'bian_ju': ', '.join(bian_ju),
167 | 'zhu_yan': ', '.join(zhu_yan),
168 | 'lei_xing': ', '.join(lei_xing),
169 | 'di_qu': ', '.join(di_qu),
170 | 'yu_yan': ', '.join(yu_yan),
171 | 'shou_bo': ', '.join(shou_bo),
172 | 'ji_shu': ', '.join(ji_shu),
173 | 'dan_ji': ', '.join(dan_ji),
174 | 'score': ', '.join(score),
175 | 'comment_num': ', '.join(comment_num),
176 | 'duan_ping_num': ', '.join(duan_ping_num),
177 | 'desc': (', '.join(desc)).strip(),
178 | 'tag': ', '.join(tag),
179 | 'zai_kan': ', '.join(zai_kan),
180 | 'kan_guo': ', '.join(kan_guo),
181 | 'xiang_kan': ', '.join(xiang_kan),
182 | }
183 |
184 | self.red.sadd('dbInfo', json.dumps(data, ensure_ascii=False))
185 |
186 | except Exception as e:
187 | self.logger.info(f'error: {e}, list_info: {list_info}')
188 |
189 | def to_excel(self, items):
190 | def handler(x):
191 | if not x.get('year'):
192 | return False
193 |
194 | if not x.get('dao_yan'):
195 | return False
196 |
197 | return True
198 |
199 | data_list = list(filter(handler, map(lambda x: json.loads(x), items)))
200 | title = {
201 | 'url': '链接',
202 | 'year': '年份',
203 | 'dao_yan': '导演',
204 | 'bian_ju': '编剧',
205 | 'zhu_yan': '主演',
206 | 'lei_xing': '类型',
207 | 'di_qu': '制片国家/地区',
208 | 'yu_yan': '语言',
209 | 'shou_bo': '首播',
210 | 'ji_shu': '集数',
211 | 'dan_ji': '单集片长',
212 | 'score': '豆瓣评分(总分)',
213 | 'comment_num': '评价人数(总分)',
214 | 'duan_ping_num': '短评数',
215 | 'desc': '剧情简介',
216 | 'tag': '标签',
217 | 'zai_kan': '在看人数',
218 | 'kan_guo': '看过人数',
219 | 'xiang_kan': '想看人数',
220 | }
221 | excel_name = '豆瓣电影'
222 |
223 | self.excel.write_excel(data_list, title, excel_name, '../static/豆瓣电影.xls')
224 |
225 | def start_spider(self):
226 | # self.get_list()
227 |
228 | # data_list = list(self.red.smembers('dbList'))
229 | # for i in data_list:
230 | # self.get_info(json.loads(i))
231 |
232 | self.to_excel(list(self.red.smembers('dbInfo')))
233 |
234 |
235 | if __name__ == '__main__':
236 | run_spider(DoubanSpiderSpider)
237 |
--------------------------------------------------------------------------------
/spiders/test_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/7/1 上午11:39
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:39
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | import os
14 | from magical.cmdline import generate_spider_project, generate_spider_file
15 |
16 |
17 | def main():
18 | project_path = os.path.dirname(os.path.abspath(__file__))
19 | spider_name = 'test_common'
20 |
21 | # generate_spider_project('sync_spider', project_path, spider_name)
22 | generate_spider_file('sync_spider', project_path, spider_name)
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/spiders/test_spider/base_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: base_spider.py
6 | Time: 2021/01/01 11:40:25
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:25
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 | from magical.sync_spider.common.base_spider import BaseSpider
14 |
15 |
16 | class TestExcelBaseSpider(BaseSpider):
17 | def __init__(self, spider):
18 | super().__init__(spider)
19 |
--------------------------------------------------------------------------------
/spiders/test_spider/middleware.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: middleware.py
6 | Time: 2021/01/01 11:40:25
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:25
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import random
13 | import time
14 |
15 | import requests
16 |
17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware
18 |
19 |
20 | # ------------------------------------------------default middleware------------------------------------------------
21 |
22 |
23 | class HeadersMiddleware(DownloaderMiddleware):
24 | """请求头处理中间件"""
25 |
26 | def __init__(self, spider):
27 | super().__init__(spider)
28 |
29 | def process_request(self, request):
30 | request.headers.update({'Connection': 'close'})
31 | return request
32 |
33 |
34 | class ProxyMiddleware(DownloaderMiddleware):
35 | """代理IP中间件"""
36 |
37 | def __init__(self, spider):
38 | super().__init__(spider)
39 |
40 | self.proxy.proxy_handler(num=1)
41 |
42 | def process_request(self, request):
43 | request.meta['proxy'] = self.proxy.get_proxy()
44 | return request
45 |
46 | def process_response(self, request, response):
47 | return response
48 |
49 | def process_exception(self, request, exception):
50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
51 |
52 | if isinstance(
53 | exception,
54 | (
55 | requests.exceptions.ConnectionError,
56 | requests.exceptions.ConnectTimeout,
57 | requests.exceptions.ReadTimeout,
58 | requests.exceptions.Timeout,
59 | )
60 | ):
61 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}')
62 | time.sleep(random.randint(3, 5))
63 | self.proxy.proxy_handler(request, num=1)
64 | return self._retry(request)
65 |
66 | return exception
67 |
68 |
69 | class RequestErrorMiddleware(DownloaderMiddleware):
70 | """请求异常中间件"""
71 |
72 | def __init__(self, spider):
73 | super().__init__(spider)
74 |
75 | def process_exception(self, request, exception):
76 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True)
77 |
78 | if isinstance(
79 | exception,
80 | (
81 | requests.exceptions.ConnectionError,
82 | requests.exceptions.ConnectTimeout,
83 | requests.exceptions.ReadTimeout,
84 | requests.exceptions.Timeout,
85 | )
86 | ):
87 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}')
88 | time.sleep(random.randint(3, 5))
89 | return self._retry(request)
90 |
91 | elif isinstance(exception, requests.exceptions.HTTPError):
92 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}')
93 | return None
94 |
95 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError):
96 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}')
97 | return None
98 |
99 | elif isinstance(exception, requests.exceptions.SSLError):
100 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}')
101 | return None
102 |
103 | return exception
104 |
105 |
106 | # -------------------------------------------------spider middleware-------------------------------------------------
107 |
108 |
109 | class TestExcelMiddleware(DownloaderMiddleware):
110 |
111 | def __init__(self, spider):
112 | super().__init__(spider)
113 |
114 | def process_request(self, request):
115 | return request
116 |
117 | def process_response(self, request, response):
118 | if not request.use_middleware:
119 | return response
120 |
121 | return response
122 |
123 | def process_exception(self, request, exception):
124 | self.logger.error(f'TestExcelMiddleware.process_exception: {exception}, request: {request}')
125 | return exception
126 |
--------------------------------------------------------------------------------
/spiders/test_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: settings.py
6 | Time: 2021/01/01 11:40:25
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:25
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | from magical.utils import log_path
13 |
14 | # project settings
15 |
16 |
17 | # -------------------------------------------------------------------------------------------------------------------
18 |
19 | # 项目名称
20 | PROJECT_NAME = 'test_spider'
21 |
22 | # logger 路径
23 | LOGGER_PATH = log_path(__file__)
24 |
25 | # 重试次数
26 | RETRY_COUNT = 10
27 |
28 | # 管道中间件,可配置多个
29 | # PIPELINE_MIDDLEWARE_PATH = {
30 | # "spiders.test_spider.pipeline.TestExcelPipeline": 10
31 | # }
32 |
33 | # 下载中间件,可配置多个
34 | DOWNLOAD_MIDDLEWARE_PATH = {
35 | # "spiders.test_spider.middleware.DuplicateMiddleware": 7,
36 | # "spiders.test_spider.middleware.HeadersMiddleware": 8,
37 | # "spiders.test_spider.middleware.ProxyMiddleware": 9,
38 | "spiders.test_spider.middleware.RequestErrorMiddleware": 10,
39 | "spiders.test_spider.middleware.TestExcelMiddleware": 100
40 | }
41 |
42 | # 爬虫公共类,基类
43 | BASE_SPIDER_PATH = "spiders.test_spider.base_spider.TestExcelBaseSpider"
44 |
45 | # user-agent
46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \
47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
48 |
49 | # -------------------------------------------------------------------------------------------------------------------
50 |
51 |
52 | # default settings
53 |
54 | # 下载中间件
55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader"
56 |
57 | # 下载处理中间件
58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler"
59 |
60 | # 下载调度器
61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager"
62 |
63 | # 下载中间件,可配置多个
64 | # DOWNLOAD_MIDDLEWARE_PATH = {}
65 |
66 | # -------------------------------------------------------------------------------------------------------------------
67 |
68 | # 管道处理中间件
69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler"
70 |
71 | # 管道调度器
72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager"
73 |
74 | # 管道中间件,可配置多个
75 | # PIPELINE_MIDDLEWARE_PATH = {}
76 |
77 | # -------------------------------------------------------------------------------------------------------------------
78 | # 暂时不使用,存在问题
79 | # # 去重中间件
80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler"
81 | #
82 | # # 去重过滤器
83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter"
84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter"
85 | #
86 | # # 去重队列,redis, memory = 内存
87 | # FILTER_QUEUE_TYPE = 'redis'
88 | #
89 | # # 去重是否 md5 加密
90 | # FILTER_USE_MD5 = False
91 | #
92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red
93 | # FILTER_REDIS_NAME = 'red'
94 | #
95 | # # 去重初始容量
96 | # FILTER_INITIAL_CAPACITY = 100000000
97 | #
98 | # # 去重错误率
99 | # FILTER_ERROR_RATE = 0.00001
100 |
101 | # -------------------------------------------------------------------------------------------------------------------
102 |
103 | # # rabbit mq 配置
104 | # MESSAGE_MQ_CONFIG = {
105 | # 'username': 'admin',
106 | # 'password': 'admin123',
107 | # 'host': '127.0.0.1',
108 | # 'port': 9999
109 | # }
110 | #
111 | # # rabbit mq 消费批次,每次消费 10 条
112 | # MESSAGE_MQ_PREFETCH_COUNT = 10
113 | #
114 | # # rabbit mq virtual host
115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider'
116 | #
117 | # # rabbit mq 操作类
118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler'
119 |
120 | # -------------------------------------------------------------------------------------------------------------------
121 |
122 | # 爬虫公共类,基类
123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider"
124 |
125 | # 爬虫工具类
126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil"
127 |
128 | # 代理IP中间件
129 | # redis IP 获取
130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy'
131 | # # 芝麻代理 IP
132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy'
133 |
134 | # 邮件
135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler'
136 |
137 | # post ger sql 操作类
138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle'
139 |
140 | # mysql 操作类
141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler'
142 |
143 | # redis 操作类
144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler'
145 |
146 | # -------------------------------------------------------------------------------------------------------------------
147 |
148 | # 初始化 代理 IP 数量
149 | PROXY_NUM = 5
150 |
151 | # 重试次数
152 | # RETRY_COUNT = 3
153 |
154 | # 包含一下状态吗,重试
155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
156 |
157 | # 忽略 ssl 验证
158 | REQUEST_VERIFY = False
159 |
160 | # 请求超时时间
161 | REQUEST_TIMEOUT = 30
162 |
163 | # 消费者线程数
164 | CONSUMER_THREAD_NUM = 10
165 |
166 | # -------------------------------------------------------------------------------------------------------------------
167 |
168 | """
169 | 数据库配置
170 |
171 | 单个数据库
172 | REDIS_CONFIG = {
173 | 'host': '',
174 | 'host': '',
175 | 'db': '',
176 | 'user': '',
177 | 'password': '',
178 | 'decode_responses': True
179 | }
180 | 使用:
181 | red 默认变量名称
182 | self.red.get('key1')
183 | spider.red.get('key1')
184 |
185 | 多个数据库
186 | REDIS_CONFIG = [
187 | {
188 | 'name': 'name1',
189 | 'host': '',
190 | 'host': '',
191 | 'db': '',
192 | 'user': '',
193 | 'password': '',
194 | 'decode_responses': True
195 | },
196 | {
197 | 'name': 'name2',
198 | 'host': '',
199 | 'host': '',
200 | 'db': '',
201 | 'user': '',
202 | 'password': '',
203 | 'decode_responses': True
204 | },
205 | ]
206 | 使用:
207 | self.name1.get('key1')
208 | spider.name1.get('key1')
209 |
210 | self.name2.get('key1')
211 | spider.name2.get('key1')
212 | """
213 |
--------------------------------------------------------------------------------
/spiders/test_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py
6 | Time: 2021/01/01 11:40:25
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:25
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/spiders/test_spider/spiders/test_common.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: test_common.py
6 | Time: 2021/31/31 17:34:58
7 | -------------------------------------------------
8 | Change Activity: 2021/31/31 17:34:58
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import sys
14 |
15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16 | sys.path.append(file_path)
17 |
18 | from magical.sync_spider import SyncSpider, run_spider
19 |
20 |
21 | class TestCommonSpider(SyncSpider):
22 | name = 'test_common'
23 | settings_path = 'spiders.test_spider.settings'
24 |
25 | default_custom_setting = {}
26 |
27 | def __init__(self, *args, **kwargs):
28 | custom_setting = {}
29 | kwargs.update(dict(custom_setting=custom_setting))
30 | super().__init__(*args, **kwargs)
31 |
32 | def start_spider(self):
33 | print(self.excel)
34 |
35 |
36 | if __name__ == '__main__':
37 | run_spider(TestCommonSpider)
38 |
--------------------------------------------------------------------------------
/spiders/test_spider/spiders/test_excel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: test_excel.py
6 | Time: 2021/01/01 11:40:25
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:25
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import sys
14 |
15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16 | sys.path.append(file_path)
17 |
18 | from magical.sync_spider import SyncSpider, load_files, run_spider
19 |
20 |
21 | class TestExcelSpider(SyncSpider):
22 | name = 'test_excel'
23 | settings_path = 'spiders.test_spider.settings'
24 |
25 | default_custom_setting = {}
26 |
27 | def __init__(self, *args, **kwargs):
28 | custom_setting = {}
29 | kwargs.update(dict(custom_setting=custom_setting))
30 | super().__init__(*args, **kwargs)
31 |
32 | self.excel = load_files(self.settings['EXCEL'])
33 |
34 | def start_spider(self):
35 | data_list = [
36 | {'desc': 'desc1', 'name': 'name1', 'plat': 'plat1'},
37 | {'desc': 'desc2', 'name': 'name2', 'plat': 'plat2'},
38 | {'desc': 'desc3', 'name': 'name3', 'plat': 'plat3'},
39 | {'desc': 'desc4', 'name': 'name4', 'plat': 'plat4'},
40 | {'desc': 'desc5', 'name': 'name5', 'plat': 'plat5'},
41 | ]
42 |
43 | title = {'desc': '描述', 'name': '店铺名称', 'plat': '渠道'}
44 | excel_name = 'test'
45 | excel_file_path = '../static/test.xls'
46 | self.excel.write_excel(data_list, title, excel_name, excel_file_path)
47 |
48 |
49 | if __name__ == '__main__':
50 | run_spider(TestExcelSpider)
51 |
--------------------------------------------------------------------------------
/spiders/test_spider/spiders/test_proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: test_proxy.py
6 | Time: 2021/01/01 11:40:08
7 | -------------------------------------------------
8 | Change Activity: 2021/01/01 11:40:08
9 | -------------------------------------------------
10 | Desc:
11 | """
12 | import os
13 | import sys
14 |
15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16 | sys.path.append(file_path)
17 |
18 | from magical.sync_spider import SyncSpider, Request, run_spider
19 |
20 |
21 | class TestProxySpider(SyncSpider):
22 | name = 'test_proxy'
23 | settings_path = 'spiders.test_spider.settings'
24 |
25 | default_custom_setting = {}
26 |
27 | def __init__(self, *args, **kwargs):
28 | custom_setting = {}
29 | kwargs.update(dict(custom_setting=custom_setting))
30 | super().__init__(*args, **kwargs)
31 |
32 | def start_spider(self):
33 | print(self.proxy.get_proxy())
34 |
35 | self.download(
36 | url='',
37 | params={},
38 | method='POST',
39 | data={},
40 | headers={},
41 | meta={
42 | 'proxy': self.proxy.get_proxy()
43 | }
44 | )
45 |
46 | request = Request(
47 | url='',
48 | params={},
49 | method='POST',
50 | data={},
51 | headers={},
52 | meta={
53 | 'proxy': self.proxy.get_proxy()
54 | }
55 | )
56 | self.download(request)
57 |
58 |
59 | if __name__ == '__main__':
60 | run_spider(TestProxySpider)
61 |
--------------------------------------------------------------------------------
/spiders/test_spider/static/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | Author: qinLess
5 | File: __init__.py.py
6 | Time: 2021/7/1 上午11:48
7 | -------------------------------------------------
8 | Change Activity: 2021/7/1 上午11:48
9 | -------------------------------------------------
10 | Desc:
11 | """
12 |
13 |
14 | def main():
15 | pass
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/spiders/test_spider/static/test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qinLess/magical/4a6d38e55b3c8396c10712927028db4329faf888/spiders/test_spider/static/test.xls
--------------------------------------------------------------------------------