├── .gitignore ├── README.md ├── magical ├── __init__.py ├── cmdline.py ├── sync_spider │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── base_spider.py │ │ ├── email_handler.py │ │ ├── log_setting.py │ │ ├── proxy_handler.py │ │ ├── redis_lock.py │ │ ├── spider_util.py │ │ ├── user_agent.json │ │ └── utils.py │ ├── config │ │ ├── __init__.py │ │ ├── default_settings.py │ │ └── settings.py │ ├── core │ │ ├── __init__.py │ │ ├── spider.py │ │ └── start_spider.py │ ├── databases │ │ ├── __init__.py │ │ ├── init_db.py │ │ ├── mysql_pool.py │ │ ├── post_gre_sql_pool.py │ │ └── red_pool.py │ ├── extends_module │ │ ├── __init__.py │ │ ├── base_module │ │ │ ├── __init__.py │ │ │ ├── downloader.py │ │ │ └── pipeline.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ └── retry.py │ │ └── mqs │ │ │ ├── __init__.py │ │ │ └── rabbit_mq │ │ │ ├── __init__.py │ │ │ └── handler.py │ ├── http │ │ ├── __init__.py │ │ ├── request.py │ │ └── response.py │ └── middleware │ │ ├── __init__.py │ │ ├── download │ │ ├── __init__.py │ │ ├── downloader.py │ │ ├── handler.py │ │ └── manager.py │ │ ├── duplicate │ │ ├── __init__.py │ │ ├── bit_array.py │ │ ├── bloom_filter.py │ │ ├── expire_filter.py │ │ └── handler.py │ │ └── pipeline │ │ ├── __init__.py │ │ ├── handler.py │ │ └── manager.py ├── template.py ├── templates │ ├── __init__.py │ └── sync_spider │ │ ├── __init__.py │ │ ├── base_spider.py.tmpl │ │ ├── middleware.py.tmpl │ │ ├── settings.py.tmpl │ │ ├── spider.py.tmpl │ │ └── spiders │ │ ├── __init__.py │ │ └── __init__.py.tmpl └── utils.py ├── requirements.txt ├── setup.py └── spiders ├── __init__.py ├── common ├── __init__.py ├── excel.py ├── proxy.py ├── settings.py └── spider_init.py ├── test_douban ├── __init__.py ├── base_spider.py ├── middleware.py ├── settings.py └── spiders │ ├── __init__.py │ └── douban_spider.py └── test_spider ├── __init__.py ├── base_spider.py ├── middleware.py ├── settings.py ├── spiders ├── __init__.py ├── test_common.py ├── test_excel.py └── test_proxy.py └── static ├── __init__.py └── test.xls /.gitignore: -------------------------------------------------------------------------------- 1 | test 2 | __pycache__ 3 | .idea 4 | .DS_Store 5 | logs 6 | captcha 7 | file 8 | *.egg-info 9 | build 10 | dist -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 2021-07-01 更新 2 | ### 添加自定义公用组件,详情可查看,spiders 示例 3 | 4 | ## 2021-07-31 更新 5 | ### 添加统一初始化,爬虫工具类,模块 6 | 1、`spiders.common.settings.py` 文件配置 7 | ```python 8 | SPIDER_INIT_HANDLER = 'spiders.common.spider_init.SpiderInit' 9 | EXCEL = 'spiders.common.excel' 10 | ``` 11 | 2、`spiders.common.spider_init.py` 具体实现 12 | ```python 13 | from magical.sync_spider import load_files 14 | 15 | 16 | class SpiderInit(object): 17 | def __init__(self, spider): 18 | self.settings = spider.settings 19 | 20 | spider.excel = load_files(self.settings['EXCEL']) 21 | ``` 22 | 3、`spiders.test_spider.spider.test_common.py` 示例 23 | ```python 24 | import os 25 | import sys 26 | 27 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 28 | sys.path.append(file_path) 29 | 30 | from magical.sync_spider import SyncSpider, run_spider 31 | 32 | 33 | class TestCommonSpider(SyncSpider): 34 | name = 'test_common' 35 | settings_path = 'spiders.test_spider.settings' 36 | 37 | default_custom_setting = {} 38 | 39 | def __init__(self, *args, **kwargs): 40 | custom_setting = {} 41 | kwargs.update(dict(custom_setting=custom_setting)) 42 | super().__init__(*args, **kwargs) 43 | 44 | def start_spider(self): 45 | print(self.excel) 46 | 47 | 48 | if __name__ == '__main__': 49 | run_spider(TestCommonSpider) 50 | ``` 51 | 52 | ------------------------------ 53 | 54 | ## 简介 55 | 56 | **magical** 轻量级爬虫框架, 模仿 scrapy 开发,没有 scrapy 复杂,抛弃了 yield 跟 回掉函数,流程简单化,全部可自定义,框架只是简单封装了一些常用函数 57 | 58 | ### 项目文件: 59 | - `spiders` 爬虫列表文件夹 60 | - `settings` 爬虫配置文件 61 | - `middleware` 中间件文件 62 | - `pipeline` 管道文件 63 | - `base_spdier` 64 | 65 | ### spider 提供3个爬虫类: 66 | - `SyncSpider` 单线程爬虫 67 | - `RedisMessageMQSpider` redis 发布者订阅者模式爬虫 68 | - `RabbitMessageMQSpider` rabbitMQ 生产者消费者爬虫 69 | - `ThreadSyncSpider` 多线程爬虫,启动多个线程,去实例化以上三种爬虫类 70 | 71 | **sync_spider** `requests`同步版本 72 | **async_spider** `aiohttp`异步版本 (问题较多,已放弃开发) 73 | 74 | ## 创建项目 (需要先创建 spiders 文件夹,执行以下代码可自动生成代码文件) 75 | ```python 76 | import os 77 | from magical.cmdline import generate_spider_project, generate_spider_file 78 | 79 | 80 | def main(): 81 | project_path = os.path.dirname(os.path.abspath(__file__)) 82 | spider_name = 'test_spider_pipelines' 83 | 84 | # 创建单个爬虫文件 85 | generate_spider_file('sync_spider', project_path, spider_name) 86 | 87 | # 创建爬虫项目 88 | # generate_spider_project('sync_spider', project_path, spider_name) 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | ``` 94 | 95 | ## Spider 96 | ```python 97 | from magical.sync_spider import run_spider, SyncSpider, Request 98 | 99 | 100 | class TestSpider(SyncSpider): 101 | name = 'test_spider' 102 | settings_path = 'spiders.test.settings.py' 103 | 104 | default_custom_setting = {} 105 | 106 | def __init__(self, *args, **kwargs): 107 | custom_setting = {} 108 | kwargs.update(dict(custom_setting=custom_setting)) 109 | super().__init__(*args, **kwargs) 110 | 111 | def start_spider(self): 112 | self.logger.info(f'Hello {self.name}') 113 | 114 | # 发起request请求 115 | request = Request(url='http://www.baidu.com/') 116 | response = self.download(request) 117 | 118 | title = response.re.findall('(.*?)') 119 | self.logger.info(f'title: {title}') 120 | 121 | data = {'title': title[0]} 122 | 123 | # 调用 pipeline 处理数据,返回 True or False 124 | pip_res = self.pipeline(data) 125 | print('pip_res: ', pip_res) 126 | 127 | # 调用 redis 128 | self.red.get('key1') 129 | 130 | # 调用 mysql 131 | self.mysql.select('select * from test;') 132 | 133 | # 调用 postgresql 134 | self.post_gre.select('select * from test;') 135 | 136 | 137 | if __name__ == '__main__': 138 | run_spider(TestSpider) 139 | ``` 140 | 141 | ## Database 142 | 数据库配置, redis 为例 143 | - 单个数据库 144 | ```python 145 | REDIS_CONFIG = { 146 | 'host': '', 147 | 'host': '', 148 | 'db': '', 149 | 'user': '', 150 | 'password': '', 151 | 'decode_responses': True 152 | } 153 | 154 | """red 默认变量名称 155 | Usage: 156 | self.red.get('key1') 157 | spider.red.get('key1') 158 | """ 159 | ``` 160 | - 多个数据库 161 | ```python 162 | REDIS_CONFIG = [ 163 | { 164 | 'name': 'name1', 165 | 'host': '', 166 | 'host': '', 167 | 'db': '', 168 | 'user': '', 169 | 'password': '', 170 | 'decode_responses': True 171 | }, 172 | { 173 | 'name': 'name2', 174 | 'host': '', 175 | 'host': '', 176 | 'db': '', 177 | 'user': '', 178 | 'password': '', 179 | 'decode_responses': True 180 | } 181 | ] 182 | """ 183 | Usage: 184 | self.name1.get('key1') 185 | spider.name1.get('key1') 186 | 187 | self.name2.get('key1') 188 | spider.name2.get('key1') 189 | """ 190 | ``` 191 | - RedisPool 使用 (默认访问名称 red, 如果有多个连接 通过 name 字段访问) 192 | ```python 193 | 194 | self.red.get('key1') 195 | self.red.set('key1', 'value1') 196 | ``` 197 | 198 | - MysqlPool 使用 (默认访问名称 mysql, 如果有多个连接 通过 name 字段访问) 199 | ```python 200 | 201 | # 执行 sql 202 | self.mysql.execute('select * from test;') 203 | 204 | # 查询 sql 205 | self.mysql.select('select * from test;') 206 | 207 | # 插入单条数据 208 | data = { 209 | 'feild1': 'data1', 210 | 'field2': 'data2' 211 | } 212 | self.mysql.insert_dict(table_name='table1', info_dict=data, ignore=False, replace=False) 213 | 214 | # 插入多条数据 215 | data = [ 216 | { 217 | 'feild1': 'data1', 218 | 'field2': 'data2' 219 | }, 220 | { 221 | 'feild1': 'data1', 222 | 'field2': 'data2' 223 | } 224 | ] 225 | self.mysql.insert_list(table_name='table1', info_list=data, ignore=False, replace=False) 226 | ``` 227 | 228 | - PostGreSqlPool 使用 (默认访问名称 post_gre, 如果有多个连接 通过 name 字段访问) 229 | ```python 230 | 231 | # 执行 sql 232 | self.post_gre.execute('select * from test;') 233 | 234 | # 查询 sql 235 | self.post_gre.select('select * from test;') 236 | 237 | # 插入单条数据 (indexes = 表的唯一索引,用于过滤已存在的数据) 238 | data = { 239 | 'feild1': 'data1', 240 | 'field2': 'data2' 241 | } 242 | self.post_gre.insert_conflict_dict(table_name='table1', info_dict=data, indexes=False) 243 | 244 | # 插入多条数据 (indexes = 表的唯一索引,用于过滤已存在的数据) 245 | data = [ 246 | { 247 | 'feild1': 'data1', 248 | 'field2': 'data2' 249 | }, 250 | { 251 | 'feild1': 'data1', 252 | 'field2': 'data2' 253 | } 254 | ] 255 | self.post_gre.insert_conflict_list(table_name='table1', info_list=data, indexes=False) 256 | ``` 257 | 258 | ## Download Middleware 259 | ```python 260 | 261 | import requests 262 | 263 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 264 | 265 | 266 | # 效果不好,不推荐使用 267 | class DuplicateMiddleware(DownloaderMiddleware): 268 | """去重中间件""" 269 | 270 | def __init__(self, spider): 271 | super().__init__(spider) 272 | 273 | def process_request(self, request): 274 | 275 | if request.meta.get('is_filter'): 276 | # 0 == 不存在,1 == 存在 277 | if self.duplicate.get(**request.meta['filter_info']) != 0: 278 | return None 279 | 280 | return request 281 | 282 | def process_response(self, request, response): 283 | 284 | if response and request.meta.get('is_filter'): 285 | # 请求成功添加到,去重种子列表里。 0 == 已存在,1 == 不存在,添加成功 286 | if self.duplicate.add(**request.meta['filter_info']) == 1: 287 | pass 288 | 289 | return response 290 | 291 | 292 | class HeadersMiddleware(DownloaderMiddleware): 293 | """请求头中间件,User-Agent 随机切换""" 294 | 295 | def __init__(self, spider): 296 | super().__init__(spider) 297 | 298 | def process_request(self, request): 299 | request.headers.update({ 300 | 'Connection': 'close', 301 | 'user-agent': self.spider.spider_util.random_ua() 302 | }) 303 | return request 304 | 305 | 306 | class ProxyMiddleware(DownloaderMiddleware): 307 | """代理 IP 中间件""" 308 | 309 | def __init__(self, spider): 310 | super().__init__(spider) 311 | 312 | # 初始化代理 IP,num 初始化几条 313 | # self.proxy_handler(num=1) 314 | 315 | def process_request(self, request): 316 | # 获取一条代理 IP 317 | # request.meta['proxy'] = self.proxy.get_proxy() 318 | return request 319 | 320 | def process_response(self, request, response): 321 | return response 322 | 323 | def process_exception(self, request, exception): 324 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 325 | 326 | if isinstance( 327 | exception, 328 | ( 329 | requests.exceptions.ConnectionError, 330 | requests.exceptions.ConnectTimeout, 331 | requests.exceptions.ReadTimeout, 332 | requests.exceptions.Timeout, 333 | ) 334 | ): 335 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}') 336 | time.sleep(random.randint(3, 5)) 337 | self.proxy.proxy_handler(request, num=1) 338 | return self._retry(request) 339 | 340 | elif isinstance(exception, requests.exceptions.HTTPError): 341 | self.logger.error(f'ProxyMiddleware - requests.exceptions.HTTPError - request: {request}') 342 | return None 343 | 344 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError): 345 | self.logger.error(f'ProxyMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}') 346 | return None 347 | 348 | elif isinstance(exception, requests.exceptions.SSLError): 349 | self.logger.error(f'ProxyMiddleware - requests.exceptions.SSLError - request: {request}') 350 | return None 351 | 352 | return exception 353 | 354 | 355 | class TestSpiderMiddleware(DownloaderMiddleware): 356 | """爬虫中间件""" 357 | 358 | def __init__(self, spider): 359 | super().__init__(spider) 360 | 361 | def process_request(self, request): 362 | return request 363 | 364 | def process_response(self, request, response): 365 | if not request.use_middleware: 366 | return response 367 | 368 | return response 369 | 370 | def process_exception(self, request, exception): 371 | self.logger.exception(f'TestSpiderMiddleware.process_exception: {exception}, request: {request}') 372 | return exception 373 | ``` 374 | 375 | ## Pipeline Middleware 376 | ```python 377 | class TestSpiderPipeline(PipelineMiddleware): 378 | 379 | def __init__(self, spider): 380 | super().__init__(spider) 381 | 382 | def process_item(self, item, **kwargs): 383 | """数据处理 384 | 385 | Args: 386 | item : 要处理的数据 387 | kwargs: 388 | table_name: 表名称 389 | replace : True or False (mysql 数据库使用) 390 | ignore : True or False (mysql 数据库使用) 391 | indexes : 数据库表唯一索引字段 (PostGreSql 数据库使用) 392 | 393 | Return: 394 | 返回的数据类型如果不等于 type(item) 则不会调用后面的 pipeline process_item 函数 395 | """ 396 | return item 397 | 398 | def process_exception(self, item, exception, **kwargs): 399 | if isinstance(exception, Exception): 400 | self.logger.error(f'TestSpiderPipeline - exception: {exception}') 401 | return None 402 | 403 | return exception 404 | ``` 405 | 406 | # 持续更新中······ 407 | -------------------------------------------------------------------------------- /magical/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/5/24 下午9:18 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/24 下午9:18 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/cmdline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: cmdline.py 6 | Time: 2021/4/14 下午3:09 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/14 下午3:09 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import string 14 | import sys 15 | import datetime 16 | from shutil import copy2, copystat 17 | from os.path import join, exists, abspath, dirname 18 | 19 | from magical.template import render_template_file, string_camelcase 20 | 21 | TEMPLATES_TO_RENDER = [ 22 | ('${spider_name}', 'spiders', '__init__.py.tmpl'), 23 | ('${spider_name}', 'base_spider.py.tmpl'), 24 | ('${spider_name}', 'middleware.py.tmpl'), 25 | ('${spider_name}', 'settings.py.tmpl') 26 | ] 27 | 28 | 29 | def _copytree(src, dst): 30 | """复制文件 31 | 32 | Args: 33 | src: 模版文件路径(str) 34 | dst: 项目路径(str) 35 | Returns: 36 | """ 37 | if not exists(dst): 38 | os.makedirs(dst) 39 | 40 | names = os.listdir(src) 41 | 42 | for name in names: 43 | if name == 'spider.py.tmpl': 44 | continue 45 | 46 | if name == '__init__.py': 47 | continue 48 | 49 | src_name = os.path.join(src, name) 50 | dst_name = os.path.join(dst, name) 51 | 52 | if os.path.isdir(src_name): 53 | _copytree(src_name, dst_name) 54 | 55 | else: 56 | copy2(src_name, dst_name) 57 | 58 | copystat(src, dst) 59 | 60 | 61 | def generate_spider_project(spider_type, project_path=None, spider_name=None): 62 | """生成项目爬虫文件 63 | 64 | Args: 65 | spider_type: 爬虫类型(sync_spider, async_spider) 66 | project_path: 项目路径 67 | spider_name: 爬虫名称 68 | """ 69 | if not spider_type: 70 | sys.exit('spider_type is not null') 71 | 72 | if not project_path: 73 | sys.exit('project_path is not null') 74 | 75 | if not spider_name: 76 | sys.exit('spider_name is not null') 77 | 78 | templates_dir = abspath(join(dirname(__file__), f'templates/{spider_type}')) 79 | _copytree(templates_dir, join(abspath(project_path))) 80 | copy2(join(templates_dir, 'spider.py.tmpl'), join(abspath(project_path), 'spiders', f'{spider_name}.py.tmpl')) 81 | 82 | s_path = abspath(project_path).split('/') 83 | spider_path = '.'.join(s_path[s_path.index('spiders'):]) 84 | settings_path = spider_path + '.settings' 85 | project_name = s_path[s_path.index('spiders') + 1] 86 | 87 | TEMPLATES_TO_RENDER.append(('${spider_name}', 'spiders', f'{spider_name}.py.tmpl')) 88 | 89 | for paths in TEMPLATES_TO_RENDER: 90 | path = join(*paths) 91 | 92 | tpl_file = string.Template(path).substitute(spider_name=project_path) 93 | 94 | render_template_file( 95 | tpl_file, 96 | project_name=project_name, 97 | settings_path=settings_path, 98 | spider_path=spider_path, 99 | spider_name=spider_name, 100 | create_time=datetime.datetime.now().strftime('%Y/%d/%d %H:%M:%S'), 101 | SpiderName=string_camelcase(spider_name), 102 | ) 103 | 104 | 105 | def generate_spider_file(spider_type, project_path=None, spider_name=None): 106 | """生成爬虫文件 107 | 108 | Args: 109 | spider_type: 爬虫类型(sync_spider, async_spider) 110 | project_path: 项目路径 111 | spider_name: 爬虫名称 112 | """ 113 | if not spider_type: 114 | sys.exit('spider_type is not null') 115 | 116 | if not project_path: 117 | sys.exit('project_path is not null') 118 | 119 | if not spider_name: 120 | sys.exit('spider_name is not null') 121 | 122 | templates_dir = abspath(join(dirname(__file__), f'templates/{spider_type}')) 123 | copy2(join(templates_dir, 'spider.py.tmpl'), join(abspath(project_path), 'spiders', f'{spider_name}.py.tmpl')) 124 | 125 | s_path = abspath(project_path).split('/') 126 | spider_path = '.'.join(s_path[s_path.index('spiders'):]) 127 | settings_path = spider_path + '.settings' 128 | 129 | path = join(*('${spider_name}', 'spiders', f'{spider_name}.py.tmpl')) 130 | 131 | tpl_file = string.Template(path).substitute(spider_name=project_path) 132 | 133 | render_template_file( 134 | tpl_file, 135 | settings_path=settings_path, 136 | spider_path=spider_path, 137 | spider_name=spider_name, 138 | create_time=datetime.datetime.now().strftime('%Y/%d/%d %H:%M:%S'), 139 | SpiderName=string_camelcase(spider_name), 140 | ) 141 | 142 | 143 | if __name__ == '__main__': 144 | generate_spider_project('async_spider', '/Users/qinjiahu/Desktop/project/gn/spider_project/test/test1', 'test1') 145 | -------------------------------------------------------------------------------- /magical/sync_spider/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午4:48 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:48 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | from magical.utils import load_objects, load_files 14 | from magical.sync_spider.core.start_spider import run_spider, run_thread_spider 15 | from magical.sync_spider.http.request import Request 16 | from magical.sync_spider.core.spider import SyncSpider, ThreadSyncSpider, RedisMessageMQSpider, RabbitMessageMQSpider 17 | from magical.sync_spider.common.log_setting import get_logger 18 | 19 | 20 | def get_settings(settings_path=None): 21 | import importlib 22 | from magical.sync_spider.config.settings import Settings 23 | 24 | settings = Settings() 25 | 26 | if settings_path: 27 | custom_settings = importlib.import_module(settings_path) 28 | settings.load_config(custom_settings) 29 | 30 | return settings 31 | 32 | 33 | class TestSyncSpider(object): 34 | name = 'test_sync_spider' 35 | 36 | def __init__(self, settings_path=None): 37 | self.settings = get_settings(settings_path) 38 | self.loop = None 39 | self.logger = get_logger(self) 40 | -------------------------------------------------------------------------------- /magical/sync_spider/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午4:48 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:48 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/common/base_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: base_spider.py 6 | Time: 2021/4/11 下午9:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 下午9:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | class BaseSpider(object): 15 | 16 | def __init__(self, spider): 17 | self.red = spider.red 18 | self.logger = spider.logger 19 | self.post_gre = spider.post_gre 20 | self.download = spider.download 21 | self.settings = spider.settings 22 | self.spider_util = spider.spider_util 23 | self.spider_data = spider.spider_data 24 | -------------------------------------------------------------------------------- /magical/sync_spider/common/email_handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: email_handler.py 6 | Time: 2021/4/10 下午4:49 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:49 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import time 13 | import smtplib 14 | from email.mime.text import MIMEText 15 | from email.mime.image import MIMEImage 16 | from email.mime.multipart import MIMEMultipart 17 | 18 | 19 | class EmailHandler(object): 20 | mail_host = 'smtp.qq.com' 21 | mail_user = '2027762055@qq.com' 22 | mail_pass = 'lcpzxzargptleibi' 23 | sender = '2027762055@qq.com' 24 | receivers = ['qinjiahu@gnlab.com'] 25 | 26 | @staticmethod 27 | def __send_email_image(image_path, title): 28 | send_str = f''' 29 | 30 | image1 31 | 32 | ''' 33 | 34 | # 构建message 35 | msg = MIMEMultipart() 36 | 37 | # 添加邮件内容 38 | content = MIMEText(send_str, _subtype='html', _charset='utf8') 39 | msg.attach(content) 40 | 41 | # 构建并添加图像对象 42 | img1 = MIMEImage(open(image_path, 'rb').read(), _subtype='octet-stream') 43 | img1.add_header('Content-ID', 'image1') 44 | msg.attach(img1) 45 | 46 | # 邮件主题 47 | msg['Subject'] = title 48 | 49 | # 邮件收、发件人 50 | msg['To'] = EmailHandler.receivers[0] 51 | msg['From'] = EmailHandler.sender 52 | 53 | try: 54 | # 登录邮箱 55 | server = smtplib.SMTP_SSL("smtp.qq.com", port=465) 56 | server.login(EmailHandler.sender, EmailHandler.mail_pass) 57 | server.sendmail(EmailHandler.sender, EmailHandler.receivers, msg.as_string()) 58 | server.quit() 59 | except smtplib.SMTPException as e: 60 | print('send_email_image.error: ', e) # 打印错误 61 | 62 | @staticmethod 63 | def __send_email(content, title): 64 | message = MIMEText(content, 'plain', 'utf-8') 65 | # 邮件主题 66 | message['Subject'] = title 67 | # 发送方信息 68 | message['From'] = EmailHandler.sender 69 | # 接受方信息 70 | message['To'] = EmailHandler.receivers[0] 71 | 72 | # 登录并发送邮件 73 | try: 74 | smtpObj = smtplib.SMTP() 75 | # 连接到服务器 76 | smtpObj.connect(EmailHandler.mail_host, 25) 77 | # 登录到服务器 78 | smtpObj.login(EmailHandler.mail_user, EmailHandler.mail_pass) 79 | # 发送 80 | smtpObj.sendmail(EmailHandler.sender, EmailHandler.receivers, message.as_string()) 81 | # 退出 82 | smtpObj.quit() 83 | except smtplib.SMTPException as e: 84 | print('__send_email.error: ', e) # 打印错误 85 | 86 | @staticmethod 87 | def send_email(title, image_path=None): 88 | content = f'cookie 失效,请及时补充 {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))}' 89 | 90 | if image_path: 91 | EmailHandler.__send_email_image(image_path, title) 92 | 93 | else: 94 | EmailHandler.__send_email(content, title) 95 | 96 | 97 | if __name__ == "__main__": 98 | EmailHandler.send_email('JD 商家后台 Cookie') 99 | -------------------------------------------------------------------------------- /magical/sync_spider/common/log_setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: log_setting.py 6 | Time: 2021/4/10 下午1:05 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午1:05 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import logging.config 13 | 14 | import os 15 | import datetime 16 | import logging 17 | import logging.handlers 18 | 19 | 20 | class Logger(object): 21 | instance = {} 22 | init_flag = {} 23 | 24 | def __new__(cls, *args, **kwargs): 25 | spider = kwargs['spider'] 26 | name = spider.name 27 | 28 | if not cls.instance.get(name): 29 | cls.instance[name] = super().__new__(cls) 30 | 31 | return cls.instance[name] 32 | 33 | def __init__(self, spider): 34 | name = spider.name 35 | if Logger.init_flag.get(name): 36 | return 37 | Logger.init_flag[name] = True 38 | 39 | self.logger = logging.getLogger(name) 40 | if not self.logger.handlers: 41 | self.logger.setLevel(logging.DEBUG) 42 | day_date = datetime.datetime.now().strftime("%Y-%m-%d") 43 | log_path = spider.settings['LOGGER_PATH'] 44 | self.log_path = os.path.join(log_path or 'logs/', f'{day_date}/') 45 | if not os.path.exists(self.log_path): 46 | os.makedirs(self.log_path) 47 | 48 | self.log_name = f'{self.log_path}{name + ".log"}' 49 | fh = logging.FileHandler(self.log_name, 'a', encoding='utf-8') 50 | fh.setLevel(logging.INFO) 51 | ch = logging.StreamHandler() 52 | ch.setLevel(logging.INFO) 53 | formatter = logging.Formatter( 54 | '[%(asctime)s] %(filename)s -> %(funcName)s line:%(lineno)d [%(levelname)s] %(message)s') 55 | fh.setFormatter(formatter) 56 | ch.setFormatter(formatter) 57 | self.logger.addHandler(fh) 58 | self.logger.addHandler(ch) 59 | fh.close() 60 | ch.close() 61 | 62 | 63 | def get_logger(spider): 64 | return Logger(spider=spider).logger 65 | -------------------------------------------------------------------------------- /magical/sync_spider/common/proxy_handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: proxy_handler.py 6 | Time: 2021/5/5 下午2:48 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/5 下午2:48 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import random 13 | 14 | 15 | class ProxyHandler(object): 16 | __instance = {} 17 | 18 | def __new__(cls, *args, **kwargs): 19 | if not cls.__instance.get(cls.__name__): 20 | cls.__instance[cls.__name__] = super().__new__(cls) 21 | return cls.__instance[cls.__name__] 22 | 23 | def __init__(self, spider): 24 | self.spider = spider 25 | self.logger = spider.logger 26 | 27 | self.proxy_list = [] 28 | 29 | self.proxy_num = spider.settings.get('PROXY_NUM', 1) 30 | 31 | 32 | class GetRedisProxy(ProxyHandler): 33 | def __init__(self, spider): 34 | super().__init__(spider) 35 | 36 | def generate_proxy(self, num): 37 | red_proxy = self.spider.red_proxy 38 | proxy_keys = list(red_proxy.keys('ip_pool_win7*')) 39 | 40 | for i in range(num): 41 | proxy = (red_proxy.get(random.choice(proxy_keys))).split("_")[0] 42 | 43 | new_proxy = { 44 | 'https': f'socks5://{proxy}/', 45 | 'http': f'socks5://{proxy}/' 46 | } 47 | 48 | if self.spider.test_ip(new_proxy): 49 | self.proxy_list.append(new_proxy) 50 | 51 | def proxy_handler(self, request=None, num=None): 52 | if not request: 53 | self.generate_proxy(num or self.proxy_num) 54 | 55 | else: 56 | if request.meta.get('proxy') in self.proxy_list: 57 | self.proxy_list.remove(request.meta.get('proxy')) 58 | self.generate_proxy(num or self.proxy_num) 59 | 60 | def get_proxy(self): 61 | return random.choice(self.proxy_list) if len(self.proxy_list) > 0 else None 62 | -------------------------------------------------------------------------------- /magical/sync_spider/common/redis_lock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: redis_lock.py 6 | Time: 2021/5/13 下午3:52 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午3:52 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import time 13 | 14 | 15 | class RedisLock(object): 16 | def __init__(self, key, timeout=300, wait_timeout=300, break_wait=None, redis_cli=None): 17 | """ 18 | redis超时锁 19 | :param key: 关键字 不同项目区分 20 | :param timeout: 锁超时时间 21 | :param wait_timeout: 等待加锁超时时间 防止多线程竞争时可能出现的 某个线程无限等待 22 | <=0 则不等待 直接加锁失败 23 | :param break_wait: 可自定义函数 灵活控制 wait_timeout 时间 当此函数返回True时 不再wait 24 | :param redis_cli: redis客户端 25 | 26 | 用法示例: 27 | with RedisLock(key="test", timeout=10, wait_timeout=100, redis_uri="") as _lock: 28 | if _lock.locked: 29 | # 用来判断是否加上了锁 30 | # do somethings 31 | """ 32 | self.redis_index = -1 33 | if not key: 34 | raise Exception("lock key is empty") 35 | if not redis_cli: 36 | raise Exception("redis_cli is empty") 37 | 38 | self.redis_conn = redis_cli 39 | self.lock_key = "redis_lock:{}".format(key) 40 | # 锁超时时间 41 | self.timeout = timeout 42 | # 等待加锁时间 43 | self.wait_timeout = wait_timeout 44 | # wait中断函数 45 | self.break_wait = break_wait 46 | if self.break_wait is None: 47 | self.break_wait = lambda: False 48 | if not callable(self.break_wait): 49 | raise TypeError( 50 | "break_wait must be function or None, but: {}".format( 51 | type(self.break_wait) 52 | ) 53 | ) 54 | 55 | self.locked = False 56 | 57 | def __enter__(self): 58 | if not self.locked: 59 | self.acquire() 60 | return self 61 | 62 | def __exit__(self, exc_type, exc_val, exc_tb): 63 | self.release() 64 | 65 | def __repr__(self): 66 | return "".format(self.lock_key, self.redis_index) 67 | 68 | def acquire(self): 69 | start = time.time() 70 | while 1: 71 | # 尝试加锁 72 | if self.redis_conn.setnx(self.lock_key, time.time()): 73 | self.redis_conn.expire(self.lock_key, self.timeout) 74 | self.locked = True 75 | break 76 | else: 77 | # 修复bug: 当加锁时被干掉 导致没有设置expire成功 锁无限存在 78 | if self.redis_conn.ttl(self.lock_key) < 0: 79 | self.redis_conn.delete(self.lock_key) 80 | 81 | if self.wait_timeout > 0: 82 | if time.time() - start > self.wait_timeout: 83 | # log.info("加锁失败") 84 | break 85 | else: 86 | # 不等待 87 | break 88 | if self.break_wait(): 89 | # log.info("break_wait 生效 不再等待加锁") 90 | break 91 | # log.debug("等待加锁: {} wait:{}".format(self, time.time() - start)) 92 | if self.wait_timeout > 10: 93 | time.sleep(5) 94 | else: 95 | time.sleep(1) 96 | return 97 | 98 | def release(self): 99 | if self.locked: 100 | self.redis_conn.delete(self.lock_key) 101 | self.locked = False 102 | return 103 | 104 | def prolong_life(self, life_time: int) -> int: 105 | """ 106 | 延长这个锁的超时时间 107 | :param life_time: 延长时间 108 | :return: 109 | """ 110 | expire = self.redis_conn.ttl(self.lock_key) 111 | if expire < 0: 112 | return expire 113 | expire += life_time 114 | self.redis_conn.expire(self.lock_key, expire) 115 | return self.redis_conn.ttl(self.lock_key) 116 | -------------------------------------------------------------------------------- /magical/sync_spider/common/spider_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: spider_util.py 6 | Time: 2021/4/11 下午9:35 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 下午9:35 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import json 14 | import random 15 | import time 16 | import datetime 17 | import hashlib 18 | 19 | 20 | class SpiderUtil(object): 21 | 22 | file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'user_agent.json') 23 | with open(file_path, 'r', encoding='utf-8') as f: 24 | usa = json.load(f) 25 | 26 | def __init__(self, spider): 27 | self.spider = spider 28 | 29 | @staticmethod 30 | def random_ua(): 31 | return random.choice(SpiderUtil.usa) 32 | 33 | @staticmethod 34 | def microsecond_handler(time_str, symbol='-'): 35 | new_time_str = time_str.replace('T', ' ') 36 | 37 | dt = datetime.datetime.strptime(new_time_str, "%Y-%m-%d %H:%M:%S.%f+0800") 38 | dt1 = time.mktime(dt.timetuple()) + (dt.microsecond / 1000000) 39 | dt1 = dt1 * 1000 40 | dt1 = dt1 - 1 if '-' == symbol else dt1 + 1 41 | dt2 = datetime.datetime.fromtimestamp((int(dt1)) / 1000) 42 | 43 | return (dt2.strftime("%Y-%m-%dT%H:%M:%S.%f+0800")).replace('000', '') 44 | 45 | @staticmethod 46 | def get_sha1_encrypt(string): 47 | return hashlib.sha1(string.encode()).hexdigest() 48 | 49 | @staticmethod 50 | def get_md5_encrypt(string): 51 | new_md5 = hashlib.md5() 52 | new_md5.update(string.encode(encoding='utf-8')) 53 | return new_md5.hexdigest() 54 | 55 | 56 | if __name__ == '__main__': 57 | print(SpiderUtil.random_ua()) 58 | -------------------------------------------------------------------------------- /magical/sync_spider/common/user_agent.json: -------------------------------------------------------------------------------- 1 | [ 2 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", 3 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", 4 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", 5 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", 6 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", 7 | "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 8 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", 9 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", 10 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", 11 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", 12 | "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", 13 | "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", 14 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 15 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 16 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", 17 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", 18 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", 19 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", 20 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", 21 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", 22 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", 23 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", 24 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", 25 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", 26 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", 27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36", 29 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36", 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", 31 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", 33 | "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36", 34 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36", 35 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36", 36 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36", 37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36", 38 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", 39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 41 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 42 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 43 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 45 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36", 46 | "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", 47 | "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", 48 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17", 49 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17", 50 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15", 51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14", 52 | "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16", 53 | "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14", 54 | "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14", 55 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14", 56 | "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02", 57 | "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00", 58 | "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00", 59 | "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00", 60 | "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00", 61 | "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0", 62 | "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62", 63 | "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62", 64 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 65 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52", 66 | "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51", 67 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51", 68 | "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50", 69 | "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50", 70 | "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11", 71 | "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11", 72 | "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11", 73 | "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10", 74 | "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10", 75 | "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10", 76 | "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1", 77 | "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01", 78 | "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01", 79 | "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01", 80 | "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01", 81 | "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01", 82 | "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01", 83 | "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01", 84 | "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01", 85 | "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01", 86 | "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01", 87 | "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01", 88 | "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01", 89 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01", 90 | "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01", 91 | "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01", 92 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01", 93 | "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00", 94 | "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00", 95 | "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00", 96 | "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00", 97 | "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00", 98 | "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00", 99 | "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00", 100 | "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00", 101 | "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00", 102 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 103 | "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", 104 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0", 105 | "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0", 106 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0", 107 | "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0", 108 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0", 109 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0", 110 | "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0", 111 | "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0", 112 | "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3", 113 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0", 114 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0", 115 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0", 116 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0", 117 | "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0", 118 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0", 119 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0", 120 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0", 121 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0", 122 | "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0", 123 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0", 124 | "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0", 125 | "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0", 126 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1", 127 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1", 128 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0", 129 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0", 130 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0", 131 | "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0", 132 | "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0", 133 | "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0", 134 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0", 135 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 136 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0", 137 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", 138 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0", 139 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0", 140 | "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0", 141 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0", 142 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0", 143 | "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0", 144 | "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0", 145 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0", 146 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0", 147 | "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", 148 | "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0", 149 | "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1", 150 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0", 151 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6", 152 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A", 153 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25", 154 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", 155 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10", 156 | "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3", 157 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1", 158 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1", 159 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 160 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 161 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 162 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 163 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 164 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 165 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 166 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 167 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 168 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 169 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 170 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 171 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 172 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 173 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 174 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 175 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 176 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 177 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 178 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 179 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 180 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 181 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 182 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 183 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 184 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 185 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 186 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 187 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 188 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 189 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 190 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 191 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 192 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 193 | "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 194 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 195 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 196 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 197 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 198 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 199 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 200 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 201 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5" 202 | ] -------------------------------------------------------------------------------- /magical/sync_spider/common/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: utils.py 6 | Time: 2021/4/10 下午9:36 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午9:36 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import time 13 | 14 | from magical.utils import round_half_up, get_fmt_time 15 | 16 | start_time = time.time() 17 | success_rate = 0 18 | success_num = 0 19 | failure_num = 0 20 | end_time = None 21 | req_num = None 22 | 23 | 24 | def _gen_content(name): 25 | global req_num, success_rate 26 | req_num = success_num + failure_num 27 | 28 | success_rate = float(round_half_up(success_num / req_num, 4)) * 100 if req_num else 0 29 | 30 | return [ 31 | f'爬虫名称: {name}', 32 | f'请求成功率: {success_rate}%', 33 | f'请求成功次数: {success_num}', 34 | f'请求失败次数: {failure_num}', 35 | f'开始时间: {get_fmt_time(timestamp=start_time)}', 36 | f'结束时间: {get_fmt_time(timestamp=end_time)}', 37 | ] 38 | 39 | 40 | def call_func(request_func, exception_func, response_func, *args, **kwargs): 41 | global success_num, failure_num, end_time 42 | 43 | failure_num += 1 44 | try: 45 | result = request_func(*args, **kwargs) 46 | 47 | except Exception as exc: 48 | failure_num -= 1 49 | return exception_func(exc) 50 | 51 | else: 52 | failure_num -= 1 53 | success_num += 1 54 | return response_func(result) 55 | 56 | finally: 57 | end_time = time.time() 58 | 59 | 60 | def call_func_item(item_func, exception_func, *args, **kwargs): 61 | try: 62 | return item_func(*args, **kwargs) 63 | 64 | except Exception as exc: 65 | return exception_func(exc) 66 | -------------------------------------------------------------------------------- /magical/sync_spider/config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午2:32 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午2:32 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/config/default_settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: default_settings.py 6 | Time: 2021/4/10 下午5:19 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午5:19 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | # ------------------------------------------------------------------------------------------------------------------- 14 | 15 | # # 去重中间件 16 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler" 17 | # 18 | # # 去重过滤器 19 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter" 20 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter" 21 | # 22 | # # 去重队列,redis, memory = 内存 23 | # FILTER_QUEUE_TYPE = 'redis' 24 | # 25 | # # 去重是否 md5 加密 26 | # FILTER_USE_MD5 = False 27 | # 28 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red 29 | # FILTER_REDIS_NAME = 'red' 30 | # 31 | # # 去重初始容量 32 | # FILTER_INITIAL_CAPACITY = 100000000 33 | # 34 | # # 去重错误率 35 | # FILTER_ERROR_RATE = 0.00001 36 | 37 | # ------------------------------------------------------------------------------------------------------------------- 38 | 39 | # # rabbit mq 配置 40 | # MESSAGE_MQ_CONFIG = { 41 | # 'username': 'admin', 42 | # 'password': 'admin123', 43 | # 'host': '127.0.0.1', 44 | # 'port': 18097 45 | # } 46 | # 47 | # # rabbit mq 消费批次,每次消费 10 条 48 | # MESSAGE_MQ_PREFETCH_COUNT = 10 49 | # 50 | # # rabbit mq virtual host 51 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider' 52 | # 53 | # # rabbit mq 操作类 54 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler' 55 | 56 | # ------------------------------------------------------------------------------------------------------------------- 57 | 58 | # 下载中间件 59 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader" 60 | 61 | # 下载处理中间件 62 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler" 63 | 64 | # 下载调度器 65 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager" 66 | 67 | # 中间件,可配置多个,默认是重试中间件 68 | DOWNLOAD_MIDDLEWARE_PATH = {} 69 | 70 | # ------------------------------------------------------------------------------------------------------------------- 71 | 72 | # 管道处理中间件 73 | PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler" 74 | 75 | # 管道调度器 76 | PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager" 77 | 78 | # 管道中间件,可配置多个 79 | PIPELINE_MIDDLEWARE_PATH = {} 80 | 81 | # ------------------------------------------------------------------------------------------------------------------- 82 | 83 | # 爬虫公共类,基类 84 | BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider" 85 | 86 | # 爬虫工具类 87 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil" 88 | 89 | # 邮件 90 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler' 91 | 92 | # post ger sql 操作类 93 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle' 94 | 95 | # mysql 操作类 96 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler' 97 | 98 | # redis 操作类 99 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler' 100 | 101 | # 代理IP中间件 102 | # redis IP 获取 103 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy' 104 | # # 芝麻代理 IP 105 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy' 106 | 107 | # ------------------------------------------------------------------------------------------------------------------- 108 | 109 | # 初始化 代理 IP 数量 110 | PROXY_NUM = 5 111 | 112 | # 重试次数 113 | RETRY_COUNT = 3 114 | 115 | # 包含一下状态吗,重试 116 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408] 117 | 118 | # 忽略 ssl 验证 119 | REQUEST_VERIFY = False 120 | 121 | # 请求超时时间 122 | REQUEST_TIMEOUT = 30 123 | 124 | # 5s盾,delay 时间 125 | SCRAPER_DELAY = 30 126 | 127 | # 消费者线程数 128 | CONSUMER_THREAD_NUM = 10 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /magical/sync_spider/config/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: settings.py 6 | Time: 2021/3/24 上午9:34 7 | ------------------------------------------------- 8 | Change Activity: 2021/3/24 上午9:34 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import json 13 | from importlib import import_module 14 | 15 | from magical.sync_spider.config import default_settings 16 | 17 | 18 | class Attribute(object): 19 | 20 | def __init__(self, value): 21 | self.value = value 22 | 23 | def __str__(self): 24 | return "" % self.value 25 | 26 | __repr__ = __str__ 27 | 28 | 29 | class Settings(object): 30 | 31 | def __init__(self, ): 32 | self.attrs = {} 33 | self.load_config(default_settings) 34 | 35 | def __getitem__(self, key): 36 | return self.attrs[key].value if key in self.attrs else None 37 | 38 | def load_config(self, module): 39 | if isinstance(module, str): 40 | module = import_module(module) 41 | 42 | for key in dir(module): 43 | if key.isupper(): 44 | self.set(key, getattr(module, key)) 45 | 46 | def set(self, key: str, value): 47 | self.attrs[key] = Attribute(value) 48 | 49 | def set_dict(self, values): 50 | for key, value in values.items(): 51 | self.set(key, value) 52 | 53 | def get(self, key, default=None): 54 | return self[key] or default 55 | 56 | def get_int(self, key, default=0): 57 | return int(self.get(key, default)) 58 | 59 | def get_float(self, key, default=0.0): 60 | return float(self.get(key, default)) 61 | 62 | def get_list(self, key, default=None): 63 | value = self.get(key, default or None) 64 | if isinstance(value, str): 65 | value = value.split(",") 66 | return value 67 | 68 | def get_dict(self, key, default=None): 69 | value = self.get(key, default or None) 70 | if isinstance(value, str): 71 | value = json.loads(value) 72 | return value 73 | -------------------------------------------------------------------------------- /magical/sync_spider/core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午4:49 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:49 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/core/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: spider.py 6 | Time: 2021/4/10 下午4:55 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:55 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import copy 13 | import json 14 | import time 15 | import importlib 16 | from queue import Queue 17 | 18 | import threading 19 | import requests 20 | from sqlalchemy import create_engine 21 | 22 | from magical.utils import load_objects 23 | from magical.sync_spider.common.log_setting import get_logger 24 | 25 | from magical.sync_spider.databases.init_db import InitDatabase 26 | 27 | from magical.sync_spider.config.settings import Settings 28 | from magical.sync_spider.http.response import Response 29 | from magical.sync_spider.http.request import Request 30 | 31 | 32 | # 爬虫初始化 33 | class InitSpider(object): 34 | name = 'base_init_spider' 35 | 36 | spider_start_time = time.time() 37 | 38 | this = None 39 | 40 | def __init__(self, *args, **kwargs): 41 | self.name = kwargs.get('name', self.name) 42 | self.custom_setting = kwargs.get('custom_setting', {}) 43 | self.settings_path = kwargs.get('settings_path') 44 | self.common_settings_path = kwargs.get('common_settings_path') 45 | 46 | self.__load_settings(self.custom_setting) 47 | 48 | self.logger = get_logger(self) 49 | self.__load_dbs() 50 | 51 | self.email_handler = load_objects(self.settings['EMAIL_HANDLER']) 52 | self.spider_util = load_objects(self.settings['SPIDER_UTIL_PATH'])(self) 53 | 54 | if self.settings['PROXY_HANDLER']: 55 | self.proxy = load_objects(self.settings['PROXY_HANDLER'])(self) 56 | 57 | if self.settings['FILTER_DUPLICATE_HANDLER']: 58 | self.duplicate = load_objects(self.settings['FILTER_DUPLICATE_HANDLER'])(self) 59 | 60 | InitSpider.this = self 61 | 62 | def __load_settings(self, custom_setting={}): 63 | self.settings = Settings() 64 | self.settings.set_dict(custom_setting) 65 | if self.settings_path: 66 | try: 67 | self.settings.load_config(importlib.import_module(self.common_settings_path)) 68 | except Exception as e: 69 | pass 70 | self.settings.load_config(importlib.import_module(self.settings_path)) 71 | 72 | def __load_dbs(self): 73 | self.dbs = InitDatabase(self).dbs 74 | 75 | for db in self.dbs: 76 | setattr(self, db['name'], db['instance']) 77 | 78 | def __close_dbs(self): 79 | for db in self.dbs: 80 | db['instance'] and db['instance'].close_pool() 81 | 82 | def test_ip(self, proxy): 83 | res = None 84 | try: 85 | res = requests.get('http://www.httpbin.org/ip', proxies=proxy) 86 | res_json = res.json() 87 | 88 | if res_json.get('origin') in proxy.get('http', proxy.get('https', )): 89 | self.logger.info(f'可用代理: {proxy}') 90 | return True 91 | 92 | else: 93 | self.logger.error(f'不可用代理: {proxy}') 94 | 95 | except Exception as e: 96 | self.logger.error(f'测试代理异常: {proxy}, error: {e}, res: {res and res.text}', exc_info=True) 97 | 98 | def close_spider(self): 99 | self.__close_dbs() 100 | self.logger.info(f'Time usage: {time.time() - self.spider_start_time}') 101 | self.logger.info(f'Spider finished!') 102 | self.logger.info(f'Close Spider!') 103 | 104 | @staticmethod 105 | def this_close_spider(): 106 | InitSpider.this.close_spider() 107 | 108 | @staticmethod 109 | def get_create_engine(db_type, name, settings_path): 110 | """获取数据库 create_engine 连接,用于 pandas 111 | 112 | Args: 113 | db_type: mysql or post_gre 114 | name: 数据库名称 115 | settings_path: 配置文件路径 116 | """ 117 | custom_settings = importlib.import_module(settings_path) 118 | 119 | configs = getattr(custom_settings, f'{db_type.upper()}_CONFIG') 120 | 121 | if isinstance(configs, list): 122 | dbs = list(filter(lambda x: x['name'] == name, configs)) 123 | if len(dbs) == 0: 124 | raise KeyError(f'{db_type} {name} 数据库 不存在') 125 | 126 | else: 127 | config = dbs[0] 128 | else: 129 | config = configs 130 | 131 | db = config['db'] 132 | user = config['user'] 133 | host = config['host'] 134 | port = config['port'] 135 | password = config['password'] 136 | 137 | if db_type == 'post_gre': 138 | db_engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}') 139 | 140 | else: 141 | db_engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}?charset=utf8mb4') 142 | 143 | return db_engine 144 | 145 | 146 | # 爬虫基类 147 | class BaseSyncSpider(object): 148 | name = 'base_sync_spider' 149 | spider_data = {} 150 | default_custom_setting = {} 151 | settings_path = None 152 | base_spider = None 153 | common_settings_path = 'spiders.common.settings' 154 | 155 | def __init__(self, *args, **kwargs): 156 | self.custom_setting = kwargs.get('custom_setting', {}) 157 | self.custom_setting.update(self.default_custom_setting) 158 | 159 | kwargs['custom_setting'] = self.custom_setting 160 | kwargs['name'] = self.name 161 | kwargs['settings_path'] = self.settings_path 162 | kwargs['common_settings_path'] = self.common_settings_path 163 | 164 | if not kwargs.get('init_spider'): 165 | self.init_spider = InitSpider(*args, **kwargs) 166 | 167 | else: 168 | self.init_spider = kwargs.get('init_spider') 169 | 170 | self.settings = Settings() 171 | self.settings.set_dict({k: v.value for k, v in self.init_spider.settings.attrs.items()}) 172 | self.settings.set_dict(copy.deepcopy(self.custom_setting)) 173 | 174 | self.download_cls = load_objects(self.settings['DOWNLOADER_PATH'])(self) 175 | self.pipeline_cls = load_objects(self.settings['PIPELINE_HANDLER_PATH'])(self) 176 | self.base_spider = load_objects(self.settings['BASE_SPIDER_PATH'])(self) 177 | 178 | self.__load_mq() 179 | 180 | if self.settings.get('SPIDER_INIT_HANDLER'): 181 | self.spider_init = load_objects(self.settings['SPIDER_INIT_HANDLER'])(self) 182 | 183 | def close_message_mq(self): 184 | message_mq = getattr(self, 'message_mq') 185 | if message_mq: 186 | message_mq.close_mq() 187 | 188 | def __load_mq(self): 189 | message_mq_handler = self.settings['MESSAGE_MQ_HANDLER'] 190 | if message_mq_handler: 191 | setattr(self, 'message_mq', load_objects(message_mq_handler)(self)) 192 | 193 | def __getattr__(self, item: str): 194 | 195 | if hasattr(self.init_spider, item): 196 | return getattr(self.init_spider, item) 197 | 198 | elif self.base_spider and hasattr(self.base_spider, item): 199 | return getattr(self.base_spider, item) 200 | 201 | else: 202 | self.logger.error(f'{item} 属性不在,base_spider or init_spider') 203 | return None 204 | 205 | def __download(self, request: Request) -> Response: 206 | response = self.download_cls.fetch(request) 207 | return response 208 | 209 | def download(self, request: Request = None, **kwargs) -> Response: 210 | if not isinstance(request, Request): 211 | request = Request(**kwargs) 212 | try: 213 | response = self.__download(request) 214 | except AttributeError as exc: 215 | self.logger.error(f'AttributeError: {str(exc)}', exc_info=True) 216 | self.logger.warning('find a error,post to error back.') 217 | except Exception as exc: 218 | self.logger.error(f'AttributeError: {str(exc)}', exc_info=True) 219 | else: 220 | if isinstance(response, Request): 221 | return self.download(response) 222 | 223 | return response 224 | 225 | def pipeline(self, item, **kwargs): 226 | return self.pipeline_cls.pipeline(item, **kwargs) 227 | 228 | def test_ip(self, proxy: dict) -> bool: 229 | return self.init_spider.test_ip(proxy) 230 | 231 | @staticmethod 232 | def create_thread(func, **kwargs): 233 | t = threading.Thread(target=func, args=(kwargs,)) 234 | t.start() 235 | return t 236 | 237 | @staticmethod 238 | def create_engine(db_type, name, settings_path): 239 | return InitSpider.get_create_engine(db_type, name, settings_path) 240 | 241 | 242 | # redis 订阅者爬虫类 243 | class RedisMessageMQSpider(BaseSyncSpider): 244 | name = 'redis_message_mq_spider' 245 | 246 | def __init__(self, *args, **kwargs): 247 | super().__init__(*args, **kwargs) 248 | 249 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10 250 | self.spider_queue = Queue(100) 251 | 252 | def start_spider(self): 253 | raise NotImplementedError 254 | 255 | def start(self): 256 | self.logger.info('Start Spider!') 257 | 258 | try: 259 | self.start_spider() 260 | 261 | except Exception as e: 262 | self.logger.error(f'redis_message_mq_spider.start.error: {e}', exc_info=True) 263 | 264 | finally: 265 | self.close_spider() 266 | 267 | def start_thread(self): 268 | """启动爬虫,适用于多线程""" 269 | try: 270 | self.start_spider() 271 | 272 | except Exception as e: 273 | self.logger.error(f'redis_message_mq_spider.start_thread.error: {e}', exc_info=True) 274 | 275 | def __consumer_queue(self, func): 276 | while True: 277 | msg = self.spider_queue.get() 278 | try: 279 | self.logger.info(f'spider_queue.msg: {msg}') 280 | func(msg) 281 | 282 | except Exception as e: 283 | self.logger.exception(e) 284 | 285 | self.spider_queue.task_done() 286 | 287 | def __consumer_mq(self, key): 288 | redis_sub = self.red_mq.subscribe(key) 289 | msgs = redis_sub.listen() 290 | 291 | for msg in msgs: 292 | if msg['type'] == 'message': 293 | self.spider_queue.put(json.loads(msg['data'])) 294 | 295 | def producer_mq(self, key, value=None, values=None): 296 | if isinstance(values, list): 297 | for i in values: 298 | if isinstance(i, dict): 299 | i = json.dumps(i, ensure_ascii=False) 300 | 301 | self.red_mq.public(key, i) 302 | 303 | else: 304 | if isinstance(value, dict): 305 | value = json.dumps(value, ensure_ascii=False) 306 | 307 | self.red_mq.public(key, value) 308 | 309 | def producer(self, func=None, **kwargs): 310 | t = threading.Thread(target=func, args=(kwargs,)) 311 | t.start() 312 | return t 313 | 314 | def consumer_mq(self, key): 315 | t = threading.Thread(target=self.__consumer_mq, args=(key,)) 316 | t.start() 317 | return t 318 | 319 | def consumer_queue(self, func, thread_num=None): 320 | for index in range(thread_num or self.consumer_thread_num): 321 | consumer_thread = threading.Thread(target=self.__consumer_queue, args=(func,)) 322 | consumer_thread.daemon = True 323 | consumer_thread.start() 324 | 325 | 326 | # rabbit MQ 爬虫类 327 | class RabbitMessageMQSpider(BaseSyncSpider): 328 | name = 'rabbit_message_mq_spider' 329 | 330 | def __init__(self, *args, **kwargs): 331 | super().__init__(*args, **kwargs) 332 | 333 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10 334 | self.spider_queue = Queue(100) 335 | 336 | self.fail_spider_queue = Queue(100) 337 | 338 | def start_spider(self): 339 | raise NotImplementedError 340 | 341 | def start(self): 342 | self.logger.info('Start Spider!') 343 | 344 | try: 345 | self.start_spider() 346 | 347 | except Exception as e: 348 | self.logger.error(f'rabbit_message_mq_spider.start.error: {e}', exc_info=True) 349 | 350 | finally: 351 | self.close_message_mq() 352 | self.close_spider() 353 | 354 | def start_thread(self): 355 | """启动爬虫,适用于多线程""" 356 | try: 357 | self.start_spider() 358 | 359 | except Exception as e: 360 | self.logger.error(f'rabbit_message_mq_spider.start_thread.error: {e}', exc_info=True) 361 | 362 | def __consumer_queue(self, func): 363 | while True: 364 | channel, method, properties, body = self.spider_queue.get() 365 | try: 366 | msg = json.loads(body) 367 | 368 | except json.decoder.JSONDecodeError: 369 | msg = body.decode() 370 | 371 | try: 372 | self.logger.info(f'spider_queue.msg: {msg}') 373 | 374 | if func(msg): 375 | self.logger.info(f'rabbit mq 消费成功: {msg}') 376 | 377 | else: 378 | self.logger.error(f'rabbit mq 消费失败: {msg}') 379 | self.fail_spider_queue.put(msg) 380 | 381 | except Exception as e: 382 | self.logger.exception(e) 383 | self.fail_spider_queue.put(msg) 384 | 385 | finally: 386 | self.message_mq.receiver.basic_ack(channel, method) 387 | self.spider_queue.task_done() 388 | 389 | def consumer_queue(self, func, thread_num=None): 390 | for index in range(thread_num or self.consumer_thread_num): 391 | consumer_thread = threading.Thread(target=self.__consumer_queue, args=(func,)) 392 | consumer_thread.daemon = True 393 | consumer_thread.start() 394 | 395 | def __consumer_mq_callback(self, channel, method, properties, body): 396 | self.spider_queue.put((channel, method, properties, body)) 397 | 398 | def consumer_mq(self, key): 399 | t = threading.Thread(target=self.message_mq.consumer, args=(key, self.__consumer_mq_callback)) 400 | t.start() 401 | return t 402 | 403 | def producer_mq(self, key=None, value=None, values=None): 404 | if isinstance(values, list): 405 | for i in values: 406 | if isinstance(i, dict): 407 | i = json.dumps(i, ensure_ascii=False) 408 | 409 | self.message_mq.producer(key, i) 410 | 411 | else: 412 | if isinstance(value, dict): 413 | value = json.dumps(value, ensure_ascii=False) 414 | 415 | self.message_mq.producer(key, value) 416 | 417 | def get_queue_len(self): 418 | return self.spider_queue.qsize() 419 | 420 | 421 | # 单线程爬虫类 422 | class SyncSpider(BaseSyncSpider): 423 | name = 'sync_spider' 424 | 425 | def __init__(self, *args, **kwargs): 426 | super().__init__(*args, **kwargs) 427 | 428 | self.consumer_thread_num = self.settings['CONSUMER_THREAD_NUM'] or 10 429 | self.spider_queue = Queue(1000) 430 | 431 | def start_spider(self): 432 | raise NotImplementedError 433 | 434 | def start(self): 435 | """启动爬虫,适用于单线程""" 436 | self.logger.info('Start Spider!') 437 | 438 | try: 439 | self.start_spider() 440 | 441 | except Exception as e: 442 | self.logger.error(f'sync_spider.start.error: {e}', exc_info=True) 443 | 444 | finally: 445 | self.close_spider() 446 | 447 | def start_thread(self): 448 | """启动爬虫,适用于多线程""" 449 | try: 450 | self.start_spider() 451 | 452 | except Exception as e: 453 | self.logger.error(f'sync_spider.start_thread.error: {e}', exc_info=True) 454 | 455 | def start_mq(self): 456 | """启动爬虫,适用于消息队列, redis mq""" 457 | try: 458 | self.start_spider() 459 | 460 | except Exception as e: 461 | self.logger.error(f'sync_spider.start_mq.error: {e}', exc_info=True) 462 | 463 | def __producer(self, items): 464 | for item in items: 465 | self.spider_queue.put(item) 466 | 467 | self.spider_queue.join() 468 | 469 | def producer(self, items=[], func=None, **kwargs): 470 | 471 | if func: 472 | t = threading.Thread(target=func, args=(kwargs,)) 473 | else: 474 | t = threading.Thread(target=self.__producer, args=(items,)) 475 | 476 | t.start() 477 | return t 478 | 479 | def __consumer(self, func, queue): 480 | spider_queue = queue if queue else self.spider_queue 481 | 482 | while True: 483 | msg = spider_queue.get() 484 | try: 485 | # self.logger.info(f'spider_queue.msg: {msg}') 486 | func(msg) 487 | 488 | except Exception as e: 489 | self.logger.exception(e) 490 | 491 | spider_queue.task_done() 492 | 493 | def consumer(self, func, thread_num=None, queue=None): 494 | for index in range(thread_num or self.consumer_thread_num): 495 | consumer_thread = threading.Thread(target=self.__consumer, args=(func, queue)) 496 | consumer_thread.daemon = True 497 | consumer_thread.start() 498 | 499 | 500 | # 多线程爬虫类 501 | class ThreadSyncSpider(object): 502 | def __init__(self, items, spider_cls, *args, **kwargs): 503 | kwargs['name'] = spider_cls.name 504 | kwargs['settings_path'] = spider_cls.settings_path 505 | kwargs['custom_setting'] = spider_cls.default_custom_setting 506 | 507 | self.init_spider = InitSpider(*args, **kwargs) 508 | 509 | self.items = items 510 | self.spider_cls = spider_cls 511 | 512 | self.tasks = [] 513 | 514 | def __start(self, item): 515 | self.spider_cls(init_spider=self.init_spider, **item).start_thread() 516 | 517 | def start(self): 518 | InitSpider.this.logger.info('Start Spider!') 519 | 520 | try: 521 | for item in self.items: 522 | t = threading.Thread(target=self.__start, args=(item,)) 523 | t.start() 524 | self.tasks.append(t) 525 | 526 | for task in self.tasks: 527 | task.join() 528 | 529 | except Exception as e: 530 | InitSpider.this.logger.error(f'sync_spider.start.error: {e}', exc_info=True) 531 | 532 | finally: 533 | InitSpider.this_close_spider() 534 | -------------------------------------------------------------------------------- /magical/sync_spider/core/start_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: start_spider.py 6 | Time: 2021/4/14 下午5:21 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/14 下午5:21 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.sync_spider.core.spider import ThreadSyncSpider 13 | 14 | 15 | def run_spider(spider_cls, *args, **kwargs): 16 | spider = spider_cls(*args, **kwargs) 17 | spider.start() 18 | 19 | 20 | def run_thread_spider(items, spider_cls, *args, **kwargs): 21 | ThreadSyncSpider(items, spider_cls, *args, **kwargs).start() 22 | -------------------------------------------------------------------------------- /magical/sync_spider/databases/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午4:48 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:48 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/databases/init_db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: init_db.py 6 | Time: 2021/4/29 下午6:58 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/29 下午6:58 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import load_objects 13 | 14 | 15 | class InitDatabase(object): 16 | instance = None 17 | init_flag = None 18 | 19 | def __new__(cls, *args, **kwargs): 20 | if not cls.instance: 21 | cls.instance = super().__new__(cls) 22 | return cls.instance 23 | 24 | def __init__(self, spider): 25 | if InitDatabase.init_flag: 26 | return 27 | InitDatabase.init_flag = True 28 | 29 | self.spider = spider 30 | self.logger = spider.logger 31 | self.settings = spider.settings 32 | 33 | self.post_gre_config = self.settings['POST_GRE_CONFIG'] 34 | self.mysql_config = self.settings['MYSQL_CONFIG'] 35 | self.redis_config = self.settings['REDIS_CONFIG'] 36 | 37 | self.dbs = [] 38 | 39 | self.__load_dbs() 40 | self.__init_dbs() 41 | 42 | def __set_dict(self, name, instance=None): 43 | self.dbs.append({'name': name, 'instance': instance}) 44 | 45 | def __load_dbs(self): 46 | self.sql_handler = load_objects(self.settings['POST_GRE_SQL_HANDLER']) 47 | self.red_handler = load_objects(self.settings['REDIS_HANDLER']) 48 | self.mysql_handler = load_objects(self.settings['MYSQL_HANDLER']) 49 | 50 | def __init_dbs(self): 51 | # redis 52 | if isinstance(self.redis_config, dict): 53 | self.__set_dict('red', self.red_handler(config=self.redis_config)) 54 | elif isinstance(self.redis_config, list): 55 | for rc in self.redis_config: 56 | self.__set_dict(rc["name"], self.red_handler(config=rc)) 57 | else: 58 | self.logger.info('未添加 redis 配置') 59 | self.__set_dict('red') 60 | 61 | # PostGreSql 62 | if isinstance(self.post_gre_config, dict): 63 | self.__set_dict('post_gre', self.sql_handler(config=self.post_gre_config, spider=self.spider)) 64 | elif isinstance(self.post_gre_config, list): 65 | for pgc in self.post_gre_config: 66 | self.__set_dict(pgc["name"], self.sql_handler(config=pgc, spider=self.spider)) 67 | else: 68 | self.logger.info('未添加 post gre sql 配置') 69 | self.__set_dict('post_gre') 70 | 71 | # mysql 72 | if isinstance(self.mysql_config, dict): 73 | self.__set_dict('mysql', self.mysql_handler(config=self.mysql_config, spider=self.spider)) 74 | elif isinstance(self.mysql_config, list): 75 | for my in self.mysql_config: 76 | self.__set_dict(my["name"], self.mysql_handler(config=my, spider=self.spider)) 77 | else: 78 | self.logger.info('未添加 mysql 配置') 79 | self.__set_dict('mysql') 80 | -------------------------------------------------------------------------------- /magical/sync_spider/databases/mysql_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: mysql_pool.py 6 | Time: 2021/4/22 上午12:41 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/22 上午12:41 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import pymysql 13 | from DBUtils.PooledDB import PooledDB 14 | 15 | 16 | class MysqlHandler(object): 17 | __instance = {} 18 | __init = {} 19 | 20 | def __new__(cls, *args, **kwargs): 21 | config = kwargs['config'] 22 | name = config.get('name', 'mysql') 23 | 24 | if not cls.__instance.get(name): 25 | cls.__instance[name] = super().__new__(cls) 26 | 27 | return cls.__instance[name] 28 | 29 | def __init__(self, config, spider): 30 | name = config.get('name', 'mysql') 31 | if MysqlHandler.__init.get(name): 32 | return 33 | MysqlHandler.__init[name] = True 34 | 35 | self.log = spider.logger 36 | self.config = config 37 | 38 | self.pool = PooledDB( 39 | creator=pymysql, 40 | maxconnections=0, 41 | mincached=5, 42 | maxcached=5, 43 | maxshared=3, 44 | blocking=True, 45 | maxusage=None, 46 | setsession=[], 47 | ping=0, 48 | host=self.config['host'], 49 | port=self.config['port'], 50 | user=self.config['user'], 51 | password=self.config['password'], 52 | database=self.config['db'], 53 | charset=self.config['charset'] 54 | ) 55 | 56 | def get_pool(self): 57 | conn = self.pool.connection() 58 | cur = conn.cursor() 59 | return conn, cur 60 | 61 | def execute(self, sql, info_data=None): 62 | conn, cur = self.get_pool() 63 | try: 64 | if isinstance(info_data, dict): 65 | cur.execute(sql, info_data) 66 | elif isinstance(info_data, list): 67 | cur.executemany(sql, info_data) 68 | else: 69 | cur.execute(sql) 70 | conn.commit() 71 | return True 72 | 73 | except pymysql.err.IntegrityError as e: 74 | self.log.info(f'pymysql.err.IntegrityError: {e}') 75 | self.log.info(f"execute failed: {sql}") 76 | return False 77 | 78 | except Exception as e: 79 | self.log.info(f'mysql db: {e}') 80 | self.log.info(f"execute failed: {sql}") 81 | return False 82 | 83 | finally: 84 | cur.close() 85 | conn.close() 86 | 87 | def insert_dict(self, table_name, info_dict, ignore=False, replace=False): 88 | fs = ','.join(list(map(lambda x: '`' + x + '`', [*info_dict.keys()]))) 89 | vs = ','.join(list(map(lambda x: '%(' + x + ')s', [*info_dict.keys()]))) 90 | 91 | sql = f"insert into `{table_name}` ({fs}) values ({vs});" 92 | if ignore: 93 | sql = f"insert ignore into `{table_name}` ({fs}) values ({vs});" 94 | elif replace: 95 | sql = f"replace into {table_name} ({fs}) values ({vs});" 96 | 97 | try: 98 | return self.execute(sql, info_dict) 99 | 100 | except Exception as e: 101 | self.log.info(f'insert_dict.mysql db: {e}') 102 | self.log.info("insert_dict.failed: " + sql + "\t" + str(info_dict.values())) 103 | 104 | def insert_list(self, table_name, info_list, ignore=False, replace=False): 105 | keys = list(info_list[0].keys()) 106 | fs = ', '.join(keys) 107 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', keys))) 108 | 109 | sql = f"insert into {table_name} ({fs}) values ({vs});" 110 | if ignore: 111 | sql = f"insert ignore into {table_name} ({fs}) values ({vs});" 112 | elif replace: 113 | sql = f"replace into {table_name} ({fs}) values ({vs});" 114 | 115 | try: 116 | return self.execute(sql, info_list) 117 | except Exception as e: 118 | self.log.info(f'insert_list.mysql db: {e}') 119 | 120 | def select(self, sql): 121 | conn, cur = self.get_pool() 122 | cur.execute(sql) 123 | result = cur.fetchall() 124 | conn.close() 125 | cur.close() 126 | return result 127 | 128 | def close_pool(self): 129 | self.pool.close() 130 | -------------------------------------------------------------------------------- /magical/sync_spider/databases/post_gre_sql_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: post_gre_sql_pool.py 6 | Time: 2021/4/10 下午4:49 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:49 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import psycopg2 13 | from DBUtils.PooledDB import PooledDB 14 | 15 | 16 | class PostGreHandle(object): 17 | __instance = {} 18 | __init = {} 19 | 20 | def __new__(cls, *args, **kwargs): 21 | config = kwargs['config'] 22 | name = config.get('name', 'post_gre') 23 | 24 | if not cls.__instance.get(name): 25 | cls.__instance[name] = super().__new__(cls) 26 | 27 | return cls.__instance[name] 28 | 29 | def __init__(self, config, spider): 30 | name = config.get('name', 'post_gre') 31 | if PostGreHandle.__init.get(name): 32 | return 33 | PostGreHandle.__init[name] = True 34 | 35 | self.log = spider.logger 36 | self.config = config 37 | 38 | self.pool = PooledDB( 39 | creator=psycopg2, 40 | maxconnections=0, 41 | mincached=5, 42 | maxcached=5, 43 | maxshared=3, 44 | blocking=True, 45 | maxusage=None, 46 | setsession=[], 47 | ping=0, 48 | host=self.config['host'], 49 | port=self.config['port'], 50 | user=self.config['user'], 51 | password=self.config['password'], 52 | database=self.config['db'] 53 | ) 54 | 55 | def get_pool(self): 56 | conn = self.pool.connection() 57 | cur = conn.cursor() 58 | return conn, cur 59 | 60 | def execute(self, sql, info_data=None): 61 | conn, cur = self.get_pool() 62 | try: 63 | if isinstance(info_data, dict): 64 | cur.execute(sql, info_data) 65 | elif isinstance(info_data, list): 66 | cur.executemany(sql, info_data) 67 | else: 68 | cur.execute(sql) 69 | conn.commit() 70 | return True 71 | 72 | except Exception as e: 73 | self.log.info(f'sql db: {e}') 74 | self.log.info(f"execute failed: {sql}") 75 | return False 76 | 77 | finally: 78 | cur.close() 79 | conn.close() 80 | 81 | def insert_conflict_list(self, table_name, info_list, indexes=None): 82 | keys = list(info_list[0].keys()) 83 | fs = ', '.join(keys) 84 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', keys))) 85 | 86 | sql = f"insert into {table_name} ({fs}) values ({vs}) on conflict ({indexes}) do nothing;" 87 | 88 | try: 89 | return self.execute(sql, info_list) 90 | except Exception as e: 91 | self.log.exception(f'insert_conflict_list.sql db: {e}') 92 | return False 93 | 94 | def insert_conflict_dict(self, table_name, info_dict, indexes=None): 95 | fs = ', '.join(list(info_dict.keys())) 96 | vs = ', '.join(list(map(lambda x: '%(' + x + ')s', [*info_dict.keys()]))) 97 | sql = f"insert into {table_name} ({fs}) values ({vs}) on conflict ({indexes}) do nothing;" 98 | 99 | try: 100 | return self.execute(sql, info_dict) 101 | except Exception as e: 102 | self.log.exception(f'insert_conflict_dict.sql db: {e}') 103 | self.log.error("insert_conflict_dict.failed: " + sql + "\t" + str(info_dict.values())) 104 | return False 105 | 106 | def select(self, sql): 107 | conn, cur = self.get_pool() 108 | 109 | try: 110 | cur.execute(sql) 111 | result = cur.fetchall() 112 | 113 | finally: 114 | conn.close() 115 | cur.close() 116 | return result 117 | 118 | def close_pool(self): 119 | self.pool.close() 120 | -------------------------------------------------------------------------------- /magical/sync_spider/databases/red_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: red_pool.py 6 | Time: 2021/4/10 下午4:49 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:49 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import json 13 | import redis 14 | import copy 15 | 16 | 17 | class RedisBase(redis.StrictRedis): 18 | __instance = {} 19 | __init = {} 20 | 21 | def __new__(cls, *args, **kwargs): 22 | config = kwargs['config'] 23 | name = config.get('name', 'red') 24 | 25 | if not cls.__instance.get(name): 26 | cls.__instance[name] = super().__new__(cls) 27 | 28 | return cls.__instance[name] 29 | 30 | def __init__(self, config): 31 | name = config.get('name', 'red') 32 | if RedisBase.__init.get(name): 33 | return 34 | RedisBase.__init[name] = True 35 | 36 | new_config = copy.deepcopy(config) 37 | 38 | if new_config.get('name'): 39 | del new_config['name'] 40 | 41 | super().__init__(**new_config) 42 | 43 | def public(self, key, msg): 44 | self.publish(key, msg) 45 | return True 46 | 47 | def subscribe(self, key): 48 | pub = self.pubsub() 49 | pub.subscribe(key) 50 | return pub 51 | 52 | def set_str(self, key, value, **kwargs): 53 | return self.set(key, value, **kwargs) 54 | 55 | def set_dict(self, key, value): 56 | if isinstance(value, (list, dict)): 57 | value = json.dumps(value, ensure_ascii=False) 58 | return self.set(key, value) 59 | 60 | def get_dict(self, key): 61 | data = self.get(key) 62 | return json.loads(data) if data else {} 63 | 64 | def get_list(self, key): 65 | data = self.get(key) 66 | return json.loads(data) if data else [] 67 | 68 | def get_str(self, key): 69 | return self.get(key) 70 | 71 | def close_pool(self): 72 | self.connection_pool.disconnect() 73 | 74 | def _pipeline(self): 75 | pipe = self.pipeline(transaction=True) 76 | pipe.multi() 77 | return pipe 78 | 79 | 80 | class RedisHandler(RedisBase): 81 | def __init__(self, config): 82 | super().__init__(config=config) 83 | 84 | def get_str(self, key): 85 | return self.get(key) 86 | 87 | def set_bit(self, table, offsets, values): 88 | if isinstance(offsets, list): 89 | if not isinstance(values, list): 90 | values = [values] * len(offsets) 91 | else: 92 | assert len(offsets) == len(values), "offsets值要与values值一一对应" 93 | 94 | pipe = self._pipeline() 95 | 96 | for offset, value in zip(offsets, values): 97 | pipe.setbit(table, offset, value) 98 | 99 | return pipe.execute() 100 | 101 | else: 102 | return self.setbit(table, offsets, values) 103 | 104 | def get_bit(self, table, offsets): 105 | if isinstance(offsets, list): 106 | pipe = self._pipeline() 107 | for offset in offsets: 108 | pipe.getbit(table, offset) 109 | 110 | return pipe.execute() 111 | 112 | else: 113 | return self.getbit(table, offsets) 114 | 115 | def bit_count(self, table): 116 | return self.bitcount(table) 117 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/11 上午12:40 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 上午12:40 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/base_module/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/4/18 上午11:21 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/18 上午11:21 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/base_module/downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: downloader.py 6 | Time: 2021/4/11 上午12:41 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 上午12:41 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | class DownloaderMiddleware(object): 15 | __instance = {} 16 | 17 | def __new__(cls, *args, **kwargs): 18 | if not cls.__instance: 19 | cls.__instance = super().__new__(cls) 20 | return cls.__instance 21 | 22 | def __init__(self, spider, **kwargs): 23 | self.spider = spider 24 | self.proxy = spider.proxy 25 | self.logger = spider.logger 26 | self.settings = spider.settings 27 | # self.duplicate = spider.duplicate 28 | self.max_retry_count = spider.settings.get_int("RETRY_COUNT") 29 | self.retry_status_codes = spider.settings.get_list("RETRY_STATUS_CODES") 30 | 31 | def process_request(self, request): 32 | return request 33 | 34 | def process_response(self, request, response): 35 | return response 36 | 37 | def process_exception(self, request, exception): 38 | return exception 39 | 40 | def _retry(self, request): 41 | retry_count = request.meta.get('retry_count', 0) + 1 42 | if retry_count < self.max_retry_count: 43 | retry_request = request.copy() 44 | retry_request.meta["retry_count"] = retry_count 45 | return retry_request 46 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/base_module/pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: pipeline.py 6 | Time: 2021/5/17 下午5:56 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/17 下午5:56 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | class PipelineMiddleware(object): 15 | __instance = {} 16 | 17 | def __new__(cls, *args, **kwargs): 18 | if not cls.__instance: 19 | cls.__instance = super().__new__(cls) 20 | return cls.__instance 21 | 22 | def __init__(self, spider, **kwargs): 23 | self.spider = spider 24 | self.logger = spider.logger 25 | self.settings = spider.settings 26 | 27 | def process_item(self, item, **kwargs): 28 | """数据处理 29 | 30 | Args: 31 | item : 要处理的数据 32 | kwargs: 33 | table_name: 表名称 34 | replace : True or False (mysql 数据库使用) 35 | ignore : True or False (mysql 数据库使用) 36 | indexes : 数据库表唯一索引字段 (PostGreSql 数据库使用) 37 | 38 | Return: 39 | 返回的数据类型如果不等于 type(item) 则不会调用后面的 pipeline process_item 函数 40 | """ 41 | return item 42 | 43 | def process_exception(self, item, exception, **kwargs): 44 | return exception 45 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/download/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/11 上午12:40 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 上午12:40 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/download/retry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: retry.py 6 | Time: 2021/4/11 上午12:51 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/11 上午12:51 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 13 | 14 | 15 | class RetryMiddleware(DownloaderMiddleware): 16 | RETRY_EXCEPTIONS = () 17 | 18 | def __init__(self, spider): 19 | super().__init__(spider) 20 | 21 | def process_response(self, request, response): 22 | if not request.use_middleware: 23 | return response 24 | if not request.meta.get("is_retry", False): 25 | return response 26 | if response.status in self.retry_status_codes: 27 | return self._retry(request) or response 28 | return response 29 | 30 | def process_exception(self, request, exception): 31 | if isinstance(exception, self.RETRY_EXCEPTIONS) and request.meta.get("is_retry", False): 32 | return self._retry(request) 33 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/mqs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/5/7 下午11:00 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/7 下午11:00 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/mqs/rabbit_mq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/5/7 下午11:00 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/7 下午11:00 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/extends_module/mqs/rabbit_mq/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: handler.py 6 | Time: 2021/5/7 下午11:23 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/7 下午11:23 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import threading 13 | from functools import partial 14 | 15 | import pika 16 | 17 | 18 | class MQBase(object): 19 | """消息队列基类, 该类线程不安全的""" 20 | 21 | def __init__(self, spider, ack=True): 22 | """当开启手动消息确认, 要考虑消息重入的情况, 默认开启手动消息确认 23 | 24 | Args: 25 | spider = 爬虫对象 26 | ack = 是否自动确认消息 27 | """ 28 | # 使用 线程局部变量,保证rabbit连接线程安全 29 | self.local = threading.local() 30 | 31 | self.spider = spider 32 | self.logger = spider.logger 33 | 34 | self._conn = None 35 | self._properties = pika.BasicProperties(delivery_mode=2, ) 36 | 37 | self.virtual_host = spider.settings['MESSAGE_MQ_VIRTUAL_HOST'] 38 | self.prefetch_count = spider.settings['MESSAGE_MQ_PREFETCH_COUNT'] 39 | self.rabbit_config = spider.settings['MESSAGE_MQ_CONFIG'] 40 | 41 | self.port = self.rabbit_config['port'] 42 | self.host = self.rabbit_config['host'] 43 | self.username = self.rabbit_config['username'] 44 | self.password = self.rabbit_config['password'] 45 | 46 | self.ack = ack 47 | 48 | def close(self): 49 | try: 50 | if hasattr(self.local, 'channel'): 51 | self.local.channel.close() 52 | self.logger.info('rabbit mq channel closed!') 53 | 54 | if not (self._conn and self._conn.is_open): 55 | self._conn.close() 56 | self.logger.info('rabbit mq connection closed!') 57 | 58 | except Exception as e: 59 | self.logger.error(f'rabbit mq closed error: {e}') 60 | 61 | def _check_channel(self): 62 | if not hasattr(self.local, 'channel'): 63 | channel = self._rabbit_mq_init() 64 | self.local.channel = channel 65 | 66 | def _rabbit_mq_init(self): 67 | """初始化 连接 rabbit mq""" 68 | credentials = pika.PlainCredentials(username=self.username, password=self.password) 69 | parameters = pika.ConnectionParameters( 70 | host=self.host, 71 | port=self.port, 72 | virtual_host=self.virtual_host, 73 | credentials=credentials, 74 | heartbeat=0 75 | ) 76 | self._conn = pika.BlockingConnection(parameters) 77 | channel = self._conn.channel() 78 | 79 | if self.ack: 80 | channel.confirm_delivery() 81 | 82 | self.logger.info('rabbit mq connection successfully !') 83 | 84 | return channel 85 | 86 | 87 | class MQSender(MQBase): 88 | 89 | def __init__(self, *args, **kwargs): 90 | super().__init__(*args, **kwargs) 91 | 92 | def try_send(self, queue_name, msg): 93 | try: 94 | self._check_channel() 95 | 96 | self.local.channel.queue_declare(queue=queue_name, durable=True) 97 | self.local.channel.basic_publish( 98 | exchange='', 99 | routing_key=queue_name, 100 | body=msg.encode(), 101 | properties=self._properties 102 | ) 103 | self.logger.info(f'rabbit MQ 消息推送成功, msg: {msg}') 104 | success = True 105 | except Exception as e: 106 | self.logger.exception(e) 107 | self.logger.error(f'rabbit MQ 消息推送失败, msg: {msg}') 108 | success = False 109 | 110 | return success 111 | 112 | def push(self, queue_name, msg): 113 | ret = self.try_send(queue_name, msg) or self.try_send(queue_name, msg) 114 | return ret 115 | 116 | 117 | class MQReceiver(MQBase): 118 | 119 | def __init__(self, *args, **kwargs): 120 | super().__init__(*args, **kwargs) 121 | 122 | def basic_ack(self, channel, method): 123 | return self._conn.add_callback_threadsafe(partial(channel.basic_ack, method.delivery_tag)) 124 | 125 | def start(self, queue_name, callback): 126 | """开始消费""" 127 | self._check_channel() 128 | 129 | self.local.channel.queue_declare(queue=queue_name, durable=True, auto_delete=False) 130 | self.local.channel.basic_qos(prefetch_count=self.prefetch_count) 131 | self.local.channel.basic_consume(queue_name, callback, auto_ack=not self.ack) 132 | self.local.channel.start_consuming() 133 | 134 | 135 | class RabbitMQHandler(object): 136 | def __init__(self, spider): 137 | self.spider = spider 138 | 139 | self.sender = MQSender(spider) 140 | self.receiver = MQReceiver(spider) 141 | 142 | def close_mq(self): 143 | # self.sender.close() 144 | self.receiver.close() 145 | 146 | def producer(self, queue_name, value): 147 | self.sender.push(queue_name, value) 148 | 149 | def consumer(self, queue_name, callback=None): 150 | self.receiver.start(queue_name, callback) 151 | -------------------------------------------------------------------------------- /magical/sync_spider/http/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午10:27 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午10:27 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/http/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: request.py 6 | Time: 2021/4/10 下午4:55 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午4:55 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from pickle import dumps, loads 13 | from urllib.parse import urlencode 14 | 15 | 16 | class Request(object): 17 | def __init__(self, url, params=None, method='GET', data={}, headers={}, meta={}, 18 | json=None, encoding='utf-8', use_middleware=True, session=True, s5=False, 19 | **kwargs): 20 | self.s5 = s5 21 | self.url = url 22 | self.data = data 23 | self.json = json 24 | self.params = params 25 | self.method = method 26 | self.encoding = encoding 27 | self.headers = headers 28 | self.session = session 29 | self.use_middleware = use_middleware 30 | self.kwargs = kwargs 31 | 32 | self.meta = self._load_meta(meta) 33 | 34 | def copy(self, *args, **kwargs): 35 | keys = [ 36 | 'url', 'method', 'data', 'json', 'params', 'headers', 'meta', 'session', 37 | 'use_middleware', 's5' 38 | ] 39 | for key in keys: 40 | kwargs.setdefault(key, getattr(self, key)) 41 | cls = kwargs.pop('cls', self.__class__) 42 | return cls(*args, **kwargs) 43 | 44 | def dumps(self): 45 | return dumps(self) 46 | 47 | def loads(self): 48 | return loads(self) 49 | 50 | @staticmethod 51 | def _load_meta(custom_meta): 52 | meta = { 53 | 'test_key': 'test_key1', 54 | 'proxy': None, 55 | 'retry_count': 0 56 | } 57 | 58 | if isinstance(custom_meta, dict): 59 | meta.update(custom_meta) 60 | return meta 61 | 62 | def __str__(self): 63 | return " %s %s>" % ( 64 | self.meta['retry_count'], 65 | self.method, 66 | self.url + urlencode(self.meta.get('params')) if self.meta.get('params') else self.url 67 | ) 68 | 69 | __repr__ = __str__ 70 | -------------------------------------------------------------------------------- /magical/sync_spider/http/response.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: response.py 6 | Time: 2021/4/10 下午10:28 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午10:28 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import re 13 | from lxml import etree 14 | 15 | 16 | class Response(object): 17 | def __init__(self, response, request): 18 | self.response = response 19 | self.request = request 20 | 21 | self.meta = request.meta 22 | self.url = response.url 23 | self.status = response.status_code 24 | self.text = response.text 25 | self.headers = response.headers 26 | self.cookies = response.cookies 27 | 28 | def set_encoding(self, encoding): 29 | self.response.encoding = encoding 30 | self.text = self.response.text 31 | 32 | def json(self): 33 | try: 34 | return self.response.json() 35 | except Exception as e: 36 | return None 37 | 38 | def __str__(self): 39 | return "" % (self.status, self.url) 40 | 41 | __repr__ = __str__ 42 | 43 | @property 44 | def re(self): 45 | return Regex(self.text) 46 | 47 | @property 48 | def selector(self): 49 | selector = etree.HTML(self.text) 50 | return selector 51 | 52 | def css(self, css_select: str): 53 | return self.selector.cssselect(css_select) 54 | 55 | def xpath(self, xpath_str: str) -> list: 56 | result_list = self.selector.xpath(xpath_str) 57 | return result_list 58 | 59 | 60 | class Regex(object): 61 | def __init__(self, html): 62 | self.html = html 63 | 64 | def findall(self, pattern, flags=0): 65 | return re.findall(pattern, self.html, flags) 66 | 67 | def search(self, pattern, flags=0): 68 | return re.search(pattern, self.html, flags) 69 | 70 | def match(self, pattern, flags=0): 71 | return re.match(pattern, self.html, flags) 72 | 73 | def sub(self, pattern, repl, count, flags=0): 74 | return re.sub(pattern, repl, self.html, count, flags) 75 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.tmpl.py 6 | Time: 2021/4/10 下午11:24 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午11:24 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/download/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/4/18 下午12:37 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/18 下午12:37 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/download/downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: downloader.py 6 | Time: 2021/4/10 下午11:27 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/10 下午11:27 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import load_objects 13 | 14 | 15 | class Downloader(object): 16 | """Downloader中间件""" 17 | 18 | def __init__(self, spider): 19 | handler_cls = spider.settings['DOWNLOAD_HANDLER_PATH'] 20 | handler_manager_cls = spider.settings['DOWNLOAD_MIDDLEWARE_MANAGER_PATH'] 21 | self.handler = load_objects(handler_cls)(spider) 22 | self.middleware = load_objects(handler_manager_cls)(spider) 23 | 24 | def _download(self, request): 25 | """请求函数 26 | Args: 27 | request: request对象 28 | Returns: 29 | response 对象 30 | """ 31 | resp = self.handler.fetch(request) 32 | return resp 33 | 34 | def fetch(self, request): 35 | """请求函数 36 | Args: 37 | request: request对象 38 | Returns: 39 | response 对象 40 | """ 41 | resp = self.middleware.download(self._download, request) 42 | return resp 43 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/download/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: handler.py 6 | Time: 2021/4/18 下午12:37 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/18 下午12:37 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import urllib3 13 | import cfscrape 14 | import requests 15 | from urllib.parse import urlparse 16 | from requests import adapters 17 | 18 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 19 | adapters.DEFAULT_RETRIES = 5 20 | 21 | from magical.sync_spider.http.response import Response 22 | 23 | 24 | class DownloadHandler(object): 25 | """请求中间件处理""" 26 | 27 | def __init__(self, spider, **kwargs): 28 | self.spider = spider 29 | self.kwargs = kwargs 30 | self.logger = spider.logger 31 | self.settings = spider.settings 32 | 33 | self.session = requests.session() 34 | self.scrape = cfscrape.create_scraper(delay=self.settings['SCRAPER_DELAY']) 35 | self.scrape_session = cfscrape.create_scraper(sess=self.session, delay=self.settings['SCRAPER_DELAY']) 36 | 37 | def fetch(self, request): 38 | """开始下载 39 | 40 | Args: 41 | request: request 对象 42 | Returns: 43 | response 对象 44 | """ 45 | 46 | if request.s5: 47 | instance = self.scrape 48 | if request.session: 49 | instance = self.scrape_session 50 | 51 | elif request.session: 52 | instance = self.session 53 | 54 | else: 55 | instance = requests 56 | 57 | if request.method == 'POST': 58 | response = instance.post( 59 | request.url, 60 | data=request.data, 61 | json=request.json, 62 | headers=request.headers, 63 | params=request.params, 64 | proxies=request.meta.get('proxy'), 65 | verify=self.settings['REQUEST_VERIFY'], 66 | timeout=self.settings['REQUEST_TIMEOUT'], 67 | **request.kwargs 68 | ) 69 | else: 70 | response = instance.get( 71 | request.url, 72 | headers=request.headers, 73 | params=request.params, 74 | proxies=request.meta.get('proxy'), 75 | verify=self.settings['REQUEST_VERIFY'], 76 | timeout=self.settings['REQUEST_TIMEOUT'], 77 | **request.kwargs 78 | ) 79 | 80 | response.encoding = request.encoding 81 | 82 | res = Response(response, request) 83 | 84 | self.logger.debug(f"Downloaded ({res.status}) {str(request)}") 85 | return res 86 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/download/manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: manager.py 6 | Time: 2021/4/18 下午12:37 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/18 下午12:37 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from collections import defaultdict 13 | 14 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 15 | from magical.sync_spider.http.request import Request 16 | from magical.sync_spider.common.utils import call_func 17 | from magical.utils import load_objects 18 | 19 | 20 | class DownloadMiddlewareManager(object): 21 | def __init__(self, spider): 22 | self.methods = defaultdict(list) 23 | self.spider = spider 24 | self.settings = spider.settings 25 | self.middleware_s = self._load_middleware() 26 | 27 | for miw in self.middleware_s: 28 | self._add_middleware(miw) 29 | 30 | def _load_middleware(self): 31 | middleware_s = [] 32 | middleware_s_dict = self.settings["DOWNLOAD_MIDDLEWARE_PATH"] 33 | middleware_s_list = sorted(middleware_s_dict.items(), key=lambda x: x[1]) 34 | 35 | for middleware_key, value in middleware_s_list: 36 | middleware = load_objects(middleware_key) 37 | if issubclass(middleware, DownloaderMiddleware): 38 | middleware_instance = middleware(self.spider) 39 | middleware_s.append(middleware_instance) 40 | return middleware_s 41 | 42 | def _add_middleware(self, miw): 43 | if hasattr(miw, "process_request"): 44 | self.methods['process_request'].append(miw.process_request) 45 | 46 | if hasattr(miw, "process_response"): 47 | self.methods['process_response'].append(miw.process_response) 48 | 49 | if hasattr(miw, "process_exception"): 50 | self.methods['process_exception'].append(miw.process_exception) 51 | 52 | def download(self, download_func, request): 53 | this = self 54 | 55 | def process_request(request): 56 | for method in this.methods['process_request']: 57 | request = method(request) 58 | if not request: 59 | return request 60 | response = download_func(request) 61 | return response 62 | 63 | def process_response(response): 64 | for method in this.methods['process_response']: 65 | response = method(request, response) 66 | if isinstance(response, Request) or not response: 67 | return response 68 | return response 69 | 70 | def process_exception(exception): 71 | for method in this.methods['process_exception']: 72 | response = method(request, exception) 73 | if isinstance(response, Request) or not response: 74 | return response 75 | return exception 76 | 77 | resp = call_func(process_request, process_exception, process_response, request) 78 | 79 | return resp 80 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/duplicate/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/5/13 下午1:45 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午1:45 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/duplicate/bit_array.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: bit_array.py 6 | Time: 2021/5/13 下午3:45 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午3:45 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from __future__ import absolute_import 13 | 14 | import bitarray 15 | 16 | 17 | class BitArray: 18 | def set_all(self, value): 19 | pass 20 | 21 | def set(self, key, offsets, values): 22 | raise ImportError("this method mush be implement") 23 | 24 | def get(self, key, offsets): 25 | raise ImportError("this method mush be implement") 26 | 27 | def count(self, key, value=True): 28 | raise ImportError("this method mush be implement") 29 | 30 | 31 | class MemoryBitArray(BitArray): 32 | def __init__(self, num_bits): 33 | self.num_bits = num_bits 34 | self.bit_array = bitarray.bitarray(num_bits, endian="little") 35 | 36 | self.set_all(0) 37 | 38 | def set_all(self, value): 39 | self.bit_array.setall(value) 40 | 41 | def set(self, key, offsets, values): 42 | old_values = [] 43 | 44 | if isinstance(offsets, list): 45 | if not isinstance(values, list): 46 | values = [values] * len(offsets) 47 | else: 48 | assert len(offsets) == len(values), "offsets值要与values值一一对应" 49 | 50 | for offset, value in zip(offsets, values): 51 | old_values.append(int(self.bit_array[offset])) 52 | self.bit_array[offset] = value 53 | 54 | else: 55 | old_values = int(self.bit_array[offsets]) 56 | self.bit_array[offsets] = values 57 | 58 | return old_values 59 | 60 | def get(self, key, offsets): 61 | if isinstance(offsets, list): 62 | return [self.bit_array[offset] for offset in offsets] 63 | else: 64 | return self.bit_array[offsets] 65 | 66 | def count(self, key, value=True): 67 | return self.bit_array.count(value) 68 | 69 | 70 | class RedisBitArray(BitArray): 71 | redis_db = None 72 | 73 | def __init__(self, spider): 74 | red_name = spider.settings['FILTER_REDIS_NAME'] 75 | self.red = getattr(spider, red_name) if not red_name else spider.red 76 | 77 | self.count_cached_name = "{}_count_cached" 78 | 79 | def set(self, key, offsets, values): 80 | return self.red.set_bit(key, offsets, values) 81 | 82 | def get(self, key, offsets): 83 | return self.red.get_bit(key, offsets) 84 | 85 | def count(self, key, value=True): 86 | # 先查redis的缓存,若没有 在统计数量 87 | count = self.red.get_str(self.count_cached_name) 88 | if count: 89 | return int(count) 90 | else: 91 | count = self.red.bit_count(key) 92 | # 半小时过期 93 | self.red.set_str(self.count_cached_name.format(key), count, ex=1800) 94 | return count 95 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/duplicate/bloom_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: bloom_filter.py 6 | Time: 2021/5/13 下午3:31 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午3:31 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import hashlib 13 | import math 14 | import threading 15 | import time 16 | 17 | from struct import unpack, pack 18 | 19 | from magical.sync_spider.middleware.duplicate import bit_array 20 | from magical.sync_spider.common.redis_lock import RedisLock 21 | 22 | 23 | def make_hash_funcs(num_slices, num_bits): 24 | if num_bits >= (1 << 31): 25 | fmt_code, chunk_size = "Q", 8 26 | elif num_bits >= (1 << 15): 27 | fmt_code, chunk_size = "I", 4 28 | else: 29 | fmt_code, chunk_size = "H", 2 30 | total_hash_bits = 8 * num_slices * chunk_size 31 | if total_hash_bits > 384: 32 | hash_fn = hashlib.sha512 33 | elif total_hash_bits > 256: 34 | hash_fn = hashlib.sha384 35 | elif total_hash_bits > 160: 36 | hash_fn = hashlib.sha256 37 | elif total_hash_bits > 128: 38 | hash_fn = hashlib.sha1 39 | else: 40 | hash_fn = hashlib.md5 41 | 42 | fmt = fmt_code * (hash_fn().digest_size // chunk_size) 43 | num_salts, extra = divmod(num_slices, len(fmt)) 44 | if extra: 45 | num_salts += 1 46 | salts = tuple(hash_fn(hash_fn(pack("I", i)).digest()) for i in range(num_salts)) 47 | 48 | def _make_hash_funcs(key): 49 | if isinstance(key, str): 50 | key = key.encode("utf-8") 51 | else: 52 | key = str(key).encode("utf-8") 53 | 54 | i = 0 55 | for salt in salts: 56 | h = salt.copy() 57 | h.update(key) 58 | for uint in unpack(fmt, h.digest()): 59 | yield uint % num_bits 60 | i += 1 61 | if i >= num_slices: 62 | return 63 | 64 | return _make_hash_funcs 65 | 66 | 67 | class BloomFilter(object): 68 | def __init__(self, spider, filter_queue_type): 69 | self.capacity = spider.settings.get_int('FILTER_INITIAL_CAPACITY') 70 | self.error_rate = spider.settings.get_float('FILTER_ERROR_RATE') 71 | 72 | if not (0 < self.error_rate < 1): 73 | raise ValueError("Error_Rate must be between 0 and 1.") 74 | 75 | if not self.capacity > 0: 76 | raise ValueError("Capacity must be > 0") 77 | 78 | num_slices = int(math.ceil(math.log(1.0 / self.error_rate, 2))) 79 | bits_per_slice = int( 80 | math.ceil( 81 | (self.capacity * abs(math.log(self.error_rate))) 82 | / (num_slices * (math.log(2) ** 2)) 83 | ) 84 | ) 85 | 86 | self.num_slices = num_slices 87 | self.bits_per_slice = bits_per_slice 88 | self.num_bits = num_slices * bits_per_slice 89 | self.make_hashes = make_hash_funcs(self.num_slices, self.bits_per_slice) 90 | 91 | self._is_at_capacity = False 92 | self._check_capacity_time = 0 93 | 94 | if filter_queue_type == 'memory': 95 | self.bit_array = bit_array.MemoryBitArray(self.num_bits) 96 | self.bit_array.set_all(False) 97 | 98 | elif filter_queue_type == 'redis': 99 | self.bit_array = bit_array.RedisBitArray(spider) 100 | 101 | else: 102 | raise ValueError("not support this filter_queue_type") 103 | 104 | def is_at_capacity(self, filter_key): 105 | if self._is_at_capacity: 106 | return self._is_at_capacity 107 | 108 | bit_count = self.bit_array.count(filter_key) 109 | if bit_count and bit_count / self.num_bits > 0.5: 110 | self._is_at_capacity = True 111 | 112 | return self._is_at_capacity 113 | 114 | def get(self, filter_key, value): 115 | is_list = isinstance(value, list) 116 | keys = value if is_list else [value] 117 | is_exists = [] 118 | 119 | offsets = [] 120 | for key in keys: 121 | hashes = self.make_hashes(key) 122 | offset = 0 123 | for k in hashes: 124 | offsets.append(offset + k) 125 | offset += self.bits_per_slice 126 | 127 | old_values = self.bit_array.get(filter_key, offsets) 128 | 129 | for i in range(0, len(old_values), self.num_slices): 130 | is_exists.append(int(all(old_values[i: i + self.num_slices]))) 131 | 132 | return is_exists if is_list else is_exists[0] 133 | 134 | def add(self, filter_key, value): 135 | if self.is_at_capacity(filter_key): 136 | raise IndexError("BloomFilter is at capacity") 137 | 138 | is_list = isinstance(value, list) 139 | keys = value if is_list else [value] 140 | is_added = [] 141 | 142 | offsets = [] 143 | for key in keys: 144 | hashes = self.make_hashes(key) 145 | offset = 0 146 | for k in hashes: 147 | offsets.append(offset + k) 148 | offset += self.bits_per_slice 149 | 150 | old_values = self.bit_array.set(filter_key, offsets, 1) 151 | for i in range(0, len(old_values), self.num_slices): 152 | is_added.append(1 ^ int(all(old_values[i: i + self.num_slices]))) 153 | 154 | return is_added if is_list else is_added[0] 155 | 156 | 157 | class ScalableBloomFilter(object): 158 | def __init__(self, spider): 159 | self.spider = spider 160 | red_name = spider.settings['FILTER_REDIS_NAME'] 161 | self.red = getattr(spider, red_name) if not red_name else spider.red 162 | 163 | self.filter_queue_type = spider.settings['FILTER_QUEUE_TYPE'] 164 | 165 | self.filters = [] 166 | self.filters.append(self.create_filter()) 167 | 168 | self._thread_lock = threading.RLock() 169 | self._check_capacity_time = 0 170 | 171 | def create_filter(self): 172 | return BloomFilter(self.spider, self.filter_queue_type) 173 | 174 | def __check_filter_capacity(self, filter_key): 175 | if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800: 176 | if self.filter_queue_type == 'memory': 177 | with self._thread_lock: 178 | while True: 179 | if self.filters[-1].is_at_capacity(filter_key): 180 | self.filters.append(self.create_filter()) 181 | else: 182 | break 183 | 184 | self._check_capacity_time = time.time() 185 | else: 186 | # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来 187 | with RedisLock(key="ScalableBloomFilter", timeout=300, wait_timeout=300, redis_cli=self.red) as lock: 188 | if lock.locked: 189 | while True: 190 | if self.filters[-1].is_at_capacity(filter_key): 191 | self.filters.append(self.create_filter()) 192 | else: 193 | break 194 | 195 | self._check_capacity_time = time.time() 196 | 197 | def get(self, filter_key, value): 198 | self.__check_filter_capacity(filter_key) 199 | 200 | is_list = isinstance(value, list) 201 | keys = value if is_list else [value] 202 | not_exist_keys = list(set(keys)) 203 | 204 | # 检查之前的bloomFilter是否存在 205 | # 记录下每级filter存在的key,不存在的key继续向下检查 206 | for f in reversed(self.filters): 207 | # 当前的filter是否存在 208 | current_filter_is_exists = f.get(filter_key, not_exist_keys) 209 | 210 | not_exist_keys_temp = [] 211 | 212 | for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists): 213 | # 当前filter不存在的key 需要继续向下检查 214 | if not is_exist: 215 | not_exist_keys_temp.append(checked_key) 216 | 217 | not_exist_keys = not_exist_keys_temp 218 | 219 | if not not_exist_keys: 220 | break 221 | 222 | # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在 223 | for i, key in enumerate(keys): 224 | for j, not_exist_key in enumerate(not_exist_keys): 225 | if key == not_exist_key: 226 | keys[i] = 0 227 | not_exist_keys.pop(j) 228 | break 229 | else: 230 | keys[i] = 1 231 | 232 | is_exists = keys 233 | return is_exists if is_list else is_exists[0] 234 | 235 | def add(self, filter_key, value, skip_check=False): 236 | self.__check_filter_capacity(filter_key) 237 | 238 | current_filter = self.filters[-1] 239 | 240 | if skip_check: 241 | return current_filter.add(filter_key, value) 242 | 243 | else: 244 | is_list = isinstance(value, list) 245 | keys = value if is_list else [value] 246 | not_exist_keys = list(set(keys)) 247 | 248 | # 检查之前的bloomFilter是否存在 249 | # 记录下每级filter存在的key,不存在的key继续向下检查 250 | for f in reversed(self.filters): 251 | # 当前的filter是否存在 252 | current_filter_is_exists = f.get(filter_key, not_exist_keys) 253 | 254 | not_exist_keys_temp = [] 255 | 256 | for key, is_exist in zip(not_exist_keys, current_filter_is_exists): 257 | # 当前filter不存在的key 需要继续向下检查 258 | if not is_exist: 259 | not_exist_keys_temp.append(key) 260 | 261 | not_exist_keys = not_exist_keys_temp 262 | 263 | if not not_exist_keys: 264 | break 265 | 266 | # 仍有不存在的关键词,记录该关键词 267 | if not_exist_keys: 268 | current_filter.add(filter_key, not_exist_keys) 269 | 270 | # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在 271 | for i, key in enumerate(keys): 272 | for j, not_exist_key in enumerate(not_exist_keys): 273 | if key == not_exist_key: 274 | keys[i] = 1 275 | not_exist_keys.pop(j) 276 | break 277 | else: 278 | keys[i] = 0 279 | 280 | is_added = keys 281 | return is_added if is_list else is_added[0] 282 | 283 | @property 284 | def capacity(self): 285 | return sum(f.capacity for f in self.filters) 286 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/duplicate/expire_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: expire_filter.py 6 | Time: 2021/5/13 下午2:26 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午2:26 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import time 13 | 14 | 15 | class ExpireFilter(object): 16 | 17 | def __init__(self, spider): 18 | red_name = spider.settings['FILTER_REDIS_NAME'] 19 | 20 | self.expire = spider.settings['FILTER_REDIS_KEY_EXPIRE'] 21 | self.red = getattr(spider, red_name) if not red_name else spider.red 22 | 23 | @property 24 | def current_timestamp(self): 25 | return int(time.time()) 26 | 27 | def get(self, filter_key, value): 28 | return self.red.zscore(filter_key, value) 29 | 30 | def add(self, filter_key, value): 31 | return self.red.zadd(filter_key, value) 32 | 33 | # def del_expire_key(self): 34 | # self.red.zremrangebyscore(self.name, "-inf", self.current_timestamp - self.expire_time) 35 | # 36 | # def record_expire_time(self): 37 | # if self.expire_time_record_key: 38 | # self.red.hset(self.expire_time_record_key, key=self.name, value=self.expire_time) 39 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/duplicate/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: handler.py 6 | Time: 2021/5/13 下午1:45 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/13 下午1:45 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import copy 13 | 14 | from magical.utils import load_objects 15 | 16 | 17 | class DuplicateHandler(object): 18 | 19 | def __init__(self, spider): 20 | self.spider = spider 21 | 22 | self.use_md5 = self.spider.settings['FILTER_USE_MD5'] 23 | self.filter_method = load_objects(self.spider.settings['FILTER_METHOD_MANAGER'])(spider) 24 | 25 | def __deal_data(self, filter_data): 26 | if self.use_md5: 27 | value = self.spider.spider_util.get_md5_encrypt(filter_data) 28 | 29 | else: 30 | value = copy.deepcopy(filter_data) 31 | 32 | return value 33 | 34 | def get(self, key, value): 35 | return self.filter_method.get(key, self.__deal_data(value)) 36 | 37 | def add(self, key, value): 38 | return self.filter_method.add(key, self.__deal_data(value)) 39 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/5/17 下午5:54 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/17 下午5:54 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/pipeline/handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: handler.py 6 | Time: 2021/5/17 下午6:07 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/17 下午6:07 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import load_objects 13 | 14 | 15 | class PipelineHandler(object): 16 | 17 | def __init__(self, spider, **kwargs): 18 | self.spider = spider 19 | self.kwargs = kwargs 20 | self.logger = spider.logger 21 | self.settings = spider.settings 22 | 23 | handler_manager_cls = self.settings['PIPELINE_MIDDLEWARE_MANAGER_PATH'] 24 | self.middleware = load_objects(handler_manager_cls)(spider) 25 | 26 | def pipeline(self, item, **kwargs): 27 | return self.middleware.pipeline(item, **kwargs) 28 | -------------------------------------------------------------------------------- /magical/sync_spider/middleware/pipeline/manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: manager.py 6 | Time: 2021/5/17 下午5:54 7 | ------------------------------------------------- 8 | Change Activity: 2021/5/17 下午5:54 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from collections import defaultdict 13 | 14 | from magical.sync_spider.extends_module.base_module.pipeline import PipelineMiddleware 15 | from magical.sync_spider.common.utils import call_func_item 16 | from magical.utils import load_objects 17 | 18 | 19 | class PipelineMiddlewareManager(object): 20 | def __init__(self, spider): 21 | self.methods = defaultdict(list) 22 | self.spider = spider 23 | self.settings = spider.settings 24 | self.middleware_s = self.__load_middleware() 25 | 26 | for miw in self.middleware_s: 27 | self.__add_middleware(miw) 28 | 29 | def __load_middleware(self): 30 | middleware_s = [] 31 | middleware_s_dict = self.settings["PIPELINE_MIDDLEWARE_PATH"] 32 | middleware_s_list = sorted(middleware_s_dict.items(), key=lambda x: x[1]) 33 | 34 | for middleware_key, value in middleware_s_list: 35 | middleware = load_objects(middleware_key) 36 | if issubclass(middleware, PipelineMiddleware): 37 | middleware_instance = middleware(self.spider) 38 | middleware_s.append(middleware_instance) 39 | return middleware_s 40 | 41 | def __add_middleware(self, miw): 42 | if hasattr(miw, "process_item"): 43 | self.methods['process_item'].append(miw.process_item) 44 | 45 | if hasattr(miw, "process_exception"): 46 | self.methods['process_exception'].append(miw.process_exception) 47 | 48 | def pipeline(self, item, **kwargs): 49 | 50 | def process_item(item): 51 | for method in self.methods['process_item']: 52 | item = method(item, **kwargs) 53 | if not isinstance(item, type(item)): 54 | return item 55 | 56 | return item 57 | 58 | def process_exception(exception): 59 | for method in self.methods['process_exception']: 60 | exception = method(item, exception) 61 | if not exception: 62 | return exception 63 | return exception 64 | 65 | return call_func_item(process_item, process_exception, item) 66 | -------------------------------------------------------------------------------- /magical/template.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: template.py 6 | Time: 2021/4/14 下午3:37 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/14 下午3:37 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import re 14 | import string 15 | 16 | 17 | def render_template_file(path, **kwargs): 18 | with open(path, 'rb') as fp: 19 | raw = fp.read().decode('utf8') 20 | 21 | content = string.Template(raw).substitute(**kwargs) 22 | 23 | render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path 24 | with open(render_path, 'wb') as fp: 25 | fp.write(content.encode('utf8')) 26 | if path.endswith('.tmpl'): 27 | os.remove(path) 28 | 29 | 30 | CAMELCASE_INVALID_CHARS = re.compile('[^a-zA-Z\d]') 31 | 32 | 33 | def string_camelcase(string): 34 | return CAMELCASE_INVALID_CHARS.sub('', string.title()) 35 | -------------------------------------------------------------------------------- /magical/templates/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: 2021/4/22 上午12:05 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/22 上午12:05 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: 2021/4/22 上午12:08 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/22 上午12:08 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/base_spider.py.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: base_spider.py 6 | Time: ${create_time} 7 | ------------------------------------------------- 8 | Change Activity: ${create_time} 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | from magical.sync_spider.common.base_spider import BaseSpider 14 | 15 | 16 | class ${SpiderName}BaseSpider(BaseSpider): 17 | def __init__(self, spider): 18 | super().__init__(spider) 19 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/middleware.py.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: middleware.py 6 | Time: ${create_time} 7 | ------------------------------------------------- 8 | Change Activity: ${create_time} 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import random 13 | import time 14 | 15 | import requests 16 | 17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 18 | 19 | 20 | # ------------------------------------------------default middleware------------------------------------------------ 21 | 22 | 23 | class HeadersMiddleware(DownloaderMiddleware): 24 | """请求头处理中间件""" 25 | 26 | def __init__(self, spider): 27 | super().__init__(spider) 28 | 29 | def process_request(self, request): 30 | request.headers.update({'Connection': 'close'}) 31 | return request 32 | 33 | 34 | class ProxyMiddleware(DownloaderMiddleware): 35 | """代理IP中间件""" 36 | 37 | def __init__(self, spider): 38 | super().__init__(spider) 39 | 40 | self.proxy.proxy_handler(num=1) 41 | 42 | def process_request(self, request): 43 | request.meta['proxy'] = self.proxy.get_proxy() 44 | return request 45 | 46 | def process_response(self, request, response): 47 | return response 48 | 49 | def process_exception(self, request, exception): 50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 51 | 52 | if isinstance(exception, ( 53 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, 54 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)): 55 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}') 56 | time.sleep(random.randint(3, 5)) 57 | self.proxy.proxy_handler(request, num=1) 58 | return self._retry(request) 59 | 60 | return exception 61 | 62 | 63 | class RequestErrorMiddleware(DownloaderMiddleware): 64 | """请求异常中间件""" 65 | 66 | def __init__(self, spider): 67 | super().__init__(spider) 68 | 69 | def process_exception(self, request, exception): 70 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 71 | 72 | if isinstance(exception, ( 73 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, 74 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)): 75 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}') 76 | time.sleep(random.randint(3, 5)) 77 | return self._retry(request) 78 | 79 | elif isinstance(exception, requests.exceptions.HTTPError): 80 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}') 81 | return None 82 | 83 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError): 84 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}') 85 | return None 86 | 87 | elif isinstance(exception, requests.exceptions.SSLError): 88 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}') 89 | return None 90 | 91 | return exception 92 | 93 | 94 | # -------------------------------------------------spider middleware------------------------------------------------- 95 | 96 | 97 | class ${SpiderName}Middleware(DownloaderMiddleware): 98 | 99 | def __init__(self, spider): 100 | super().__init__(spider) 101 | 102 | def process_request(self, request): 103 | return request 104 | 105 | def process_response(self, request, response): 106 | if not request.use_middleware: 107 | return response 108 | 109 | return response 110 | 111 | def process_exception(self, request, exception): 112 | self.logger.error(f'${SpiderName}Middleware.process_exception: {exception}, request: {request}') 113 | return exception 114 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/settings.py.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: settings.py 6 | Time: ${create_time} 7 | ------------------------------------------------- 8 | Change Activity: ${create_time} 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import log_path 13 | 14 | # project settings 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------------------- 18 | 19 | # 项目名称 20 | PROJECT_NAME = '${project_name}' 21 | 22 | # logger 路径 23 | LOGGER_PATH = log_path(__file__) 24 | 25 | # 重试次数 26 | RETRY_COUNT = 10 27 | 28 | # 管道中间件,可配置多个 29 | # PIPELINE_MIDDLEWARE_PATH = { 30 | # "${spider_path}.pipeline.${SpiderName}Pipeline": 10 31 | # } 32 | 33 | # 下载中间件,可配置多个 34 | DOWNLOAD_MIDDLEWARE_PATH = { 35 | # "${spider_path}.middleware.DuplicateMiddleware": 7, 36 | # "${spider_path}.middleware.HeadersMiddleware": 8, 37 | # "${spider_path}.middleware.ProxyMiddleware": 9, 38 | "${spider_path}.middleware.RequestErrorMiddleware": 10, 39 | "${spider_path}.middleware.${SpiderName}Middleware": 100 40 | } 41 | 42 | # 爬虫公共类,基类 43 | BASE_SPIDER_PATH = "${spider_path}.base_spider.${SpiderName}BaseSpider" 44 | 45 | # user-agent 46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \ 47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' 48 | 49 | # ------------------------------------------------------------------------------------------------------------------- 50 | 51 | 52 | # default settings 53 | 54 | # 下载中间件 55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader" 56 | 57 | # 下载处理中间件 58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler" 59 | 60 | # 下载调度器 61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager" 62 | 63 | # 下载中间件,可配置多个 64 | # DOWNLOAD_MIDDLEWARE_PATH = {} 65 | 66 | # ------------------------------------------------------------------------------------------------------------------- 67 | 68 | # 管道处理中间件 69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler" 70 | 71 | # 管道调度器 72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager" 73 | 74 | # 管道中间件,可配置多个 75 | # PIPELINE_MIDDLEWARE_PATH = {} 76 | 77 | # ------------------------------------------------------------------------------------------------------------------- 78 | # 暂时不使用,存在问题 79 | # # 去重中间件 80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler" 81 | # 82 | # # 去重过滤器 83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter" 84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter" 85 | # 86 | # # 去重队列,redis, memory = 内存 87 | # FILTER_QUEUE_TYPE = 'redis' 88 | # 89 | # # 去重是否 md5 加密 90 | # FILTER_USE_MD5 = False 91 | # 92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red 93 | # FILTER_REDIS_NAME = 'red' 94 | # 95 | # # 去重初始容量 96 | # FILTER_INITIAL_CAPACITY = 100000000 97 | # 98 | # # 去重错误率 99 | # FILTER_ERROR_RATE = 0.00001 100 | 101 | # ------------------------------------------------------------------------------------------------------------------- 102 | 103 | # # rabbit mq 配置 104 | # MESSAGE_MQ_CONFIG = { 105 | # 'username': 'admin', 106 | # 'password': 'admin123', 107 | # 'host': '127.0.0.1', 108 | # 'port': 9999 109 | # } 110 | # 111 | # # rabbit mq 消费批次,每次消费 10 条 112 | # MESSAGE_MQ_PREFETCH_COUNT = 10 113 | # 114 | # # rabbit mq virtual host 115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider' 116 | # 117 | # # rabbit mq 操作类 118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler' 119 | 120 | # ------------------------------------------------------------------------------------------------------------------- 121 | 122 | # 爬虫公共类,基类 123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider" 124 | 125 | # 爬虫工具类 126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil" 127 | 128 | # 代理IP中间件 129 | # redis IP 获取 130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy' 131 | # # 芝麻代理 IP 132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy' 133 | 134 | # 邮件 135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler' 136 | 137 | # post ger sql 操作类 138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle' 139 | 140 | # mysql 操作类 141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler' 142 | 143 | # redis 操作类 144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler' 145 | 146 | # ------------------------------------------------------------------------------------------------------------------- 147 | 148 | # 初始化 代理 IP 数量 149 | PROXY_NUM = 5 150 | 151 | # 重试次数 152 | # RETRY_COUNT = 3 153 | 154 | # 包含一下状态吗,重试 155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408] 156 | 157 | # 忽略 ssl 验证 158 | REQUEST_VERIFY = False 159 | 160 | # 请求超时时间 161 | REQUEST_TIMEOUT = 30 162 | 163 | # 5s盾,delay 时间 164 | SCRAPER_DELAY = 30 165 | 166 | # 消费者线程数 167 | CONSUMER_THREAD_NUM = 10 168 | 169 | # ------------------------------------------------------------------------------------------------------------------- 170 | 171 | """ 172 | 数据库配置 173 | 174 | 单个数据库 175 | REDIS_CONFIG = { 176 | 'host': '', 177 | 'host': '', 178 | 'db': '', 179 | 'user': '', 180 | 'password': '', 181 | 'decode_responses': True 182 | } 183 | 使用: 184 | red 默认变量名称 185 | self.red.get('key1') 186 | spider.red.get('key1') 187 | 188 | 多个数据库 189 | REDIS_CONFIG = [ 190 | { 191 | 'name': 'name1', 192 | 'host': '', 193 | 'host': '', 194 | 'db': '', 195 | 'user': '', 196 | 'password': '', 197 | 'decode_responses': True 198 | }, 199 | { 200 | 'name': 'name2', 201 | 'host': '', 202 | 'host': '', 203 | 'db': '', 204 | 'user': '', 205 | 'password': '', 206 | 'decode_responses': True 207 | }, 208 | ] 209 | 使用: 210 | self.name1.get('key1') 211 | spider.name1.get('key1') 212 | 213 | self.name2.get('key1') 214 | spider.name2.get('key1') 215 | """ 216 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/spider.py.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: ${spider_name}.py 6 | Time: ${create_time} 7 | ------------------------------------------------- 8 | Change Activity: ${create_time} 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import sys 14 | 15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | sys.path.append(file_path) 17 | 18 | from magical.sync_spider import SyncSpider, Request, run_spider 19 | 20 | 21 | class ${SpiderName}Spider(SyncSpider): 22 | name = '${spider_name}' 23 | settings_path = '${settings_path}' 24 | 25 | default_custom_setting = {} 26 | 27 | def __init__(self, *args, **kwargs): 28 | custom_setting = {} 29 | kwargs.update(dict(custom_setting=custom_setting)) 30 | super().__init__(*args, **kwargs) 31 | 32 | def start_spider(self): 33 | self.logger.info(f'Hello {self.name}') 34 | 35 | request = Request(url='http://www.baidu.com/') 36 | response = self.download(request) 37 | 38 | title = response.re.findall('(.*?)') 39 | self.logger.info(f'title: {title}') 40 | 41 | 42 | if __name__ == '__main__': 43 | run_spider(${SpiderName}Spider) 44 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: 2021/4/22 上午12:08 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/22 上午12:08 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/templates/sync_spider/spiders/__init__.py.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: ${create_time} 7 | ------------------------------------------------- 8 | Change Activity: ${create_time} 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /magical/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: goods_utils.py 6 | Time: 2021/4/20 上午12:31 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/20 上午12:31 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import time 14 | import datetime 15 | 16 | from importlib import import_module 17 | from decimal import Decimal, ROUND_HALF_UP 18 | 19 | 20 | def log_path(project_path): 21 | """获取 logs 路径 22 | 23 | Args: 24 | project_path: 项目绝对路径 25 | Returns: 返回 log 路径 26 | """ 27 | s_path = os.path.basename(os.path.abspath(project_path)) 28 | 29 | if s_path == 'spiders': 30 | return os.path.join(os.path.dirname(project_path), 'logs') 31 | 32 | else: 33 | return log_path(os.path.dirname(project_path)) 34 | 35 | 36 | # 加载 py 文件 37 | def load_files(path): 38 | return import_module(path) 39 | 40 | 41 | # 加载模块类函数 42 | def load_objects(path): 43 | try: 44 | dot = path.rindex('.') 45 | except ValueError as e: 46 | raise ValueError("Error loading object '%s': not a full path" % path) 47 | 48 | module, name = path[:dot], path[dot + 1:] 49 | mod = import_module(module) 50 | 51 | try: 52 | obj = getattr(mod, name) 53 | except AttributeError: 54 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) 55 | 56 | return obj 57 | 58 | 59 | def round_half_up(digit, n=2): 60 | return Decimal(str(digit)).quantize(Decimal('0.' + '0' * n), rounding=ROUND_HALF_UP) 61 | 62 | 63 | def get_fmt_time(fmt="%Y-%m-%d %H:%M:%S", timestamp=None): 64 | if timestamp: 65 | return time.strftime(fmt, time.localtime(int(timestamp))) 66 | return datetime.datetime.now().strftime(fmt) 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | redis==3.5.3 2 | pandas==1.2.3 3 | PyExecJS==1.5.1 4 | opencv_python==4.5.1.48 5 | requests==2.25.1 6 | PyMySQL==0.9.3 7 | urllib3==1.26.4 8 | rsa==4.7.2 9 | SQLAlchemy==1.4.5 10 | xlwt==1.3.0 11 | psycopg2==2.7.7 12 | lxml==4.6.3 13 | DBUtils==1.3 14 | pycryptodome==3.10.1 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: setup.py 6 | Time: 2021/4/21 下午9:49 7 | ------------------------------------------------- 8 | Change Activity: 2021/4/21 下午9:49 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | from setuptools import setup, find_packages 15 | 16 | 17 | setup( 18 | name='magical', 19 | version='1.1.0', 20 | description='参照 scrapy 轻量级爬虫框架', 21 | author='magical developers', 22 | maintainer='qinjiahu', 23 | maintainer_email='qinless@qinless.com', 24 | license='BSD', 25 | 26 | packages=find_packages(exclude=( 27 | 'examples', 'examples.*', 'public', 'public.*', 'test', 'test.*', '.gitee', '.gitee.*', 28 | 'public', 'public.*', 'spiders', 'spiders.*', 'logs', 'logs.*' 29 | )), 30 | 31 | package_data={ 32 | '': ['*.py.tmpl', '*.json'] 33 | }, 34 | 35 | include_package_data=True, 36 | zip_safe=False, 37 | 38 | classifiers=[ 39 | 'Framework :: Crawler', 40 | 'Environment :: Console', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: Implementation :: CPython', 43 | 'Programming Language :: Python :: Implementation :: PyPy', 44 | 'Topic :: Software Development :: Libraries :: Application Frameworks', 45 | 'Topic :: Software Development :: Libraries :: Python Modules', 46 | ], 47 | python_requires='>=3.6.*', 48 | install_requires=[] 49 | ) 50 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/7/1 上午11:38 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:38 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /spiders/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/7/1 上午11:38 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:38 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /spiders/common/excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: excel.py 6 | Time: 2021/7/1 上午11:39 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:39 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import xlwt 13 | 14 | 15 | def excel_style(): 16 | # 为样式创建字体 17 | font = xlwt.Font() 18 | # 设置字体名字对应系统内字体 19 | font.name = u'微软雅黑' 20 | font.height = 240 21 | 22 | alignment = xlwt.Alignment() 23 | # 设置水平居中 24 | alignment.horz = xlwt.Alignment.HORZ_CENTER 25 | # 设置垂直居中 26 | alignment.vert = xlwt.Alignment.VERT_CENTER 27 | 28 | borders = xlwt.Borders() # Create borders 29 | # 添加边框-虚线边框 30 | borders.left = xlwt.Borders.THIN 31 | borders.right = xlwt.Borders.THIN 32 | borders.top = xlwt.Borders.THIN 33 | borders.bottom = xlwt.Borders.THIN 34 | # 边框上色 35 | borders.left_colour = 23 36 | borders.right_colour = 23 37 | borders.top_colour = 23 38 | borders.bottom_colour = 23 39 | 40 | # 初始化样式 41 | style = xlwt.XFStyle() 42 | # 为样式设置字体 43 | style.font = font 44 | # 对齐方式设置 45 | style.alignment = alignment 46 | style.borders = borders 47 | 48 | return style 49 | 50 | 51 | def write_excel(data, headers, name, path_name): 52 | workbook = xlwt.Workbook() 53 | sheet = workbook.add_sheet(name) 54 | 55 | style = excel_style() 56 | 57 | num = 1 58 | for k, v in headers.items(): 59 | if k.startswith('$'): 60 | continue 61 | sheet.col(num).width = 100 * 50 62 | sheet.write(0, num, v, style) 63 | num += 1 64 | 65 | col = 0 66 | for n in range(0, len(data)): 67 | num = 1 68 | 69 | pattern = xlwt.Pattern() 70 | if n % 2 == 0: 71 | pattern.pattern = xlwt.Pattern.SOLID_PATTERN 72 | pattern.pattern_fore_colour = 22 73 | else: 74 | pattern.pattern = xlwt.Pattern.SOLID_PATTERN 75 | pattern.pattern_fore_colour = 1 76 | 77 | style.pattern = pattern 78 | 79 | sheet.row(col).height_mismatch = True 80 | sheet.row(col).height = 30 * 20 81 | 82 | # 根据文件头来获取数据 83 | for key in headers.keys(): 84 | item = data[n][key] 85 | sheet.write(n + 1, num, item, style) 86 | num += 1 87 | 88 | # for k, v in data[n].items(): 89 | # sheet.write(n + 1, num, v, style) 90 | # num += 1 91 | 92 | col += 1 93 | 94 | sheet.row(col).height_mismatch = True 95 | sheet.row(col).height = 30 * 20 96 | 97 | workbook.save(f'{path_name}') 98 | print(f'{name}.xls 写入成功') 99 | 100 | 101 | if __name__ == '__main__': 102 | data_list = [ 103 | {'desc': 'desc1', 'name': 'name1', 'plat': 'plat1'}, 104 | {'desc': 'desc2', 'name': 'name2', 'plat': 'plat2'}, 105 | {'desc': 'desc3', 'name': 'name3', 'plat': 'plat3'}, 106 | {'desc': 'desc4', 'name': 'name4', 'plat': 'plat4'}, 107 | {'desc': 'desc5', 'name': 'name5', 'plat': 'plat5'}, 108 | ] 109 | 110 | title = {'desc': '描述', 'name': '店铺名称', 'plat': '渠道'} 111 | excel_name = 'test' 112 | write_excel(data_list, title, excel_name, excel_name) 113 | -------------------------------------------------------------------------------- /spiders/common/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: proxy.py 6 | Time: 2021/7/1 上午11:39 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:39 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | class GetProxy(object): 15 | 16 | # spider 是爬虫实例对象 17 | def __init__(self, spider): 18 | self.logger = spider.logger 19 | 20 | def get_proxy(self): 21 | self.logger.info('获取一条代理Ip') 22 | return None 23 | -------------------------------------------------------------------------------- /spiders/common/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: settings.py 6 | Time: 2021/7/1 上午11:39 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:39 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | # 统一初始化,爬虫其他工具类,模块 13 | SPIDER_INIT_HANDLER = 'spiders.common.spider_init.SpiderInit' 14 | 15 | EXCEL = 'spiders.common.excel' 16 | PROXY_HANDLER = 'spiders.common.proxy.GetProxy' 17 | 18 | REDIS_CONFIG = { 19 | 'host': '127.0.0.1', 20 | 'port': '6379', 21 | 'db': '0', 22 | 'decode_responses': True 23 | } 24 | 25 | -------------------------------------------------------------------------------- /spiders/common/spider_init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: spider_init.py 6 | Time: 2021/7/31 下午5:07 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/31 下午5:07 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.sync_spider import load_files 13 | 14 | 15 | class SpiderInit(object): 16 | def __init__(self, spider): 17 | self.settings = spider.settings 18 | 19 | spider.excel = load_files(self.settings['EXCEL']) 20 | -------------------------------------------------------------------------------- /spiders/test_douban/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/7/13 上午11:29 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/13 上午11:29 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | from magical.cmdline import generate_spider_project, generate_spider_file 14 | 15 | 16 | def main(): 17 | project_path = os.path.dirname(os.path.abspath(__file__)) 18 | spider_name = 'douban_spider' 19 | 20 | generate_spider_project('sync_spider', project_path, spider_name) 21 | # generate_spider_file('sync_spider', project_path, spider_name) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /spiders/test_douban/base_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: base_spider.py 6 | Time: 2021/13/13 11:30:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/13/13 11:30:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | from magical.sync_spider.common.base_spider import BaseSpider 14 | 15 | 16 | class DoubanSpiderBaseSpider(BaseSpider): 17 | def __init__(self, spider): 18 | super().__init__(spider) 19 | -------------------------------------------------------------------------------- /spiders/test_douban/middleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: middleware.py 6 | Time: 2021/13/13 11:30:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/13/13 11:30:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import random 13 | import time 14 | 15 | import requests 16 | 17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 18 | 19 | 20 | # ------------------------------------------------default middleware------------------------------------------------ 21 | 22 | 23 | class HeadersMiddleware(DownloaderMiddleware): 24 | """请求头处理中间件""" 25 | 26 | def __init__(self, spider): 27 | super().__init__(spider) 28 | 29 | def process_request(self, request): 30 | request.headers.update({'Connection': 'close'}) 31 | return request 32 | 33 | 34 | class ProxyMiddleware(DownloaderMiddleware): 35 | """代理IP中间件""" 36 | 37 | def __init__(self, spider): 38 | super().__init__(spider) 39 | 40 | self.proxy.proxy_handler(num=1) 41 | 42 | def process_request(self, request): 43 | request.meta['proxy'] = self.proxy.get_proxy() 44 | return request 45 | 46 | def process_response(self, request, response): 47 | return response 48 | 49 | def process_exception(self, request, exception): 50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 51 | 52 | if isinstance(exception, ( 53 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, 54 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)): 55 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}') 56 | time.sleep(random.randint(3, 5)) 57 | self.proxy.proxy_handler(request, num=1) 58 | return self._retry(request) 59 | 60 | return exception 61 | 62 | 63 | class RequestErrorMiddleware(DownloaderMiddleware): 64 | """请求异常中间件""" 65 | 66 | def __init__(self, spider): 67 | super().__init__(spider) 68 | 69 | def process_exception(self, request, exception): 70 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 71 | 72 | if isinstance(exception, ( 73 | requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, 74 | requests.exceptions.ReadTimeout, requests.exceptions.Timeout)): 75 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}') 76 | time.sleep(random.randint(3, 5)) 77 | return self._retry(request) 78 | 79 | elif isinstance(exception, requests.exceptions.HTTPError): 80 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}') 81 | return None 82 | 83 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError): 84 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}') 85 | return None 86 | 87 | elif isinstance(exception, requests.exceptions.SSLError): 88 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}') 89 | return None 90 | 91 | return exception 92 | 93 | 94 | # -------------------------------------------------spider middleware------------------------------------------------- 95 | 96 | 97 | class DoubanSpiderMiddleware(DownloaderMiddleware): 98 | 99 | def __init__(self, spider): 100 | super().__init__(spider) 101 | 102 | def process_request(self, request): 103 | return request 104 | 105 | def process_response(self, request, response): 106 | if not request.use_middleware: 107 | return response 108 | 109 | return response 110 | 111 | def process_exception(self, request, exception): 112 | self.logger.error(f'DoubanSpiderMiddleware.process_exception: {exception}, request: {request}') 113 | return exception 114 | -------------------------------------------------------------------------------- /spiders/test_douban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: settings.py 6 | Time: 2021/13/13 11:30:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/13/13 11:30:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import log_path 13 | 14 | # project settings 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------------------- 18 | 19 | # 项目名称 20 | PROJECT_NAME = 'test_douban' 21 | 22 | # logger 路径 23 | LOGGER_PATH = log_path(__file__) 24 | 25 | # 重试次数 26 | RETRY_COUNT = 10 27 | 28 | # 管道中间件,可配置多个 29 | # PIPELINE_MIDDLEWARE_PATH = { 30 | # "spiders.test_douban.pipeline.DoubanSpiderPipeline": 10 31 | # } 32 | 33 | # 下载中间件,可配置多个 34 | DOWNLOAD_MIDDLEWARE_PATH = { 35 | # "spiders.test_douban.middleware.DuplicateMiddleware": 7, 36 | # "spiders.test_douban.middleware.HeadersMiddleware": 8, 37 | # "spiders.test_douban.middleware.ProxyMiddleware": 9, 38 | "spiders.test_douban.middleware.RequestErrorMiddleware": 10, 39 | "spiders.test_douban.middleware.DoubanSpiderMiddleware": 100 40 | } 41 | 42 | # 爬虫公共类,基类 43 | BASE_SPIDER_PATH = "spiders.test_douban.base_spider.DoubanSpiderBaseSpider" 44 | 45 | # user-agent 46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \ 47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' 48 | 49 | # ------------------------------------------------------------------------------------------------------------------- 50 | 51 | 52 | # default settings 53 | 54 | # 下载中间件 55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader" 56 | 57 | # 下载处理中间件 58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler" 59 | 60 | # 下载调度器 61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager" 62 | 63 | # 下载中间件,可配置多个 64 | # DOWNLOAD_MIDDLEWARE_PATH = {} 65 | 66 | # ------------------------------------------------------------------------------------------------------------------- 67 | 68 | # 管道处理中间件 69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler" 70 | 71 | # 管道调度器 72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager" 73 | 74 | # 管道中间件,可配置多个 75 | # PIPELINE_MIDDLEWARE_PATH = {} 76 | 77 | # ------------------------------------------------------------------------------------------------------------------- 78 | # 暂时不使用,存在问题 79 | # # 去重中间件 80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler" 81 | # 82 | # # 去重过滤器 83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter" 84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter" 85 | # 86 | # # 去重队列,redis, memory = 内存 87 | # FILTER_QUEUE_TYPE = 'redis' 88 | # 89 | # # 去重是否 md5 加密 90 | # FILTER_USE_MD5 = False 91 | # 92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red 93 | # FILTER_REDIS_NAME = 'red' 94 | # 95 | # # 去重初始容量 96 | # FILTER_INITIAL_CAPACITY = 100000000 97 | # 98 | # # 去重错误率 99 | # FILTER_ERROR_RATE = 0.00001 100 | 101 | # ------------------------------------------------------------------------------------------------------------------- 102 | 103 | # # rabbit mq 配置 104 | # MESSAGE_MQ_CONFIG = { 105 | # 'username': 'admin', 106 | # 'password': 'admin123', 107 | # 'host': '127.0.0.1', 108 | # 'port': 9999 109 | # } 110 | # 111 | # # rabbit mq 消费批次,每次消费 10 条 112 | # MESSAGE_MQ_PREFETCH_COUNT = 10 113 | # 114 | # # rabbit mq virtual host 115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider' 116 | # 117 | # # rabbit mq 操作类 118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler' 119 | 120 | # ------------------------------------------------------------------------------------------------------------------- 121 | 122 | # 爬虫公共类,基类 123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider" 124 | 125 | # 爬虫工具类 126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil" 127 | 128 | # 代理IP中间件 129 | # redis IP 获取 130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy' 131 | # # 芝麻代理 IP 132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy' 133 | 134 | # 邮件 135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler' 136 | 137 | # post ger sql 操作类 138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle' 139 | 140 | # mysql 操作类 141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler' 142 | 143 | # redis 操作类 144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler' 145 | 146 | # ------------------------------------------------------------------------------------------------------------------- 147 | 148 | # 初始化 代理 IP 数量 149 | PROXY_NUM = 5 150 | 151 | # 重试次数 152 | # RETRY_COUNT = 3 153 | 154 | # 包含一下状态吗,重试 155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408] 156 | 157 | # 忽略 ssl 验证 158 | REQUEST_VERIFY = False 159 | 160 | # 请求超时时间 161 | REQUEST_TIMEOUT = 30 162 | 163 | # 消费者线程数 164 | CONSUMER_THREAD_NUM = 10 165 | 166 | # ------------------------------------------------------------------------------------------------------------------- 167 | 168 | """ 169 | 数据库配置 170 | 171 | 单个数据库 172 | REDIS_CONFIG = { 173 | 'host': '', 174 | 'host': '', 175 | 'db': '', 176 | 'user': '', 177 | 'password': '', 178 | 'decode_responses': True 179 | } 180 | 使用: 181 | red 默认变量名称 182 | self.red.get('key1') 183 | spider.red.get('key1') 184 | 185 | 多个数据库 186 | REDIS_CONFIG = [ 187 | { 188 | 'name': 'name1', 189 | 'host': '', 190 | 'host': '', 191 | 'db': '', 192 | 'user': '', 193 | 'password': '', 194 | 'decode_responses': True 195 | }, 196 | { 197 | 'name': 'name2', 198 | 'host': '', 199 | 'host': '', 200 | 'db': '', 201 | 'user': '', 202 | 'password': '', 203 | 'decode_responses': True 204 | }, 205 | ] 206 | 使用: 207 | self.name1.get('key1') 208 | spider.name1.get('key1') 209 | 210 | self.name2.get('key1') 211 | spider.name2.get('key1') 212 | """ 213 | -------------------------------------------------------------------------------- /spiders/test_douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: 2021/13/13 11:30:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/13/13 11:30:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /spiders/test_douban/spiders/douban_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: douban_spider.py 6 | Time: 2021/13/13 11:30:06 7 | ------------------------------------------------- 8 | Change Activity: 2021/13/13 11:30:06 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import json 13 | import os 14 | import sys 15 | 16 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 17 | sys.path.append(file_path) 18 | 19 | from magical.sync_spider import SyncSpider, run_spider, load_files 20 | 21 | 22 | class DoubanSpiderSpider(SyncSpider): 23 | name = 'douban_spider' 24 | settings_path = 'spiders.test_douban.settings' 25 | 26 | default_custom_setting = {} 27 | 28 | def __init__(self, *args, **kwargs): 29 | custom_setting = {} 30 | kwargs.update(dict(custom_setting=custom_setting)) 31 | super().__init__(*args, **kwargs) 32 | 33 | self.ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \ 34 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36' 35 | 36 | self.excel = load_files(self.settings['EXCEL']) 37 | 38 | def get_list(self, start=0, limit=100, tag='热门'): 39 | 40 | self.logger.info(f'start: {start}, tag: {tag}') 41 | 42 | headers = { 43 | 'Host': 'movie.douban.com', 44 | 'Referer': 'https://movie.douban.com/tv/', 45 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 46 | 'sec-ch-ua-mobile': '?0', 47 | 'Sec-Fetch-Dest': 'empty', 48 | 'Sec-Fetch-Mode': 'cors', 49 | 'Sec-Fetch-Site': 'same-origin', 50 | 'User-Agent': self.ua, 51 | 'X-Requested-With': 'XMLHttpRequest' 52 | } 53 | params = { 54 | 'type': 'tv', 55 | 'tag': tag, 56 | 'sort': 'recommend', 57 | 'page_limit': limit, 58 | 'page_start': start 59 | } 60 | url = 'https://movie.douban.com/j/search_subjects' 61 | response = self.download(url=url, headers=headers, params=params) 62 | subjects = response.json().get('subjects', []) 63 | 64 | if len(subjects) > 0: 65 | self.red.sadd('dbList', *[json.dumps(i, ensure_ascii=False) for i in subjects]) 66 | 67 | if len(subjects) < 100: 68 | return 69 | 70 | else: 71 | start += 100 72 | return self.get_list(start=start) 73 | 74 | def get_info(self, list_info): 75 | info_url = list_info['url'] 76 | headers = { 77 | 'Host': 'movie.douban.com', 78 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 79 | 'User-Agent': self.ua, 80 | 'Upgrade-Insecure-Requests': '1' 81 | } 82 | 83 | try: 84 | response = self.download(url=info_url, headers=headers) 85 | 86 | # 年份 87 | year = response.re.findall('\((.*?)\)') 88 | print('year: ', year) 89 | 90 | # 导演 91 | dao_yan = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') 92 | print('dao_yan: ', dao_yan) 93 | 94 | # 编剧 95 | bian_ju = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()') 96 | print('bian_ju: ', bian_ju) 97 | 98 | # 主演 99 | zhu_yan = response.re.findall('(.*?)') 100 | print('zhu_yan: ', zhu_yan) 101 | 102 | # 类型 103 | lei_xing = response.re.findall('(.*?)') 104 | print('lei_xing: ', lei_xing) 105 | 106 | # 制片国家/地区 107 | di_qu = response.re.findall('制片国家/地区: (.*?)
') 108 | print('di_qu: ', di_qu) 109 | 110 | # 语言 111 | yu_yan = response.re.findall('语言: (.*?)
') 112 | print('yu_yan: ', yu_yan) 113 | 114 | # 首播 115 | shou_bo = response.xpath('//*[@id="info"]/span[10]/text()') 116 | print('shou_bo: ', shou_bo) 117 | 118 | # 集数 119 | ji_shu = response.re.findall('集数: (.*?)
') 120 | print('ji_shu: ', ji_shu) 121 | 122 | # 单集片长 123 | dan_ji = response.re.findall('单集片长: (.*?)
') 124 | print('dan_ji: ', dan_ji) 125 | 126 | # 豆瓣总评分 127 | score = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') 128 | print('score: ', score) 129 | 130 | # 评价人数 131 | comment_num = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()') 132 | print('comment_num: ', comment_num) 133 | 134 | # 短评数 135 | duan_ping_num = response.xpath('//*[@id="comments-section"]/div[1]/h2/span/a/text()') 136 | print('duan_ping_num: ', duan_ping_num) 137 | 138 | # 小组讨论数 139 | 140 | # 剧情简介 141 | desc = response.xpath('//*[@id="link-report"]/span/text()') 142 | print('desc: ', desc) 143 | 144 | # 标签 145 | tag = response.xpath('//*[@id="content"]/div[2]/div[2]/div[4]/div/a/text()') 146 | print('tag: ', tag) 147 | 148 | # 播放平台 149 | 150 | # 在看人数 151 | zai_kan = response.xpath('//*[@id="subject-others-interests"]/div/a[1]/text()') 152 | print('zai_kan: ', zai_kan) 153 | 154 | # 看过人数 155 | kan_guo = response.xpath('//*[@id="subject-others-interests"]/div/a[2]/text()') 156 | print('kan_guo: ', kan_guo) 157 | 158 | # 想看人数 159 | xiang_kan = response.xpath('//*[@id="subject-others-interests"]/div/a[3]/text()') 160 | print('xiang_kan: ', xiang_kan) 161 | 162 | data = { 163 | 'url': list_info['url'], 164 | 'year': ', '.join(year), 165 | 'dao_yan': ', '.join(dao_yan), 166 | 'bian_ju': ', '.join(bian_ju), 167 | 'zhu_yan': ', '.join(zhu_yan), 168 | 'lei_xing': ', '.join(lei_xing), 169 | 'di_qu': ', '.join(di_qu), 170 | 'yu_yan': ', '.join(yu_yan), 171 | 'shou_bo': ', '.join(shou_bo), 172 | 'ji_shu': ', '.join(ji_shu), 173 | 'dan_ji': ', '.join(dan_ji), 174 | 'score': ', '.join(score), 175 | 'comment_num': ', '.join(comment_num), 176 | 'duan_ping_num': ', '.join(duan_ping_num), 177 | 'desc': (', '.join(desc)).strip(), 178 | 'tag': ', '.join(tag), 179 | 'zai_kan': ', '.join(zai_kan), 180 | 'kan_guo': ', '.join(kan_guo), 181 | 'xiang_kan': ', '.join(xiang_kan), 182 | } 183 | 184 | self.red.sadd('dbInfo', json.dumps(data, ensure_ascii=False)) 185 | 186 | except Exception as e: 187 | self.logger.info(f'error: {e}, list_info: {list_info}') 188 | 189 | def to_excel(self, items): 190 | def handler(x): 191 | if not x.get('year'): 192 | return False 193 | 194 | if not x.get('dao_yan'): 195 | return False 196 | 197 | return True 198 | 199 | data_list = list(filter(handler, map(lambda x: json.loads(x), items))) 200 | title = { 201 | 'url': '链接', 202 | 'year': '年份', 203 | 'dao_yan': '导演', 204 | 'bian_ju': '编剧', 205 | 'zhu_yan': '主演', 206 | 'lei_xing': '类型', 207 | 'di_qu': '制片国家/地区', 208 | 'yu_yan': '语言', 209 | 'shou_bo': '首播', 210 | 'ji_shu': '集数', 211 | 'dan_ji': '单集片长', 212 | 'score': '豆瓣评分(总分)', 213 | 'comment_num': '评价人数(总分)', 214 | 'duan_ping_num': '短评数', 215 | 'desc': '剧情简介', 216 | 'tag': '标签', 217 | 'zai_kan': '在看人数', 218 | 'kan_guo': '看过人数', 219 | 'xiang_kan': '想看人数', 220 | } 221 | excel_name = '豆瓣电影' 222 | 223 | self.excel.write_excel(data_list, title, excel_name, '../static/豆瓣电影.xls') 224 | 225 | def start_spider(self): 226 | # self.get_list() 227 | 228 | # data_list = list(self.red.smembers('dbList')) 229 | # for i in data_list: 230 | # self.get_info(json.loads(i)) 231 | 232 | self.to_excel(list(self.red.smembers('dbInfo'))) 233 | 234 | 235 | if __name__ == '__main__': 236 | run_spider(DoubanSpiderSpider) 237 | -------------------------------------------------------------------------------- /spiders/test_spider/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/7/1 上午11:39 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:39 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | import os 14 | from magical.cmdline import generate_spider_project, generate_spider_file 15 | 16 | 17 | def main(): 18 | project_path = os.path.dirname(os.path.abspath(__file__)) 19 | spider_name = 'test_common' 20 | 21 | # generate_spider_project('sync_spider', project_path, spider_name) 22 | generate_spider_file('sync_spider', project_path, spider_name) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /spiders/test_spider/base_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: base_spider.py 6 | Time: 2021/01/01 11:40:25 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:25 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | from magical.sync_spider.common.base_spider import BaseSpider 14 | 15 | 16 | class TestExcelBaseSpider(BaseSpider): 17 | def __init__(self, spider): 18 | super().__init__(spider) 19 | -------------------------------------------------------------------------------- /spiders/test_spider/middleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: middleware.py 6 | Time: 2021/01/01 11:40:25 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:25 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import random 13 | import time 14 | 15 | import requests 16 | 17 | from magical.sync_spider.extends_module.base_module.downloader import DownloaderMiddleware 18 | 19 | 20 | # ------------------------------------------------default middleware------------------------------------------------ 21 | 22 | 23 | class HeadersMiddleware(DownloaderMiddleware): 24 | """请求头处理中间件""" 25 | 26 | def __init__(self, spider): 27 | super().__init__(spider) 28 | 29 | def process_request(self, request): 30 | request.headers.update({'Connection': 'close'}) 31 | return request 32 | 33 | 34 | class ProxyMiddleware(DownloaderMiddleware): 35 | """代理IP中间件""" 36 | 37 | def __init__(self, spider): 38 | super().__init__(spider) 39 | 40 | self.proxy.proxy_handler(num=1) 41 | 42 | def process_request(self, request): 43 | request.meta['proxy'] = self.proxy.get_proxy() 44 | return request 45 | 46 | def process_response(self, request, response): 47 | return response 48 | 49 | def process_exception(self, request, exception): 50 | self.logger.error(f'ProxyMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 51 | 52 | if isinstance( 53 | exception, 54 | ( 55 | requests.exceptions.ConnectionError, 56 | requests.exceptions.ConnectTimeout, 57 | requests.exceptions.ReadTimeout, 58 | requests.exceptions.Timeout, 59 | ) 60 | ): 61 | self.logger.error(f'ProxyMiddleware - 请求异常重试 - request: {request}') 62 | time.sleep(random.randint(3, 5)) 63 | self.proxy.proxy_handler(request, num=1) 64 | return self._retry(request) 65 | 66 | return exception 67 | 68 | 69 | class RequestErrorMiddleware(DownloaderMiddleware): 70 | """请求异常中间件""" 71 | 72 | def __init__(self, spider): 73 | super().__init__(spider) 74 | 75 | def process_exception(self, request, exception): 76 | self.logger.error(f'RequestErrorMiddleware.process_exception: {exception}, request: {request}', exc_info=True) 77 | 78 | if isinstance( 79 | exception, 80 | ( 81 | requests.exceptions.ConnectionError, 82 | requests.exceptions.ConnectTimeout, 83 | requests.exceptions.ReadTimeout, 84 | requests.exceptions.Timeout, 85 | ) 86 | ): 87 | self.logger.error(f'RequestErrorMiddleware - 请求异常重试 - request: {request}') 88 | time.sleep(random.randint(3, 5)) 89 | return self._retry(request) 90 | 91 | elif isinstance(exception, requests.exceptions.HTTPError): 92 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.HTTPError - request: {request}') 93 | return None 94 | 95 | elif isinstance(exception, requests.exceptions.ChunkedEncodingError): 96 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.ChunkedEncodingError - request: {request}') 97 | return None 98 | 99 | elif isinstance(exception, requests.exceptions.SSLError): 100 | self.logger.error(f'RequestErrorMiddleware - requests.exceptions.SSLError - request: {request}') 101 | return None 102 | 103 | return exception 104 | 105 | 106 | # -------------------------------------------------spider middleware------------------------------------------------- 107 | 108 | 109 | class TestExcelMiddleware(DownloaderMiddleware): 110 | 111 | def __init__(self, spider): 112 | super().__init__(spider) 113 | 114 | def process_request(self, request): 115 | return request 116 | 117 | def process_response(self, request, response): 118 | if not request.use_middleware: 119 | return response 120 | 121 | return response 122 | 123 | def process_exception(self, request, exception): 124 | self.logger.error(f'TestExcelMiddleware.process_exception: {exception}, request: {request}') 125 | return exception 126 | -------------------------------------------------------------------------------- /spiders/test_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: settings.py 6 | Time: 2021/01/01 11:40:25 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:25 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | from magical.utils import log_path 13 | 14 | # project settings 15 | 16 | 17 | # ------------------------------------------------------------------------------------------------------------------- 18 | 19 | # 项目名称 20 | PROJECT_NAME = 'test_spider' 21 | 22 | # logger 路径 23 | LOGGER_PATH = log_path(__file__) 24 | 25 | # 重试次数 26 | RETRY_COUNT = 10 27 | 28 | # 管道中间件,可配置多个 29 | # PIPELINE_MIDDLEWARE_PATH = { 30 | # "spiders.test_spider.pipeline.TestExcelPipeline": 10 31 | # } 32 | 33 | # 下载中间件,可配置多个 34 | DOWNLOAD_MIDDLEWARE_PATH = { 35 | # "spiders.test_spider.middleware.DuplicateMiddleware": 7, 36 | # "spiders.test_spider.middleware.HeadersMiddleware": 8, 37 | # "spiders.test_spider.middleware.ProxyMiddleware": 9, 38 | "spiders.test_spider.middleware.RequestErrorMiddleware": 10, 39 | "spiders.test_spider.middleware.TestExcelMiddleware": 100 40 | } 41 | 42 | # 爬虫公共类,基类 43 | BASE_SPIDER_PATH = "spiders.test_spider.base_spider.TestExcelBaseSpider" 44 | 45 | # user-agent 46 | UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' \ 47 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' 48 | 49 | # ------------------------------------------------------------------------------------------------------------------- 50 | 51 | 52 | # default settings 53 | 54 | # 下载中间件 55 | DOWNLOADER_PATH = "magical.sync_spider.middleware.download.downloader.Downloader" 56 | 57 | # 下载处理中间件 58 | DOWNLOAD_HANDLER_PATH = "magical.sync_spider.middleware.download.handler.DownloadHandler" 59 | 60 | # 下载调度器 61 | DOWNLOAD_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.download.manager.DownloadMiddlewareManager" 62 | 63 | # 下载中间件,可配置多个 64 | # DOWNLOAD_MIDDLEWARE_PATH = {} 65 | 66 | # ------------------------------------------------------------------------------------------------------------------- 67 | 68 | # 管道处理中间件 69 | # PIPELINE_HANDLER_PATH = "magical.sync_spider.middleware.pipeline.handler.PipelineHandler" 70 | 71 | # 管道调度器 72 | # PIPELINE_MIDDLEWARE_MANAGER_PATH = "magical.sync_spider.middleware.pipeline.manager.PipelineMiddlewareManager" 73 | 74 | # 管道中间件,可配置多个 75 | # PIPELINE_MIDDLEWARE_PATH = {} 76 | 77 | # ------------------------------------------------------------------------------------------------------------------- 78 | # 暂时不使用,存在问题 79 | # # 去重中间件 80 | # FILTER_DUPLICATE_HANDLER = "magical.sync_spider.middleware.duplicate.handler.DuplicateHandler" 81 | # 82 | # # 去重过滤器 83 | # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.bloom_filter.ScalableBloomFilter" 84 | # # FILTER_METHOD_MANAGER = "magical.sync_spider.middleware.duplicate.expire_filter.ExpireFilter" 85 | # 86 | # # 去重队列,redis, memory = 内存 87 | # FILTER_QUEUE_TYPE = 'redis' 88 | # 89 | # # 去重是否 md5 加密 90 | # FILTER_USE_MD5 = False 91 | # 92 | # # 使用那个 redis 实例 去重,配置连接 name,默认为 red 93 | # FILTER_REDIS_NAME = 'red' 94 | # 95 | # # 去重初始容量 96 | # FILTER_INITIAL_CAPACITY = 100000000 97 | # 98 | # # 去重错误率 99 | # FILTER_ERROR_RATE = 0.00001 100 | 101 | # ------------------------------------------------------------------------------------------------------------------- 102 | 103 | # # rabbit mq 配置 104 | # MESSAGE_MQ_CONFIG = { 105 | # 'username': 'admin', 106 | # 'password': 'admin123', 107 | # 'host': '127.0.0.1', 108 | # 'port': 9999 109 | # } 110 | # 111 | # # rabbit mq 消费批次,每次消费 10 条 112 | # MESSAGE_MQ_PREFETCH_COUNT = 10 113 | # 114 | # # rabbit mq virtual host 115 | # MESSAGE_MQ_VIRTUAL_HOST = 'spider' 116 | # 117 | # # rabbit mq 操作类 118 | # MESSAGE_MQ_HANDLER = 'magical.sync_spider.extends_module.mqs.rabbit_mq.handler.RabbitMQHandler' 119 | 120 | # ------------------------------------------------------------------------------------------------------------------- 121 | 122 | # 爬虫公共类,基类 123 | # BASE_SPIDER_PATH = "magical.sync_spider.common.base_spider.BaseSpider" 124 | 125 | # 爬虫工具类 126 | SPIDER_UTIL_PATH = "magical.sync_spider.common.spider_util.SpiderUtil" 127 | 128 | # 代理IP中间件 129 | # redis IP 获取 130 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetRedisProxy' 131 | # # 芝麻代理 IP 132 | # PROXY_HANDLER = 'magical.sync_spider.common.proxy_handler.GetZhiMaProxy' 133 | 134 | # 邮件 135 | EMAIL_HANDLER = 'magical.sync_spider.common.email_handler.EmailHandler' 136 | 137 | # post ger sql 操作类 138 | POST_GRE_SQL_HANDLER = 'magical.sync_spider.databases.post_gre_sql_pool.PostGreHandle' 139 | 140 | # mysql 操作类 141 | MYSQL_HANDLER = 'magical.sync_spider.databases.mysql_pool.MysqlHandler' 142 | 143 | # redis 操作类 144 | REDIS_HANDLER = 'magical.sync_spider.databases.red_pool.RedisHandler' 145 | 146 | # ------------------------------------------------------------------------------------------------------------------- 147 | 148 | # 初始化 代理 IP 数量 149 | PROXY_NUM = 5 150 | 151 | # 重试次数 152 | # RETRY_COUNT = 3 153 | 154 | # 包含一下状态吗,重试 155 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408] 156 | 157 | # 忽略 ssl 验证 158 | REQUEST_VERIFY = False 159 | 160 | # 请求超时时间 161 | REQUEST_TIMEOUT = 30 162 | 163 | # 消费者线程数 164 | CONSUMER_THREAD_NUM = 10 165 | 166 | # ------------------------------------------------------------------------------------------------------------------- 167 | 168 | """ 169 | 数据库配置 170 | 171 | 单个数据库 172 | REDIS_CONFIG = { 173 | 'host': '', 174 | 'host': '', 175 | 'db': '', 176 | 'user': '', 177 | 'password': '', 178 | 'decode_responses': True 179 | } 180 | 使用: 181 | red 默认变量名称 182 | self.red.get('key1') 183 | spider.red.get('key1') 184 | 185 | 多个数据库 186 | REDIS_CONFIG = [ 187 | { 188 | 'name': 'name1', 189 | 'host': '', 190 | 'host': '', 191 | 'db': '', 192 | 'user': '', 193 | 'password': '', 194 | 'decode_responses': True 195 | }, 196 | { 197 | 'name': 'name2', 198 | 'host': '', 199 | 'host': '', 200 | 'db': '', 201 | 'user': '', 202 | 'password': '', 203 | 'decode_responses': True 204 | }, 205 | ] 206 | 使用: 207 | self.name1.get('key1') 208 | spider.name1.get('key1') 209 | 210 | self.name2.get('key1') 211 | spider.name2.get('key1') 212 | """ 213 | -------------------------------------------------------------------------------- /spiders/test_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py 6 | Time: 2021/01/01 11:40:25 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:25 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /spiders/test_spider/spiders/test_common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: test_common.py 6 | Time: 2021/31/31 17:34:58 7 | ------------------------------------------------- 8 | Change Activity: 2021/31/31 17:34:58 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import sys 14 | 15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | sys.path.append(file_path) 17 | 18 | from magical.sync_spider import SyncSpider, run_spider 19 | 20 | 21 | class TestCommonSpider(SyncSpider): 22 | name = 'test_common' 23 | settings_path = 'spiders.test_spider.settings' 24 | 25 | default_custom_setting = {} 26 | 27 | def __init__(self, *args, **kwargs): 28 | custom_setting = {} 29 | kwargs.update(dict(custom_setting=custom_setting)) 30 | super().__init__(*args, **kwargs) 31 | 32 | def start_spider(self): 33 | print(self.excel) 34 | 35 | 36 | if __name__ == '__main__': 37 | run_spider(TestCommonSpider) 38 | -------------------------------------------------------------------------------- /spiders/test_spider/spiders/test_excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: test_excel.py 6 | Time: 2021/01/01 11:40:25 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:25 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import sys 14 | 15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | sys.path.append(file_path) 17 | 18 | from magical.sync_spider import SyncSpider, load_files, run_spider 19 | 20 | 21 | class TestExcelSpider(SyncSpider): 22 | name = 'test_excel' 23 | settings_path = 'spiders.test_spider.settings' 24 | 25 | default_custom_setting = {} 26 | 27 | def __init__(self, *args, **kwargs): 28 | custom_setting = {} 29 | kwargs.update(dict(custom_setting=custom_setting)) 30 | super().__init__(*args, **kwargs) 31 | 32 | self.excel = load_files(self.settings['EXCEL']) 33 | 34 | def start_spider(self): 35 | data_list = [ 36 | {'desc': 'desc1', 'name': 'name1', 'plat': 'plat1'}, 37 | {'desc': 'desc2', 'name': 'name2', 'plat': 'plat2'}, 38 | {'desc': 'desc3', 'name': 'name3', 'plat': 'plat3'}, 39 | {'desc': 'desc4', 'name': 'name4', 'plat': 'plat4'}, 40 | {'desc': 'desc5', 'name': 'name5', 'plat': 'plat5'}, 41 | ] 42 | 43 | title = {'desc': '描述', 'name': '店铺名称', 'plat': '渠道'} 44 | excel_name = 'test' 45 | excel_file_path = '../static/test.xls' 46 | self.excel.write_excel(data_list, title, excel_name, excel_file_path) 47 | 48 | 49 | if __name__ == '__main__': 50 | run_spider(TestExcelSpider) 51 | -------------------------------------------------------------------------------- /spiders/test_spider/spiders/test_proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: test_proxy.py 6 | Time: 2021/01/01 11:40:08 7 | ------------------------------------------------- 8 | Change Activity: 2021/01/01 11:40:08 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | import os 13 | import sys 14 | 15 | file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | sys.path.append(file_path) 17 | 18 | from magical.sync_spider import SyncSpider, Request, run_spider 19 | 20 | 21 | class TestProxySpider(SyncSpider): 22 | name = 'test_proxy' 23 | settings_path = 'spiders.test_spider.settings' 24 | 25 | default_custom_setting = {} 26 | 27 | def __init__(self, *args, **kwargs): 28 | custom_setting = {} 29 | kwargs.update(dict(custom_setting=custom_setting)) 30 | super().__init__(*args, **kwargs) 31 | 32 | def start_spider(self): 33 | print(self.proxy.get_proxy()) 34 | 35 | self.download( 36 | url='', 37 | params={}, 38 | method='POST', 39 | data={}, 40 | headers={}, 41 | meta={ 42 | 'proxy': self.proxy.get_proxy() 43 | } 44 | ) 45 | 46 | request = Request( 47 | url='', 48 | params={}, 49 | method='POST', 50 | data={}, 51 | headers={}, 52 | meta={ 53 | 'proxy': self.proxy.get_proxy() 54 | } 55 | ) 56 | self.download(request) 57 | 58 | 59 | if __name__ == '__main__': 60 | run_spider(TestProxySpider) 61 | -------------------------------------------------------------------------------- /spiders/test_spider/static/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | Author: qinLess 5 | File: __init__.py.py 6 | Time: 2021/7/1 上午11:48 7 | ------------------------------------------------- 8 | Change Activity: 2021/7/1 上午11:48 9 | ------------------------------------------------- 10 | Desc: 11 | """ 12 | 13 | 14 | def main(): 15 | pass 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /spiders/test_spider/static/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qinLess/magical/4a6d38e55b3c8396c10712927028db4329faf888/spiders/test_spider/static/test.xls --------------------------------------------------------------------------------