├── src ├── lrabbit_spider │ ├── __init__.py │ ├── __main__.py │ ├── buffer │ │ ├── __init__.py │ │ ├── item_buffer.py │ │ └── request_buffer.py │ ├── network │ │ ├── user_agent.py │ │ └── request.py │ ├── constants.py │ ├── utils │ │ ├── tools.py │ │ ├── js │ │ │ └── intercept.js │ │ ├── log.py │ │ └── webdriver.py │ ├── deque │ │ └── __init__.py │ ├── setting.py │ └── db │ │ └── redisdb.py └── lrabbit_scrapy │ ├── android │ ├── __init__.py │ ├── sslbypass.py │ └── sslpass.js │ ├── asynico_utils │ └── __init__.py │ ├── __init__.py │ ├── __main__.py │ ├── common_utils │ ├── __init__.py │ ├── redis_helper.py │ ├── all_in_one.py │ ├── config_helper.py │ ├── network_helper.py │ ├── print_log_helper.py │ └── mysql_helper.py │ ├── config.py │ ├── all_excepiton │ └── __init__.py │ ├── asy_crawl.py │ ├── command.py │ ├── template_crawl.py │ ├── spider.py │ └── asynico_basespider.py ├── .gitignore ├── workspace.code-workspace ├── pyproject.toml ├── test ├── test_scrapy │ ├── test_log.py │ └── test_request_buffer.py └── test_spider.py ├── setup.cfg ├── LICENSE ├── LICENSE.rst ├── setup.py └── README.md /src/lrabbit_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_spider/__main__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/android/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_spider/buffer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_spider/buffer/item_buffer.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_spider/network/user_agent.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/asynico_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/lrabbit_spider/constants.py: -------------------------------------------------------------------------------- 1 | REQUEST_REPEAT = "request 已存在" -------------------------------------------------------------------------------- /src/lrabbit_scrapy/__init__.py: -------------------------------------------------------------------------------- 1 | from .spider import LrabbitSpider 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__/ 3 | env 4 | .ini 5 | dist/* 6 | .egg-info 7 | -------------------------------------------------------------------------------- /workspace.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": {} 8 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/__main__.py: -------------------------------------------------------------------------------- 1 | from .command import run 2 | import sys 3 | 4 | 5 | if __name__ == '__main__': 6 | args =sys.argv[1:] 7 | run(*args) 8 | -------------------------------------------------------------------------------- /test/test_scrapy/test_log.py: -------------------------------------------------------------------------------- 1 | 2 | from lrabbit_spider.utils.log import log 3 | 4 | 5 | log.info("test") 6 | log.debug("test") 7 | log.error("test") 8 | log.critical("test") 9 | log.warning("test") 10 | log.error(Exception("test")) 11 | log.warning(['error']) -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/2 15:27 4 | @Time : 2021/11/2 15:27 5 | @Author : lrabbit 6 | @FileName: config_helper.py 7 | @Software: PyCharm 8 | @Blog : https://www.lrabbit.life 9 | """ 10 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/config.py: -------------------------------------------------------------------------------- 1 | # [server] 2 | # db_user = root 3 | # db_password = password 4 | # db_database = database_name 5 | # db_host = 127.0.0.1/ you best not add into git 6 | # workers_num = 10 7 | # [test] 8 | # db_user = root 9 | # db_password = password 10 | # db_database = database_name 11 | # db_host = 127.0.0.1 12 | # workers_num = 10 13 | -------------------------------------------------------------------------------- /test/test_scrapy/test_request_buffer.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | from urllib import request 3 | from lrabbit_spider.buffer.request_buffer import RequestsBuffer 4 | from lrabbit_spider.network.request import Request 5 | import time 6 | 7 | request_buffer = RequestsBuffer("test_request_buffer") 8 | 9 | request_buffer.start() 10 | 11 | 12 | while True: 13 | 14 | request_buffer.put_request(Request(url="https://www.baidu.com")) 15 | time.sleep(1) 16 | 17 | -------------------------------------------------------------------------------- /src/lrabbit_spider/utils/tools.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def delay_time(sleep_time=60): 4 | time.sleep(sleep_time) 5 | 6 | 7 | class Singleton(object): 8 | def __init__(self,cls): 9 | self._cls = cls 10 | self._instance = {} 11 | 12 | def __call__(self,*args,**kwargs): 13 | if self._cls not in self._instance: 14 | self._instance[self._cls] = self._cls(*args,**kwargs) 15 | return self._instance[self._cls] 16 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/all_excepiton/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/22 13:41 4 | @Author : lrabbit 5 | @FileName: spider.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | 10 | 11 | class Excepiton403(Exception): 12 | def __init__(self): 13 | self.__name = "exception403" 14 | 15 | 16 | class Exception404(Exception): 17 | pass 18 | 19 | 20 | class Exception500(Exception): 21 | pass 22 | 23 | 24 | class ExceptionFileFieldNameError(Exception): 25 | pass 26 | 27 | 28 | if __name__ == '__main__': 29 | print(type(Excepiton403()).__name__) 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = lrabbit_spider 3 | version = 2.0.8 4 | author = lrabbit 5 | author_email = 709343607@qq.com 6 | description = this is a small spider,you can easy running. When you often need to crawl a single site, you can reduce some repeated code every time, using this small framework you can quickly crawl data into a file or database. 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/litter-rabbit/lrabbit_scrapy 10 | project_urls = 11 | Bug Tracker = https://github.com/litter-rabbit/lrabbit_scrapy/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/redis_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 10:14 4 | @Author : lrabbit 5 | @FileName: redis_helper.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | import redis 10 | from lrabbit_scrapy.common_utils.config_helper import get_redis_config, get_config_path 11 | 12 | 13 | class RedisClient: 14 | 15 | def __init__(self, db=0, config_path_env=None, env='test'): 16 | config_path = get_config_path(config_path_env) 17 | redis_config = get_redis_config(config_path, env) 18 | self.redis_executor = redis.StrictRedis(host=redis_config.REDIS_HOST, port=redis_config.REDIS_PORT, 19 | password=redis_config.REDIS_PASSWORD, 20 | db=db,decode_responses=True) 21 | 22 | 23 | if __name__ == '__main__': 24 | redis_client = RedisClient() 25 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/android/sslbypass.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | import os 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 4 | from pathlib import Path 5 | import subprocess 6 | 7 | 8 | def start_frida_server(servername): 9 | server_path = f'/data/local/tmp/{servername}' 10 | killserver = "adb shell su -c killall -9 " + servername 11 | os.system(killserver) 12 | subprocess.Popen( 13 | ["adb", "shell", "su", "-c", server_path]) 14 | 15 | 16 | def sslbypass(server_name='15.0.0'): 17 | try: 18 | start_frida_server(server_name) 19 | except Exception as e: 20 | traceback.print_exc() 21 | LogUtils.log_error("please check frida-server name or this path is in /data/local/tmp?") 22 | exit(0) 23 | current_parent_path = Path(__file__).parent 24 | frida_path = os.path.join(current_parent_path, 'sslpass.js') 25 | frida_hook_cmd = f"frida -FU -l {frida_path} --no-pause" 26 | os.system(frida_hook_cmd) 27 | 28 | 29 | if __name__ == '__main__': 30 | sslbypass() 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /src/lrabbit_spider/deque/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from typing import Callable, Tuple, Union,Any,List,Optional 4 | 5 | 6 | 7 | class Dedup: 8 | 9 | BloomFilter = 1 10 | MemoryFilter = 2 11 | ExpireFilter = 3 12 | def __init__(self,filter_type:int=BloomFilter,to_md5:bool = True,**kwargs): 13 | pass 14 | def __repr__(self): 15 | pass 16 | 17 | def _deal_datas(self,datas): 18 | pass 19 | 20 | def add(self,datas:Union[List[Any],Any],skip_check:bool = False) -> Union[List[Any],Any]: 21 | pass 22 | def get(self,datas:Union[List[Any],Any]) -> Union[List[Any],Any]: 23 | pass 24 | 25 | # def filter_exist_data( 26 | # self,datas:List[Any], 27 | # *, 28 | # datas_fingerprints:Options[List] = None, 29 | # callback:Callable[[Any],None] = None) ->Union(Tuple[List[Any],List[Any],List[Any]]): 30 | 31 | # pass 32 | def filter_exist_data( 33 | self, 34 | datas: List[Any], 35 | *, 36 | datas_fingerprints: Optional[List] = None, 37 | callback: Callable[[Any], None] = None 38 | ) -> Union[Tuple[List[Any], List[Any]], List[Any]]: 39 | pass 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="lrabbit_scrapy", 8 | version="2.0.8", 9 | author="lrabbit", 10 | author_email="709343607@qq.com", 11 | description="this is a small spider,you can easy running. When you often need to crawl a single site, you can reduce some repeated code every time, using this small framework you can quickly crawl data into a file or database.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/litter-rabbit/lrabbit_scrapy", 15 | project_urls={ 16 | "Bug Tracker": "https://github.com/litter-rabbit/lrabbit_scrapy/issues", 17 | }, 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | 24 | install_requires=[ 25 | "parsel == 1.6.0", 26 | "requests >= 2.26.0", 27 | "PyMySQL >= 0.9.3", 28 | "redispy >= 3.0.0", 29 | "frida >= 15.0.0", 30 | "frida-tools >= 10.4.1" 31 | ], 32 | packages=setuptools.find_packages(where="src"), 33 | package_dir={"": "src"}, 34 | package_data={ 35 | # If any package contains *.txt files, include them: 36 | "": ["*.js"], 37 | # And include any *.dat files found in the "data" subdirectory 38 | # of the "mypkg" package, also: 39 | }, 40 | include_package_data=True, 41 | python_requires=">=3.6.8", 42 | 43 | ) 44 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/asy_crawl.py: -------------------------------------------------------------------------------- 1 | from lrabbit_scrapy.asynico_basespider import BaseSpider 2 | import sqlalchemy as sa 3 | 4 | 5 | class Spider(BaseSpider): 6 | # setup 7 | is_open_mysql = False 8 | is_drop_tables = False 9 | # reset all tasks,files,this is may delete all data files 10 | reset_task_list = False 11 | 12 | """ 13 | not call a method or attribute start_with of 'file','table' 14 | """ 15 | # datastore 16 | table_table1 = [ 17 | sa.Column('val', sa.String(255)), 18 | ] 19 | 20 | # file_store 21 | file_blogPost = [ 22 | 'id', 'title', 'datetime', 'content' 23 | ] 24 | 25 | def __init__(self, spider_name): 26 | super(Spider, self).__init__(spider_name) 27 | 28 | async def worker(self, task): 29 | """ 30 | 31 | code your worker method 32 | 33 | :param task: 34 | :return: 35 | """ 36 | """ 37 | mysql work method 38 | """ 39 | # await self.insert_one(self.tables['table1'].insert().values(val=str(task))) 40 | # res = await self.query(self.tables['table1'].select()) 41 | # res = await res.fetchall() 42 | 43 | """ 44 | want to see how to work,uncomment beyond code 45 | """ 46 | url = f"http://www.lrabbit.life/post_detail/?id={task}" 47 | 48 | data = {"id": task, "datetime": "1997", "title": "lrabbit", "content": "hello"} 49 | if data: 50 | self.all_files['blogPost'].write(data) 51 | 52 | async def create_tasks(self): 53 | return [i for i in range(100)] 54 | 55 | 56 | if __name__ == '__main__': 57 | s = Spider(__file__) 58 | s.run() 59 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/all_in_one.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 10:22 4 | @Author : lrabbit 5 | @FileName: all_in_one.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | import os 10 | import datetime 11 | import csv 12 | from lrabbit_scrapy.all_excepiton import ExceptionFileFieldNameError 13 | 14 | 15 | def get_time_format_now(option=1): 16 | if option == 1: 17 | return datetime.datetime.now().strftime("%Y-%m-%d") 18 | else: 19 | return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 20 | 21 | 22 | class FileStore: 23 | 24 | def __init__(self, file_path: str, filed_name: list): 25 | self.file_path = file_path 26 | self.file_name = os.path.splitext(file_path) 27 | self.filed_name = filed_name 28 | 29 | def write(self, d: dict): 30 | """ 31 | :param d: dict type 32 | :return: 33 | """ 34 | if list(d.keys()) != self.filed_name: 35 | raise ExceptionFileFieldNameError() 36 | 37 | with open(self.file_path, 'a', encoding='utf8', newline='') as f: 38 | dict_write = csv.DictWriter(f, fieldnames=self.filed_name) 39 | dict_write.writerow(d) 40 | 41 | def write_many(self, rows: [dict]): 42 | if list(d.keys()) != self.filed_name: 43 | raise ExceptionFileFieldNameError() 44 | with open(self.file_path, 'a', encoding='utf8', newline='') as f: 45 | dict_write = csv.DictWriter(f, fieldnames=self.filed_name) 46 | dict_write.writerows(rows) 47 | 48 | 49 | if __name__ == '__main__': 50 | blog_file = FileStore(r"D:\PythonWorkSpace\lrabbit_scrapy\test\blogPost.csv", ["title"]) 51 | d = {"title": "1"} 52 | blog_file.write(d) 53 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/config_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 10:27 4 | @Author : lrabbit 5 | @FileName: config_helper.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | 10 | from configparser import ConfigParser 11 | import os 12 | 13 | 14 | class MysqlConfigClass: 15 | MYSQL_USER = None 16 | MYSQL_PASSWORD = None 17 | MYSQL_DATABASE = None 18 | MYSQL_HOST = None 19 | MYSQL_PORT = 3306 20 | 21 | 22 | class RedisConfigClass: 23 | REDIS_PASSWORD = None 24 | REDIS_DATABASE = None 25 | REDIS_HOST = None 26 | REDIS_PORT = 6379 27 | 28 | 29 | def get_config(config_path=None, env='test'): 30 | """ 31 | 32 | :param config_path: 33 | :return: 34 | """ 35 | config = ConfigParser() 36 | if not config_path: 37 | pwd = os.path.dirname(__file__) 38 | config_path = os.path.join(pwd, 'crawl.ini') 39 | config.read(config_path) 40 | if os.getenv("ENV") == 'server': 41 | env = os.getenv("ENV") 42 | config = config[env] 43 | return config 44 | 45 | 46 | def get_mysql_config(config_path, env='test') -> MysqlConfigClass: 47 | if not config_path: 48 | raise Exception("无效的文件路径") 49 | config = get_config(config_path, env) 50 | mysqlconfig = MysqlConfigClass() 51 | for k, v in config.items(): 52 | setattr(mysqlconfig, k.upper(), v) 53 | return mysqlconfig 54 | 55 | 56 | def get_redis_config(config_path, env='test') -> RedisConfigClass: 57 | if not config_path: 58 | raise Exception("无效的文件路径") 59 | config = get_config(config_path, env) 60 | redisconfig = RedisConfigClass() 61 | for k, v in config.items(): 62 | setattr(redisconfig, k.upper(), v) 63 | return redisconfig 64 | 65 | 66 | def get_config_path(config_path_env=None): 67 | if not config_path_env: 68 | config_path_env = "config_path" 69 | config_path = os.environ.get(config_path_env) 70 | if not config_path: 71 | raise Exception(f"请设置环境变量{config_path_env}为ini配置文件的绝对路径") 72 | return config_path 73 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/command.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 4 | from lrabbit_scrapy.android.sslbypass import sslbypass 5 | 6 | base_dir = Path(__file__).resolve().parent 7 | 8 | 9 | def newspider(spider_name): 10 | copy_new_name_file(spider_name, f'{spider_name}.py', 'asy_crawl.py') 11 | copy_new_name_file(spider_name, f'{spider_name}.ini', 'config.py', is_config=True) 12 | LogUtils.log_finish('创建项目成功') 13 | 14 | 15 | def new_template_spider(spider_name): 16 | copy_new_name_file(spider_name, f'{spider_name}.py', 'template_crawl.py') 17 | LogUtils.log_finish('创建项目成功') 18 | 19 | 20 | def copy_new_name_file(spider_name, new_name, src_name, is_config=False): 21 | dst_path = os.path.abspath(os.getcwd()) 22 | if not os.path.exists(os.path.join(dst_path, spider_name)): 23 | os.mkdir(os.path.join(dst_path, spider_name)) 24 | dst_path = os.path.join(dst_path, spider_name) 25 | dst_file = os.path.join(dst_path, new_name) 26 | src_file = os.path.join(base_dir, src_name) 27 | if os.path.exists(dst_file): 28 | raise Exception(f'please remove your file {dst_file}') 29 | f2 = open(dst_file, 'a') 30 | with open(src_file, 'r') as f: 31 | for line in f.readlines(): 32 | if is_config: 33 | line = line.replace("# ", "") 34 | f2.write(line) 35 | f2.close() 36 | 37 | 38 | def run(*args): 39 | argv = args[0] 40 | if argv == 'new_scrapy': 41 | spider_name = args[1] 42 | print("opts", spider_name) 43 | new_template_spider(spider_name) 44 | elif argv == 'sslpass': 45 | if len(args) >= 2: 46 | server_name = args[1] 47 | print("firda server name") 48 | sslbypass(server_name) 49 | else: 50 | sslbypass() 51 | elif argv == 'asy_new_scrapy': 52 | spider_name = args[1] 53 | print("opts", spider_name) 54 | newspider(spider_name) 55 | else: 56 | print("options: new_scarpy or sslpass or asy_new_scrapy ") 57 | 58 | 59 | if __name__ == '__main__': 60 | import sys 61 | argv = sys.argv[1:] 62 | run(*argv) 63 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/network_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 10:21 4 | @Author : lrabbit 5 | @FileName: network_helper.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | 10 | import urllib3 11 | import requests 12 | from lrabbit_scrapy.all_excepiton import Excepiton403, Exception404, Exception500 13 | 14 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 15 | 16 | 17 | class RequestSession: 18 | 19 | def __init__(self, proxies=None, timeout=15, headers=None): 20 | self.proxies = proxies 21 | self.timeout = timeout 22 | self.session = requests.session() 23 | self.session.headers = headers 24 | 25 | def send_request(self, method='GET', url=None, headers=None, data=None) -> requests.Response: 26 | 27 | if method == 'GET': 28 | res = self.session.get(url, proxies=self.proxies, verify=False, timeout=self.timeout) 29 | else: 30 | if isinstance(data, dict): 31 | res = self.session.post(url, json=data, proxies=self.proxies, headers=headers, verify=False, 32 | timeout=self.timeout) 33 | elif isinstance(data, str): 34 | res = self.session.get(url, data=data, proxies=self.proxies, headers=headers, verify=False, 35 | timeout=self.timeout) 36 | else: 37 | raise Exception("no data post") 38 | return self.deal_res(res) 39 | 40 | def download_file_by_url(self, out_file_path, url, headers=None): 41 | res = self.session.get(url=url, headers=headers, proxies=self.proxies, verify=False, stream=True, 42 | ) 43 | with open(out_file_path, 'wb') as f: 44 | for chunk in res.iter_content(1024): 45 | if chunk: 46 | f.write(chunk) 47 | return out_file_path 48 | 49 | def deal_res(self, res): 50 | if res.status_code == 403: 51 | raise Excepiton403() 52 | elif res.status_code == 404: 53 | raise Exception404() 54 | elif res.status_code == 500: 55 | raise Exception500() 56 | return res 57 | 58 | 59 | if __name__ == '__main__': 60 | session = RequestSession() 61 | -------------------------------------------------------------------------------- /test/test_spider.py: -------------------------------------------------------------------------------- 1 | from lrabbit_scrapy.spider import LrabbitSpider 2 | from lrabbit_scrapy.common_utils.network_helper import RequestSession 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 4 | from lrabbit_scrapy.common_utils.all_in_one import FileStore 5 | import os 6 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient 7 | from parsel import Selector 8 | 9 | 10 | class Spider(LrabbitSpider): 11 | """ 12 | spider_name : lrabbit blog spider 13 | """ 14 | # unique spider name 15 | spider_name = "lrabbit_blog" 16 | # max thread worker numbers 17 | max_thread_num = 2 18 | # is open for every thread a mysql connection,if your max_thread_num overpass 10 and in code need mysql query ,you need open this config 19 | thread_mysql_open = True 20 | # reset all task_list,every restart program will init task list 21 | reset_task_config = False 22 | # open loop init_task_list ,when your task is all finish,and you want again ,you can open it 23 | loop_task_config = False 24 | # remove config option,if open it,then confirm option when you init task 25 | remove_confirm_config = False 26 | # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini" 27 | config_env_name = "config_path" 28 | # redis db_num 29 | redis_db_config = 0 30 | # debug log ,open tracback log 31 | debug_config = False 32 | 33 | def __init__(self): 34 | super().__init__() 35 | self.session = RequestSession() 36 | self.proxy_session = RequestSession(proxies=None) 37 | csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv") 38 | self.field_names = ['id', 'title', 'datetime'] 39 | self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names) 40 | 41 | def worker(self, *args): 42 | task = args[0] 43 | mysql_client: MysqlClient 44 | if len(args) == 2: 45 | mysql_client = args[1] 46 | # mysql_client.execute("") 47 | res = self.session.send_request(method='GET', url=f'http://www.lrabbit.life/post_detail/?id={task}') 48 | selector = Selector(res.text) 49 | title = selector.css(".detail-title h1::text").get() 50 | datetime = selector.css(".detail-info span::text").get() 51 | if title: 52 | post_data = {"id": task, "title": title, 'datetime': datetime} 53 | self.blog_file.write(post_data) 54 | # when you succes get content update redis stat 55 | self.update_stat_redis() 56 | LogUtils.log_finish(task) 57 | 58 | def init_task_list(self): 59 | 60 | # you can get init task from mysql 61 | # res = self.mysql_client.query("select id from rookie limit 100 ") 62 | # return [task['id'] for task in res] 63 | return [i for i in range(100)] 64 | 65 | 66 | if __name__ == '__main__': 67 | spider = Spider() 68 | spider.run() 69 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/template_crawl.py: -------------------------------------------------------------------------------- 1 | from lrabbit_scrapy.spider import LrabbitSpider 2 | from lrabbit_scrapy.common_utils.network_helper import RequestSession 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 4 | from lrabbit_scrapy.common_utils.all_in_one import FileStore 5 | import os 6 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient 7 | from parsel import Selector 8 | 9 | 10 | class Spider(LrabbitSpider): 11 | """ 12 | spider_name : lrabbit blog spider 13 | """ 14 | # unique spider name 15 | spider_name = "lrabbit_blog" 16 | # max thread worker numbers 17 | max_thread_num = 2 18 | # is open for every thread a mysql connection,if your max_thread_num overpass 10 and in code need mysql query ,you need open this config 19 | thread_mysql_open = True 20 | # reset all task_list,every restart program will init task list 21 | reset_task_config = True 22 | # open loop init_task_list ,when your task is all fnish,and you want again ,you can open it 23 | loop_task_config = True 24 | # remove config option,if open it,then confirm option when you init task 25 | remove_confirm_config = True 26 | # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini" 27 | config_env_name = "config_path" 28 | # redis db_num 29 | redis_db_config = 0 30 | # debug log ,open tracback log 31 | debug_config = False 32 | 33 | def __init__(self): 34 | super().__init__() 35 | self.session = RequestSession() 36 | self.proxy_session = RequestSession(proxies=None) 37 | csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv") 38 | self.field_names = ['id', 'title', 'datetime'] 39 | self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names) 40 | 41 | def worker(self, *args): 42 | task = args[0] 43 | mysql_client: MysqlClient 44 | if len(args) == 2: 45 | mysql_client = args[1] 46 | mysql_client.execute("select id from rookie limit 100") 47 | # mysql_client.execute("") 48 | url = f'http://www.lrabbit.life/post_detail/?id={task}' 49 | LogUtils.log_running(url) 50 | res = self.session.send_request(method='GET', url=url) 51 | selector = Selector(res.text) 52 | title = selector.css(".detail-title h1::text").get() 53 | datetime = selector.css(".detail-info span::text").get() 54 | if title: 55 | post_data = {"id": task, "title": title, 'datetime': datetime} 56 | self.blog_file.write(post_data) 57 | # when you succes get content update redis stat 58 | self.update_stat_redis() 59 | LogUtils.log_finish(task) 60 | 61 | def init_task_list(self): 62 | 63 | # you can get init task from mysql 64 | res = self.mysql_client.query("select id from rookie limit 100 ") 65 | return [task['id'] for task in res] 66 | # return [i for i in range(100)] 67 | 68 | 69 | if __name__ == '__main__': 70 | spider = Spider() 71 | spider.run() 72 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/print_log_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 10:14 4 | @Author : lrabbit 5 | @FileName: redis_helper.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | 10 | import datetime 11 | import os 12 | 13 | 14 | class TermColor: 15 | ATTRIBUTES = dict( 16 | list(zip([ 17 | 'bold', 18 | 'dark', 19 | '', 20 | 'underline', 21 | 'blink', 22 | '', 23 | 'reverse', 24 | 'concealed' 25 | ], 26 | list(range(1, 9)) 27 | )) 28 | ) 29 | del ATTRIBUTES[''] 30 | 31 | HIGHLIGHTS = dict( 32 | list(zip([ 33 | 'on_grey', 34 | 'on_red', 35 | 'on_green', 36 | 'on_yellow', 37 | 'on_blue', 38 | 'on_magenta', 39 | 'on_cyan', 40 | 'on_white' 41 | ], 42 | list(range(40, 48)) 43 | )) 44 | ) 45 | 46 | COLORS = dict( 47 | list(zip([ 48 | 'grey', 49 | 'red', 50 | 'green', 51 | 'yellow', 52 | 'blue', 53 | 'magenta', 54 | 'cyan', 55 | 'white', 56 | ], 57 | list(range(30, 38)) 58 | )) 59 | ) 60 | 61 | RESET = '\033[0m' 62 | 63 | @staticmethod 64 | def colored(text, color=None, on_color=None, attrs=None): 65 | 66 | if os.getenv('ANSI_COLORS_DISABLED') is None: 67 | fmt_str = '\033[%dm%s' 68 | if color is not None: 69 | text = fmt_str % (TermColor.COLORS[color], text) 70 | 71 | if on_color is not None: 72 | text = fmt_str % (TermColor.HIGHLIGHTS[on_color], text) 73 | 74 | if attrs is not None: 75 | for attr in attrs: 76 | text = fmt_str % (TermColor.ATTRIBUTES[attr], text) 77 | 78 | text += TermColor.RESET 79 | return text 80 | 81 | 82 | class CommonUtils: 83 | 84 | def __init__(self): 85 | pass 86 | 87 | @staticmethod 88 | def fix_str_args(args): 89 | return list(map(lambda x: str(x).strip(), args)) 90 | 91 | @staticmethod 92 | def get_format_time(for_mat='%Y-%m-%d %H:%M:%S'): 93 | return TermColor.colored(datetime.datetime.now().strftime(for_mat), 'yellow') 94 | 95 | @staticmethod 96 | def space_join_line_arg(*args): 97 | return ' '.join(args) + '\n' 98 | 99 | 100 | class LogUtils: 101 | 102 | def __init__(self): 103 | pass 104 | 105 | @staticmethod 106 | def log_now_time_str(): 107 | # fix buffer cache 108 | # sys.stdout.buffer.write() 109 | print(CommonUtils.get_format_time()) 110 | 111 | @staticmethod 112 | def log_str(color_str, args): 113 | args = CommonUtils.fix_str_args(args) 114 | text = ' '.join(args) 115 | text = color_str + ' ' + text + '\n' 116 | # fix buffer cache 117 | # sys.stdout.buffer.write(text.encode('utf8')) 118 | print(text,end='') 119 | 120 | @staticmethod 121 | def log_info(*args): 122 | color_str = TermColor.colored('[*INFO*]', 'cyan') 123 | LogUtils.log_str(color_str, args) 124 | 125 | @staticmethod 126 | def log_running(*args): 127 | color_str = TermColor.colored('[*RUNNING*]', 'yellow') 128 | LogUtils.log_str(color_str, args) 129 | 130 | @staticmethod 131 | def log_finish(*args): 132 | color_str = TermColor.colored('[*FINISH*]', 'green') 133 | LogUtils.log_str(color_str, args) 134 | 135 | @staticmethod 136 | def log_error(*args): 137 | color_str = TermColor.colored('[*ERROR*]', 'red') 138 | LogUtils.log_str(color_str, args) 139 | 140 | @staticmethod 141 | def log_to_file(file_path, line): 142 | """ 143 | 144 | :param file_path: log file path 145 | :param line: a str type 146 | :return: 147 | """ 148 | with open(file_path, 'a', encoding='utf8') as f: 149 | line = CommonUtils.space_join_line_arg(LogUtils.get_format_time(), line) 150 | f.write(line) 151 | -------------------------------------------------------------------------------- /src/lrabbit_spider/buffer/request_buffer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | from threading import Thread 6 | from lrabbit_spider.utils.log import log 7 | import lrabbit_spider.utils.tools as tools 8 | import collections 9 | import lrabbit_spider.setting as setting 10 | from lrabbit_spider.network.request import Request 11 | from lrabbit_spider.deque import Dedup 12 | import lrabbit_spider.constants as constants 13 | from lrabbit_spider.db.redisdb import RedisDB 14 | 15 | 16 | MAX_URL_COUNT=1000 17 | class RequestsBuffer(Thread): 18 | dedup : Dedup= None 19 | 20 | def __init__(self,redis_key): 21 | if not hasattr(self,"_request_deque"): 22 | 23 | super(RequestsBuffer,self).__init__() 24 | self._thread_stop=False 25 | self._is_adding_to_db = False 26 | self._requests_deque = collections.deque() 27 | self._del_requests_deque = collections.deque() 28 | self._db = RedisDB() 29 | self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key) 30 | self.table_failed_request =setting.TAB_FAILED_REQUSETS 31 | if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE: 32 | self.__class__.dedup = Dedup(name=redis_key,to_md5=False,**setting.REQUEST_FILTER_SETTING) 33 | 34 | 35 | def run(self): 36 | self._thread_stop = False 37 | while not self._thread_stop: 38 | try: 39 | self.__add_request_to_db() 40 | except Exception as e: 41 | log.exception(e) 42 | tools.delay_time(1) 43 | 44 | def put_request(self,request): 45 | self._requests_deque.append(request) 46 | if self.get_requests_count() > MAX_URL_COUNT: 47 | self.flush() 48 | 49 | 50 | pass 51 | def put_del_request(self,request): 52 | self._del_requests_deque.append(request) 53 | 54 | 55 | pass 56 | def put_failed_request(self,request,table=None): 57 | try: 58 | request_dict = request.to_dict 59 | self._db.zadd(table or self.table_failed_request,request_dict,request.priority) 60 | except Exception as e: 61 | log.exception(e) 62 | 63 | def flush(self): 64 | try: 65 | self.__add_request_to_db() 66 | except Exception as e: 67 | log.exception(e) 68 | 69 | def get_requests_count(self): 70 | return len(self._requests_deque) 71 | pass 72 | def is_adding_to_db(self): 73 | return self._is_adding_to_db 74 | pass 75 | def __add_request_to_db(self): 76 | request_list = [] 77 | prioritys = [] 78 | callbacks = [] 79 | 80 | while self._requests_deque: 81 | request:Request = self._requests_deque.popleft() 82 | self.is_adding_to_db = True 83 | if callable(request): 84 | callbacks.append(request) 85 | priority = request.fingerprint 86 | if ( 87 | request.filter_repeat 88 | and setting.REQUEST_FILTER_ENABLE 89 | and not self.__class__.dedup.add(request.fingerprint) 90 | 91 | ): 92 | log.debug(constants.REQUEST_REPEAT+f" URL = {request.url}") 93 | continue 94 | else: 95 | request_list.append(str(request.to_dict)) 96 | prioritys.append(priority) 97 | if len(request_list)>MAX_URL_COUNT: 98 | self._db.zadd(self._table_request,request_list,prioritys) 99 | request_list = [] 100 | prioritys = [] 101 | 102 | if request_list: 103 | self._db.zadd(self._table_request,request_list,prioritys) 104 | 105 | for callback in callbacks: 106 | try: 107 | callback() 108 | except Exception as e: 109 | log.exception(e) 110 | if self._del_requests_deque: 111 | reqeust_done_list = [] 112 | while self._del_requests_deque: 113 | reqeust_done_list.append(self._del_requests_deque.popleft()) 114 | reqeust_done_list = list(set(reqeust_done_list)-set(request_list)) 115 | if reqeust_done_list: 116 | self._db.zrem(self._table_request,reqeust_done_list) 117 | 118 | self._is_adding_to_db = False 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lrabbit_scrapy 2 | ===== 3 | 4 | this is a small spider,you can easy running. you don't have to redo some repeated code every time, using this small framework you can quickly crawl data into a file or database. 5 | 6 | 7 | Requirements 8 | ---------- 9 | python >=3.6.8 10 | 11 | Installing 12 | ---------- 13 | 14 | $ pip3 install lrabbit_scrapy 15 | 16 | quick start 17 | ---------------- 18 | 19 | 20 | * python3 -m lrabbit_scrapy new_scrapy blog 21 | * then will create a directory of blog,this is file content 22 | ```python 23 | from lrabbit_scrapy.spider import LrabbitSpider 24 | from lrabbit_scrapy.common_utils.network_helper import RequestSession 25 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 26 | from lrabbit_scrapy.common_utils.all_in_one import FileStore 27 | import os 28 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient 29 | from parsel import Selector 30 | 31 | 32 | class Spider(LrabbitSpider): 33 | """ 34 | spider_name : lrabbit blog spider 35 | """ 36 | # unique spider name 37 | spider_name = "lrabbit_blog" 38 | # max thread worker numbers 39 | max_thread_num = 2 40 | # is open for every thread a mysql connection,if your max_thread_num overpass 10 and in code need mysql query ,you need open this config 41 | thread_mysql_open = True 42 | # reset all task_list,every restart program will init task list 43 | reset_task_config = False 44 | # open loop init_task_list ,when your task is all finish,and you want again ,you can open it 45 | loop_task_config = False 46 | # remove config option,if open it,then confirm option when you init task 47 | remove_confirm_config = False 48 | # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini" 49 | config_env_name = "config_path" 50 | # redis db_num 51 | redis_db_config = 0 52 | # debug log ,open tracback log 53 | debug_config = False 54 | 55 | def __init__(self): 56 | super().__init__() 57 | self.session = RequestSession() 58 | self.proxy_session = RequestSession(proxies=None) 59 | csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv") 60 | self.field_names = ['id', 'title', 'datetime'] 61 | self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names) 62 | 63 | def worker(self, *args): 64 | task = args[0] 65 | mysql_client: MysqlClient 66 | if len(args) == 2: 67 | mysql_client = args[1] 68 | # mysql_client.execute("") 69 | res = self.session.send_request(method='GET', url=f'http://www.lrabbit.life/post_detail/?id={task}') 70 | selector = Selector(res.text) 71 | title = selector.css(".detail-title h1::text").get() 72 | datetime = selector.css(".detail-info span::text").get() 73 | if title: 74 | post_data = {"id": task, "title": title, 'datetime': datetime} 75 | self.blog_file.write(post_data) 76 | # when you succes get content update redis stat 77 | self.update_stat_redis() 78 | LogUtils.log_finish(task) 79 | 80 | def init_task_list(self): 81 | 82 | # you can get init task from mysql 83 | # res = self.mysql_client.query("select id from rookie limit 100 ") 84 | # return [task['id'] for task in res] 85 | return [i for i in range(100)] 86 | 87 | 88 | if __name__ == '__main__': 89 | spider = Spider() 90 | spider.run() 91 | 92 | ``` 93 | 94 | * set config.ini and config env variable 95 | * create crawl.ini,for example this file path is /root/crawl.ini 96 | ```ini 97 | [server] 98 | mysql_user = root 99 | mysql_password = 123456 100 | mysql_database = test 101 | mysql_host = 192.168.1.1 102 | redis_user = lrabbit 103 | redis_host = 192.168.1.1 104 | redis_port = 6379 105 | redis_password = 123456 106 | 107 | [test] 108 | mysql_user = root 109 | mysql_password = 123456 110 | mysql_database = test 111 | mysql_host = 192.168.1.1 112 | redis_user = lrabbit 113 | redis_host = 192.168.1.1 114 | redis_port = 6379 115 | redis_password = 123456 116 | ``` 117 | * set config env 118 | * windows power shell 119 | * $env:config_path = "/root/crawl.ini" 120 | * linux 121 | * export config_path="/root/crawl.ini" 122 | 123 | * python3 blog_spider.py 124 | 125 | ## other function 126 | * python3 blog_spider.py stat 127 | * show task stat 128 | * python3 -m lrabbit-scrapy sslpass 129 | * pass android ssl 130 | 131 | - author: https://www.lrabbit.life/ 132 | 133 | -------------------------------------------------------------------------------- /src/lrabbit_spider/network/request.py: -------------------------------------------------------------------------------- 1 | 2 | import lrabbit_spider.network.user_agent as user_agent 3 | import lrabbit_spider.setting as setting 4 | import requests 5 | from lrabbit_spider.utils. 6 | from requests.adapters import HTTPAdapter 7 | class Request(object): 8 | 9 | webdriver_pool:WebD 10 | __REQUEST_ATTRS__ = { 11 | # 'method', 'url', 必须传递 不加入**kwargs中 12 | "params", 13 | "data", 14 | "headers", 15 | "cookies", 16 | "files", 17 | "auth", 18 | "timeout", 19 | "allow_redirects", 20 | "proxies", 21 | "hooks", 22 | "stream", 23 | "verify", 24 | "cert", 25 | "json", 26 | } 27 | def __init__( 28 | self, 29 | url="", 30 | retry_times=0, 31 | priority=300, 32 | parser_name=None, 33 | callback=None, 34 | filter_repeat=True, 35 | auto_request=True, 36 | request_sync=False, 37 | use_session=None, 38 | random_user_agent=True, 39 | download_midware=None, 40 | is_abandoned=False, 41 | render=False, 42 | render_time=0, 43 | **kwargs, 44 | ): 45 | """ 46 | @summary: Request参数 47 | --------- 48 | 框架参数 49 | @param url: 待抓取url 50 | @param retry_times: 当前重试次数 51 | @param priority: 优先级 越小越优先 默认300 52 | @param parser_name: 回调函数所在的类名 默认为当前类 53 | @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可) 54 | @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True 55 | @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页 56 | @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队 57 | @param use_session: 是否使用session方式 58 | @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True 59 | @param download_midware: 下载中间件。默认为parser中的download_midware 60 | @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False 61 | @param render: 是否用浏览器渲染 62 | @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码 63 | -- 64 | 以下参数与requests参数使用方式一致 65 | @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断 66 | @param params: 请求参数 67 | @param data: 请求body 68 | @param json: 请求json字符串,同 json.dumps(data) 69 | @param headers: 70 | @param cookies: 字典 或 CookieJar 对象 71 | @param files: 72 | @param auth: 73 | @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组 74 | @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向 75 | @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"} 76 | @param verify: 为 True 时将会验证 SSL 证书 77 | @param stream: 如果为 False,将会立即下载响应内容 78 | @param cert: 79 | -- 80 | @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出 81 | --------- 82 | @result: 83 | """ 84 | 85 | self.url = url 86 | self.retry_times = retry_times 87 | self.priority = priority 88 | self.parser_name = parser_name 89 | self.callback = callback 90 | self.filter_repeat = filter_repeat 91 | self.auto_request = auto_request 92 | self.request_sync = request_sync 93 | self.use_session = use_session 94 | self.random_user_agent = random_user_agent 95 | self.download_midware = download_midware 96 | self.is_abandoned = is_abandoned 97 | self.render = render 98 | self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0) 99 | 100 | self.requests_kwargs = {} 101 | for key, value in kwargs.items(): 102 | if key in self.__class__.__REQUEST_ATTRS__: # 取requests参数 103 | self.requests_kwargs[key] = value 104 | 105 | self.__dict__[key] = value 106 | 107 | 108 | def __repr__(self): 109 | pass 110 | 111 | def __setattr__(self,key,value): 112 | self.__dict__[key] = value 113 | if key in self.__class__.__REQUEST_ATTRS__: 114 | self.requests_kwargs[key] = value 115 | 116 | def __lt__(self,other): 117 | return self.priority{ 10 | handler.next(config); 11 | } 12 | , 13 | onError: (err,handler)=>{ 14 | handler.next(err) 15 | } 16 | , 17 | onResponse: (response,handler)=>{ 18 | var url = response.config.url; 19 | if (window.__urlRegexes.length > 0) { 20 | for (const regex of window.__urlRegexes) { 21 | var re = new RegExp(regex, "g"); 22 | if (re.exec(url)) { 23 | window.__ajaxData[regex] = { 24 | request: { 25 | 'url': response.config.xhr.config.url, 26 | 'data': response.config.xhr.config.body, 27 | 'headers': response.config.xhr.config.headers 28 | }, 29 | response: { 30 | 'url': response.config.url, 31 | 'headers': response.headers, 32 | 'content': response.response, 33 | 'status_code': response.status 34 | } 35 | }; 36 | } 37 | } 38 | } 39 | handler.next(response) 40 | } 41 | }) 42 | -------------------------------------------------------------------------------- /src/lrabbit_spider/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | # redis 表名 6 | # 任务表模版 7 | TAB_REQUSETS = "{redis_key}:z_requsets" 8 | # 任务失败模板 9 | TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets" 10 | # 数据保存失败模板 11 | TAB_FAILED_ITEMS = "{redis_key}:s_failed_items" 12 | # 爬虫状态表模版 13 | TAB_SPIDER_STATUS = "{redis_key}:z_spider_status" 14 | # 爬虫时间记录表 15 | TAB_SPIDER_TIME = "{redis_key}:h_spider_time" 16 | # 用户池 17 | TAB_USER_POOL = "{redis_key}:h_{user_type}_pool" 18 | 19 | # MYSQL 20 | MYSQL_IP = os.getenv("MYSQL_IP") 21 | MYSQL_PORT = int(os.getenv("MYSQL_PORT", 3306)) 22 | MYSQL_DB = os.getenv("MYSQL_DB") 23 | MYSQL_USER_NAME = os.getenv("MYSQL_USER_NAME") 24 | MYSQL_USER_PASS = os.getenv("MYSQL_USER_PASS") 25 | 26 | # MONGODB 27 | MONGO_IP = os.getenv("MONGO_IP", "localhost") 28 | MONGO_PORT = int(os.getenv("MONGO_PORT", 27017)) 29 | MONGO_DB = os.getenv("MONGO_DB") 30 | MONGO_USER_NAME = os.getenv("MONGO_USER_NAME") 31 | MONGO_USER_PASS = os.getenv("MONGO_USER_PASS") 32 | 33 | # REDIS 34 | # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"] 35 | REDISDB_IP_PORTS = os.getenv("REDISDB_IP_PORTS") 36 | REDISDB_USER_PASS = os.getenv("REDISDB_USER_PASS") 37 | REDISDB_DB = int(os.getenv("REDISDB_DB", 0)) 38 | # 适用于redis哨兵模式 39 | REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME") 40 | 41 | # 数据入库的pipeline,可自定义,默认MysqlPipeline 42 | ITEM_PIPELINES = [ 43 | "feapder.pipelines.mysql_pipeline.MysqlPipeline", 44 | # "feapder.pipelines.mongo_pipeline.MongoPipeline", 45 | ] 46 | EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警 47 | EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试 48 | 49 | # 爬虫相关 50 | # COLLECTOR 51 | COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 52 | COLLECTOR_TASK_COUNT = 10 # 每次获取任务数量 53 | 54 | # SPIDER 55 | SPIDER_THREAD_COUNT = 1 # 爬虫并发数 56 | SPIDER_SLEEP_TIME = ( 57 | 0 # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5 58 | ) 59 | SPIDER_TASK_COUNT = 1 # 每个parser从内存队列中获取任务的数量 60 | SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 61 | SPIDER_AUTO_START_REQUESTS = ( 62 | True # 是否主动执行添加 设置为False 需要手动调用start_monitor_task,适用于多进程情况下 63 | ) 64 | KEEP_ALIVE = False # 爬虫是否常驻 65 | 66 | # 浏览器渲染 67 | WEBDRIVER = dict( 68 | pool_size=1, # 浏览器的数量 69 | load_images=True, # 是否加载图片 70 | user_agent=None, # 字符串 或 无参函数,返回值为user_agent 71 | proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 72 | headless=False, # 是否为无头浏览器 73 | driver_type="CHROME", # CHROME、PHANTOMJS、FIREFOX 74 | timeout=30, # 请求超时时间 75 | window_size=(1024, 800), # 窗口大小 76 | executable_path=None, # 浏览器路径,默认为默认路径 77 | render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码 78 | custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数 79 | xhr_url_regexes=None, # 拦截xhr接口,支持正则,数组类型 80 | ) 81 | 82 | # 爬虫启动时,重新抓取失败的requests 83 | RETRY_FAILED_REQUESTS = False 84 | # 保存失败的request 85 | SAVE_FAILED_REQUEST = True 86 | # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做) 87 | REQUEST_LOST_TIMEOUT = 600 # 10分钟 88 | # request网络请求超时时间 89 | REQUEST_TIMEOUT = 22 # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组 90 | 91 | # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求 92 | RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 93 | RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 94 | RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 95 | 96 | # redis 存放item与request的根目录 97 | REDIS_KEY = "" 98 | # 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬 99 | DELETE_KEYS = [] 100 | 101 | # 设置代理 102 | PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 103 | PROXY_ENABLE = True 104 | 105 | # 随机headers 106 | RANDOM_HEADERS = True 107 | # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型 108 | USER_AGENT_TYPE = "chrome" 109 | # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效 110 | DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" 111 | # requests 使用session 112 | USE_SESSION = False 113 | 114 | # 去重 115 | ITEM_FILTER_ENABLE = False # item 去重 116 | ITEM_FILTER_SETTING = dict( 117 | filter_type=1 # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3 118 | ) 119 | REQUEST_FILTER_ENABLE = False # request 去重 120 | REQUEST_FILTER_SETTING = dict( 121 | filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3 122 | expire_time=2592000, # 过期时间1个月 123 | ) 124 | 125 | # 报警 支持钉钉、企业微信、邮件 126 | # 钉钉报警 127 | DINGDING_WARNING_URL = "" # 钉钉机器人api 128 | DINGDING_WARNING_PHONE = "" # 报警人 支持列表,可指定多个 129 | DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False 130 | # 邮件报警 131 | EMAIL_SENDER = "" # 发件人 132 | EMAIL_PASSWORD = "" # 授权码 133 | EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个 134 | EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱 135 | # 企业微信报警 136 | WECHAT_WARNING_URL = "" # 企业微信机器人api 137 | WECHAT_WARNING_PHONE = "" # 报警人 将会在群内@此人, 支持列表,可指定多人 138 | WECHAT_WARNING_ALL = False # 是否提示所有人, 默认为False 139 | # 时间间隔 140 | WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏; 0表示不去重 141 | WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / ERROR 142 | WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 143 | 144 | LOG_NAME = os.path.basename(os.getcwd()) 145 | LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 146 | LOG_LEVEL = "DEBUG" 147 | LOG_COLOR = True # 是否带有颜色 148 | LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台 149 | LOG_IS_WRITE_TO_FILE = False # 是否写文件 150 | LOG_MODE = "w" # 写文件的模式 151 | LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数 152 | LOG_BACKUP_COUNT = 20 # 日志文件保留数量 153 | LOG_ENCODING = "utf8" # 日志文件编码 154 | OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 155 | 156 | # 打点监控 influxdb 配置 157 | INFLUXDB_HOST = os.getenv("INFLUXDB_HOST", "localhost") 158 | INFLUXDB_PORT = int(os.getenv("INFLUXDB_PORT", 8086)) 159 | INFLUXDB_UDP_PORT = int(os.getenv("INFLUXDB_UDP_PORT", 8089)) 160 | INFLUXDB_USER = os.getenv("INFLUXDB_USER") 161 | INFLUXDB_PASSWORD = os.getenv("INFLUXDB_PASSWORD") 162 | INFLUXDB_DATABASE = os.getenv("INFLUXDB_DB") 163 | # 监控数据存储的表名,爬虫管理系统上会以task_id命名 164 | INFLUXDB_MEASUREMENT = "task_" + os.getenv("TASK_ID") if os.getenv("TASK_ID") else None 165 | # 打点监控其他参数,若这里也配置了influxdb的参数, 则会覆盖外面的配置 166 | METRICS_OTHER_ARGS = dict(retention_policy_duration="180d", emit_interval=60) 167 | 168 | ############# 导入用户自定义的setting ############# 169 | try: 170 | from setting import * 171 | 172 | # 兼容老版本的配置 173 | KEEP_ALIVE = not AUTO_STOP_WHEN_SPIDER_DONE 174 | except: 175 | pass 176 | -------------------------------------------------------------------------------- /src/lrabbit_spider/db/redisdb.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import lrabbit_spider.setting as setting 4 | from redis.connection import Encoder as _Encoder 5 | from redis.exceptions import ConnectionError, TimeoutError 6 | from redis.exceptions import DataError 7 | from redis.sentinel import Sentinel 8 | from rediscluster import RedisCluster 9 | from redis._compat import unicode,long,basestring 10 | 11 | from lrabbit_spider.utils.log import log 12 | 13 | 14 | import redis 15 | class Encoder(_Encoder): 16 | def encode(self, value): 17 | "Return a bytestring or bytes-like representation of the value" 18 | if isinstance(value, (bytes, memoryview)): 19 | return value 20 | # elif isinstance(value, bool): 21 | # # special case bool since it is a subclass of int 22 | # raise DataError( 23 | # "Invalid input of type: 'bool'. Convert to a " 24 | # "bytes, string, int or float first." 25 | # ) 26 | elif isinstance(value, float): 27 | value = repr(value).encode() 28 | elif isinstance(value, (int, long)): 29 | # python 2 repr() on longs is '123L', so use str() instead 30 | value = str(value).encode() 31 | elif isinstance(value, (list, dict, tuple)): 32 | value = unicode(value) 33 | elif not isinstance(value, basestring): 34 | # a value we don't know how to deal with. throw an error 35 | typename = type(value).__name__ 36 | raise DataError( 37 | "Invalid input of type: '%s'. Convert to a " 38 | "bytes, string, int or float first." % typename 39 | ) 40 | if isinstance(value, unicode): 41 | value = value.encode(self.encoding, self.encoding_errors) 42 | return value 43 | 44 | 45 | redis.connection.Encoder = Encoder 46 | class RedisDB: 47 | def __init__( 48 | self, 49 | ip_ports=None, 50 | db=None, 51 | user_pass=None, 52 | url=None, 53 | decode_responses=True, 54 | service_name=None, 55 | max_connections=32, 56 | **kwargs, 57 | ): 58 | """ 59 | redis的封装 60 | Args: 61 | ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"] 62 | db: 63 | user_pass: 64 | url: 65 | decode_responses: 66 | service_name: 适用于redis哨兵模式 67 | """ 68 | 69 | # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 70 | if ip_ports is None: 71 | ip_ports = setting.REDISDB_IP_PORTS 72 | if db is None: 73 | db = setting.REDISDB_DB 74 | if user_pass is None: 75 | user_pass = setting.REDISDB_USER_PASS 76 | if service_name is None: 77 | service_name = setting.REDISDB_SERVICE_NAME 78 | 79 | self._is_redis_cluster = False 80 | 81 | self.__redis = None 82 | self._url = url 83 | self._ip_ports = ip_ports 84 | self._db = db 85 | self._user_pass = user_pass 86 | self._decode_responses = decode_responses 87 | self._service_name = service_name 88 | self._max_connections = max_connections 89 | self._kwargs = kwargs 90 | self.get_connect() 91 | 92 | def get_connect(self): 93 | pass 94 | def __repr__(self): 95 | pass 96 | 97 | @property 98 | def _redis(self): 99 | pass 100 | @_redis.setter 101 | def _redis(self,val): 102 | pass 103 | 104 | 105 | @classmethod 106 | def from_url(cls,url): 107 | pass 108 | 109 | def sadd(self,table,values): 110 | pass 111 | 112 | def sget(self,table,count=1,is_pop=True): 113 | pass 114 | 115 | def srem(self,table,values): 116 | pass 117 | def sget_count(self,tables): 118 | pass 119 | 120 | def sdelete(slef,table): 121 | pass 122 | 123 | def sismember(self,table,key): 124 | pass 125 | 126 | def zadd(self,table,values,priority=0): 127 | pass 128 | def zget(self,table,count=1,is_pop=True): 129 | pass 130 | def zremrangebyscore(self,table,priority_min,priority_max): 131 | pass 132 | def zrangebysocre(selt,tale,priority_min,priority_mmax,count=None,is_pop=True): 133 | pass 134 | def zrangebyscore_increase_socre(self,table,priority_min,priority_max,icrease_score,count=None): 135 | pass 136 | def zrangebyscore_set_score(selfmtable,priority_min,priority_max,score,count=None): 137 | pass 138 | def zincrby(self,table,amount,value): 139 | pass 140 | 141 | def zget_count(self,table,priority_min=None,prioirty_max=None): 142 | pass 143 | 144 | def zrem(self,table,values): 145 | pass 146 | 147 | def zexiste(self,table,values): 148 | pass 149 | def lpush(self,table,values): 150 | pass 151 | 152 | def lpop(self,table,count=1): 153 | pass 154 | def rpoplpush(self,from_table,to_table=None): 155 | pass 156 | 157 | def lget_count(self,table): 158 | pass 159 | 160 | def lrem(self,table,value,num=0): 161 | pass 162 | def lrange(self,table,start=0,end=-1): 163 | pass 164 | 165 | def hset(self,table,value): 166 | pass 167 | 168 | def hset_batch(self,table,datas): 169 | pass 170 | 171 | def hincry(self,table,key,increment): 172 | pass 173 | 174 | def hget(self,table,key,is_pop=False): 175 | pass 176 | 177 | def hgetall(selfmtable): 178 | pss 179 | 180 | def hexistes(sefl,table,key): 181 | pass 182 | 183 | def hdel(self,table,*keys): 184 | pass 185 | 186 | def hget_count(self,table): 187 | pass 188 | 189 | def hkeys(self,table): 190 | pass 191 | 192 | def setbit(self,table,offset,values): 193 | pass 194 | 195 | def getbit(self,table,offsets): 196 | pass 197 | 198 | def bitcount(self,table): 199 | pass 200 | 201 | def strset(self,table,values,**kwargs): 202 | pass 203 | 204 | def str_incrby(self,table,values): 205 | pass 206 | 207 | def strget(self,table): 208 | pass 209 | 210 | def strlen(sefl,table): 211 | pass 212 | 213 | def getkeys(self,regex): 214 | pass 215 | 216 | def set_expire(self,key,seconds): 217 | pass 218 | 219 | def get_expire(self,key): 220 | pass 221 | 222 | def clear(sefl,table): 223 | pass 224 | 225 | def get_redis_obj(slef): 226 | pass 227 | 228 | def _reconenct(self): 229 | pass 230 | 231 | def __getattr__(self,name): 232 | pass 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /src/lrabbit_spider/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import BaseRotatingHandler 3 | import os 4 | import sys 5 | import lrabbit_spider.setting as setting 6 | from better_exceptions import format_exception 7 | import loguru 8 | 9 | LOG_FORMAT = "%(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s| %(message)s" 10 | 11 | class InterceptHandler(logging.Handler): 12 | def emit(self, record): 13 | # Retrieve context where the logging call occurred, this happens to be in the 6th frame upward 14 | logger_opt = loguru.logger.opt(depth=6, exception=record.exc_info) 15 | logger_opt.log(record.levelname, record.getMessage()) 16 | 17 | class RotatingFileHandler(BaseRotatingHandler): 18 | def __init__( 19 | self, filename, mode="a", max_bytes=0, backup_count=0, encoding=None, delay=0 20 | ): 21 | BaseRotatingHandler.__init__(self, filename, mode, encoding, delay) 22 | self.max_bytes = max_bytes 23 | self.backup_count = backup_count 24 | self.placeholder = str(len(str(backup_count))) 25 | 26 | def doRollover(self): 27 | if self.stream: 28 | self.stream.close() 29 | self.stream = None 30 | if self.backup_count > 0: 31 | for i in range(self.backup_count - 1, 0, -1): 32 | sfn = ("%0" + self.placeholder + "d.") % i # '%2d.'%i -> 02 33 | sfn = sfn.join(self.baseFilename.split(".")) 34 | # sfn = "%d_%s" % (i, self.baseFilename) 35 | # dfn = "%d_%s" % (i + 1, self.baseFilename) 36 | dfn = ("%0" + self.placeholder + "d.") % (i + 1) 37 | dfn = dfn.join(self.baseFilename.split(".")) 38 | if os.path.exists(sfn): 39 | # print "%s -> %s" % (sfn, dfn) 40 | if os.path.exists(dfn): 41 | os.remove(dfn) 42 | os.rename(sfn, dfn) 43 | dfn = (("%0" + self.placeholder + "d.") % 1).join( 44 | self.baseFilename.split(".") 45 | ) 46 | if os.path.exists(dfn): 47 | os.remove(dfn) 48 | # Issue 18940: A file may not have been created if delay is True. 49 | if os.path.exists(self.baseFilename): 50 | os.rename(self.baseFilename, dfn) 51 | if not self.delay: 52 | self.stream = self._open() 53 | 54 | def shouldRollover(self, record): 55 | 56 | if self.stream is None: # delay was set... 57 | self.stream = self._open() 58 | if self.max_bytes > 0: # are we rolling over? 59 | msg = "%s\n" % self.format(record) 60 | self.stream.seek(0, 2) # due to non-posix-compliant Windows feature 61 | if self.stream.tell() + len(msg) >= self.max_bytes: 62 | return 1 63 | return 0 64 | 65 | 66 | def get_logger( 67 | 68 | name=None, 69 | path=None, 70 | log_level=None, 71 | is_write_to_console=None, 72 | is_write_to_file=None, 73 | color=None, 74 | mode=None, 75 | max_bytes=None, 76 | backup_count=None, 77 | encoding=None, 78 | is_print_default_exception=True 79 | 80 | ): 81 | name = name or setting.LOG_NAME 82 | path = path or setting.LOG_PATH 83 | log_level = log_level or setting.LOG_LEVEL 84 | is_write_to_console = (is_write_to_console 85 | if is_write_to_console is not None 86 | else setting.LOG_IS_WRITE_TO_CONSOLE 87 | ) 88 | color = color if color is not None else setting.LOG_COLOR 89 | mode = mode or setting.LOG_MODE 90 | max_bytes = max_bytes or setting.LOG_MAX_BYTES 91 | backup_count = backup_count or setting.LOG_BACKUP_COUNT 92 | encoding = encoding or setting.LOG_ENCODING 93 | 94 | 95 | name = name.split(os.sep)[-1].split(".")[0] 96 | 97 | logger = logging.getLogger() 98 | logger.setLevel(log_level) 99 | formatter = logging.Formatter(LOG_FORMAT) 100 | if is_print_default_exception: 101 | formatter.formatException = lambda exce_info : format_exception(*exce_info) 102 | 103 | if is_write_to_console: 104 | if path and not os.path.exists(os.path.dirname(path)): 105 | os.makedirs(os.path.dirname(path)) 106 | 107 | rf_handler = RotatingFileHandler( 108 | path, 109 | mode=mode, 110 | max_bytes=max_bytes, 111 | backup_count = backup_count, 112 | encoding = encoding 113 | ) 114 | rf_handler.setFormatter(formatter) 115 | logger.addHandler(rf_handler) 116 | if color and is_write_to_console: 117 | loguru_handler = InterceptHandler() 118 | loguru_handler.setFormatter(formatter) 119 | logger.addHandler(loguru_handler) 120 | elif is_write_to_console: 121 | stream_handler = logging.StreamHandler() 122 | stream_handler.setFormatter(formatter) 123 | stream_handler.stream=sys.stdout 124 | logger.addHandler(stream_handler) 125 | 126 | _handler_list = [] 127 | _handler_name_list = [] 128 | for _handler in logger.handlers: 129 | if str(_handler) not in _handler_name_list: 130 | _handler_name_list.append(str(_handler.name)) 131 | _handler_list.append(_handler) 132 | logger.handlers = _handler_list 133 | 134 | return logger 135 | 136 | 137 | STOP_LOGS = [ 138 | # ES 139 | "urllib3.response", 140 | "urllib3.connection", 141 | "elasticsearch.trace", 142 | "requests.packages.urllib3.util", 143 | "requests.packages.urllib3.util.retry", 144 | "urllib3.util", 145 | "requests.packages.urllib3.response", 146 | "requests.packages.urllib3.contrib.pyopenssl", 147 | "requests.packages", 148 | "urllib3.util.retry", 149 | "requests.packages.urllib3.contrib", 150 | "requests.packages.urllib3.connectionpool", 151 | "requests.packages.urllib3.poolmanager", 152 | "urllib3.connectionpool", 153 | "requests.packages.urllib3.connection", 154 | "elasticsearch", 155 | "log_request_fail", 156 | # requests 157 | "requests", 158 | "selenium.webdriver.remote.remote_connection", 159 | "selenium.webdriver.remote", 160 | "selenium.webdriver", 161 | "selenium", 162 | # markdown 163 | "MARKDOWN", 164 | "build_extension", 165 | # newspaper 166 | "calculate_area", 167 | "largest_image_url", 168 | "newspaper.images", 169 | "newspaper", 170 | "Importing", 171 | "PIL", 172 | ] 173 | 174 | 175 | for STOP_LOG in STOP_LOGS: 176 | log_level = eval("logging."+setting.OTHERS_LOG_LEVAL) 177 | logging.getLogger(STOP_LOG).setLevel(log_level) 178 | 179 | 180 | 181 | class Log: 182 | log = None 183 | 184 | def __getattr__(self,name): 185 | if self.__class__.log is None: 186 | self.__class__.log = get_logger() 187 | return getattr(self.__class__.log,name) 188 | @property 189 | def debug(self): 190 | return self.__class__.log.debug 191 | pass 192 | 193 | @property 194 | def info(self): 195 | return self.__class__.log.info 196 | pass 197 | 198 | @property 199 | def warning(self): 200 | return self.__class__.log.warning 201 | pass 202 | 203 | @property 204 | def exception(self): 205 | return self.__class__.log.exception 206 | pass 207 | 208 | @property 209 | def error(self): 210 | return self.__class__.log.error 211 | pass 212 | @property 213 | def critical(self): 214 | return self.__class__.log.critical 215 | pass 216 | 217 | 218 | 219 | 220 | 221 | log = Log() -------------------------------------------------------------------------------- /src/lrabbit_spider/utils/webdriver.py: -------------------------------------------------------------------------------- 1 | 2 | from concurrent.futures import thread 3 | import os 4 | import queue 5 | from re import I 6 | import threading 7 | from selenium import webdriver 8 | from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver 9 | from lrabbit_spider.utils.log import log 10 | from lrabbit_spider.utils.tools import Singleton 11 | DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" 12 | 13 | class XhrRequest: 14 | def __init__(self,url,data,headers); 15 | self.url = url 16 | self.data = data 17 | self.headers = headers 18 | 19 | class XhrResponse: 20 | def __init__(self,request,url,headers,content,status_code): 21 | self.request = request 22 | self.url = url 23 | self.headers = headers 24 | self.content = content 25 | self.status_code = status_code 26 | 27 | 28 | 29 | 30 | class WebDriver(RemoteWebDriver): 31 | CHROME = "CHROME" 32 | def __init__( 33 | self, 34 | load_images=True, 35 | user_agent=None, 36 | proxy=None, 37 | headless=False, 38 | driver_type=CHROME, 39 | timeout=16, 40 | window_size=(1024, 800), 41 | executable_path=None, 42 | custom_argument=None, 43 | xhr_url_regexes: list = None, 44 | **kwargs, 45 | ): 46 | """ 47 | webdirver 封装,支持chrome、phantomjs 和 firefox 48 | Args: 49 | load_images: 是否加载图片 50 | user_agent: 字符串 或 无参函数,返回值为user_agent 51 | proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 52 | headless: 是否启用无头模式 53 | driver_type: CHROME 或 PHANTOMJS,FIREFOX 54 | timeout: 请求超时时间 55 | window_size: # 窗口大小 56 | executable_path: 浏览器路径,默认为默认路径 57 | xhr_url_regexes: 拦截xhr接口,支持正则,数组类型 58 | **kwargs: 59 | """ 60 | self._load_images = load_images 61 | self._user_agent = user_agent or DEFAULT_USERAGENT 62 | self._proxy = proxy 63 | self._headless = headless 64 | self._timeout = timeout 65 | self._window_size = window_size 66 | self._executable_path = executable_path 67 | self._custom_argument = custom_argument 68 | self._xhr_url_regexes = xhr_url_regexes 69 | 70 | if self._xhr_url_regexes and driver_type !=WebDriver.CHROME: 71 | raise Exception("xhr url only support by chrome") 72 | 73 | if driver_type == WebDriver.CHROME: 74 | self.driver= self.chrome_driver() 75 | 76 | def _enter__(self): 77 | return self 78 | 79 | def __exit__(self,exc_type,exc_val,exc_tb): 80 | if exc_val: 81 | log.error(exc_val) 82 | self.quit() 83 | return True 84 | 85 | def get_driver(self): 86 | return self.driver 87 | 88 | def chrome_driver(self): 89 | chrome_options = webdriver.ChromeOptions() 90 | chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) 91 | chrome_options.add_experimental_option("useAutomationExtension", False) 92 | # docker 里运行需要 93 | chrome_options.add_argument("--no-sandbox") 94 | 95 | if self._proxy: 96 | chrome_options.add_argument( 97 | "--proxy-server={}".format( 98 | self._proxy() if callable(self._proxy) else self._proxy 99 | ) 100 | ) 101 | 102 | if self._user_agent: 103 | chrome_options.add_argument( 104 | "user-agent={}".format( 105 | self._user_agent() if callable(self._user_agent) else self._user_agent 106 | ) 107 | ) 108 | if self._load_images: 109 | chrome_options.add_argument( 110 | "prefs", {"profile.managed_default_content_settings.images": 2} 111 | ) 112 | if not self._headless: 113 | chrome_options.add_argument("--headless") 114 | chrome_options.add_argument("--disable-gpu") 115 | if self._window_size: 116 | chrome_options.add_argument( 117 | "--window-size={}.{}".format(self._window_size[0],self._window_size[1]) 118 | ) 119 | 120 | if self._executable_path: 121 | driver = webdriver.Chrome(options=chrome_options,executable_path=self._executable_path) 122 | else: 123 | driver = webdriver.Chrome(options=chrome_options) 124 | 125 | with open(os.path.join(os.path.dirname(__file__),"./js/stealth.min.js")) as f: 126 | js = f.read() 127 | driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js}) 128 | 129 | if self._xhr_url_regexes: 130 | assert isinstance(self._xhr_url_regexes,list) 131 | with open( 132 | os.path.join(os.path.dirname(__file__),"./js/intercept.js") 133 | 134 | ) as f: 135 | js = f.read() 136 | driver.execute_cdp_cmd( 137 | "Page.addScriptToEvaluateOnNewDocument", {"source": js} 138 | ) 139 | js = f"window.__urlRegexes = {self._xhr_url_regexes}" 140 | driver.execute_cdp_cmd( 141 | "Page.addScriptToEvaluateOnNewDocument", {"source": js} 142 | ) 143 | return driver 144 | 145 | 146 | 147 | @property 148 | def cookies(self): 149 | cookies_json = {} 150 | for cookie in self.driver.get_cookies(): 151 | cookie_json[cookie["name"]] = cookie['value'] 152 | return cookies_json 153 | 154 | @cookies.setter 155 | def cookies(self,val): 156 | for key,value in val.items(): 157 | self.driver.add_cookie({"name":key,"value":value}) 158 | 159 | pass 160 | 161 | @property 162 | def user_agent(self): 163 | return self.driver.execute_script("return navigator.userAgent;") 164 | 165 | 166 | def xhr_response(self,xhr_url_regex): 167 | data = self.driver.execute_script( 168 | f'return window.__ajaxData["{xhr_url_regex}"]' 169 | ) 170 | if not data: 171 | return None 172 | 173 | request = XhrRequest(**data)["request"] 174 | response = XhrResponse(request,**data)["response"] 175 | return response 176 | def xhr_text(self,xhr_url_regex): 177 | response = self.xhr_response(xhr_url_regex) 178 | if not response: 179 | return None 180 | return response.content 181 | pass 182 | 183 | def xhr_json(self,xhr_url_regex): 184 | pass 185 | 186 | def __getattr__(self,xhr_url_regex): 187 | pass 188 | 189 | 190 | 191 | @Singleton 192 | class WebDriverPool: 193 | def __init__(self,pool_size=5,**kwargs): 194 | 195 | self.queue = queue.Queue(maxsize=pool_size) 196 | self.kwargs=kwargs 197 | self.lock = threading.RLock() 198 | self.driver_count = 0 199 | pass 200 | @property 201 | def is_full(self): 202 | return self.driver_count >=self.queue.maxsize 203 | pass 204 | 205 | def get(self,user_agent,proxy): 206 | if not self.is_full: 207 | with self.lock: 208 | kwargs = self.kwargs.copy() 209 | if user_agent: 210 | kwargs["user_agent"] = user_agent 211 | if proxy: 212 | kwargs["proxy"] = proxy 213 | driver = WebDriver(**kwargs) 214 | self.queue.put(driver) 215 | self.driver_count +=1 216 | driver = self.queue.get() 217 | return driver 218 | 219 | def remove(self,driver): 220 | driver.quit() 221 | self.driver_count-=1 222 | 223 | def close(self): 224 | while not self.queue.empty(): 225 | driver = self.queue.get() 226 | driver.quit() 227 | self.driver_count -=1 228 | 229 | 230 | 231 | 232 | 233 | def put(self,driver): 234 | pass 235 | 236 | def remove(self,driver): 237 | pass 238 | def close(self): 239 | pass 240 | 241 | 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2021/11/18 11:41 4 | @Author : lrabbit 5 | @FileName: spider.py 6 | @Software: PyCharm 7 | @Blog : https://www.lrabbit.life 8 | """ 9 | import traceback 10 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient 11 | from lrabbit_scrapy.common_utils.redis_helper import RedisClient 12 | from threading import Thread 13 | from lrabbit_scrapy.common_utils.all_in_one import get_time_format_now 14 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils 15 | 16 | 17 | class LrabbitSpider: 18 | 19 | def __init__(self): 20 | 21 | self._init_config() 22 | self.mysql_client = MysqlClient(config_path_env=self.config_env_name) 23 | self.redis_client = RedisClient(config_path_env=self.config_env_name, db=self.redis_db_config) 24 | spider_task_key = f'list:{self.__getattribute__("spider_name")}' 25 | self.spider_name = self.__getattribute__("spider_name") 26 | self.task_list_redis_key = spider_task_key 27 | self.success_count_all_key = f"success:count:{self.spider_name}" 28 | self.success_count_day_key = f"success:count:{self.spider_name}:{get_time_format_now()}" 29 | self.fail_count_all_key = f"fail:count:{self.spider_name}" 30 | self.fail_count_day_key = f"fail:count:{self.spider_name}:{get_time_format_now()}" 31 | self.thread_task_list = [] 32 | self.task_list = [] 33 | 34 | def _init_config(self): 35 | try: 36 | self.__getattribute__("thread_mysql_open") 37 | except: 38 | self.thread_mysql_open = False 39 | try: 40 | self.__getattribute__("max_thread_num") 41 | except: 42 | self.max_thread_num = 10 43 | try: 44 | self.__getattribute__("reset_task_config") 45 | except: 46 | self.reset_task_config = False 47 | try: 48 | self.__getattribute__("loop_task_config") 49 | except: 50 | self.loop_task_config = False 51 | try: 52 | self.__getattribute__("remove_confirm_config") 53 | except: 54 | self.remove_confirm_config = False 55 | try: 56 | self.__getattribute__("config_env_name") 57 | except: 58 | self.config_env_name = "config_path" 59 | try: 60 | self.__getattribute__("redis_db_config") 61 | except: 62 | self.redis_db_config = 0 63 | try: 64 | self.__getattribute__("debug_config") 65 | except: 66 | self.debug_config = True 67 | 68 | def _send_task_redis(self, task_list): 69 | for task in task_list: 70 | LogUtils.log_info("new task", task) 71 | self.redis_client.redis_executor.sadd(self.task_list_redis_key, task) 72 | 73 | def update_stat_redis(self): 74 | """ 75 | success:count_all success:count:spider_name 76 | success:count:day success:count:spider_name:2021-11-11 77 | :return: 78 | """ 79 | day = get_time_format_now() 80 | self.success_count_day_key = f"success:count:{self.spider_name}:{day}" 81 | self.redis_client.redis_executor.incr(self.success_count_all_key) 82 | self.redis_client.redis_executor.incr(self.success_count_day_key) 83 | 84 | def _init_task_list(self): 85 | 86 | if self.reset_task_config or not self.redis_client.redis_executor.exists(self.task_list_redis_key): 87 | 88 | LogUtils.log_info("init task list") 89 | generate_task_list_callback = self.__getattribute__("init_task_list") 90 | if self.redis_client.redis_executor.exists(self.task_list_redis_key): 91 | LogUtils.log_info("already exists", self.task_list_redis_key, "task list", "count", 92 | self.redis_client.redis_executor.scard(self.task_list_redis_key)) 93 | try: 94 | remove_confirm_config = self.__getattribute__("remove_confirm_config") 95 | if not remove_confirm_config: 96 | option = input("please input y to delete task list and add new task") 97 | if option != 'y': 98 | exit(-1) 99 | except AttributeError as e: 100 | option = input("please input y to delete task list and add new task") 101 | if option != 'y': 102 | exit(-1) 103 | except Exception as e: 104 | pass 105 | self.redis_client.redis_executor.delete(self.task_list_redis_key) 106 | 107 | generate_task_all = generate_task_list_callback() 108 | count = self.redis_client.redis_executor.scard(self.task_list_redis_key) 109 | if count >= 1: 110 | LogUtils.log_info("already init task") 111 | return 112 | if len(generate_task_all) < 10: 113 | for item in generate_task_all: 114 | LogUtils.log_info("new task", item) 115 | self.redis_client.redis_executor.sadd(self.task_list_redis_key, item) 116 | else: 117 | thread_num = 10 118 | step = len(generate_task_all) // thread_num 119 | send_thread_list = [] 120 | for i in range(thread_num): 121 | if i == thread_num - 1: 122 | t = Thread(target=self._send_task_redis, args=(generate_task_all[i * step:],)) 123 | else: 124 | t = Thread(target=self._send_task_redis, args=(generate_task_all[(i * step):(i + 1) * step],)) 125 | t.start() 126 | send_thread_list.append(t) 127 | for t in send_thread_list: 128 | t.join() 129 | LogUtils.log_finish("init task list success") 130 | 131 | task_count = self.redis_client.redis_executor.scard(self.task_list_redis_key) 132 | LogUtils.log_info("current task count", task_count) 133 | try: 134 | remove_confirm_config = self.__getattribute__("remove_confirm_config") 135 | if not remove_confirm_config: 136 | option = input("please input y to continue") 137 | if option != 'y': 138 | exit(-1) 139 | except AttributeError as e: 140 | option = input("please input y to continue") 141 | if option != 'y': 142 | exit(-1) 143 | except Exception as e: 144 | pass 145 | 146 | def _run(self): 147 | self._init_task_list() 148 | try: 149 | worker_callback = self.__getattribute__("worker") 150 | except Exception as e: 151 | LogUtils.log_error("you not define worker function") 152 | exit(-1) 153 | 154 | def self_loop_call_back(): 155 | while True: 156 | task = self.redis_client.redis_executor.spop(self.task_list_redis_key) 157 | if not task: 158 | break 159 | try: 160 | self.task_list.append(task) 161 | if self.thread_mysql_open: 162 | _mysql_client = MysqlClient(config_path_env=self.config_env_name) 163 | worker_callback(task, _mysql_client) 164 | else: 165 | worker_callback(task) 166 | except Exception as e: 167 | name_exception = type(e).__name__.lower() 168 | if self.debug_config: 169 | traceback.print_exc() 170 | else: 171 | LogUtils.log_error(task, name_exception, e.__getattribute__('args')) 172 | self.redis_client.redis_executor.sadd( 173 | f"list:error:count:{self.spider_name}:{name_exception}:{get_time_format_now()}", 174 | task) 175 | self.fail_count_day_key = f"fail:count:{self.spider_name}:{get_time_format_now()}" 176 | self.redis_client.redis_executor.incr(self.fail_count_day_key) 177 | self.redis_client.redis_executor.incr(self.fail_count_all_key) 178 | try: 179 | self.task_list.remove(task) 180 | except Exception as e: 181 | pass 182 | 183 | self.task_list = [] 184 | self.process_list = [] 185 | 186 | for _ in range(self.max_thread_num): 187 | t = Thread(target=self_loop_call_back, args=()) 188 | t.start() 189 | self.process_list.append(t) 190 | for t in self.process_list: 191 | t.join() 192 | 193 | def _menu(self): 194 | import sys 195 | options = sys.argv[1:] 196 | if len(options) > 0: 197 | if options[0] == 'stat': 198 | LogUtils.log_info("remain t ask list", self.redis_client.redis_executor.scard(self.task_list_redis_key)) 199 | print("\n") 200 | day = get_time_format_now() 201 | self.success_count_day_key = f"success:count:{self.spider_name}:{day}" 202 | LogUtils.log_finish("today success count", 203 | self.redis_client.redis_executor.get(self.success_count_day_key)) 204 | LogUtils.log_error("today fail count", self.redis_client.redis_executor.get(self.fail_count_day_key)) 205 | LogUtils.log_finish("all success count", 206 | self.redis_client.redis_executor.get(self.success_count_all_key)) 207 | LogUtils.log_error("all fail count", self.redis_client.redis_executor.get(self.fail_count_all_key)) 208 | print("\n") 209 | LogUtils.log_error("404 status_code count", 210 | self.redis_client.redis_executor.scard( 211 | f"list:error:count:{self.spider_name}:exception404:{get_time_format_now()}")) 212 | LogUtils.log_error("403 status_code count", 213 | self.redis_client.redis_executor.scard( 214 | f"list:error:count:{self.spider_name}:exception403:{get_time_format_now()}")) 215 | LogUtils.log_error("500 status_code count", 216 | self.redis_client.redis_executor.scard( 217 | f"list:error:count:{self.spider_name}:exception500:{get_time_format_now()}")) 218 | else: 219 | LogUtils.log_error(" you can add stat option ,check scrapy stat") 220 | exit(-1) 221 | 222 | def run(self): 223 | while True: 224 | self._menu() 225 | try: 226 | self._run() 227 | if self.loop_task_config: 228 | continue 229 | break 230 | except KeyboardInterrupt as e: 231 | # when you keyboard break,need give this task back 232 | while True: 233 | if len(self.task_list) == 0: 234 | break 235 | task = self.task_list.pop() 236 | if not task: 237 | break 238 | self.redis_client.redis_executor.sadd(self.task_list_redis_key, task) 239 | LogUtils.log_now_time_str() 240 | LogUtils.log_finish("all finish") 241 | 242 | 243 | if __name__ == '__main__': 244 | spider = LrabbitSpider() 245 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/common_utils/mysql_helper.py: -------------------------------------------------------------------------------- 1 | import time 2 | import traceback 3 | import pymysql.cursors 4 | import pymysql 5 | import queue 6 | import threading 7 | import logging 8 | from lrabbit_scrapy.common_utils.config_helper import get_mysql_config, get_config_path 9 | 10 | 11 | class MysqlClient(object): 12 | 13 | def __init__(self, database=None, config_path_env=None, env='test'): 14 | config_path = get_config_path(config_path_env) 15 | mysql_config = get_mysql_config(config_path, env) 16 | host = mysql_config.MYSQL_HOST 17 | if not database: 18 | database = mysql_config.MYSQL_DATABASE 19 | user = mysql_config.MYSQL_USER 20 | password = mysql_config.MYSQL_PASSWORD 21 | port = mysql_config.MYSQL_PORT 22 | max_idle_time = 7 * 3600 23 | connect_timeout = 10 24 | time_zone = "+0:00" 25 | charset = "utf8mb4" 26 | sql_mode = "TRADITIONAL" 27 | self.host = host 28 | self.database = database 29 | self.max_idle_time = float(max_idle_time) 30 | args = dict(use_unicode=True, charset=charset, 31 | database=database, 32 | init_command=('SET time_zone = "%s"' % time_zone), 33 | cursorclass=pymysql.cursors.DictCursor, 34 | connect_timeout=connect_timeout, sql_mode=sql_mode) 35 | if user is not None: 36 | args["user"] = user 37 | if password is not None: 38 | args["passwd"] = password 39 | # We accept a path to a MySQL socket file or a host(:port) string 40 | if "/" in host: 41 | args["unix_socket"] = host 42 | else: 43 | self.socket = None 44 | pair = host.split(":") 45 | if len(pair) == 2: 46 | args["host"] = pair[0] 47 | args["port"] = int(pair[1]) 48 | else: 49 | args["host"] = host 50 | args["port"] = 3306 51 | if port: 52 | args['port'] = port 53 | 54 | self._db = None 55 | self._db_args = args 56 | self._last_use_time = time.time() 57 | try: 58 | self.reconnect() 59 | except Exception: 60 | logging.error("Cannot connect to MySQL on %s", self.host, 61 | exc_info=True) 62 | 63 | def _ensure_connected(self): 64 | if (self._db is None or 65 | (time.time() - self._last_use_time > self.max_idle_time)): 66 | self.reconnect() 67 | self._last_use_time = time.time() 68 | 69 | def _cursor(self): 70 | self._ensure_connected() 71 | return self._db.cursor() 72 | 73 | def __del__(self): 74 | self.close() 75 | 76 | def close(self): 77 | """Closes this database connection.""" 78 | if getattr(self, "_db", None) is not None: 79 | self._db.close() 80 | self._db = None 81 | 82 | def reconnect(self): 83 | """Closes the existing database connection and re-opens it.""" 84 | self.close() 85 | self._db = pymysql.connect(**self._db_args) 86 | self._db.autocommit(True) 87 | 88 | def query(self, query, *parameters, **kwparameters): 89 | """Returns a row list for the given query and parameters.""" 90 | cursor = self._cursor() 91 | try: 92 | cursor.execute(query, kwparameters or parameters) 93 | result = cursor.fetchall() 94 | return result 95 | finally: 96 | cursor.close() 97 | 98 | def get(self, query, *parameters, **kwparameters): 99 | """Returns the (singular) row returned by the given query. 100 | """ 101 | cursor = self._cursor() 102 | try: 103 | cursor.execute(query, kwparameters or parameters) 104 | return cursor.fetchone() 105 | finally: 106 | cursor.close() 107 | 108 | def execute(self, query, *parameters, **kwparameters): 109 | """Executes the given query, returning the lastrowid from the query.""" 110 | cursor = self._cursor() 111 | try: 112 | cursor.execute(query, kwparameters or parameters) 113 | return cursor.lastrowid 114 | except Exception as e: 115 | if e.args[0] == 1062: 116 | pass 117 | else: 118 | traceback.print_exc() 119 | raise e 120 | finally: 121 | cursor.close() 122 | 123 | insert = execute 124 | 125 | ## =============== high level method for table =================== 126 | 127 | def table_has(self, table_name, field, value): 128 | if isinstance(value, str): 129 | value = value.encode('utf8') 130 | sql = 'SELECT %s FROM %s WHERE %s="%s"' % ( 131 | field, 132 | table_name, 133 | field, 134 | value) 135 | d = self.get(sql) 136 | return d 137 | 138 | def table_insert(self, table_name, item): 139 | '''item is a dict : key is mysql table field''' 140 | fields = list(item.keys()) 141 | values = list(item.values()) 142 | fieldstr = ','.join(fields) 143 | valstr = ','.join(['%s'] * len(item)) 144 | for i in range(len(values)): 145 | if isinstance(values[i], str): 146 | values[i] = values[i].encode('utf8') 147 | sql = 'INSERT INTO %s (%s) VALUES(%s)' % (table_name, fieldstr, valstr) 148 | try: 149 | last_id = self.execute(sql, *values) 150 | return last_id 151 | except Exception as e: 152 | if e.args[0] == 1062: 153 | # just skip duplicated item 154 | pass 155 | else: 156 | traceback.print_exc() 157 | print('sql:', sql) 158 | print('item:') 159 | for i in range(len(fields)): 160 | vs = str(values[i]) 161 | if len(vs) > 300: 162 | print(fields[i], ' : ', len(vs), type(values[i])) 163 | else: 164 | print(fields[i], ' : ', vs, type(values[i])) 165 | raise e 166 | 167 | def table_update(self, table_name, updates, 168 | field_where, value_where): 169 | '''updates is a dict of {field_update:value_update}''' 170 | upsets = [] 171 | values = [] 172 | for k, v in updates.items(): 173 | s = '%s=%%s' % k 174 | upsets.append(s) 175 | values.append(v) 176 | upsets = ','.join(upsets) 177 | sql = 'UPDATE %s SET %s WHERE %s="%s"' % ( 178 | table_name, 179 | upsets, 180 | field_where, value_where, 181 | ) 182 | self.execute(sql, *(values)) 183 | 184 | 185 | logger = logging.Logger(name="mysql connect") 186 | 187 | 188 | class Connection(pymysql.connections.Connection): 189 | _pool = None 190 | _reusable_expection = (pymysql.err.ProgrammingError, pymysql.err.IntegrityError, pymysql.err.NotSupportedError) 191 | 192 | def __init__(self, *args, **kwargs): 193 | pymysql.connections.Connection.__init__(self, *args, **kwargs) 194 | self.args = args 195 | self.kwargs = kwargs 196 | 197 | def __exit__(self, exc, value, traceback): 198 | 199 | pymysql.connections.Connection.__exit__(self, exc, value, traceback) 200 | if self._pool: 201 | if not exc or exc in self._reusable_expection: 202 | '''reusable connection''' 203 | self._pool.put_connection(self) 204 | else: 205 | '''no reusable connection, close it and create a new one then put it to the pool''' 206 | self._pool.put_connection(self._recreate(*self.args, **self.kwargs)) 207 | self._pool = None 208 | try: 209 | self.close() 210 | logger.warning("Close not reusable connection from pool(%s) caused by %s", self._pool.name, value) 211 | except Exception: 212 | pass 213 | 214 | def _recreate(self, *args, **kwargs): 215 | conn = Connection(*args, **kwargs) 216 | logger.debug('Create new connection due to pool(%s) lacking', self._pool.name) 217 | return conn 218 | 219 | def close(self): 220 | 221 | if self._pool: 222 | self._pool.put_connection(self) 223 | else: 224 | pymysql.connections.Connection.close(self) 225 | 226 | def execute_query(self, query, args=(), dictcursor=False, return_one=False, exec_many=False): 227 | 228 | with self: 229 | cur = self.cursor() if not dictcursor else self.cursor(pymysql.cursors.DictCursor) 230 | try: 231 | if exec_many: 232 | cur.executemany(query, args) 233 | else: 234 | cur.execute(query, args) 235 | except Exception: 236 | raise 237 | # if no record match the query, return () if return_one==False, else return None 238 | return cur.fetchone() if return_one else cur.fetchall() 239 | 240 | 241 | class ConnectionPool: 242 | _HARD_LIMIT = 200 243 | _THREAD_LOCAL = threading.local() 244 | _THREAD_LOCAL.retry_counter = 0 # a counter used for debug get_connection() method 245 | 246 | def __init__(self, size=10, name=None, *args, **kwargs): 247 | self._pool = queue.Queue(self._HARD_LIMIT) 248 | self._size = size if 0 < size < self._HARD_LIMIT else self._HARD_LIMIT 249 | self.name = name if name else '-'.join( 250 | [kwargs.get('host', 'localhost'), str(kwargs.get('port', 3306)), 251 | kwargs.get('user', ''), kwargs.get('database', '')]) 252 | for _ in range(self._size): 253 | conn = Connection(*args, **kwargs) 254 | conn._pool = self 255 | self._pool.put(conn) 256 | 257 | def get_connection(self, timeout=1, retry_num=1) -> Connection: 258 | """ 259 | timeout: timeout of get a connection from pool, should be a int(0 means return or raise immediately) 260 | retry_num: how many times will retry to get a connection 261 | """ 262 | try: 263 | conn = self._pool.get(timeout=timeout) if timeout > 0 else self._pool.get_nowait() 264 | logger.debug('Get connection from pool(%s)', self.name) 265 | return conn 266 | except queue.Empty: 267 | if not hasattr(self._THREAD_LOCAL, 'retry_counter'): 268 | self._THREAD_LOCAL.retry_counter = 0 269 | if retry_num > 0: 270 | self._THREAD_LOCAL.retry_counter += 1 271 | logger.debug('Retry get connection from pool(%s), the %d times', self.name, 272 | self._THREAD_LOCAL.retry_counter) 273 | retry_num -= 1 274 | return self.get_connection(timeout, retry_num) 275 | else: 276 | total_times = self._THREAD_LOCAL.retry_counter + 1 277 | self._THREAD_LOCAL.retry_counter = 0 278 | raise GetConnectionFromPoolError("can't get connection from pool({}) within {}*{} second(s)".format( 279 | self.name, timeout, total_times)) 280 | 281 | def put_connection(self, conn): 282 | if not conn._pool: 283 | conn._pool = self 284 | conn.cursor().close() 285 | try: 286 | self._pool.put_nowait(conn) 287 | logger.debug("Put connection back to pool(%s)", self.name) 288 | except queue.Full: 289 | logger.warning("Put connection to pool(%s) error, pool is full, size:%d", self.name, self.size()) 290 | 291 | def size(self): 292 | return self._pool.qsize() 293 | 294 | 295 | class GetConnectionFromPoolError(Exception): 296 | """Exception related can't get connection from pool within timeout seconds.""" 297 | 298 | 299 | class MysqlConnectionPool: 300 | 301 | def __init__(self, database=None, config_path_env=None, env='test'): 302 | config_path = get_config_path(config_path_env) 303 | mysql_config = get_mysql_config(config_path, env) 304 | host = mysql_config.MYSQL_HOST 305 | if not database: 306 | database = mysql_config.MYSQL_DATABASE 307 | user = mysql_config.MYSQL_USER 308 | password = mysql_config.MYSQL_PASSWORD 309 | port = mysql_config.MYSQL_PORT 310 | max_idle_time = 7 * 3600 311 | connect_timeout = 10 312 | time_zone = "+0:00" 313 | charset = "utf8mb4" 314 | sql_mode = "TRADITIONAL" 315 | self.host = host 316 | self.database = database 317 | self.max_idle_time = float(max_idle_time) 318 | args = dict(use_unicode=True, charset=charset, 319 | database=database, 320 | init_command=('SET time_zone = "%s"' % time_zone), 321 | cursorclass=pymysql.cursors.DictCursor, 322 | connect_timeout=connect_timeout, sql_mode=sql_mode) 323 | if user is not None: 324 | args["user"] = user 325 | if password is not None: 326 | args["passwd"] = password 327 | # We accept a path to a MySQL socket file or a host(:port) string 328 | if "/" in host: 329 | args["unix_socket"] = host 330 | else: 331 | self.socket = None 332 | pair = host.split(":") 333 | if len(pair) == 2: 334 | args["host"] = pair[0] 335 | args["port"] = int(pair[1]) 336 | else: 337 | args["host"] = host 338 | args["port"] = 3306 339 | if port: 340 | args['port'] = port 341 | self._args = args 342 | self.pool = ConnectionPool(size=10, **self._args) 343 | 344 | def execute_query(self, sql): 345 | conn = self.pool.get_connection() 346 | with conn: 347 | res = conn.execute_query(sql) 348 | return res 349 | 350 | 351 | if __name__ == '__main__': 352 | pass 353 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/asynico_basespider.py: -------------------------------------------------------------------------------- 1 | import asyncio.queues 2 | import asyncio 3 | import time 4 | import datetime 5 | import sys 6 | import sqlalchemy as sa 7 | from sqlalchemy import MetaData, Table, Column, Integer 8 | from aiomysql.sa import create_engine as aio_create_engine 9 | from sqlalchemy import create_engine 10 | from configparser import ConfigParser 11 | from typing import Dict 12 | import inspect 13 | import csv 14 | import os 15 | import traceback 16 | 17 | pwd_dir = os.path.abspath(os.getcwd()) 18 | metadata = MetaData() 19 | 20 | config = None 21 | last_exc = None 22 | 23 | 24 | class TermColor: 25 | ATTRIBUTES = dict( 26 | list(zip([ 27 | 'bold', 28 | 'dark', 29 | '', 30 | 'underline', 31 | 'blink', 32 | '', 33 | 'reverse', 34 | 'concealed' 35 | ], 36 | list(range(1, 9)) 37 | )) 38 | ) 39 | del ATTRIBUTES[''] 40 | 41 | HIGHLIGHTS = dict( 42 | list(zip([ 43 | 'on_grey', 44 | 'on_red', 45 | 'on_green', 46 | 'on_yellow', 47 | 'on_blue', 48 | 'on_magenta', 49 | 'on_cyan', 50 | 'on_white' 51 | ], 52 | list(range(40, 48)) 53 | )) 54 | ) 55 | 56 | COLORS = dict( 57 | list(zip([ 58 | 'grey', 59 | 'red', 60 | 'green', 61 | 'yellow', 62 | 'blue', 63 | 'magenta', 64 | 'cyan', 65 | 'white', 66 | ], 67 | list(range(30, 38)) 68 | )) 69 | ) 70 | 71 | RESET = '\033[0m' 72 | 73 | @staticmethod 74 | def colored(text, color=None, on_color=None, attrs=None): 75 | 76 | if os.getenv('ANSI_COLORS_DISABLED') is None: 77 | fmt_str = '\033[%dm%s' 78 | if color is not None: 79 | text = fmt_str % (TermColor.COLORS[color], text) 80 | 81 | if on_color is not None: 82 | text = fmt_str % (TermColor.HIGHLIGHTS[on_color], text) 83 | 84 | if attrs is not None: 85 | for attr in attrs: 86 | text = fmt_str % (TermColor.ATTRIBUTES[attr], text) 87 | 88 | text += TermColor.RESET 89 | return text 90 | 91 | 92 | class CommonUtils: 93 | 94 | def __init__(self): 95 | pass 96 | 97 | @staticmethod 98 | def fix_str_args(args): 99 | return list(map(lambda x: str(x).strip(), args)) 100 | 101 | @staticmethod 102 | def get_format_time(for_mat='%Y-%m-%d %H:%M:%S'): 103 | return TermColor.colored(datetime.datetime.now().strftime(for_mat), 'yellow').encode('utf8') 104 | 105 | @staticmethod 106 | def space_join_line_arg(*args): 107 | return ' '.join(args) + '\n' 108 | 109 | 110 | class LogUtils: 111 | 112 | def __init__(self): 113 | pass 114 | 115 | @staticmethod 116 | def log_now_time_str(): 117 | sys.stdout.buffer.write(CommonUtils.get_format_time()) 118 | 119 | @staticmethod 120 | def log_str(color_str, args): 121 | args = CommonUtils.fix_str_args(args) 122 | text = ' '.join(args) 123 | text = color_str + ' ' + text + '\n' 124 | sys.stdout.buffer.write(text.encode('utf8')) 125 | 126 | @staticmethod 127 | def log_info(*args): 128 | color_str = TermColor.colored('[*INFO*]', 'cyan') 129 | LogUtils.log_str(color_str, args) 130 | 131 | @staticmethod 132 | def log_running(*args): 133 | color_str = TermColor.colored('[*RUNNING*]', 'yellow') 134 | LogUtils.log_str(color_str, args) 135 | 136 | @staticmethod 137 | def log_finish(*args): 138 | color_str = TermColor.colored('*FINISH*', 'green') 139 | LogUtils.log_str(color_str, args) 140 | 141 | @staticmethod 142 | def log_error(*args): 143 | color_str = TermColor.colored('[*ERROR*]', 'red') 144 | LogUtils.log_str(color_str, args) 145 | 146 | @staticmethod 147 | def log_to_file(file_path, line): 148 | """ 149 | 150 | :param file_path: log file path 151 | :param line: a str type 152 | :return: 153 | """ 154 | with open(file_path, 'a', encoding='utf8') as f: 155 | line = CommonUtils.space_join_line_arg(LogUtils.get_format_time(), line) 156 | f.write(line) 157 | 158 | 159 | class DbUtils(LogUtils): 160 | """ 161 | all_tables is store all tables sqlalchemy metadata 162 | """ 163 | all_tables: Dict[str, sa.Table] = {} 164 | 165 | def __init__(self, spider_name): 166 | spider_name = spider_name.split(".")[0] 167 | super(DbUtils, self).__init__() 168 | global config 169 | config = ConfigParser() 170 | config_path = os.path.join(pwd_dir, f'{spider_name}.ini') 171 | config.read(config_path) 172 | env = os.getenv('ENV', 'test') 173 | config = config[env] 174 | self.engine = None 175 | 176 | async def init_engine(self): 177 | """ 178 | init connect pymysql,create tables by sqlalchemy 179 | :return: 180 | """ 181 | self.engine = await aio_create_engine( 182 | user=config['db_user'], 183 | password=config['db_password'], 184 | port=3306, 185 | host=config['db_host'], 186 | db=config['db_database'], 187 | autocommit=True 188 | ) 189 | engine = create_engine( 190 | f'mysql+pymysql://{config["db_user"]}:{config["db_password"]}@{config["db_host"]}/{config["db_database"]}', 191 | echo=True, 192 | 193 | ) 194 | metadata.bind = engine 195 | try: 196 | if self.__getattribute__('is_drop_tables'): 197 | yes = input("请输入yes,确认删除表数据") 198 | if yes != 'yes': 199 | exit(1) 200 | metadata.drop_all() 201 | LogUtils.log_finish('已清空表') 202 | except Exception as e: 203 | LogUtils.log_info("not found is_drop_tables") 204 | self._generate_tables() 205 | metadata.create_all(engine) 206 | 207 | def _generate_tables(self): 208 | """ 209 | get all attrs and check name ,if start_with "table" ,get it values to generate columns. 210 | 211 | :return: 212 | """ 213 | for k, v in inspect.getmembers(self): 214 | if k.startswith('table'): 215 | table_name = k.split('_')[-1] 216 | try: 217 | tbl = Table(table_name, metadata, Column('id', Integer, primary_key=True), *v) 218 | except Exception as e: 219 | pass 220 | self.all_tables[table_name] = tbl 221 | LogUtils.log_finish(table_name, '创建完成') 222 | 223 | @asyncio.coroutine 224 | def insert_one(self, sql): 225 | """ 226 | :param sql: this sql you can sqlalchemy api generate a sql or you can just write origin sql 227 | :return: 228 | """ 229 | with (yield from self.engine) as conn: 230 | yield from conn.execute(sql) 231 | 232 | @asyncio.coroutine 233 | def query(self, sql): 234 | with (yield from self.engine) as conn: 235 | res = yield from conn.execute(sql) 236 | res = yield from res.fetchall() 237 | return res 238 | 239 | @asyncio.coroutine 240 | def execute(self, sql): 241 | with (yield from self.engine) as conn: 242 | yield from conn.execute(sql) 243 | 244 | 245 | class FileStore: 246 | 247 | def __init__(self, file_name, headers, reset_task_list): 248 | self.file_name = f'{file_name}.csv' 249 | self.headers = headers 250 | self.reset_task_list = reset_task_list 251 | self.write_headers() 252 | 253 | def write_headers(self): 254 | """ 255 | write headers 256 | :return: 257 | """ 258 | if not os.path.exists(os.path.join(pwd_dir, self.file_name)) or self.reset_task_list: 259 | with open(os.path.join(pwd_dir, self.file_name), 'w', encoding='utf8', newline='') as f: 260 | dict_write = csv.DictWriter(f, fieldnames=self.headers) 261 | dict_write.writeheader() 262 | 263 | def write(self, d): 264 | """ 265 | 266 | :param d: dict type 267 | :return: 268 | """ 269 | with open(os.path.join(pwd_dir, self.file_name), 'a', encoding='utf8', newline='') as f: 270 | dict_write = csv.DictWriter(f, fieldnames=self.headers) 271 | dict_write.writerow(d) 272 | 273 | 274 | class WriteUtil(LogUtils): 275 | """ 276 | all_files : store all FileStore object,you can pass a name to write data to different files 277 | """ 278 | all_files: Dict[str, FileStore] = {} 279 | 280 | def __init__(self): 281 | 282 | super(WriteUtil, self).__init__() 283 | self._generate_files() 284 | 285 | def _generate_files(self): 286 | try: 287 | reset_task_list = self.__getattribute__('reset_task_list') 288 | except Exception as e: 289 | self.log_info('not found reset_task_list option ') 290 | return 291 | for k, v in inspect.getmembers(self): 292 | if k.startswith('file'): 293 | file_name = k.split("_")[-1] 294 | self.all_files[file_name] = FileStore(file_name, v, reset_task_list) 295 | LogUtils.log_finish(f'创建{file_name}存储文件成功') 296 | 297 | 298 | class BaseSpider(DbUtils, WriteUtil): 299 | """ 300 | spider_name:default is filename 301 | """ 302 | 303 | def __init__(self, spider_name: str): 304 | # in windows spider_name maybe is .\asy_crawl.py ,so we need trim it 305 | if spider_name.startswith(".\\"): 306 | spider_name = spider_name.replace(".\\", "") 307 | spider_name = spider_name.split('.')[0] 308 | DbUtils.__init__(self, spider_name) 309 | WriteUtil.__init__(self) 310 | self.task_queue = asyncio.queues.Queue() 311 | self.task_list = [] 312 | self.redis = None 313 | self.db = None 314 | self.spider_name = spider_name 315 | self.start_time = None 316 | self.finish_file_name = None 317 | self.all_file_name = None 318 | self.init_file_name(spider_name) 319 | self.config = ConfigParser() 320 | config_path = os.path.join(pwd_dir, f'{spider_name}.ini') 321 | self.config.read(config_path) 322 | env = os.getenv('ENV', 'test') 323 | self.config = self.config[env] 324 | 325 | def init_file_name(self, spider_name): 326 | """ 327 | 328 | :param spider_name:generate task_list files 329 | :return: 330 | """ 331 | spider_name = spider_name.split('.')[0] 332 | self.finish_file_name = f'{spider_name}_finish.log' 333 | self.all_file_name = f'{spider_name}_all.log' 334 | 335 | def get_tasks_list_by_file(self): 336 | """ 337 | read from all.log and finish.log and get not finish task 338 | :return: 339 | """ 340 | finish_set = set() 341 | all_set = set() 342 | if os.path.exists(os.path.join(pwd_dir, self.all_file_name)): 343 | with open(self.all_file_name, 'r', encoding='utf8') as f: 344 | for line in f.readlines(): 345 | all_set.add(line) 346 | if os.path.exists(os.path.join(pwd_dir, self.finish_file_name)): 347 | with open(self.finish_file_name, 'r', encoding='utf8') as f: 348 | for line in f.readlines(): 349 | finish_set.add(line) 350 | return list(all_set.difference(finish_set)) 351 | 352 | async def _generate_task(self): 353 | """ 354 | init task_list data 355 | :return: 356 | """ 357 | try: 358 | reset_task_list = self.__getattribute__('reset_task_list') 359 | except Exception as e: 360 | LogUtils.log_info("not found reset_task_list") 361 | return 362 | if not os.path.exists(os.path.join(pwd_dir, self.all_file_name)) or reset_task_list: 363 | if os.path.exists(os.path.join(pwd_dir, self.all_file_name)): 364 | os.remove(os.path.join(pwd_dir, self.all_file_name)) 365 | if os.path.exists(os.path.join(pwd_dir, self.finish_file_name)): 366 | os.remove(os.path.join(pwd_dir, self.finish_file_name)) 367 | try: 368 | generate_callback = self.__getattribute__('create_tasks') 369 | except Exception as e: 370 | LogUtils.log_info("not found create_tasks") 371 | return 372 | task_lists = await generate_callback() 373 | task_lists = list(set(task_lists)) 374 | for task in task_lists: 375 | with open(os.path.join(pwd_dir, self.all_file_name), 'a', encoding='utf8') as f: 376 | f.write(str(task)) 377 | f.write('\n') 378 | else: 379 | task_lists = self.get_tasks_list_by_file() 380 | for task in task_lists: 381 | self.task_queue.put_nowait(str(task).strip()) 382 | for _ in range(1): 383 | self.task_queue.put_nowait(None) 384 | 385 | async def base_worker(self): 386 | """ 387 | every woker is in while, get task form task_queue 388 | :return: 389 | """ 390 | try: 391 | worker_callback = self.__getattribute__('worker') 392 | except Exception as e: 393 | LogUtils.log_info("not found worker") 394 | while True: 395 | global last_exc 396 | task = await self.task_queue.get() 397 | if task: 398 | self.log_running(task) 399 | try: 400 | await worker_callback(task) 401 | with open(self.finish_file_name, 'a', encoding='utf8') as f: 402 | f.write(str(task).strip()) 403 | f.write('\n') 404 | self.log_finish(task) 405 | except Exception as e: 406 | if str(traceback.format_exc()) != last_exc: 407 | with open("error.log", 'a', encoding='utf8') as f: 408 | f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 409 | f.write('\n') 410 | f.write(traceback.format_exc()) 411 | last_exc = str(traceback.format_exc()) 412 | self.task_queue.task_done() 413 | else: 414 | print("正在等待最后的任务完成,默认等待5秒") 415 | await asyncio.sleep(5) 416 | total_time = time.monotonic() - self.started_time 417 | for task in self.task_list: 418 | task.cancel() 419 | print(f"除去等待时间,本次代码运行花费时间为: {total_time - 2}") 420 | exit(0) 421 | 422 | async def wait_all_task(self): 423 | """ 424 | wait all worker is finished 425 | :return: 426 | """ 427 | await self.task_queue.join() 428 | 429 | def run(self): 430 | """ 431 | main function,scrapy start with it 432 | :return: 433 | """ 434 | loop = asyncio.get_event_loop() 435 | is_open_mysql = self.__getattribute__("is_open_mysql") 436 | if is_open_mysql: 437 | loop.run_until_complete(self.init_engine()) 438 | loop.run_until_complete(self._generate_task()) 439 | self.task_list = [] 440 | for i in range(int(self.config['workers_num'])): 441 | task = loop.create_task(self.base_worker()) 442 | self.task_list.append(task) 443 | self.started_time = time.monotonic() 444 | loop.run_until_complete(self.wait_all_task()) 445 | 446 | -------------------------------------------------------------------------------- /src/lrabbit_scrapy/android/sslpass.js: -------------------------------------------------------------------------------- 1 | setTimeout(function () { 2 | Java.perform(function () { 3 | console.log(''); 4 | console.log('======'); 5 | console.log('[#] Android Bypass for various Certificate Pinning methods [#]'); 6 | console.log('======'); 7 | 8 | 9 | var X509TrustManager = Java.use('javax.net.ssl.X509TrustManager'); 10 | var SSLContext = Java.use('javax.net.ssl.SSLContext'); 11 | 12 | // TrustManager (Android < 7) // 13 | //////////////////////////////// 14 | var TrustManager = Java.registerClass({ 15 | // Implement a custom TrustManager 16 | name: 'dev.asd.test.TrustManager', 17 | implements: [X509TrustManager], 18 | methods: { 19 | checkClientTrusted: function (chain, authType) { 20 | }, 21 | checkServerTrusted: function (chain, authType) { 22 | }, 23 | getAcceptedIssuers: function () { 24 | return []; 25 | } 26 | } 27 | }); 28 | // Prepare the TrustManager array to pass to SSLContext.init() 29 | var TrustManagers = [TrustManager.$new()]; 30 | // Get a handle on the init() on the SSLContext class 31 | var SSLContext_init = SSLContext.init.overload( 32 | '[Ljavax.net.ssl.KeyManager;', '[Ljavax.net.ssl.TrustManager;', 'java.security.SecureRandom'); 33 | try { 34 | // Override the init method, specifying the custom TrustManager 35 | SSLContext_init.implementation = function (keyManager, trustManager, secureRandom) { 36 | console.log('[+] Bypassing Trustmanager (Android < 7) request'); 37 | SSLContext_init.call(this, keyManager, TrustManagers, secureRandom); 38 | }; 39 | } catch (err) { 40 | console.log('[-] TrustManager (Android < 7) pinner not found'); 41 | //console.log(err); 42 | } 43 | 44 | 45 | // OkHTTPv3 (quadruple bypass) // 46 | // ///////////////////////////////// 47 | try { 48 | // Bypass OkHTTPv3 {1} 49 | var okhttp3_Activity_1 = Java.use('okhttp3.CertificatePinner'); 50 | okhttp3_Activity_1.check.overload('java.lang.String', 'java.util.List').implementation = function (a, b) { 51 | console.log('[+] Bypassing OkHTTPv3 {1}: ' + a); 52 | 53 | 54 | }; 55 | } catch (err) { 56 | console.log('[-] OkHTTPv3 {1} pinner not found'); 57 | //console.log(err); 58 | } 59 | try { 60 | // Bypass OkHTTPv3 {2} 61 | // This method of CertificatePinner.check could be found in some old Android app 62 | var okhttp3_Activity_2 = Java.use('okhttp3.CertificatePinner'); 63 | okhttp3_Activity_2.check.overload('java.lang.String', 'java.security.cert.Certificate').implementation = function (a, b) { 64 | console.log('[+] Bypassing OkHTTPv3 {2}: ' + a); 65 | return true; 66 | }; 67 | } catch (err) { 68 | console.log('[-] OkHTTPv3 {2} pinner not found'); 69 | //console.log(err); 70 | } 71 | try { 72 | // Bypass OkHTTPv3 {3} 73 | var okhttp3_Activity_3 = Java.use('okhttp3.CertificatePinner'); 74 | okhttp3_Activity_3.check.overload('java.lang.String', '[Ljava.security.cert.Certificate;').implementation = function (a, b) { 75 | console.log('[+] Bypassing OkHTTPv3 {3}: ' + a); 76 | return true; 77 | }; 78 | } catch (err) { 79 | console.log('[-] OkHTTPv3 {3} pinner not found'); 80 | //console.log(err); 81 | } 82 | try { 83 | // Bypass OkHTTPv3 {4} 84 | var okhttp3_Activity_4 = Java.use('okhttp3.CertificatePinner'); 85 | okhttp3_Activity_4['check$okhttp'].implementation = function (a, b) { 86 | console.log('[+] Bypassing OkHTTPv3 {4}: ' + a); 87 | 88 | 89 | }; 90 | } catch (err) { 91 | console.log('[-] OkHTTPv3 {4} pinner not found'); 92 | //console.log(err); 93 | } 94 | 95 | 96 | //Trustkit (triple bypass) // 97 | //////////////////////////// 98 | try { 99 | // Bypass Trustkit {1} 100 | var trustkit_Activity_1 = Java.use('com.datatheorem.android.trustkit.pinning.OkHostnameVerifier'); 101 | trustkit_Activity_1.verify.overload('java.lang.String', 'javax.net.ssl.SSLSession').implementation = function (a, b) { 102 | console.log('[+] Bypassing Trustkit {1}: ' + a); 103 | return true; 104 | }; 105 | } catch (err) { 106 | console.log('[-] Trustkit {1} pinner not found'); 107 | //console.log(err); 108 | } 109 | try { 110 | // Bypass Trustkit {2} 111 | var trustkit_Activity_2 = Java.use('com.datatheorem.android.trustkit.pinning.OkHostnameVerifier'); 112 | trustkit_Activity_2.verify.overload('java.lang.String', 'java.security.cert.X509Certificate').implementation = function (a, b) { 113 | console.log('[+] Bypassing Trustkit {2}: ' + a); 114 | return true; 115 | }; 116 | } catch (err) { 117 | console.log('[-] Trustkit {2} pinner not found'); 118 | //console.log(err); 119 | } 120 | try { 121 | // Bypass Trustkit {3} 122 | var trustkit_PinningTrustManager = Java.use('com.datatheorem.android.trustkit.pinning.PinningTrustManager'); 123 | trustkit_PinningTrustManager.checkServerTrusted.implementation = function () { 124 | console.log('[+] Bypassing Trustkit {3}'); 125 | }; 126 | } catch (err) { 127 | console.log('[-] Trustkit {3} pinner not found'); 128 | //console.log(err); 129 | } 130 | 131 | try { 132 | var TrustManagerImpl = Java.use('com.android.org.conscrypt.TrustManagerImpl'); 133 | TrustManagerImpl.verifyChain.implementation = function (untrustedChain, trustAnchorChain, host, clientAuth, ocspData, tlsSctData) { 134 | console.log('[+] Bypassing TrustManagerImpl (Android > 7): ' + host); 135 | return untrustedChain; 136 | }; 137 | } catch (err) { 138 | console.log('[-] TrustManagerImpl (Android > 7) pinner not found'); 139 | //console.log(err); 140 | } 141 | 142 | 143 | // Appcelerator Titanium // 144 | /////////////////////////// 145 | try { 146 | var appcelerator_PinningTrustManager = Java.use('appcelerator.https.PinningTrustManager'); 147 | appcelerator_PinningTrustManager.checkServerTrusted.implementation = function () { 148 | console.log('[+] Bypassing Appcelerator PinningTrustManager'); 149 | }; 150 | } catch (err) { 151 | console.log('[-] Appcelerator PinningTrustManager pinner not found'); 152 | //console.log(err); 153 | } 154 | 155 | 156 | // OpenSSLSocketImpl Conscrypt // 157 | ///////////////////////////////// 158 | try { 159 | var OpenSSLSocketImpl = Java.use('com.android.org.conscrypt.OpenSSLSocketImpl'); 160 | OpenSSLSocketImpl.verifyCertificateChain.implementation = function (certRefs, JavaObject, authMethod) { 161 | console.log('[+] Bypassing OpenSSLSocketImpl Conscrypt'); 162 | }; 163 | } catch (err) { 164 | console.log('[-] OpenSSLSocketImpl Conscrypt pinner not found'); 165 | //console.log(err); 166 | } 167 | 168 | 169 | // OpenSSLEngineSocketImpl Conscrypt // 170 | /////////////////////////////////////// 171 | try { 172 | var OpenSSLEngineSocketImpl_Activity = Java.use('com.android.org.conscrypt.OpenSSLEngineSocketImpl'); 173 | OpenSSLSocketImpl_Activity.verifyCertificateChain.overload('[Ljava.lang.Long;', 'java.lang.String').implementation = function (a, b) { 174 | console.log('[+] Bypassing OpenSSLEngineSocketImpl Conscrypt: ' + b); 175 | }; 176 | } catch (err) { 177 | console.log('[-] OpenSSLEngineSocketImpl Conscrypt pinner not found'); 178 | //console.log(err); 179 | } 180 | 181 | 182 | // OpenSSLSocketImpl Apache Harmony // 183 | ////////////////////////////////////// 184 | try { 185 | var OpenSSLSocketImpl_Harmony = Java.use('org.apache.harmony.xnet.provider.jsse.OpenSSLSocketImpl'); 186 | OpenSSLSocketImpl_Harmony.verifyCertificateChain.implementation = function (asn1DerEncodedCertificateChain, authMethod) { 187 | console.log('[+] Bypassing OpenSSLSocketImpl Apache Harmony'); 188 | }; 189 | } catch (err) { 190 | console.log('[-] OpenSSLSocketImpl Apache Harmony pinner not found'); 191 | //console.log(err); 192 | } 193 | 194 | 195 | // PhoneGap sslCertificateChecker (https://github.com/EddyVerbruggen/SSLCertificateChecker-PhoneGap-Plugin) // 196 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////// 197 | try { 198 | var phonegap_Activity = Java.use('nl.xservices.plugins.sslCertificateChecker'); 199 | phonegap_Activity.execute.overload('java.lang.String', 'org.json.JSONArray', 'org.apache.cordova.CallbackContext').implementation = function (a, b, c) { 200 | console.log('[+] Bypassing PhoneGap sslCertificateChecker: ' + a); 201 | return true; 202 | }; 203 | } catch (err) { 204 | console.log('[-] PhoneGap sslCertificateChecker pinner not found'); 205 | //console.log(err); 206 | } 207 | 208 | 209 | //IBM MobileFirst pinTrustedCertificatePublicKey (double bypass) // 210 | //////////////////////////////////////////////////////////////////// 211 | try { 212 | // Bypass IBM MobileFirst {1} 213 | var WLClient_Activity_1 = Java.use('com.worklight.wlclient.api.WLClient'); 214 | WLClient_Activity_1.getInstance().pinTrustedCertificatePublicKey.overload('java.lang.String').implementation = function (cert) { 215 | console.log('[+] Bypassing IBM MobileFirst pinTrustedCertificatePublicKey {1}: ' + cert); 216 | return; 217 | }; 218 | } catch (err) { 219 | console.log('[-] IBM MobileFirst pinTrustedCertificatePublicKey {1} pinner not found'); 220 | //console.log(err); 221 | } 222 | try { 223 | // Bypass IBM MobileFirst {2} 224 | var WLClient_Activity_2 = Java.use('com.worklight.wlclient.api.WLClient'); 225 | WLClient_Activity_2.getInstance().pinTrustedCertificatePublicKey.overload('[Ljava.lang.String;').implementation = function (cert) { 226 | console.log('[+] Bypassing IBM MobileFirst pinTrustedCertificatePublicKey {2}: ' + cert); 227 | return; 228 | }; 229 | } catch (err) { 230 | console.log('[-] IBM MobileFirst pinTrustedCertificatePublicKey {2} pinner not found'); 231 | //console.log(err); 232 | } 233 | 234 | 235 | // IBM WorkLight (ancestor of MobileFirst) HostNameVerifierWithCertificatePinning (quadruple bypass) // 236 | /////////////////////////////////////////////////////////////////////////////////////////////////////// 237 | try { 238 | // Bypass IBM WorkLight {1} 239 | var worklight_Activity_1 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning'); 240 | worklight_Activity_1.verify.overload('java.lang.String', 'javax.net.ssl.SSLSocket').implementation = function (a, b) { 241 | console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {1}: ' + a); 242 | return; 243 | }; 244 | } catch (err) { 245 | console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {1} pinner not found'); 246 | //console.log(err); 247 | } 248 | try { 249 | // Bypass IBM WorkLight {2} 250 | var worklight_Activity_2 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning'); 251 | worklight_Activity_2.verify.overload('java.lang.String', 'java.security.cert.X509Certificate').implementation = function (a, b) { 252 | console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {2}: ' + a); 253 | return; 254 | }; 255 | } catch (err) { 256 | console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {2} pinner not found'); 257 | //console.log(err); 258 | } 259 | try { 260 | // Bypass IBM WorkLight {3} 261 | var worklight_Activity_3 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning'); 262 | worklight_Activity_3.verify.overload('java.lang.String', '[Ljava.lang.String;', '[Ljava.lang.String;').implementation = function (a, b) { 263 | console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {3}: ' + a); 264 | return; 265 | }; 266 | } catch (err) { 267 | console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {3} pinner not found'); 268 | //console.log(err); 269 | } 270 | try { 271 | // Bypass IBM WorkLight {4} 272 | var worklight_Activity_4 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning'); 273 | worklight_Activity_4.verify.overload('java.lang.String', 'javax.net.ssl.SSLSession').implementation = function (a, b) { 274 | console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {4}: ' + a); 275 | return true; 276 | }; 277 | } catch (err) { 278 | console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {4} pinner not found'); 279 | //console.log(err); 280 | } 281 | 282 | 283 | //Conscrypt CertPinManager // 284 | ////////////////////////////// 285 | try { 286 | var conscrypt_CertPinManager_Activity = Java.use('com.android.org.conscrypt.CertPinManager'); 287 | conscrypt_CertPinManager_Activity.isChainValid.overload('java.lang.String', 'java.util.List').implementation = function (a, b) { 288 | console.log('[+] Bypassing Conscrypt CertPinManager: ' + a); 289 | return true; 290 | }; 291 | } catch (err) { 292 | console.log('[-] Conscrypt CertPinManager pinner not found'); 293 | //console.log(err); 294 | } 295 | 296 | 297 | // CWAC-Netsecurity (unofficial back-port pinner for Android<4.2) CertPinManager // 298 | /////////////////////////////////////////////////////////////////////////////////// 299 | try { 300 | var cwac_CertPinManager_Activity = Java.use('com.commonsware.cwac.netsecurity.conscrypt.CertPinManager'); 301 | cwac_CertPinManager_Activity.isChainValid.overload('java.lang.String', 'java.util.List').implementation = function (a, b) { 302 | console.log('[+] Bypassing CWAC-Netsecurity CertPinManager: ' + a); 303 | return true; 304 | }; 305 | } catch (err) { 306 | console.log('[-] CWAC-Netsecurity CertPinManager pinner not found'); 307 | //console.log(err); 308 | } 309 | 310 | 311 | // Worklight Androidgap WLCertificatePinningPlugin // 312 | ///////////////////////////////////////////////////// 313 | try { 314 | var androidgap_WLCertificatePinningPlugin_Activity = Java.use('com.worklight.androidgap.plugin.WLCertificatePinningPlugin'); 315 | androidgap_WLCertificatePinningPlugin_Activity.execute.overload('java.lang.String', 'org.json.JSONArray', 'org.apache.cordova.CallbackContext').implementation = function (a, b, c) { 316 | console.log('[+] Bypassing Worklight Androidgap WLCertificatePinningPlugin: ' + a); 317 | return true; 318 | }; 319 | } catch (err) { 320 | console.log('[-] Worklight Androidgap WLCertificatePinningPlugin pinner not found'); 321 | //console.log(err); 322 | } 323 | 324 | 325 | // Netty FingerprintTrustManagerFactory // 326 | ////////////////////////////////////////// 327 | try { 328 | var netty_FingerprintTrustManagerFactory = Java.use('io.netty.handler.ssl.util.FingerprintTrustManagerFactory'); 329 | //NOTE: sometimes this below implementation could be useful 330 | //var netty_FingerprintTrustManagerFactory = Java.use('org.jboss.netty.handler.ssl.util.FingerprintTrustManagerFactory'); 331 | netty_FingerprintTrustManagerFactory.checkTrusted.implementation = function (type, chain) { 332 | console.log('[+] Bypassing Netty FingerprintTrustManagerFactory'); 333 | }; 334 | } catch (err) { 335 | console.log('[-] Netty FingerprintTrustManagerFactory pinner not found'); 336 | //console.log(err); 337 | } 338 | 339 | 340 | //Squareup CertificatePinner [OkHTTP