├── src
    ├── lrabbit_spider
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── buffer
    │   │   ├── __init__.py
    │   │   ├── item_buffer.py
    │   │   └── request_buffer.py
    │   ├── network
    │   │   ├── user_agent.py
    │   │   └── request.py
    │   ├── constants.py
    │   ├── utils
    │   │   ├── tools.py
    │   │   ├── js
    │   │   │   └── intercept.js
    │   │   ├── log.py
    │   │   └── webdriver.py
    │   ├── deque
    │   │   └── __init__.py
    │   ├── setting.py
    │   └── db
    │   │   └── redisdb.py
    └── lrabbit_scrapy
    │   ├── android
    │       ├── __init__.py
    │       ├── sslbypass.py
    │       └── sslpass.js
    │   ├── asynico_utils
    │       └── __init__.py
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── common_utils
    │       ├── __init__.py
    │       ├── redis_helper.py
    │       ├── all_in_one.py
    │       ├── config_helper.py
    │       ├── network_helper.py
    │       ├── print_log_helper.py
    │       └── mysql_helper.py
    │   ├── config.py
    │   ├── all_excepiton
    │       └── __init__.py
    │   ├── asy_crawl.py
    │   ├── command.py
    │   ├── template_crawl.py
    │   ├── spider.py
    │   └── asynico_basespider.py
├── .gitignore
├── workspace.code-workspace
├── pyproject.toml
├── test
    ├── test_scrapy
    │   ├── test_log.py
    │   └── test_request_buffer.py
    └── test_spider.py
├── setup.cfg
├── LICENSE
├── LICENSE.rst
├── setup.py
└── README.md


/src/lrabbit_spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/__main__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/android/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/buffer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/buffer/item_buffer.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/network/user_agent.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/asynico_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/constants.py:
--------------------------------------------------------------------------------
1 | REQUEST_REPEAT = "request 已存在"


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/__init__.py:
--------------------------------------------------------------------------------
1 | from .spider import LrabbitSpider
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__/
3 | env
4 | .ini
5 | dist/*
6 | .egg-info
7 | 


--------------------------------------------------------------------------------
/workspace.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | 	"folders": [
3 | 		{
4 | 			"path": "."
5 | 		}
6 | 	],
7 | 	"settings": {}
8 | }


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 
8 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/__main__.py:
--------------------------------------------------------------------------------
1 | from .command import run
2 | import sys
3 | 
4 | 
5 | if __name__ == '__main__':
6 |     args =sys.argv[1:]
7 |     run(*args)
8 | 


--------------------------------------------------------------------------------
/test/test_scrapy/test_log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from lrabbit_spider.utils.log import log
 3 | 
 4 | 
 5 | log.info("test")
 6 | log.debug("test")
 7 | log.error("test")
 8 | log.critical("test")
 9 | log.warning("test")
10 | log.error(Exception("test"))
11 | log.warning(['error'])


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/2 15:27
 4 | @Time    : 2021/11/2 15:27
 5 | @Author  : lrabbit
 6 | @FileName: config_helper.py
 7 | @Software: PyCharm
 8 | @Blog    : https://www.lrabbit.life
 9 | """
10 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/config.py:
--------------------------------------------------------------------------------
 1 | # [server]
 2 | # db_user = root
 3 | # db_password = password
 4 | # db_database = database_name
 5 | # db_host = 127.0.0.1/ you best not add into git
 6 | # workers_num = 10
 7 | # [test]
 8 | # db_user = root
 9 | # db_password = password
10 | # db_database = database_name
11 | # db_host = 127.0.0.1
12 | # workers_num = 10
13 | 


--------------------------------------------------------------------------------
/test/test_scrapy/test_request_buffer.py:
--------------------------------------------------------------------------------
 1 | from threading import Thread
 2 | from urllib import request
 3 | from lrabbit_spider.buffer.request_buffer import RequestsBuffer
 4 | from lrabbit_spider.network.request import Request
 5 | import time
 6 | 
 7 | request_buffer =  RequestsBuffer("test_request_buffer")
 8 | 
 9 | request_buffer.start()
10 | 
11 | 
12 | while True:
13 | 
14 |     request_buffer.put_request(Request(url="https://www.baidu.com"))
15 |     time.sleep(1)
16 | 
17 |  


--------------------------------------------------------------------------------
/src/lrabbit_spider/utils/tools.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | def delay_time(sleep_time=60):
 4 |     time.sleep(sleep_time)
 5 | 
 6 | 
 7 | class Singleton(object):
 8 |     def __init__(self,cls):
 9 |         self._cls = cls
10 |         self._instance = {}
11 |         
12 |     def __call__(self,*args,**kwargs):
13 |         if self._cls not in self._instance:
14 |             self._instance[self._cls] = self._cls(*args,**kwargs)
15 |         return self._instance[self._cls]
16 |     


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/all_excepiton/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/22 13:41
 4 | @Author  : lrabbit
 5 | @FileName: spider.py
 6 | @Software: PyCharm
 7 | @Blog    : https://www.lrabbit.life
 8 | """
 9 | 
10 | 
11 | class Excepiton403(Exception):
12 |     def __init__(self):
13 |         self.__name = "exception403"
14 | 
15 | 
16 | class Exception404(Exception):
17 |     pass
18 | 
19 | 
20 | class Exception500(Exception):
21 |     pass
22 | 
23 | 
24 | class ExceptionFileFieldNameError(Exception):
25 |     pass
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     print(type(Excepiton403()).__name__)
30 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = lrabbit_spider
 3 | version = 2.0.8
 4 | author = lrabbit
 5 | author_email = 709343607@qq.com
 6 | description = this is a small spider,you can easy running. When you often need to crawl a single site, you can reduce some repeated code every time, using this small framework you can quickly crawl data into a file or database.
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/litter-rabbit/lrabbit_scrapy
10 | project_urls =
11 |     Bug Tracker = https://github.com/litter-rabbit/lrabbit_scrapy/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = src
20 | packages = find:
21 | python_requires = >=3.6
22 | 
23 | [options.packages.find]
24 | where = src
25 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/redis_helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/18 10:14
 4 | @Author  : lrabbit
 5 | @FileName: redis_helper.py
 6 | @Software: PyCharm
 7 | @Blog    : https://www.lrabbit.life
 8 | """
 9 | import redis
10 | from lrabbit_scrapy.common_utils.config_helper import get_redis_config, get_config_path
11 | 
12 | 
13 | class RedisClient:
14 | 
15 |     def __init__(self, db=0, config_path_env=None, env='test'):
16 |         config_path = get_config_path(config_path_env)
17 |         redis_config = get_redis_config(config_path, env)
18 |         self.redis_executor = redis.StrictRedis(host=redis_config.REDIS_HOST, port=redis_config.REDIS_PORT,
19 |                                                 password=redis_config.REDIS_PASSWORD,
20 |                                                 db=db,decode_responses=True)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     redis_client = RedisClient()
25 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/android/sslbypass.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | import os
 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 4 | from pathlib import Path
 5 | import subprocess
 6 | 
 7 | 
 8 | def start_frida_server(servername):
 9 |     server_path = f'/data/local/tmp/{servername}'
10 |     killserver = "adb shell su -c killall -9 " + servername
11 |     os.system(killserver)
12 |     subprocess.Popen(
13 |         ["adb", "shell", "su", "-c", server_path])
14 | 
15 | 
16 | def sslbypass(server_name='15.0.0'):
17 |     try:
18 |         start_frida_server(server_name)
19 |     except Exception as e:
20 |         traceback.print_exc()
21 |         LogUtils.log_error("please check frida-server name or this  path is in /data/local/tmp?")
22 |         exit(0)
23 |     current_parent_path = Path(__file__).parent
24 |     frida_path = os.path.join(current_parent_path, 'sslpass.js')
25 |     frida_hook_cmd = f"frida -FU -l {frida_path} --no-pause"
26 |     os.system(frida_hook_cmd)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     sslbypass()
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/src/lrabbit_spider/deque/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from typing import Callable, Tuple, Union,Any,List,Optional
 4 | 
 5 | 
 6 | 
 7 | class Dedup:
 8 |     
 9 |     BloomFilter = 1
10 |     MemoryFilter = 2
11 |     ExpireFilter = 3
12 |     def __init__(self,filter_type:int=BloomFilter,to_md5:bool = True,**kwargs):
13 |         pass
14 |     def __repr__(self):
15 |         pass
16 | 
17 |     def _deal_datas(self,datas):
18 |         pass
19 | 
20 |     def add(self,datas:Union[List[Any],Any],skip_check:bool = False) -> Union[List[Any],Any]:
21 |         pass
22 |     def get(self,datas:Union[List[Any],Any]) -> Union[List[Any],Any]:
23 |         pass
24 | 
25 |     # def filter_exist_data(
26 |     #     self,datas:List[Any],
27 |     #     *,
28 |     #     datas_fingerprints:Options[List] = None,
29 |     #     callback:Callable[[Any],None] = None) ->Union(Tuple[List[Any],List[Any],List[Any]]):
30 |         
31 |     #     pass
32 |     def filter_exist_data(
33 |             self,
34 |             datas: List[Any],
35 |             *,
36 |             datas_fingerprints: Optional[List] = None,
37 |             callback: Callable[[Any], None] = None
38 |         ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
39 |         pass
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="lrabbit_scrapy",
 8 |     version="2.0.8",
 9 |     author="lrabbit",
10 |     author_email="709343607@qq.com",
11 |     description="this is a small spider,you can easy running. When you often need to crawl a single site, you can reduce some repeated code every time, using this small framework you can quickly crawl data into a file or database.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/litter-rabbit/lrabbit_scrapy",
15 |     project_urls={
16 |         "Bug Tracker": "https://github.com/litter-rabbit/lrabbit_scrapy/issues",
17 |     },
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |     ],
23 | 
24 |     install_requires=[
25 |         "parsel == 1.6.0",
26 |         "requests >= 2.26.0",
27 |         "PyMySQL >= 0.9.3",
28 |         "redispy >= 3.0.0",
29 |         "frida >= 15.0.0",
30 |         "frida-tools >= 10.4.1"
31 |     ],
32 |     packages=setuptools.find_packages(where="src"),
33 |     package_dir={"": "src"},
34 |     package_data={
35 |         # If any package contains *.txt files, include them:
36 |         "": ["*.js"],
37 |         # And include any *.dat files found in the "data" subdirectory
38 |         # of the "mypkg" package, also:
39 |     },
40 |     include_package_data=True,
41 |     python_requires=">=3.6.8",
42 | 
43 | )
44 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/asy_crawl.py:
--------------------------------------------------------------------------------
 1 | from lrabbit_scrapy.asynico_basespider import BaseSpider
 2 | import sqlalchemy as sa
 3 | 
 4 | 
 5 | class Spider(BaseSpider):
 6 |     # setup
 7 |     is_open_mysql = False
 8 |     is_drop_tables = False
 9 |     # reset all tasks,files,this is may delete all data files
10 |     reset_task_list = False
11 | 
12 |     """
13 |         not call a method or attribute start_with of 'file','table'
14 |     """
15 |     # datastore
16 |     table_table1 = [
17 |         sa.Column('val', sa.String(255)),
18 |     ]
19 | 
20 |     # file_store
21 |     file_blogPost = [
22 |         'id', 'title', 'datetime', 'content'
23 |     ]
24 | 
25 |     def __init__(self, spider_name):
26 |         super(Spider, self).__init__(spider_name)
27 | 
28 |     async def worker(self, task):
29 |         """
30 | 
31 |         code your worker method
32 | 
33 |         :param task:
34 |         :return:
35 |         """
36 |         """
37 |          mysql work method
38 |         """
39 |         # await self.insert_one(self.tables['table1'].insert().values(val=str(task)))
40 |         # res = await self.query(self.tables['table1'].select())
41 |         # res = await res.fetchall()
42 | 
43 |         """
44 |          want to see how to work,uncomment beyond code
45 |         """
46 |         url = f"http://www.lrabbit.life/post_detail/?id={task}"
47 | 
48 |         data = {"id": task, "datetime": "1997", "title": "lrabbit", "content": "hello"}
49 |         if data:
50 |             self.all_files['blogPost'].write(data)
51 | 
52 |     async def create_tasks(self):
53 |         return [i for i in range(100)]
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     s = Spider(__file__)
58 |     s.run()
59 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/all_in_one.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/18 10:22
 4 | @Author  : lrabbit
 5 | @FileName: all_in_one.py
 6 | @Software: PyCharm
 7 | @Blog    : https://www.lrabbit.life
 8 | """
 9 | import os
10 | import datetime
11 | import csv
12 | from lrabbit_scrapy.all_excepiton import ExceptionFileFieldNameError
13 | 
14 | 
15 | def get_time_format_now(option=1):
16 |     if option == 1:
17 |         return datetime.datetime.now().strftime("%Y-%m-%d")
18 |     else:
19 |         return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
20 | 
21 | 
22 | class FileStore:
23 | 
24 |     def __init__(self, file_path: str, filed_name: list):
25 |         self.file_path = file_path
26 |         self.file_name = os.path.splitext(file_path)
27 |         self.filed_name = filed_name
28 | 
29 |     def write(self, d: dict):
30 |         """
31 |         :param d: dict type
32 |         :return:
33 |         """
34 |         if list(d.keys()) != self.filed_name:
35 |             raise ExceptionFileFieldNameError()
36 | 
37 |         with open(self.file_path, 'a', encoding='utf8', newline='') as f:
38 |             dict_write = csv.DictWriter(f, fieldnames=self.filed_name)
39 |             dict_write.writerow(d)
40 | 
41 |     def write_many(self, rows: [dict]):
42 |         if list(d.keys()) != self.filed_name:
43 |             raise ExceptionFileFieldNameError()
44 |         with open(self.file_path, 'a', encoding='utf8', newline='') as f:
45 |             dict_write = csv.DictWriter(f, fieldnames=self.filed_name)
46 |             dict_write.writerows(rows)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     blog_file = FileStore(r"D:\PythonWorkSpace\lrabbit_scrapy\test\blogPost.csv", ["title"])
51 |     d = {"title": "1"}
52 |     blog_file.write(d)
53 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/config_helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/18 10:27
 4 | @Author  : lrabbit
 5 | @FileName: config_helper.py
 6 | @Software: PyCharm
 7 | @Blog    : https://www.lrabbit.life
 8 | """
 9 | 
10 | from configparser import ConfigParser
11 | import os
12 | 
13 | 
14 | class MysqlConfigClass:
15 |     MYSQL_USER = None
16 |     MYSQL_PASSWORD = None
17 |     MYSQL_DATABASE = None
18 |     MYSQL_HOST = None
19 |     MYSQL_PORT = 3306
20 | 
21 | 
22 | class RedisConfigClass:
23 |     REDIS_PASSWORD = None
24 |     REDIS_DATABASE = None
25 |     REDIS_HOST = None
26 |     REDIS_PORT = 6379
27 | 
28 | 
29 | def get_config(config_path=None, env='test'):
30 |     """
31 | 
32 |      :param config_path:
33 |      :return:
34 |      """
35 |     config = ConfigParser()
36 |     if not config_path:
37 |         pwd = os.path.dirname(__file__)
38 |         config_path = os.path.join(pwd, 'crawl.ini')
39 |     config.read(config_path)
40 |     if os.getenv("ENV") == 'server':
41 |         env = os.getenv("ENV")
42 |     config = config[env]
43 |     return config
44 | 
45 | 
46 | def get_mysql_config(config_path, env='test') -> MysqlConfigClass:
47 |     if not config_path:
48 |         raise Exception("无效的文件路径")
49 |     config = get_config(config_path, env)
50 |     mysqlconfig = MysqlConfigClass()
51 |     for k, v in config.items():
52 |         setattr(mysqlconfig, k.upper(), v)
53 |     return mysqlconfig
54 | 
55 | 
56 | def get_redis_config(config_path, env='test') -> RedisConfigClass:
57 |     if not config_path:
58 |         raise Exception("无效的文件路径")
59 |     config = get_config(config_path, env)
60 |     redisconfig = RedisConfigClass()
61 |     for k, v in config.items():
62 |         setattr(redisconfig, k.upper(), v)
63 |     return redisconfig
64 | 
65 | 
66 | def get_config_path(config_path_env=None):
67 |     if not config_path_env:
68 |         config_path_env = "config_path"
69 |     config_path = os.environ.get(config_path_env)
70 |     if not config_path:
71 |         raise Exception(f"请设置环境变量{config_path_env}为ini配置文件的绝对路径")
72 |     return config_path
73 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/command.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 4 | from lrabbit_scrapy.android.sslbypass import sslbypass
 5 | 
 6 | base_dir = Path(__file__).resolve().parent
 7 | 
 8 | 
 9 | def newspider(spider_name):
10 |     copy_new_name_file(spider_name, f'{spider_name}.py', 'asy_crawl.py')
11 |     copy_new_name_file(spider_name, f'{spider_name}.ini', 'config.py', is_config=True)
12 |     LogUtils.log_finish('创建项目成功')
13 | 
14 | 
15 | def new_template_spider(spider_name):
16 |     copy_new_name_file(spider_name, f'{spider_name}.py', 'template_crawl.py')
17 |     LogUtils.log_finish('创建项目成功')
18 | 
19 | 
20 | def copy_new_name_file(spider_name, new_name, src_name, is_config=False):
21 |     dst_path = os.path.abspath(os.getcwd())
22 |     if not os.path.exists(os.path.join(dst_path, spider_name)):
23 |         os.mkdir(os.path.join(dst_path, spider_name))
24 |     dst_path = os.path.join(dst_path, spider_name)
25 |     dst_file = os.path.join(dst_path, new_name)
26 |     src_file = os.path.join(base_dir, src_name)
27 |     if os.path.exists(dst_file):
28 |         raise Exception(f'please remove your file {dst_file}')
29 |     f2 = open(dst_file, 'a')
30 |     with open(src_file, 'r') as f:
31 |         for line in f.readlines():
32 |             if is_config:
33 |                 line = line.replace("# ", "")
34 |             f2.write(line)
35 |     f2.close()
36 | 
37 | 
38 | def run(*args):
39 |     argv = args[0]
40 |     if argv == 'new_scrapy':
41 |         spider_name = args[1]
42 |         print("opts", spider_name)
43 |         new_template_spider(spider_name)
44 |     elif argv == 'sslpass':
45 |         if len(args) >= 2:
46 |             server_name = args[1]
47 |             print("firda server name")
48 |             sslbypass(server_name)
49 |         else:
50 |             sslbypass()
51 |     elif argv == 'asy_new_scrapy':
52 |         spider_name = args[1]
53 |         print("opts", spider_name)
54 |         newspider(spider_name)
55 |     else:
56 |         print("options: new_scarpy or sslpass or asy_new_scrapy ")
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     import sys
61 |     argv = sys.argv[1:]
62 |     run(*argv)
63 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/network_helper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @Time    : 2021/11/18 10:21
 4 | @Author  : lrabbit
 5 | @FileName: network_helper.py
 6 | @Software: PyCharm
 7 | @Blog    : https://www.lrabbit.life
 8 | """
 9 | 
10 | import urllib3
11 | import requests
12 | from lrabbit_scrapy.all_excepiton import Excepiton403, Exception404, Exception500
13 | 
14 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
15 | 
16 | 
17 | class RequestSession:
18 | 
19 |     def __init__(self, proxies=None, timeout=15, headers=None):
20 |         self.proxies = proxies
21 |         self.timeout = timeout
22 |         self.session = requests.session()
23 |         self.session.headers = headers
24 | 
25 |     def send_request(self, method='GET', url=None, headers=None, data=None) -> requests.Response:
26 | 
27 |         if method == 'GET':
28 |             res = self.session.get(url, proxies=self.proxies, verify=False, timeout=self.timeout)
29 |         else:
30 |             if isinstance(data, dict):
31 |                 res = self.session.post(url, json=data, proxies=self.proxies, headers=headers, verify=False,
32 |                                         timeout=self.timeout)
33 |             elif isinstance(data, str):
34 |                 res = self.session.get(url, data=data, proxies=self.proxies, headers=headers, verify=False,
35 |                                        timeout=self.timeout)
36 |             else:
37 |                 raise Exception("no data post")
38 |         return self.deal_res(res)
39 | 
40 |     def download_file_by_url(self, out_file_path, url, headers=None):
41 |         res = self.session.get(url=url, headers=headers, proxies=self.proxies, verify=False, stream=True,
42 |                                )
43 |         with open(out_file_path, 'wb') as f:
44 |             for chunk in res.iter_content(1024):
45 |                 if chunk:
46 |                     f.write(chunk)
47 |         return out_file_path
48 | 
49 |     def deal_res(self, res):
50 |         if res.status_code == 403:
51 |             raise Excepiton403()
52 |         elif res.status_code == 404:
53 |             raise Exception404()
54 |         elif res.status_code == 500:
55 |             raise Exception500()
56 |         return res
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     session = RequestSession()
61 | 


--------------------------------------------------------------------------------
/test/test_spider.py:
--------------------------------------------------------------------------------
 1 | from lrabbit_scrapy.spider import LrabbitSpider
 2 | from lrabbit_scrapy.common_utils.network_helper import RequestSession
 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 4 | from lrabbit_scrapy.common_utils.all_in_one import FileStore
 5 | import os
 6 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient
 7 | from parsel import Selector
 8 | 
 9 | 
10 | class Spider(LrabbitSpider):
11 |     """
12 |         spider_name : lrabbit blog spider
13 |     """
14 |     # unique spider name
15 |     spider_name = "lrabbit_blog"
16 |     # max thread worker numbers
17 |     max_thread_num = 2
18 |     # is open for every thread a mysql connection,if your max_thread_num overpass 10 and  in code need mysql query ,you need open this config
19 |     thread_mysql_open = True
20 |     # reset all task_list,every restart program will init task list
21 |     reset_task_config = False
22 |     # open loop init_task_list ,when your task is all finish,and you want again ,you can open it
23 |     loop_task_config = False
24 |     # remove config option,if open it,then confirm option when you init task
25 |     remove_confirm_config = False
26 |     # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini"
27 |     config_env_name = "config_path"
28 |     # redis db_num
29 |     redis_db_config = 0
30 |     # debug log ,open tracback log
31 |     debug_config = False
32 | 
33 |     def __init__(self):
34 |         super().__init__()
35 |         self.session = RequestSession()
36 |         self.proxy_session = RequestSession(proxies=None)
37 |         csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv")
38 |         self.field_names = ['id', 'title', 'datetime']
39 |         self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names)
40 | 
41 |     def worker(self, *args):
42 |         task = args[0]
43 |         mysql_client: MysqlClient
44 |         if len(args) == 2:
45 |             mysql_client = args[1]
46 |             # mysql_client.execute("")
47 |         res = self.session.send_request(method='GET', url=f'http://www.lrabbit.life/post_detail/?id={task}')
48 |         selector = Selector(res.text)
49 |         title = selector.css(".detail-title h1::text").get()
50 |         datetime = selector.css(".detail-info span::text").get()
51 |         if title:
52 |             post_data = {"id": task, "title": title, 'datetime': datetime}
53 |             self.blog_file.write(post_data)
54 |             # when you succes get content update redis stat
55 |             self.update_stat_redis()
56 |         LogUtils.log_finish(task)
57 | 
58 |     def init_task_list(self):
59 | 
60 |         # you can get init task from mysql
61 |         # res = self.mysql_client.query("select id from rookie limit 100 ")
62 |         # return [task['id'] for task in res]
63 |         return [i for i in range(100)]
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     spider = Spider()
68 |     spider.run()
69 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/template_crawl.py:
--------------------------------------------------------------------------------
 1 | from lrabbit_scrapy.spider import LrabbitSpider
 2 | from lrabbit_scrapy.common_utils.network_helper import RequestSession
 3 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 4 | from lrabbit_scrapy.common_utils.all_in_one import FileStore
 5 | import os
 6 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient
 7 | from parsel import Selector
 8 | 
 9 | 
10 | class Spider(LrabbitSpider):
11 |     """
12 |         spider_name : lrabbit blog spider
13 |     """
14 |     # unique spider name
15 |     spider_name = "lrabbit_blog"
16 |     # max thread worker numbers
17 |     max_thread_num = 2
18 |     # is open for every thread a mysql connection,if your max_thread_num overpass 10 and  in code need mysql query ,you need open this config
19 |     thread_mysql_open = True
20 |     # reset all task_list,every restart program will init task list
21 |     reset_task_config = True
22 |     # open loop init_task_list ,when your task is all fnish,and you want again ,you can open it
23 |     loop_task_config = True
24 |     # remove config option,if open it,then confirm option when you init task
25 |     remove_confirm_config = True
26 |     # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini"
27 |     config_env_name = "config_path"
28 |     # redis db_num
29 |     redis_db_config = 0
30 |     # debug log ,open tracback log
31 |     debug_config = False
32 | 
33 |     def __init__(self):
34 |         super().__init__()
35 |         self.session = RequestSession()
36 |         self.proxy_session = RequestSession(proxies=None)
37 |         csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv")
38 |         self.field_names = ['id', 'title', 'datetime']
39 |         self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names)
40 | 
41 |     def worker(self, *args):
42 |         task = args[0]
43 |         mysql_client: MysqlClient
44 |         if len(args) == 2:
45 |             mysql_client = args[1]
46 |             mysql_client.execute("select id from rookie limit 100")
47 |             # mysql_client.execute("")
48 |         url = f'http://www.lrabbit.life/post_detail/?id={task}'
49 |         LogUtils.log_running(url)
50 |         res = self.session.send_request(method='GET', url=url)
51 |         selector = Selector(res.text)
52 |         title = selector.css(".detail-title h1::text").get()
53 |         datetime = selector.css(".detail-info span::text").get()
54 |         if title:
55 |             post_data = {"id": task, "title": title, 'datetime': datetime}
56 |             self.blog_file.write(post_data)
57 |             # when you succes get content update redis stat
58 |             self.update_stat_redis()
59 |         LogUtils.log_finish(task)
60 | 
61 |     def init_task_list(self):
62 | 
63 |         # you can get init task from mysql
64 |         res = self.mysql_client.query("select id from rookie limit 100 ")
65 |         return [task['id'] for task in res]
66 |         # return [i for i in range(100)]
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     spider = Spider()
71 |     spider.run()
72 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/print_log_helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2021/11/18 10:14
  4 | @Author  : lrabbit
  5 | @FileName: redis_helper.py
  6 | @Software: PyCharm
  7 | @Blog    : https://www.lrabbit.life
  8 | """
  9 | 
 10 | import datetime
 11 | import os
 12 | 
 13 | 
 14 | class TermColor:
 15 |     ATTRIBUTES = dict(
 16 |         list(zip([
 17 |             'bold',
 18 |             'dark',
 19 |             '',
 20 |             'underline',
 21 |             'blink',
 22 |             '',
 23 |             'reverse',
 24 |             'concealed'
 25 |         ],
 26 |             list(range(1, 9))
 27 |         ))
 28 |     )
 29 |     del ATTRIBUTES['']
 30 | 
 31 |     HIGHLIGHTS = dict(
 32 |         list(zip([
 33 |             'on_grey',
 34 |             'on_red',
 35 |             'on_green',
 36 |             'on_yellow',
 37 |             'on_blue',
 38 |             'on_magenta',
 39 |             'on_cyan',
 40 |             'on_white'
 41 |         ],
 42 |             list(range(40, 48))
 43 |         ))
 44 |     )
 45 | 
 46 |     COLORS = dict(
 47 |         list(zip([
 48 |             'grey',
 49 |             'red',
 50 |             'green',
 51 |             'yellow',
 52 |             'blue',
 53 |             'magenta',
 54 |             'cyan',
 55 |             'white',
 56 |         ],
 57 |             list(range(30, 38))
 58 |         ))
 59 |     )
 60 | 
 61 |     RESET = '\033[0m'
 62 | 
 63 |     @staticmethod
 64 |     def colored(text, color=None, on_color=None, attrs=None):
 65 | 
 66 |         if os.getenv('ANSI_COLORS_DISABLED') is None:
 67 |             fmt_str = '\033[%dm%s'
 68 |             if color is not None:
 69 |                 text = fmt_str % (TermColor.COLORS[color], text)
 70 | 
 71 |             if on_color is not None:
 72 |                 text = fmt_str % (TermColor.HIGHLIGHTS[on_color], text)
 73 | 
 74 |             if attrs is not None:
 75 |                 for attr in attrs:
 76 |                     text = fmt_str % (TermColor.ATTRIBUTES[attr], text)
 77 | 
 78 |             text += TermColor.RESET
 79 |         return text
 80 | 
 81 | 
 82 | class CommonUtils:
 83 | 
 84 |     def __init__(self):
 85 |         pass
 86 | 
 87 |     @staticmethod
 88 |     def fix_str_args(args):
 89 |         return list(map(lambda x: str(x).strip(), args))
 90 | 
 91 |     @staticmethod
 92 |     def get_format_time(for_mat='%Y-%m-%d %H:%M:%S'):
 93 |         return TermColor.colored(datetime.datetime.now().strftime(for_mat), 'yellow')
 94 | 
 95 |     @staticmethod
 96 |     def space_join_line_arg(*args):
 97 |         return ' '.join(args) + '\n'
 98 | 
 99 | 
100 | class LogUtils:
101 | 
102 |     def __init__(self):
103 |         pass
104 | 
105 |     @staticmethod
106 |     def log_now_time_str():
107 |         # fix buffer cache
108 |         # sys.stdout.buffer.write()
109 |         print(CommonUtils.get_format_time())
110 | 
111 |     @staticmethod
112 |     def log_str(color_str, args):
113 |         args = CommonUtils.fix_str_args(args)
114 |         text = ' '.join(args)
115 |         text = color_str + ' ' + text + '\n'
116 |         # fix buffer cache
117 |         # sys.stdout.buffer.write(text.encode('utf8'))
118 |         print(text,end='')
119 | 
120 |     @staticmethod
121 |     def log_info(*args):
122 |         color_str = TermColor.colored('[*INFO*]', 'cyan')
123 |         LogUtils.log_str(color_str, args)
124 | 
125 |     @staticmethod
126 |     def log_running(*args):
127 |         color_str = TermColor.colored('[*RUNNING*]', 'yellow')
128 |         LogUtils.log_str(color_str, args)
129 | 
130 |     @staticmethod
131 |     def log_finish(*args):
132 |         color_str = TermColor.colored('[*FINISH*]', 'green')
133 |         LogUtils.log_str(color_str, args)
134 | 
135 |     @staticmethod
136 |     def log_error(*args):
137 |         color_str = TermColor.colored('[*ERROR*]', 'red')
138 |         LogUtils.log_str(color_str, args)
139 | 
140 |     @staticmethod
141 |     def log_to_file(file_path, line):
142 |         """
143 | 
144 |         :param file_path: log file path
145 |         :param line: a str type
146 |         :return:
147 |         """
148 |         with open(file_path, 'a', encoding='utf8') as f:
149 |             line = CommonUtils.space_join_line_arg(LogUtils.get_format_time(), line)
150 |             f.write(line)
151 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/buffer/request_buffer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | from threading import Thread
  6 | from lrabbit_spider.utils.log import log
  7 | import lrabbit_spider.utils.tools as tools
  8 | import collections
  9 | import lrabbit_spider.setting as setting
 10 | from lrabbit_spider.network.request import Request
 11 | from lrabbit_spider.deque import Dedup
 12 | import  lrabbit_spider.constants as constants
 13 | from lrabbit_spider.db.redisdb import RedisDB
 14 | 
 15 | 
 16 | MAX_URL_COUNT=1000
 17 | class RequestsBuffer(Thread):
 18 |     dedup : Dedup= None
 19 | 
 20 |     def __init__(self,redis_key):
 21 |         if not hasattr(self,"_request_deque"):
 22 | 
 23 |             super(RequestsBuffer,self).__init__()
 24 |             self._thread_stop=False
 25 |             self._is_adding_to_db = False
 26 |             self._requests_deque = collections.deque()
 27 |             self._del_requests_deque = collections.deque()
 28 |             self._db = RedisDB()
 29 |             self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
 30 |             self.table_failed_request =setting.TAB_FAILED_REQUSETS
 31 |             if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
 32 |                 self.__class__.dedup = Dedup(name=redis_key,to_md5=False,**setting.REQUEST_FILTER_SETTING)
 33 | 
 34 | 
 35 |     def run(self):
 36 |         self._thread_stop = False
 37 |         while not self._thread_stop:
 38 |             try:
 39 |                 self.__add_request_to_db()
 40 |             except Exception as e:
 41 |                 log.exception(e)
 42 |             tools.delay_time(1)
 43 | 
 44 |     def put_request(self,request):
 45 |         self._requests_deque.append(request)
 46 |         if self.get_requests_count() > MAX_URL_COUNT:
 47 |             self.flush()
 48 |         
 49 |         
 50 |         pass
 51 |     def put_del_request(self,request):
 52 |         self._del_requests_deque.append(request)
 53 | 
 54 | 
 55 |         pass
 56 |     def put_failed_request(self,request,table=None):
 57 |         try:
 58 |             request_dict = request.to_dict
 59 |             self._db.zadd(table or self.table_failed_request,request_dict,request.priority)
 60 |         except Exception as e:
 61 |             log.exception(e)
 62 | 
 63 |     def flush(self):
 64 |         try:
 65 |             self.__add_request_to_db()
 66 |         except Exception as e:
 67 |             log.exception(e)
 68 | 
 69 |     def get_requests_count(self):
 70 |         return len(self._requests_deque)
 71 |         pass
 72 |     def is_adding_to_db(self):
 73 |         return self._is_adding_to_db
 74 |         pass
 75 |     def __add_request_to_db(self):
 76 |         request_list = []
 77 |         prioritys = [] 
 78 |         callbacks = []
 79 | 
 80 |         while self._requests_deque:
 81 |             request:Request = self._requests_deque.popleft()
 82 |             self.is_adding_to_db = True
 83 |             if callable(request):
 84 |                 callbacks.append(request)
 85 |             priority = request.fingerprint
 86 |             if (
 87 |                 request.filter_repeat
 88 |                 and setting.REQUEST_FILTER_ENABLE
 89 |                 and not self.__class__.dedup.add(request.fingerprint)
 90 | 
 91 |             ):
 92 |                 log.debug(constants.REQUEST_REPEAT+f"   URL = {request.url}")
 93 |                 continue
 94 |             else:
 95 |                 request_list.append(str(request.to_dict))
 96 |                 prioritys.append(priority)
 97 |             if len(request_list)>MAX_URL_COUNT:
 98 |                 self._db.zadd(self._table_request,request_list,prioritys)
 99 |                 request_list = []
100 |                 prioritys = []
101 |             
102 |             if request_list:
103 |                 self._db.zadd(self._table_request,request_list,prioritys)
104 | 
105 |             for callback in callbacks:
106 |                 try:
107 |                     callback()
108 |                 except Exception as e:
109 |                     log.exception(e)
110 |             if self._del_requests_deque:
111 |                 reqeust_done_list = []
112 |                 while self._del_requests_deque:
113 |                     reqeust_done_list.append(self._del_requests_deque.popleft())
114 |                 reqeust_done_list = list(set(reqeust_done_list)-set(request_list))
115 |                 if reqeust_done_list:
116 |                     self._db.zrem(self._table_request,reqeust_done_list)
117 | 
118 |             self._is_adding_to_db = False
119 |                 
120 | 
121 | 
122 |             
123 |                 
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | lrabbit_scrapy
  2 | =====
  3 | 
  4 | this is a small spider,you can easy running.  you don't have to redo some repeated code every time, using this small framework you can quickly crawl data into a file or database.
  5 | 
  6 | 
  7 | Requirements
  8 | ----------
  9 | python >=3.6.8
 10 | 
 11 | Installing
 12 | ----------
 13 | 
 14 |     $ pip3 install lrabbit_scrapy
 15 | 
 16 | quick start
 17 | ----------------
 18 | 
 19 | 
 20 | * python3 -m lrabbit_scrapy new_scrapy blog
 21 |     * then will create a directory of blog,this is file content
 22 | ```python
 23 | from lrabbit_scrapy.spider import LrabbitSpider
 24 | from lrabbit_scrapy.common_utils.network_helper import RequestSession
 25 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 26 | from lrabbit_scrapy.common_utils.all_in_one import FileStore
 27 | import os
 28 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient
 29 | from parsel import Selector
 30 | 
 31 | 
 32 | class Spider(LrabbitSpider):
 33 |     """
 34 |         spider_name : lrabbit blog spider
 35 |     """
 36 |     # unique spider name
 37 |     spider_name = "lrabbit_blog"
 38 |     # max thread worker numbers
 39 |     max_thread_num = 2
 40 |     # is open for every thread a mysql connection,if your max_thread_num overpass 10 and  in code need mysql query ,you need open this config
 41 |     thread_mysql_open = True
 42 |     # reset all task_list,every restart program will init task list
 43 |     reset_task_config = False
 44 |     # open loop init_task_list ,when your task is all finish,and you want again ,you can open it
 45 |     loop_task_config = False
 46 |     # remove config option,if open it,then confirm option when you init task
 47 |     remove_confirm_config = False
 48 |     # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini"
 49 |     config_env_name = "config_path"
 50 |     # redis db_num
 51 |     redis_db_config = 0
 52 |     # debug log ,open tracback log
 53 |     debug_config = False
 54 | 
 55 |     def __init__(self):
 56 |         super().__init__()
 57 |         self.session = RequestSession()
 58 |         self.proxy_session = RequestSession(proxies=None)
 59 |         csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv")
 60 |         self.field_names = ['id', 'title', 'datetime']
 61 |         self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names)
 62 | 
 63 |     def worker(self, *args):
 64 |         task = args[0]
 65 |         mysql_client: MysqlClient
 66 |         if len(args) == 2:
 67 |             mysql_client = args[1]
 68 |             # mysql_client.execute("")
 69 |         res = self.session.send_request(method='GET', url=f'http://www.lrabbit.life/post_detail/?id={task}')
 70 |         selector = Selector(res.text)
 71 |         title = selector.css(".detail-title h1::text").get()
 72 |         datetime = selector.css(".detail-info span::text").get()
 73 |         if title:
 74 |             post_data = {"id": task, "title": title, 'datetime': datetime}
 75 |             self.blog_file.write(post_data)
 76 |             # when you succes get content update redis stat
 77 |             self.update_stat_redis()
 78 |         LogUtils.log_finish(task)
 79 | 
 80 |     def init_task_list(self):
 81 | 
 82 |         # you can get init task from mysql
 83 |         # res = self.mysql_client.query("select id from rookie limit 100 ")
 84 |         # return [task['id'] for task in res]
 85 |         return [i for i in range(100)]
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     spider = Spider()
 90 |     spider.run()
 91 | 
 92 | ```
 93 | 
 94 | * set config.ini and config env variable
 95 |     * create crawl.ini,for example this file path is /root/crawl.ini
 96 |     ```ini
 97 |   [server]
 98 |   mysql_user = root
 99 |   mysql_password = 123456
100 |   mysql_database = test
101 |   mysql_host = 192.168.1.1
102 |   redis_user = lrabbit
103 |   redis_host = 192.168.1.1
104 |   redis_port = 6379
105 |   redis_password = 123456
106 | 
107 |   [test]
108 |   mysql_user = root
109 |   mysql_password = 123456
110 |   mysql_database = test
111 |   mysql_host = 192.168.1.1
112 |   redis_user = lrabbit
113 |   redis_host = 192.168.1.1
114 |   redis_port = 6379
115 |   redis_password = 123456
116 |   ```
117 |     * set config env
118 |         * windows power shell
119 |         * $env:config_path = "/root/crawl.ini"
120 |         * linux
121 |         * export config_path="/root/crawl.ini"
122 |         
123 | * python3 blog_spider.py
124 | 
125 | ## other function
126 | * python3 blog_spider.py stat
127 |     * show task stat
128 | * python3 -m lrabbit-scrapy sslpass
129 |   * pass android ssl 
130 |   
131 | - author: https://www.lrabbit.life/
132 | 
133 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/network/request.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import lrabbit_spider.network.user_agent as user_agent
  3 | import lrabbit_spider.setting as setting
  4 | import requests
  5 | from lrabbit_spider.utils.
  6 | from requests.adapters import HTTPAdapter
  7 | class Request(object):
  8 | 
  9 |     webdriver_pool:WebD
 10 |     __REQUEST_ATTRS__ = {
 11 |         # 'method', 'url', 必须传递 不加入**kwargs中
 12 |         "params",
 13 |         "data",
 14 |         "headers",
 15 |         "cookies",
 16 |         "files",
 17 |         "auth",
 18 |         "timeout",
 19 |         "allow_redirects",
 20 |         "proxies",
 21 |         "hooks",
 22 |         "stream",
 23 |         "verify",
 24 |         "cert",
 25 |         "json",
 26 |             }
 27 |     def __init__(
 28 |         self,
 29 |         url="",
 30 |         retry_times=0,
 31 |         priority=300,
 32 |         parser_name=None,
 33 |         callback=None,
 34 |         filter_repeat=True,
 35 |         auto_request=True,
 36 |         request_sync=False,
 37 |         use_session=None,
 38 |         random_user_agent=True,
 39 |         download_midware=None,
 40 |         is_abandoned=False,
 41 |         render=False,
 42 |         render_time=0,
 43 |         **kwargs,
 44 |     ):
 45 |         """
 46 |         @summary: Request参数
 47 |         ---------
 48 |         框架参数
 49 |         @param url: 待抓取url
 50 |         @param retry_times: 当前重试次数
 51 |         @param priority: 优先级 越小越优先 默认300
 52 |         @param parser_name: 回调函数所在的类名 默认为当前类
 53 |         @param callback: 回调函数 可以是函数 也可是函数名（如想跨类回调时，parser_name指定那个类名，callback指定那个类想回调的方法名即可）
 54 |         @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
 55 |         @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空，需要自己去请求网页
 56 |         @param request_sync: 是否同步请求下载网页，默认异步。如果该请求url过期时间快，可设置为True，相当于yield的reqeust会立即响应，而不是去排队
 57 |         @param use_session: 是否使用session方式
 58 |         @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
 59 |         @param download_midware: 下载中间件。默认为parser中的download_midware
 60 |         @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
 61 |         @param render: 是否用浏览器渲染
 62 |         @param render_time: 渲染时长，即打开网页等待指定时间后再获取源码
 63 |         --
 64 |         以下参数与requests参数使用方式一致
 65 |         @param method: 请求方式，如POST或GET，默认根据data值是否为空来判断
 66 |         @param params: 请求参数
 67 |         @param data: 请求body
 68 |         @param json: 请求json字符串，同 json.dumps(data)
 69 |         @param headers:
 70 |         @param cookies: 字典 或 CookieJar 对象
 71 |         @param files:
 72 |         @param auth:
 73 |         @param timeout: (浮点或元组)等待服务器数据的超时限制，是一个浮点数，或是一个(connect timeout, read timeout) 元组
 74 |         @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
 75 |         @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
 76 |         @param verify: 为 True 时将会验证 SSL 证书
 77 |         @param stream: 如果为 False，将会立即下载响应内容
 78 |         @param cert:
 79 |         --
 80 |         @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
 81 |         ---------
 82 |         @result:
 83 |         """
 84 | 
 85 |         self.url = url
 86 |         self.retry_times = retry_times
 87 |         self.priority = priority
 88 |         self.parser_name = parser_name
 89 |         self.callback = callback
 90 |         self.filter_repeat = filter_repeat
 91 |         self.auto_request = auto_request
 92 |         self.request_sync = request_sync
 93 |         self.use_session = use_session
 94 |         self.random_user_agent = random_user_agent
 95 |         self.download_midware = download_midware
 96 |         self.is_abandoned = is_abandoned
 97 |         self.render = render
 98 |         self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
 99 | 
100 |         self.requests_kwargs = {}
101 |         for key, value in kwargs.items():
102 |             if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
103 |                 self.requests_kwargs[key] = value
104 | 
105 |             self.__dict__[key] = value
106 | 
107 | 
108 |     def __repr__(self):
109 |         pass
110 | 
111 |     def __setattr__(self,key,value):
112 |         self.__dict__[key] = value
113 |         if key in self.__class__.__REQUEST_ATTRS__:
114 |             self.requests_kwargs[key] = value
115 | 
116 |     def __lt__(self,other):
117 |         return self.priority<other.priority
118 |         pass
119 | 
120 |     @property
121 |     def _session(self):
122 |         user_session = (
123 |             setting.USE_SESSION if self.user_session is None else self.use_session
124 | 
125 |         )
126 |         if user_session and not self.__class__.session:
127 |             self.__class__.session = requests.Session()
128 |             http_adapter = HTTPAdapter(pool_connections=1000,pool_maxsize=1000)
129 |             self.__class__.session.mount("http",http_adapter)
130 |     @property
131 |     def _webdriver_pool(self):
132 |         if not self.__class__.web:
133 |         pass
134 | 
135 |     @property
136 |     def _proxies_pool(slef):
137 |         pass
138 |     @property
139 |     def to_dict():
140 |         pass
141 |     @property
142 |     def callback_name(self):
143 |         pass
144 | 
145 |     def get_response(self,save_cached=False):
146 |         pass
147 |     def proxy(self):
148 |         pass
149 |     def user_agent(self):
150 |         pass
151 |     @property
152 |     def fingerprint(self):
153 |         pass
154 |     @property
155 |     def _cache_redis_key(self):
156 |         pass
157 |     @property
158 |     def _cache_db(slef):
159 |         pass
160 |     def save_cached(self,response,expire_time=1200):
161 |         pass
162 |     def get_response_from_cached(self,save_cached=True):
163 |         pass
164 |     def del_response_cached(slef):
165 |         pass
166 |     @classmethod
167 |     def from_dict(cls,request_dict):
168 |         pass
169 |     def copy(self):
170 |         pass
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/utils/js/intercept.js:
--------------------------------------------------------------------------------
 1 | !function(t,e){for(var n in e)t[n]=e[n]}(window,function(t){function e(r){if(n[r])return n[r].exports;var o=n[r]={i:r,l:!1,exports:{}};return t[r].call(o.exports,o,o.exports,e),o.l=!0,o.exports}var n={};return e.m=t,e.c=n,e.i=function(t){return t},e.d=function(t,n,r){e.o(t,n)||Object.defineProperty(t,n,{configurable:!1,enumerable:!0,get:r})},e.n=function(t){var n=t&&t.__esModule?function(){return t.default}:function(){return t};return e.d(n,"a",n),n},e.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},e.p="",e(e.s=3)}([function(t,e,n){"use strict";function r(t,e){var n={};for(var r in t)n[r]=t[r];return n.target=n.currentTarget=e,n}function o(t){function e(e){return function(){var n=this.hasOwnProperty(e+"_")?this[e+"_"]:this.xhr[e],r=(t[e]||{}).getter;return r&&r(n,this)||n}}function n(e){return function(n){var o=this.xhr,i=this,u=t[e];if("on"===e.substring(0,2))i[e+"_"]=n,o[e]=function(u){u=r(u,i),t[e]&&t[e].call(i,o,u)||n.call(i,u)};else{var s=(u||{}).setter;n=s&&s(n,i)||n,this[e+"_"]=n;try{o[e]=n}catch(t){}}}}function o(e){return function(){var n=[].slice.call(arguments);if(t[e]){var r=t[e].call(this,n,this.xhr);if(r)return r}return this.xhr[e].apply(this.xhr,n)}}return window[s]=window[s]||XMLHttpRequest,XMLHttpRequest=function(){var t=new window[s];for(var r in t){var i="";try{i=u(t[r])}catch(t){}"function"===i?this[r]=o(r):Object.defineProperty(this,r,{get:e(r),set:n(r),enumerable:!0})}var a=this;t.getProxy=function(){return a},this.xhr=t},window[s]}function i(){window[s]&&(XMLHttpRequest=window[s]),window[s]=void 0}Object.defineProperty(e,"__esModule",{value:!0});var u="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t};e.configEvent=r,e.hook=o,e.unHook=i;var s="_rxhr"},function(t,e,n){"use strict";function r(t){if(h)throw"Proxy already exists";return h=new f(t)}function o(){h=null,(0,d.unHook)()}function i(t){return t.replace(/^\s+|\s+$/g,"")}function u(t){return t.watcher||(t.watcher=document.createElement("a"))}function s(t,e){var n=t.getProxy(),r="on"+e+"_",o=(0,d.configEvent)({type:e},n);n[r]&&n[r](o);var i;"function"==typeof Event?i=new Event(e,{bubbles:!1}):(i=document.createEvent("Event"),i.initEvent(e,!1,!0)),u(t).dispatchEvent(i)}function a(t){this.xhr=t,this.xhrProxy=t.getProxy()}function c(t){function e(t){a.call(this,t)}return e[b]=Object.create(a[b]),e[b].next=t,e}function f(t){function e(t,e){var n=new P(t);if(!f)return n.resolve();var r={response:e.response,status:e.status,statusText:e.statusText,config:t.config,headers:t.resHeader||t.getAllResponseHeaders().split("\r\n").reduce(function(t,e){if(""===e)return t;var n=e.split(":");return t[n.shift()]=i(n.join(":")),t},{})};f(r,n)}function n(t,e,n){var r=new H(t),o={config:t.config,error:n};h?h(o,r):r.next(o)}function r(){return!0}function o(t,e){return n(t,this,e),!0}function a(t,n){return 4===t.readyState&&0!==t.status?e(t,n):4!==t.readyState&&s(t,w),!0}var c=t.onRequest,f=t.onResponse,h=t.onError;return(0,d.hook)({onload:r,onloadend:r,onerror:o,ontimeout:o,onabort:o,onreadystatechange:function(t){return a(t,this)},open:function(t,e){var r=this,o=e.config={headers:{}};o.method=t[0],o.url=t[1],o.async=t[2],o.user=t[3],o.password=t[4],o.xhr=e;var i="on"+w;e[i]||(e[i]=function(){return a(e,r)});var u=function(t){n(e,r,(0,d.configEvent)(t,r))};if([x,y,g].forEach(function(t){var n="on"+t;e[n]||(e[n]=u)}),c)return!0},send:function(t,e){var n=e.config;if(n.withCredentials=e.withCredentials,n.body=t[0],c){var r=function(){c(n,new m(e))};return!1===n.async?r():setTimeout(r),!0}},setRequestHeader:function(t,e){return e.config.headers[t[0].toLowerCase()]=t[1],!0},addEventListener:function(t,e){var n=this;if(-1!==l.indexOf(t[0])){var r=t[1];return u(e).addEventListener(t[0],function(e){var o=(0,d.configEvent)(e,n);o.type=t[0],o.isTrusted=!0,r.call(n,o)}),!0}},getAllResponseHeaders:function(t,e){var n=e.resHeader;if(n){var r="";for(var o in n)r+=o+": "+n[o]+"\r\n";return r}},getResponseHeader:function(t,e){var n=e.resHeader;if(n)return n[(t[0]||"").toLowerCase()]}})}Object.defineProperty(e,"__esModule",{value:!0}),e.proxy=r,e.unProxy=o;var h,d=n(0),l=["load","loadend","timeout","error","readystatechange","abort"],v=l[0],p=l[1],y=l[2],x=l[3],w=l[4],g=l[5],b="prototype";a[b]=Object.create({resolve:function(t){var e=this.xhrProxy,n=this.xhr;e.readyState=4,n.resHeader=t.headers,e.response=e.responseText=t.response,e.statusText=t.statusText,e.status=t.status,s(n,w),s(n,v),s(n,p)},reject:function(t){this.xhrProxy.status=0,s(this.xhr,t.type),s(this.xhr,p)}});var m=c(function(t){var e=this.xhr;t=t||e.config,e.withCredentials=t.withCredentials,e.open(t.method,t.url,!1!==t.async,t.user,t.password);for(var n in t.headers)e.setRequestHeader(n,t.headers[n]);e.send(t.body)}),P=c(function(t){this.resolve(t)}),H=c(function(t){this.reject(t)})},,function(t,e,n){"use strict";Object.defineProperty(e,"__esModule",{value:!0}),e.ah=void 0;var r=n(0),o=n(1);e.ah={proxy:o.proxy,unProxy:o.unProxy,hook:r.hook,unHook:r.unHook}}]));
 2 | 
 3 | // 存储数据
 4 | window.__ajaxData = {}
 5 | // 拦截规则
 6 | window.__urlRegexes = []
 7 | 
 8 | ah.proxy({
 9 |     onRequest: (config,handler)=>{
10 |         handler.next(config);
11 |     }
12 |     ,
13 |     onError: (err,handler)=>{
14 |         handler.next(err)
15 |     }
16 |     ,
17 |     onResponse: (response,handler)=>{
18 |         var url = response.config.url;
19 |         if (window.__urlRegexes.length > 0) {
20 |             for (const regex of window.__urlRegexes) {
21 |                 var re = new RegExp(regex, "g");
22 |                 if (re.exec(url)) {
23 |                     window.__ajaxData[regex] = {
24 |                         request: {
25 |                             'url': response.config.xhr.config.url,
26 |                             'data': response.config.xhr.config.body,
27 |                             'headers': response.config.xhr.config.headers
28 |                         },
29 |                         response: {
30 |                             'url': response.config.url,
31 |                             'headers': response.headers,
32 |                             'content': response.response,
33 |                             'status_code': response.status
34 |                         }
35 |                     };
36 |                 }
37 |             }
38 |         }
39 |         handler.next(response)
40 |     }
41 | })
42 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/setting.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """爬虫配置文件"""
  3 | import os
  4 | 
  5 | # redis 表名
  6 | # 任务表模版
  7 | TAB_REQUSETS = "{redis_key}:z_requsets"
  8 | # 任务失败模板
  9 | TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets"
 10 | # 数据保存失败模板
 11 | TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
 12 | # 爬虫状态表模版
 13 | TAB_SPIDER_STATUS = "{redis_key}:z_spider_status"
 14 | # 爬虫时间记录表
 15 | TAB_SPIDER_TIME = "{redis_key}:h_spider_time"
 16 | # 用户池
 17 | TAB_USER_POOL = "{redis_key}:h_{user_type}_pool"
 18 | 
 19 | # MYSQL
 20 | MYSQL_IP = os.getenv("MYSQL_IP")
 21 | MYSQL_PORT = int(os.getenv("MYSQL_PORT", 3306))
 22 | MYSQL_DB = os.getenv("MYSQL_DB")
 23 | MYSQL_USER_NAME = os.getenv("MYSQL_USER_NAME")
 24 | MYSQL_USER_PASS = os.getenv("MYSQL_USER_PASS")
 25 | 
 26 | # MONGODB
 27 | MONGO_IP = os.getenv("MONGO_IP", "localhost")
 28 | MONGO_PORT = int(os.getenv("MONGO_PORT", 27017))
 29 | MONGO_DB = os.getenv("MONGO_DB")
 30 | MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
 31 | MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
 32 | 
 33 | # REDIS
 34 | # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
 35 | REDISDB_IP_PORTS = os.getenv("REDISDB_IP_PORTS")
 36 | REDISDB_USER_PASS = os.getenv("REDISDB_USER_PASS")
 37 | REDISDB_DB = int(os.getenv("REDISDB_DB", 0))
 38 | # 适用于redis哨兵模式
 39 | REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
 40 | 
 41 | # 数据入库的pipeline，可自定义，默认MysqlPipeline
 42 | ITEM_PIPELINES = [
 43 |     "feapder.pipelines.mysql_pipeline.MysqlPipeline",
 44 |     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
 45 | ]
 46 | EXPORT_DATA_MAX_FAILED_TIMES = 10  # 导出数据时最大的失败次数，包括保存和更新，超过这个次数报警
 47 | EXPORT_DATA_MAX_RETRY_TIMES = 10  # 导出数据时最大的重试次数，包括保存和更新，超过这个次数则放弃重试
 48 | 
 49 | # 爬虫相关
 50 | # COLLECTOR
 51 | COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
 52 | COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
 53 | 
 54 | # SPIDER
 55 | SPIDER_THREAD_COUNT = 1  # 爬虫并发数
 56 | SPIDER_SLEEP_TIME = (
 57 |     0  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数，包含2和5
 58 | )
 59 | SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
 60 | SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
 61 | SPIDER_AUTO_START_REQUESTS = (
 62 |     True  # 是否主动执行添加 设置为False 需要手动调用start_monitor_task，适用于多进程情况下
 63 | )
 64 | KEEP_ALIVE = False  # 爬虫是否常驻
 65 | 
 66 | # 浏览器渲染
 67 | WEBDRIVER = dict(
 68 |     pool_size=1,  # 浏览器的数量
 69 |     load_images=True,  # 是否加载图片
 70 |     user_agent=None,  # 字符串 或 无参函数，返回值为user_agent
 71 |     proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
 72 |     headless=False,  # 是否为无头浏览器
 73 |     driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
 74 |     timeout=30,  # 请求超时时间
 75 |     window_size=(1024, 800),  # 窗口大小
 76 |     executable_path=None,  # 浏览器路径，默认为默认路径
 77 |     render_time=0,  # 渲染时长，即打开网页等待指定时间后再获取源码
 78 |     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
 79 |     xhr_url_regexes=None,  # 拦截xhr接口，支持正则，数组类型
 80 | )
 81 | 
 82 | # 爬虫启动时，重新抓取失败的requests
 83 | RETRY_FAILED_REQUESTS = False
 84 | # 保存失败的request
 85 | SAVE_FAILED_REQUEST = True
 86 | # request防丢机制。（指定的REQUEST_LOST_TIMEOUT时间内request还没做完，会重新下发 重做）
 87 | REQUEST_LOST_TIMEOUT = 600  # 10分钟
 88 | # request网络请求超时时间
 89 | REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间，浮点数，或(connect timeout, read timeout)元组
 90 | 
 91 | # 下载缓存 利用redis缓存，但由于内存大小限制，所以建议仅供开发调试代码时使用，防止每次debug都需要网络请求
 92 | RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据，建议设置为True
 93 | RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
 94 | RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
 95 | 
 96 | # redis 存放item与request的根目录
 97 | REDIS_KEY = ""
 98 | # 爬虫启动时删除的key，类型: 元组/bool/string。 支持正则; 常用于清空任务队列，否则重启时会断点续爬
 99 | DELETE_KEYS = []
100 | 
101 | # 设置代理
102 | PROXY_EXTRACT_API = None  # 代理提取API ，返回的代理分割符为\r\n
103 | PROXY_ENABLE = True
104 | 
105 | # 随机headers
106 | RANDOM_HEADERS = True
107 | # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari'，'mobile' 若不指定则随机类型
108 | USER_AGENT_TYPE = "chrome"
109 | # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
110 | DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
111 | # requests 使用session
112 | USE_SESSION = False
113 | 
114 | # 去重
115 | ITEM_FILTER_ENABLE = False  # item 去重
116 | ITEM_FILTER_SETTING = dict(
117 |     filter_type=1  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
118 | )
119 | REQUEST_FILTER_ENABLE = False  # request 去重
120 | REQUEST_FILTER_SETTING = dict(
121 |     filter_type=3,  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
122 |     expire_time=2592000,  # 过期时间1个月
123 | )
124 | 
125 | # 报警 支持钉钉、企业微信、邮件
126 | # 钉钉报警
127 | DINGDING_WARNING_URL = ""  # 钉钉机器人api
128 | DINGDING_WARNING_PHONE = ""  # 报警人 支持列表，可指定多个
129 | DINGDING_WARNING_ALL = False  # 是否提示所有人， 默认为False
130 | # 邮件报警
131 | EMAIL_SENDER = ""  # 发件人
132 | EMAIL_PASSWORD = ""  # 授权码
133 | EMAIL_RECEIVER = ""  # 收件人 支持列表，可指定多个
134 | EMAIL_SMTPSERVER = "smtp.163.com"  # 邮件服务器 默认为163邮箱
135 | # 企业微信报警
136 | WECHAT_WARNING_URL = ""  # 企业微信机器人api
137 | WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表，可指定多人
138 | WECHAT_WARNING_ALL = False  # 是否提示所有人， 默认为False
139 | # 时间间隔
140 | WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔，防止刷屏; 0表示不去重
141 | WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
142 | WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
143 | 
144 | LOG_NAME = os.path.basename(os.getcwd())
145 | LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
146 | LOG_LEVEL = "DEBUG"
147 | LOG_COLOR = True  # 是否带有颜色
148 | LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
149 | LOG_IS_WRITE_TO_FILE = False  # 是否写文件
150 | LOG_MODE = "w"  # 写文件的模式
151 | LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
152 | LOG_BACKUP_COUNT = 20  # 日志文件保留数量
153 | LOG_ENCODING = "utf8"  # 日志文件编码
154 | OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
155 | 
156 | # 打点监控 influxdb 配置
157 | INFLUXDB_HOST = os.getenv("INFLUXDB_HOST", "localhost")
158 | INFLUXDB_PORT = int(os.getenv("INFLUXDB_PORT", 8086))
159 | INFLUXDB_UDP_PORT = int(os.getenv("INFLUXDB_UDP_PORT", 8089))
160 | INFLUXDB_USER = os.getenv("INFLUXDB_USER")
161 | INFLUXDB_PASSWORD = os.getenv("INFLUXDB_PASSWORD")
162 | INFLUXDB_DATABASE = os.getenv("INFLUXDB_DB")
163 | # 监控数据存储的表名，爬虫管理系统上会以task_id命名
164 | INFLUXDB_MEASUREMENT = "task_" + os.getenv("TASK_ID") if os.getenv("TASK_ID") else None
165 | # 打点监控其他参数，若这里也配置了influxdb的参数, 则会覆盖外面的配置
166 | METRICS_OTHER_ARGS = dict(retention_policy_duration="180d", emit_interval=60)
167 | 
168 | ############# 导入用户自定义的setting #############
169 | try:
170 |     from setting import *
171 | 
172 |     # 兼容老版本的配置
173 |     KEEP_ALIVE = not AUTO_STOP_WHEN_SPIDER_DONE
174 | except:
175 |     pass
176 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/db/redisdb.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import lrabbit_spider.setting as setting
  4 | from redis.connection import Encoder as _Encoder
  5 | from redis.exceptions import ConnectionError, TimeoutError
  6 | from redis.exceptions import DataError
  7 | from redis.sentinel import Sentinel
  8 | from rediscluster import RedisCluster
  9 | from redis._compat import unicode,long,basestring
 10 | 
 11 | from lrabbit_spider.utils.log import log
 12 | 
 13 | 
 14 | import redis
 15 | class Encoder(_Encoder):
 16 |     def encode(self, value):
 17 |         "Return a bytestring or bytes-like representation of the value"
 18 |         if isinstance(value, (bytes, memoryview)):
 19 |             return value
 20 |         # elif isinstance(value, bool):
 21 |         #     # special case bool since it is a subclass of int
 22 |         #     raise DataError(
 23 |         #         "Invalid input of type: 'bool'. Convert to a "
 24 |         #         "bytes, string, int or float first."
 25 |         #     )
 26 |         elif isinstance(value, float):
 27 |             value = repr(value).encode()
 28 |         elif isinstance(value, (int, long)):
 29 |             # python 2 repr() on longs is '123L', so use str() instead
 30 |             value = str(value).encode()
 31 |         elif isinstance(value, (list, dict, tuple)):
 32 |             value = unicode(value)
 33 |         elif not isinstance(value, basestring):
 34 |             # a value we don't know how to deal with. throw an error
 35 |             typename = type(value).__name__
 36 |             raise DataError(
 37 |                 "Invalid input of type: '%s'. Convert to a "
 38 |                 "bytes, string, int or float first." % typename
 39 |             )
 40 |         if isinstance(value, unicode):
 41 |             value = value.encode(self.encoding, self.encoding_errors)
 42 |         return value
 43 | 
 44 | 
 45 | redis.connection.Encoder = Encoder
 46 | class RedisDB:
 47 |     def __init__(
 48 |         self,
 49 |         ip_ports=None,
 50 |         db=None,
 51 |         user_pass=None,
 52 |         url=None,
 53 |         decode_responses=True,
 54 |         service_name=None,
 55 |         max_connections=32,
 56 |         **kwargs,
 57 |     ):
 58 |         """
 59 |         redis的封装
 60 |         Args:
 61 |             ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
 62 |             db:
 63 |             user_pass:
 64 |             url:
 65 |             decode_responses:
 66 |             service_name: 适用于redis哨兵模式
 67 |         """
 68 | 
 69 |         # 可能会改setting中的值，所以此处不能直接赋值为默认值，需要后加载赋值
 70 |         if ip_ports is None:
 71 |             ip_ports = setting.REDISDB_IP_PORTS
 72 |         if db is None:
 73 |             db = setting.REDISDB_DB
 74 |         if user_pass is None:
 75 |             user_pass = setting.REDISDB_USER_PASS
 76 |         if service_name is None:
 77 |             service_name = setting.REDISDB_SERVICE_NAME
 78 | 
 79 |         self._is_redis_cluster = False
 80 | 
 81 |         self.__redis = None
 82 |         self._url = url
 83 |         self._ip_ports = ip_ports
 84 |         self._db = db
 85 |         self._user_pass = user_pass
 86 |         self._decode_responses = decode_responses
 87 |         self._service_name = service_name
 88 |         self._max_connections = max_connections
 89 |         self._kwargs = kwargs
 90 |         self.get_connect()
 91 | 
 92 |     def get_connect(self):
 93 |         pass
 94 |     def __repr__(self):
 95 |         pass
 96 | 
 97 |     @property
 98 |     def _redis(self):
 99 |         pass
100 |     @_redis.setter
101 |     def _redis(self,val):
102 |         pass
103 | 
104 | 
105 |     @classmethod
106 |     def from_url(cls,url):
107 |         pass
108 | 
109 |     def sadd(self,table,values):
110 |         pass
111 | 
112 |     def sget(self,table,count=1,is_pop=True):
113 |         pass
114 | 
115 |     def srem(self,table,values):
116 |         pass
117 |     def sget_count(self,tables):
118 |         pass
119 | 
120 |     def sdelete(slef,table):
121 |         pass
122 | 
123 |     def sismember(self,table,key):
124 |         pass
125 | 
126 |     def zadd(self,table,values,priority=0):
127 |         pass
128 |     def zget(self,table,count=1,is_pop=True):
129 |         pass
130 |     def zremrangebyscore(self,table,priority_min,priority_max):
131 |         pass
132 |     def zrangebysocre(selt,tale,priority_min,priority_mmax,count=None,is_pop=True):
133 |         pass
134 |     def zrangebyscore_increase_socre(self,table,priority_min,priority_max,icrease_score,count=None):
135 |         pass
136 |     def zrangebyscore_set_score(selfmtable,priority_min,priority_max,score,count=None):
137 |         pass
138 |     def zincrby(self,table,amount,value):
139 |         pass
140 | 
141 |     def zget_count(self,table,priority_min=None,prioirty_max=None):
142 |         pass
143 | 
144 |     def zrem(self,table,values):
145 |         pass
146 | 
147 |     def zexiste(self,table,values):
148 |         pass
149 |     def lpush(self,table,values):
150 |         pass
151 | 
152 |     def lpop(self,table,count=1):
153 |         pass
154 |     def rpoplpush(self,from_table,to_table=None):
155 |         pass
156 | 
157 |     def lget_count(self,table):
158 |         pass
159 | 
160 |     def lrem(self,table,value,num=0):
161 |         pass
162 |     def lrange(self,table,start=0,end=-1):
163 |         pass
164 | 
165 |     def hset(self,table,value):
166 |         pass
167 | 
168 |     def hset_batch(self,table,datas):
169 |         pass
170 | 
171 |     def hincry(self,table,key,increment):
172 |         pass
173 | 
174 |     def hget(self,table,key,is_pop=False):
175 |         pass
176 | 
177 |     def hgetall(selfmtable):
178 |         pss
179 | 
180 |     def hexistes(sefl,table,key):
181 |         pass
182 | 
183 |     def hdel(self,table,*keys):
184 |         pass
185 | 
186 |     def hget_count(self,table):
187 |         pass
188 | 
189 |     def hkeys(self,table):
190 |         pass
191 | 
192 |     def setbit(self,table,offset,values):
193 |         pass
194 | 
195 |     def getbit(self,table,offsets):
196 |         pass
197 | 
198 |     def bitcount(self,table):
199 |         pass
200 | 
201 |     def strset(self,table,values,**kwargs):
202 |         pass
203 | 
204 |     def str_incrby(self,table,values):
205 |         pass
206 | 
207 |     def strget(self,table):
208 |         pass
209 | 
210 |     def strlen(sefl,table):
211 |         pass
212 | 
213 |     def getkeys(self,regex):
214 |         pass
215 | 
216 |     def set_expire(self,key,seconds):
217 |         pass
218 | 
219 |     def get_expire(self,key):
220 |         pass
221 | 
222 |     def clear(sefl,table):
223 |         pass
224 | 
225 |     def get_redis_obj(slef):
226 |         pass
227 | 
228 |     def _reconenct(self):
229 |         pass
230 | 
231 |     def __getattr__(self,name):
232 |         pass
233 | 
234 |     
235 | 
236 |     
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/src/lrabbit_spider/utils/log.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from logging.handlers import BaseRotatingHandler
  3 | import os
  4 | import sys
  5 | import lrabbit_spider.setting as setting
  6 | from better_exceptions import format_exception
  7 | import loguru
  8 | 
  9 | LOG_FORMAT = "%(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s| %(message)s"
 10 | 
 11 | class InterceptHandler(logging.Handler):
 12 |     def emit(self, record):
 13 |         # Retrieve context where the logging call occurred, this happens to be in the 6th frame upward
 14 |         logger_opt = loguru.logger.opt(depth=6, exception=record.exc_info)
 15 |         logger_opt.log(record.levelname, record.getMessage())
 16 | 
 17 | class RotatingFileHandler(BaseRotatingHandler):
 18 |     def __init__(
 19 |         self, filename, mode="a", max_bytes=0, backup_count=0, encoding=None, delay=0
 20 |     ):
 21 |         BaseRotatingHandler.__init__(self, filename, mode, encoding, delay)
 22 |         self.max_bytes = max_bytes
 23 |         self.backup_count = backup_count
 24 |         self.placeholder = str(len(str(backup_count)))
 25 | 
 26 |     def doRollover(self):
 27 |         if self.stream:
 28 |             self.stream.close()
 29 |             self.stream = None
 30 |         if self.backup_count > 0:
 31 |             for i in range(self.backup_count - 1, 0, -1):
 32 |                 sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
 33 |                 sfn = sfn.join(self.baseFilename.split("."))
 34 |                 # sfn = "%d_%s" % (i, self.baseFilename)
 35 |                 # dfn = "%d_%s" % (i + 1, self.baseFilename)
 36 |                 dfn = ("%0" + self.placeholder + "d.") % (i + 1)
 37 |                 dfn = dfn.join(self.baseFilename.split("."))
 38 |                 if os.path.exists(sfn):
 39 |                     # print "%s -> %s" % (sfn, dfn)
 40 |                     if os.path.exists(dfn):
 41 |                         os.remove(dfn)
 42 |                     os.rename(sfn, dfn)
 43 |             dfn = (("%0" + self.placeholder + "d.") % 1).join(
 44 |                 self.baseFilename.split(".")
 45 |             )
 46 |             if os.path.exists(dfn):
 47 |                 os.remove(dfn)
 48 |             # Issue 18940: A file may not have been created if delay is True.
 49 |             if os.path.exists(self.baseFilename):
 50 |                 os.rename(self.baseFilename, dfn)
 51 |         if not self.delay:
 52 |             self.stream = self._open()
 53 | 
 54 |     def shouldRollover(self, record):
 55 | 
 56 |         if self.stream is None:  # delay was set...
 57 |             self.stream = self._open()
 58 |         if self.max_bytes > 0:  # are we rolling over?
 59 |             msg = "%s\n" % self.format(record)
 60 |             self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
 61 |             if self.stream.tell() + len(msg) >= self.max_bytes:
 62 |                 return 1
 63 |         return 0            
 64 | 
 65 | 
 66 | def get_logger(
 67 | 
 68 |     name=None,
 69 |     path=None,
 70 |     log_level=None,
 71 |     is_write_to_console=None,
 72 |     is_write_to_file=None,
 73 |     color=None,
 74 |     mode=None,
 75 |     max_bytes=None,
 76 |     backup_count=None,
 77 |     encoding=None,
 78 |     is_print_default_exception=True
 79 |     
 80 | ):
 81 |     name = name or  setting.LOG_NAME
 82 |     path = path or setting.LOG_PATH
 83 |     log_level = log_level or setting.LOG_LEVEL
 84 |     is_write_to_console = (is_write_to_console 
 85 |         if is_write_to_console is not None
 86 |         else setting.LOG_IS_WRITE_TO_CONSOLE
 87 |     )
 88 |     color = color if color is not None else setting.LOG_COLOR
 89 |     mode = mode or setting.LOG_MODE
 90 |     max_bytes = max_bytes or setting.LOG_MAX_BYTES
 91 |     backup_count = backup_count or setting.LOG_BACKUP_COUNT
 92 |     encoding = encoding or setting.LOG_ENCODING
 93 |     
 94 | 
 95 |     name = name.split(os.sep)[-1].split(".")[0]
 96 | 
 97 |     logger = logging.getLogger()
 98 |     logger.setLevel(log_level)
 99 |     formatter = logging.Formatter(LOG_FORMAT)
100 |     if is_print_default_exception:
101 |         formatter.formatException = lambda exce_info : format_exception(*exce_info)
102 |     
103 |     if is_write_to_console:
104 |         if path and not os.path.exists(os.path.dirname(path)):
105 |             os.makedirs(os.path.dirname(path))
106 |         
107 |         rf_handler = RotatingFileHandler(
108 |             path,
109 |             mode=mode,
110 |             max_bytes=max_bytes,
111 |             backup_count = backup_count,
112 |             encoding = encoding
113 |         )
114 |         rf_handler.setFormatter(formatter)
115 |         logger.addHandler(rf_handler)
116 |     if color and is_write_to_console:
117 |         loguru_handler =  InterceptHandler()
118 |         loguru_handler.setFormatter(formatter)
119 |         logger.addHandler(loguru_handler)
120 |     elif is_write_to_console:
121 |         stream_handler = logging.StreamHandler()
122 |         stream_handler.setFormatter(formatter)
123 |         stream_handler.stream=sys.stdout
124 |         logger.addHandler(stream_handler)
125 |     
126 |     _handler_list = []
127 |     _handler_name_list = []
128 |     for _handler in logger.handlers:
129 |         if str(_handler) not in _handler_name_list:
130 |             _handler_name_list.append(str(_handler.name))
131 |             _handler_list.append(_handler)
132 |     logger.handlers = _handler_list
133 | 
134 |     return logger
135 | 
136 | 
137 | STOP_LOGS = [
138 |     # ES
139 |     "urllib3.response",
140 |     "urllib3.connection",
141 |     "elasticsearch.trace",
142 |     "requests.packages.urllib3.util",
143 |     "requests.packages.urllib3.util.retry",
144 |     "urllib3.util",
145 |     "requests.packages.urllib3.response",
146 |     "requests.packages.urllib3.contrib.pyopenssl",
147 |     "requests.packages",
148 |     "urllib3.util.retry",
149 |     "requests.packages.urllib3.contrib",
150 |     "requests.packages.urllib3.connectionpool",
151 |     "requests.packages.urllib3.poolmanager",
152 |     "urllib3.connectionpool",
153 |     "requests.packages.urllib3.connection",
154 |     "elasticsearch",
155 |     "log_request_fail",
156 |     # requests
157 |     "requests",
158 |     "selenium.webdriver.remote.remote_connection",
159 |     "selenium.webdriver.remote",
160 |     "selenium.webdriver",
161 |     "selenium",
162 |     # markdown
163 |     "MARKDOWN",
164 |     "build_extension",
165 |     # newspaper
166 |     "calculate_area",
167 |     "largest_image_url",
168 |     "newspaper.images",
169 |     "newspaper",
170 |     "Importing",
171 |     "PIL",
172 | ]
173 | 
174 | 
175 | for STOP_LOG in STOP_LOGS:
176 |     log_level = eval("logging."+setting.OTHERS_LOG_LEVAL)
177 |     logging.getLogger(STOP_LOG).setLevel(log_level)
178 |         
179 | 
180 | 
181 | class Log:
182 |     log = None
183 | 
184 |     def __getattr__(self,name):
185 |         if self.__class__.log is None:
186 |             self.__class__.log = get_logger()
187 |         return getattr(self.__class__.log,name)
188 |     @property
189 |     def debug(self):
190 |         return self.__class__.log.debug
191 |         pass
192 |     
193 |     @property
194 |     def info(self):
195 |         return self.__class__.log.info
196 |         pass
197 |         
198 |     @property
199 |     def warning(self):
200 |         return self.__class__.log.warning
201 |         pass
202 |     
203 |     @property
204 |     def exception(self):
205 |         return self.__class__.log.exception
206 |         pass
207 |     
208 |     @property
209 |     def error(self):
210 |         return self.__class__.log.error
211 |         pass
212 |     @property
213 |     def critical(self):
214 |         return self.__class__.log.critical
215 |         pass
216 | 
217 | 
218 | 
219 | 
220 | 
221 | log = Log() 


--------------------------------------------------------------------------------
/src/lrabbit_spider/utils/webdriver.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from concurrent.futures import thread
  3 | import os
  4 | import queue
  5 | from re import I
  6 | import threading
  7 | from selenium import webdriver
  8 | from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
  9 | from lrabbit_spider.utils.log import log
 10 | from lrabbit_spider.utils.tools import Singleton
 11 | DEFAULT_USERAGENT =  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
 12 | 
 13 | class XhrRequest:
 14 |     def __init__(self,url,data,headers);
 15 |         self.url = url
 16 |         self.data = data
 17 |         self.headers = headers
 18 | 
 19 | class XhrResponse:
 20 |     def __init__(self,request,url,headers,content,status_code):
 21 |         self.request = request
 22 |         self.url = url
 23 |         self.headers = headers
 24 |         self.content = content
 25 |         self.status_code = status_code
 26 | 
 27 | 
 28 | 
 29 | 
 30 | class WebDriver(RemoteWebDriver):
 31 |     CHROME = "CHROME"
 32 |     def __init__(
 33 |         self,
 34 |         load_images=True,
 35 |         user_agent=None,
 36 |         proxy=None,
 37 |         headless=False,
 38 |         driver_type=CHROME,
 39 |         timeout=16,
 40 |         window_size=(1024, 800),
 41 |         executable_path=None,
 42 |         custom_argument=None,
 43 |         xhr_url_regexes: list = None,
 44 |         **kwargs,
 45 |     ):
 46 |         """
 47 |         webdirver 封装，支持chrome、phantomjs 和 firefox
 48 |         Args:
 49 |             load_images: 是否加载图片
 50 |             user_agent: 字符串 或 无参函数，返回值为user_agent
 51 |             proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
 52 |             headless: 是否启用无头模式
 53 |             driver_type: CHROME 或 PHANTOMJS,FIREFOX
 54 |             timeout: 请求超时时间
 55 |             window_size: # 窗口大小
 56 |             executable_path: 浏览器路径，默认为默认路径
 57 |             xhr_url_regexes: 拦截xhr接口，支持正则，数组类型
 58 |             **kwargs:
 59 |         """
 60 |         self._load_images = load_images
 61 |         self._user_agent = user_agent or DEFAULT_USERAGENT
 62 |         self._proxy = proxy
 63 |         self._headless = headless
 64 |         self._timeout = timeout
 65 |         self._window_size = window_size
 66 |         self._executable_path = executable_path
 67 |         self._custom_argument = custom_argument
 68 |         self._xhr_url_regexes = xhr_url_regexes
 69 |     
 70 |         if self._xhr_url_regexes and driver_type !=WebDriver.CHROME:
 71 |             raise Exception("xhr url only support by chrome")
 72 |         
 73 |         if driver_type == WebDriver.CHROME:
 74 |             self.driver= self.chrome_driver()
 75 |     
 76 |     def _enter__(self):
 77 |         return self
 78 | 
 79 |     def __exit__(self,exc_type,exc_val,exc_tb):
 80 |         if exc_val:
 81 |             log.error(exc_val)
 82 |         self.quit()
 83 |         return True
 84 |     
 85 |     def get_driver(self):
 86 |         return self.driver
 87 |     
 88 |     def chrome_driver(self):
 89 |         chrome_options = webdriver.ChromeOptions()
 90 |         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
 91 |         chrome_options.add_experimental_option("useAutomationExtension", False)
 92 |         # docker 里运行需要
 93 |         chrome_options.add_argument("--no-sandbox")
 94 | 
 95 |         if self._proxy:
 96 |             chrome_options.add_argument(
 97 |                 "--proxy-server={}".format(
 98 |                     self._proxy() if callable(self._proxy) else self._proxy
 99 |                 )
100 |             )
101 | 
102 |         if self._user_agent:
103 |             chrome_options.add_argument(
104 |                 "user-agent={}".format(
105 |                     self._user_agent() if callable(self._user_agent) else self._user_agent
106 |                 )
107 |             )
108 |         if self._load_images:
109 |             chrome_options.add_argument(
110 |                 "prefs", {"profile.managed_default_content_settings.images": 2}
111 |             ) 
112 |         if not self._headless:
113 |             chrome_options.add_argument("--headless")
114 |             chrome_options.add_argument("--disable-gpu")
115 |         if self._window_size:
116 |             chrome_options.add_argument(
117 |                 "--window-size={}.{}".format(self._window_size[0],self._window_size[1])
118 |             ) 
119 | 
120 |         if self._executable_path:
121 |             driver = webdriver.Chrome(options=chrome_options,executable_path=self._executable_path)
122 |         else:
123 |             driver = webdriver.Chrome(options=chrome_options)
124 | 
125 |         with open(os.path.join(os.path.dirname(__file__),"./js/stealth.min.js"))  as f:
126 |             js = f.read()
127 |         driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
128 |         
129 |         if self._xhr_url_regexes:
130 |             assert isinstance(self._xhr_url_regexes,list)
131 |             with open(
132 |                 os.path.join(os.path.dirname(__file__),"./js/intercept.js") 
133 | 
134 |             ) as f:
135 |                 js = f.read()
136 |             driver.execute_cdp_cmd(
137 |                 "Page.addScriptToEvaluateOnNewDocument", {"source": js}
138 |             )
139 |             js = f"window.__urlRegexes = {self._xhr_url_regexes}"
140 |             driver.execute_cdp_cmd(
141 |                 "Page.addScriptToEvaluateOnNewDocument", {"source": js}
142 |             )
143 |         return driver
144 | 
145 | 
146 | 
147 |     @property
148 |     def cookies(self):
149 |         cookies_json = {}
150 |         for cookie in self.driver.get_cookies():
151 |             cookie_json[cookie["name"]] = cookie['value']
152 |         return cookies_json
153 | 
154 |     @cookies.setter
155 |     def cookies(self,val):
156 |         for key,value in val.items():
157 |             self.driver.add_cookie({"name":key,"value":value})
158 |     
159 |         pass
160 | 
161 |     @property
162 |     def user_agent(self):
163 |         return self.driver.execute_script("return navigator.userAgent;")
164 | 
165 | 
166 |     def xhr_response(self,xhr_url_regex):
167 |         data = self.driver.execute_script(
168 |             f'return window.__ajaxData["{xhr_url_regex}"]'
169 |         )
170 |         if not data:
171 |             return None
172 | 
173 |         request = XhrRequest(**data)["request"]
174 |         response = XhrResponse(request,**data)["response"]
175 |         return response 
176 |     def xhr_text(self,xhr_url_regex):
177 |         response = self.xhr_response(xhr_url_regex)
178 |         if not response:
179 |             return None
180 |         return response.content
181 |         pass
182 | 
183 |     def xhr_json(self,xhr_url_regex):
184 |         pass
185 | 
186 |     def __getattr__(self,xhr_url_regex):
187 |         pass
188 | 
189 | 
190 | 
191 | @Singleton
192 | class WebDriverPool:
193 |     def __init__(self,pool_size=5,**kwargs):
194 | 
195 |         self.queue = queue.Queue(maxsize=pool_size)
196 |         self.kwargs=kwargs
197 |         self.lock = threading.RLock()
198 |         self.driver_count = 0
199 |         pass
200 |     @property
201 |     def is_full(self):
202 |         return self.driver_count >=self.queue.maxsize
203 |         pass
204 | 
205 |     def get(self,user_agent,proxy):
206 |         if not self.is_full:
207 |             with self.lock:
208 |                 kwargs = self.kwargs.copy()
209 |                 if user_agent:
210 |                     kwargs["user_agent"] = user_agent
211 |                 if proxy:
212 |                     kwargs["proxy"] = proxy
213 |                 driver = WebDriver(**kwargs)
214 |                 self.queue.put(driver)
215 |                 self.driver_count +=1
216 |         driver = self.queue.get()
217 |         return driver
218 |     
219 |     def remove(self,driver):
220 |         driver.quit()
221 |         self.driver_count-=1
222 | 
223 |     def close(self):
224 |         while not self.queue.empty():
225 |             driver = self.queue.get()
226 |             driver.quit()
227 |             self.driver_count -=1
228 |         
229 | 
230 | 
231 |     
232 | 
233 |     def put(self,driver):
234 |         pass
235 | 
236 |     def remove(self,driver):
237 |         pass
238 |     def close(self):
239 |         pass
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2021/11/18 11:41
  4 | @Author  : lrabbit
  5 | @FileName: spider.py
  6 | @Software: PyCharm
  7 | @Blog    : https://www.lrabbit.life
  8 | """
  9 | import traceback
 10 | from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient
 11 | from lrabbit_scrapy.common_utils.redis_helper import RedisClient
 12 | from threading import Thread
 13 | from lrabbit_scrapy.common_utils.all_in_one import get_time_format_now
 14 | from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
 15 | 
 16 | 
 17 | class LrabbitSpider:
 18 | 
 19 |     def __init__(self):
 20 | 
 21 |         self._init_config()
 22 |         self.mysql_client = MysqlClient(config_path_env=self.config_env_name)
 23 |         self.redis_client = RedisClient(config_path_env=self.config_env_name, db=self.redis_db_config)
 24 |         spider_task_key = f'list:{self.__getattribute__("spider_name")}'
 25 |         self.spider_name = self.__getattribute__("spider_name")
 26 |         self.task_list_redis_key = spider_task_key
 27 |         self.success_count_all_key = f"success:count:{self.spider_name}"
 28 |         self.success_count_day_key = f"success:count:{self.spider_name}:{get_time_format_now()}"
 29 |         self.fail_count_all_key = f"fail:count:{self.spider_name}"
 30 |         self.fail_count_day_key = f"fail:count:{self.spider_name}:{get_time_format_now()}"
 31 |         self.thread_task_list = []
 32 |         self.task_list = []
 33 | 
 34 |     def _init_config(self):
 35 |         try:
 36 |             self.__getattribute__("thread_mysql_open")
 37 |         except:
 38 |             self.thread_mysql_open = False
 39 |         try:
 40 |             self.__getattribute__("max_thread_num")
 41 |         except:
 42 |             self.max_thread_num = 10
 43 |         try:
 44 |             self.__getattribute__("reset_task_config")
 45 |         except:
 46 |             self.reset_task_config = False
 47 |         try:
 48 |             self.__getattribute__("loop_task_config")
 49 |         except:
 50 |             self.loop_task_config = False
 51 |         try:
 52 |             self.__getattribute__("remove_confirm_config")
 53 |         except:
 54 |             self.remove_confirm_config = False
 55 |         try:
 56 |             self.__getattribute__("config_env_name")
 57 |         except:
 58 |             self.config_env_name = "config_path"
 59 |         try:
 60 |             self.__getattribute__("redis_db_config")
 61 |         except:
 62 |             self.redis_db_config = 0
 63 |         try:
 64 |             self.__getattribute__("debug_config")
 65 |         except:
 66 |             self.debug_config = True
 67 | 
 68 |     def _send_task_redis(self, task_list):
 69 |         for task in task_list:
 70 |             LogUtils.log_info("new task", task)
 71 |             self.redis_client.redis_executor.sadd(self.task_list_redis_key, task)
 72 | 
 73 |     def update_stat_redis(self):
 74 |         """
 75 |          success:count_all   success:count:spider_name
 76 |          success:count:day   success:count:spider_name:2021-11-11
 77 |         :return:
 78 |         """
 79 |         day = get_time_format_now()
 80 |         self.success_count_day_key = f"success:count:{self.spider_name}:{day}"
 81 |         self.redis_client.redis_executor.incr(self.success_count_all_key)
 82 |         self.redis_client.redis_executor.incr(self.success_count_day_key)
 83 | 
 84 |     def _init_task_list(self):
 85 | 
 86 |         if self.reset_task_config or not self.redis_client.redis_executor.exists(self.task_list_redis_key):
 87 | 
 88 |             LogUtils.log_info("init task list")
 89 |             generate_task_list_callback = self.__getattribute__("init_task_list")
 90 |             if self.redis_client.redis_executor.exists(self.task_list_redis_key):
 91 |                 LogUtils.log_info("already exists", self.task_list_redis_key, "task list", "count",
 92 |                                   self.redis_client.redis_executor.scard(self.task_list_redis_key))
 93 |                 try:
 94 |                     remove_confirm_config = self.__getattribute__("remove_confirm_config")
 95 |                     if not remove_confirm_config:
 96 |                         option = input("please input y to delete task list and add new task")
 97 |                         if option != 'y':
 98 |                             exit(-1)
 99 |                 except AttributeError as e:
100 |                     option = input("please input y to delete task list and add new task")
101 |                     if option != 'y':
102 |                         exit(-1)
103 |                 except Exception as e:
104 |                     pass
105 |                 self.redis_client.redis_executor.delete(self.task_list_redis_key)
106 | 
107 |             generate_task_all = generate_task_list_callback()
108 |             count = self.redis_client.redis_executor.scard(self.task_list_redis_key)
109 |             if count >= 1:
110 |                 LogUtils.log_info("already init task")
111 |                 return
112 |             if len(generate_task_all) < 10:
113 |                 for item in generate_task_all:
114 |                     LogUtils.log_info("new task", item)
115 |                     self.redis_client.redis_executor.sadd(self.task_list_redis_key, item)
116 |             else:
117 |                 thread_num = 10
118 |                 step = len(generate_task_all) // thread_num
119 |                 send_thread_list = []
120 |                 for i in range(thread_num):
121 |                     if i == thread_num - 1:
122 |                         t = Thread(target=self._send_task_redis, args=(generate_task_all[i * step:],))
123 |                     else:
124 |                         t = Thread(target=self._send_task_redis, args=(generate_task_all[(i * step):(i + 1) * step],))
125 |                     t.start()
126 |                     send_thread_list.append(t)
127 |                 for t in send_thread_list:
128 |                     t.join()
129 |             LogUtils.log_finish("init task list success")
130 | 
131 |         task_count = self.redis_client.redis_executor.scard(self.task_list_redis_key)
132 |         LogUtils.log_info("current task count", task_count)
133 |         try:
134 |             remove_confirm_config = self.__getattribute__("remove_confirm_config")
135 |             if not remove_confirm_config:
136 |                 option = input("please input y to continue")
137 |                 if option != 'y':
138 |                     exit(-1)
139 |         except AttributeError as e:
140 |             option = input("please input y to continue")
141 |             if option != 'y':
142 |                 exit(-1)
143 |         except Exception as e:
144 |             pass
145 | 
146 |     def _run(self):
147 |         self._init_task_list()
148 |         try:
149 |             worker_callback = self.__getattribute__("worker")
150 |         except Exception as e:
151 |             LogUtils.log_error("you not define worker function")
152 |             exit(-1)
153 | 
154 |         def self_loop_call_back():
155 |             while True:
156 |                 task = self.redis_client.redis_executor.spop(self.task_list_redis_key)
157 |                 if not task:
158 |                     break
159 |                 try:
160 |                     self.task_list.append(task)
161 |                     if self.thread_mysql_open:
162 |                         _mysql_client = MysqlClient(config_path_env=self.config_env_name)
163 |                         worker_callback(task, _mysql_client)
164 |                     else:
165 |                         worker_callback(task)
166 |                 except Exception as e:
167 |                     name_exception = type(e).__name__.lower()
168 |                     if self.debug_config:
169 |                         traceback.print_exc()
170 |                     else:
171 |                         LogUtils.log_error(task, name_exception, e.__getattribute__('args'))
172 |                     self.redis_client.redis_executor.sadd(
173 |                         f"list:error:count:{self.spider_name}:{name_exception}:{get_time_format_now()}",
174 |                         task)
175 |                     self.fail_count_day_key = f"fail:count:{self.spider_name}:{get_time_format_now()}"
176 |                     self.redis_client.redis_executor.incr(self.fail_count_day_key)
177 |                     self.redis_client.redis_executor.incr(self.fail_count_all_key)
178 |                 try:
179 |                     self.task_list.remove(task)
180 |                 except Exception as e:
181 |                     pass
182 | 
183 |         self.task_list = []
184 |         self.process_list = []
185 | 
186 |         for _ in range(self.max_thread_num):
187 |             t = Thread(target=self_loop_call_back, args=())
188 |             t.start()
189 |             self.process_list.append(t)
190 |         for t in self.process_list:
191 |             t.join()
192 | 
193 |     def _menu(self):
194 |         import sys
195 |         options = sys.argv[1:]
196 |         if len(options) > 0:
197 |             if options[0] == 'stat':
198 |                 LogUtils.log_info("remain t ask list", self.redis_client.redis_executor.scard(self.task_list_redis_key))
199 |                 print("\n")
200 |                 day = get_time_format_now()
201 |                 self.success_count_day_key = f"success:count:{self.spider_name}:{day}"
202 |                 LogUtils.log_finish("today success count",
203 |                                     self.redis_client.redis_executor.get(self.success_count_day_key))
204 |                 LogUtils.log_error("today fail count", self.redis_client.redis_executor.get(self.fail_count_day_key))
205 |                 LogUtils.log_finish("all success count",
206 |                                     self.redis_client.redis_executor.get(self.success_count_all_key))
207 |                 LogUtils.log_error("all fail count", self.redis_client.redis_executor.get(self.fail_count_all_key))
208 |                 print("\n")
209 |                 LogUtils.log_error("404 status_code count",
210 |                                    self.redis_client.redis_executor.scard(
211 |                                        f"list:error:count:{self.spider_name}:exception404:{get_time_format_now()}"))
212 |                 LogUtils.log_error("403 status_code count",
213 |                                    self.redis_client.redis_executor.scard(
214 |                                        f"list:error:count:{self.spider_name}:exception403:{get_time_format_now()}"))
215 |                 LogUtils.log_error("500 status_code count",
216 |                                    self.redis_client.redis_executor.scard(
217 |                                        f"list:error:count:{self.spider_name}:exception500:{get_time_format_now()}"))
218 |             else:
219 |                 LogUtils.log_error(" you can add stat option ,check scrapy stat")
220 |             exit(-1)
221 | 
222 |     def run(self):
223 |         while True:
224 |             self._menu()
225 |             try:
226 |                 self._run()
227 |                 if self.loop_task_config:
228 |                     continue
229 |                 break
230 |             except KeyboardInterrupt as e:
231 |                 # when you keyboard break,need give this task back
232 |                 while True:
233 |                     if len(self.task_list) == 0:
234 |                         break
235 |                     task = self.task_list.pop()
236 |                     if not task:
237 |                         break
238 |                     self.redis_client.redis_executor.sadd(self.task_list_redis_key, task)
239 |         LogUtils.log_now_time_str()
240 |         LogUtils.log_finish("all finish")
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     spider = LrabbitSpider()
245 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/common_utils/mysql_helper.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import traceback
  3 | import pymysql.cursors
  4 | import pymysql
  5 | import queue
  6 | import threading
  7 | import logging
  8 | from lrabbit_scrapy.common_utils.config_helper import get_mysql_config, get_config_path
  9 | 
 10 | 
 11 | class MysqlClient(object):
 12 | 
 13 |     def __init__(self, database=None, config_path_env=None, env='test'):
 14 |         config_path = get_config_path(config_path_env)
 15 |         mysql_config = get_mysql_config(config_path, env)
 16 |         host = mysql_config.MYSQL_HOST
 17 |         if not database:
 18 |             database = mysql_config.MYSQL_DATABASE
 19 |         user = mysql_config.MYSQL_USER
 20 |         password = mysql_config.MYSQL_PASSWORD
 21 |         port = mysql_config.MYSQL_PORT
 22 |         max_idle_time = 7 * 3600
 23 |         connect_timeout = 10
 24 |         time_zone = "+0:00"
 25 |         charset = "utf8mb4"
 26 |         sql_mode = "TRADITIONAL"
 27 |         self.host = host
 28 |         self.database = database
 29 |         self.max_idle_time = float(max_idle_time)
 30 |         args = dict(use_unicode=True, charset=charset,
 31 |                     database=database,
 32 |                     init_command=('SET time_zone = "%s"' % time_zone),
 33 |                     cursorclass=pymysql.cursors.DictCursor,
 34 |                     connect_timeout=connect_timeout, sql_mode=sql_mode)
 35 |         if user is not None:
 36 |             args["user"] = user
 37 |         if password is not None:
 38 |             args["passwd"] = password
 39 |         # We accept a path to a MySQL socket file or a host(:port) string
 40 |         if "/" in host:
 41 |             args["unix_socket"] = host
 42 |         else:
 43 |             self.socket = None
 44 |             pair = host.split(":")
 45 |             if len(pair) == 2:
 46 |                 args["host"] = pair[0]
 47 |                 args["port"] = int(pair[1])
 48 |             else:
 49 |                 args["host"] = host
 50 |                 args["port"] = 3306
 51 |         if port:
 52 |             args['port'] = port
 53 | 
 54 |         self._db = None
 55 |         self._db_args = args
 56 |         self._last_use_time = time.time()
 57 |         try:
 58 |             self.reconnect()
 59 |         except Exception:
 60 |             logging.error("Cannot connect to MySQL on %s", self.host,
 61 |                           exc_info=True)
 62 | 
 63 |     def _ensure_connected(self):
 64 |         if (self._db is None or
 65 |                 (time.time() - self._last_use_time > self.max_idle_time)):
 66 |             self.reconnect()
 67 |         self._last_use_time = time.time()
 68 | 
 69 |     def _cursor(self):
 70 |         self._ensure_connected()
 71 |         return self._db.cursor()
 72 | 
 73 |     def __del__(self):
 74 |         self.close()
 75 | 
 76 |     def close(self):
 77 |         """Closes this database connection."""
 78 |         if getattr(self, "_db", None) is not None:
 79 |             self._db.close()
 80 |             self._db = None
 81 | 
 82 |     def reconnect(self):
 83 |         """Closes the existing database connection and re-opens it."""
 84 |         self.close()
 85 |         self._db = pymysql.connect(**self._db_args)
 86 |         self._db.autocommit(True)
 87 | 
 88 |     def query(self, query, *parameters, **kwparameters):
 89 |         """Returns a row list for the given query and parameters."""
 90 |         cursor = self._cursor()
 91 |         try:
 92 |             cursor.execute(query, kwparameters or parameters)
 93 |             result = cursor.fetchall()
 94 |             return result
 95 |         finally:
 96 |             cursor.close()
 97 | 
 98 |     def get(self, query, *parameters, **kwparameters):
 99 |         """Returns the (singular) row returned by the given query.
100 |         """
101 |         cursor = self._cursor()
102 |         try:
103 |             cursor.execute(query, kwparameters or parameters)
104 |             return cursor.fetchone()
105 |         finally:
106 |             cursor.close()
107 | 
108 |     def execute(self, query, *parameters, **kwparameters):
109 |         """Executes the given query, returning the lastrowid from the query."""
110 |         cursor = self._cursor()
111 |         try:
112 |             cursor.execute(query, kwparameters or parameters)
113 |             return cursor.lastrowid
114 |         except Exception as e:
115 |             if e.args[0] == 1062:
116 |                 pass
117 |             else:
118 |                 traceback.print_exc()
119 |                 raise e
120 |         finally:
121 |             cursor.close()
122 | 
123 |     insert = execute
124 | 
125 |     ## =============== high level method for table ===================
126 | 
127 |     def table_has(self, table_name, field, value):
128 |         if isinstance(value, str):
129 |             value = value.encode('utf8')
130 |         sql = 'SELECT %s FROM %s WHERE %s="%s"' % (
131 |             field,
132 |             table_name,
133 |             field,
134 |             value)
135 |         d = self.get(sql)
136 |         return d
137 | 
138 |     def table_insert(self, table_name, item):
139 |         '''item is a dict : key is mysql table field'''
140 |         fields = list(item.keys())
141 |         values = list(item.values())
142 |         fieldstr = ','.join(fields)
143 |         valstr = ','.join(['%s'] * len(item))
144 |         for i in range(len(values)):
145 |             if isinstance(values[i], str):
146 |                 values[i] = values[i].encode('utf8')
147 |         sql = 'INSERT INTO %s (%s) VALUES(%s)' % (table_name, fieldstr, valstr)
148 |         try:
149 |             last_id = self.execute(sql, *values)
150 |             return last_id
151 |         except Exception as e:
152 |             if e.args[0] == 1062:
153 |                 # just skip duplicated item
154 |                 pass
155 |             else:
156 |                 traceback.print_exc()
157 |                 print('sql:', sql)
158 |                 print('item:')
159 |                 for i in range(len(fields)):
160 |                     vs = str(values[i])
161 |                     if len(vs) > 300:
162 |                         print(fields[i], ' : ', len(vs), type(values[i]))
163 |                     else:
164 |                         print(fields[i], ' : ', vs, type(values[i]))
165 |                 raise e
166 | 
167 |     def table_update(self, table_name, updates,
168 |                      field_where, value_where):
169 |         '''updates is a dict of {field_update:value_update}'''
170 |         upsets = []
171 |         values = []
172 |         for k, v in updates.items():
173 |             s = '%s=%%s' % k
174 |             upsets.append(s)
175 |             values.append(v)
176 |         upsets = ','.join(upsets)
177 |         sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
178 |             table_name,
179 |             upsets,
180 |             field_where, value_where,
181 |         )
182 |         self.execute(sql, *(values))
183 | 
184 | 
185 | logger = logging.Logger(name="mysql connect")
186 | 
187 | 
188 | class Connection(pymysql.connections.Connection):
189 |     _pool = None
190 |     _reusable_expection = (pymysql.err.ProgrammingError, pymysql.err.IntegrityError, pymysql.err.NotSupportedError)
191 | 
192 |     def __init__(self, *args, **kwargs):
193 |         pymysql.connections.Connection.__init__(self, *args, **kwargs)
194 |         self.args = args
195 |         self.kwargs = kwargs
196 | 
197 |     def __exit__(self, exc, value, traceback):
198 | 
199 |         pymysql.connections.Connection.__exit__(self, exc, value, traceback)
200 |         if self._pool:
201 |             if not exc or exc in self._reusable_expection:
202 |                 '''reusable connection'''
203 |                 self._pool.put_connection(self)
204 |             else:
205 |                 '''no reusable connection, close it and create a new one then put it to the pool'''
206 |                 self._pool.put_connection(self._recreate(*self.args, **self.kwargs))
207 |                 self._pool = None
208 |                 try:
209 |                     self.close()
210 |                     logger.warning("Close not reusable connection from pool(%s) caused by %s", self._pool.name, value)
211 |                 except Exception:
212 |                     pass
213 | 
214 |     def _recreate(self, *args, **kwargs):
215 |         conn = Connection(*args, **kwargs)
216 |         logger.debug('Create new connection due to pool(%s) lacking', self._pool.name)
217 |         return conn
218 | 
219 |     def close(self):
220 | 
221 |         if self._pool:
222 |             self._pool.put_connection(self)
223 |         else:
224 |             pymysql.connections.Connection.close(self)
225 | 
226 |     def execute_query(self, query, args=(), dictcursor=False, return_one=False, exec_many=False):
227 | 
228 |         with self:
229 |             cur = self.cursor() if not dictcursor else self.cursor(pymysql.cursors.DictCursor)
230 |             try:
231 |                 if exec_many:
232 |                     cur.executemany(query, args)
233 |                 else:
234 |                     cur.execute(query, args)
235 |             except Exception:
236 |                 raise
237 |             # if no record match the query, return () if return_one==False, else return None
238 |             return cur.fetchone() if return_one else cur.fetchall()
239 | 
240 | 
241 | class ConnectionPool:
242 |     _HARD_LIMIT = 200
243 |     _THREAD_LOCAL = threading.local()
244 |     _THREAD_LOCAL.retry_counter = 0  # a counter used for debug get_connection() method
245 | 
246 |     def __init__(self, size=10, name=None, *args, **kwargs):
247 |         self._pool = queue.Queue(self._HARD_LIMIT)
248 |         self._size = size if 0 < size < self._HARD_LIMIT else self._HARD_LIMIT
249 |         self.name = name if name else '-'.join(
250 |             [kwargs.get('host', 'localhost'), str(kwargs.get('port', 3306)),
251 |              kwargs.get('user', ''), kwargs.get('database', '')])
252 |         for _ in range(self._size):
253 |             conn = Connection(*args, **kwargs)
254 |             conn._pool = self
255 |             self._pool.put(conn)
256 | 
257 |     def get_connection(self, timeout=1, retry_num=1) -> Connection:
258 |         """
259 |         timeout: timeout of get a connection from pool, should be a int(0 means return or raise immediately)
260 |         retry_num: how many times will retry to get a connection
261 |         """
262 |         try:
263 |             conn = self._pool.get(timeout=timeout) if timeout > 0 else self._pool.get_nowait()
264 |             logger.debug('Get connection from pool(%s)', self.name)
265 |             return conn
266 |         except queue.Empty:
267 |             if not hasattr(self._THREAD_LOCAL, 'retry_counter'):
268 |                 self._THREAD_LOCAL.retry_counter = 0
269 |             if retry_num > 0:
270 |                 self._THREAD_LOCAL.retry_counter += 1
271 |                 logger.debug('Retry get connection from pool(%s), the %d times', self.name,
272 |                              self._THREAD_LOCAL.retry_counter)
273 |                 retry_num -= 1
274 |                 return self.get_connection(timeout, retry_num)
275 |             else:
276 |                 total_times = self._THREAD_LOCAL.retry_counter + 1
277 |                 self._THREAD_LOCAL.retry_counter = 0
278 |                 raise GetConnectionFromPoolError("can't get connection from pool({}) within {}*{} second(s)".format(
279 |                     self.name, timeout, total_times))
280 | 
281 |     def put_connection(self, conn):
282 |         if not conn._pool:
283 |             conn._pool = self
284 |         conn.cursor().close()
285 |         try:
286 |             self._pool.put_nowait(conn)
287 |             logger.debug("Put connection back to pool(%s)", self.name)
288 |         except queue.Full:
289 |             logger.warning("Put connection to pool(%s) error, pool is full, size:%d", self.name, self.size())
290 | 
291 |     def size(self):
292 |         return self._pool.qsize()
293 | 
294 | 
295 | class GetConnectionFromPoolError(Exception):
296 |     """Exception related can't get connection from pool within timeout seconds."""
297 | 
298 | 
299 | class MysqlConnectionPool:
300 | 
301 |     def __init__(self, database=None, config_path_env=None, env='test'):
302 |         config_path = get_config_path(config_path_env)
303 |         mysql_config = get_mysql_config(config_path, env)
304 |         host = mysql_config.MYSQL_HOST
305 |         if not database:
306 |             database = mysql_config.MYSQL_DATABASE
307 |         user = mysql_config.MYSQL_USER
308 |         password = mysql_config.MYSQL_PASSWORD
309 |         port = mysql_config.MYSQL_PORT
310 |         max_idle_time = 7 * 3600
311 |         connect_timeout = 10
312 |         time_zone = "+0:00"
313 |         charset = "utf8mb4"
314 |         sql_mode = "TRADITIONAL"
315 |         self.host = host
316 |         self.database = database
317 |         self.max_idle_time = float(max_idle_time)
318 |         args = dict(use_unicode=True, charset=charset,
319 |                     database=database,
320 |                     init_command=('SET time_zone = "%s"' % time_zone),
321 |                     cursorclass=pymysql.cursors.DictCursor,
322 |                     connect_timeout=connect_timeout, sql_mode=sql_mode)
323 |         if user is not None:
324 |             args["user"] = user
325 |         if password is not None:
326 |             args["passwd"] = password
327 |         # We accept a path to a MySQL socket file or a host(:port) string
328 |         if "/" in host:
329 |             args["unix_socket"] = host
330 |         else:
331 |             self.socket = None
332 |             pair = host.split(":")
333 |             if len(pair) == 2:
334 |                 args["host"] = pair[0]
335 |                 args["port"] = int(pair[1])
336 |             else:
337 |                 args["host"] = host
338 |                 args["port"] = 3306
339 |         if port:
340 |             args['port'] = port
341 |         self._args = args
342 |         self.pool = ConnectionPool(size=10, **self._args)
343 | 
344 |     def execute_query(self, sql):
345 |         conn = self.pool.get_connection()
346 |         with conn:
347 |             res = conn.execute_query(sql)
348 |             return res
349 | 
350 | 
351 | if __name__ == '__main__':
352 |     pass
353 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/asynico_basespider.py:
--------------------------------------------------------------------------------
  1 | import asyncio.queues
  2 | import asyncio
  3 | import time
  4 | import datetime
  5 | import sys
  6 | import sqlalchemy as sa
  7 | from sqlalchemy import MetaData, Table, Column, Integer
  8 | from aiomysql.sa import create_engine as aio_create_engine
  9 | from sqlalchemy import create_engine
 10 | from configparser import ConfigParser
 11 | from typing import Dict
 12 | import inspect
 13 | import csv
 14 | import os
 15 | import traceback
 16 | 
 17 | pwd_dir = os.path.abspath(os.getcwd())
 18 | metadata = MetaData()
 19 | 
 20 | config = None
 21 | last_exc = None
 22 | 
 23 | 
 24 | class TermColor:
 25 |     ATTRIBUTES = dict(
 26 |         list(zip([
 27 |             'bold',
 28 |             'dark',
 29 |             '',
 30 |             'underline',
 31 |             'blink',
 32 |             '',
 33 |             'reverse',
 34 |             'concealed'
 35 |         ],
 36 |             list(range(1, 9))
 37 |         ))
 38 |     )
 39 |     del ATTRIBUTES['']
 40 | 
 41 |     HIGHLIGHTS = dict(
 42 |         list(zip([
 43 |             'on_grey',
 44 |             'on_red',
 45 |             'on_green',
 46 |             'on_yellow',
 47 |             'on_blue',
 48 |             'on_magenta',
 49 |             'on_cyan',
 50 |             'on_white'
 51 |         ],
 52 |             list(range(40, 48))
 53 |         ))
 54 |     )
 55 | 
 56 |     COLORS = dict(
 57 |         list(zip([
 58 |             'grey',
 59 |             'red',
 60 |             'green',
 61 |             'yellow',
 62 |             'blue',
 63 |             'magenta',
 64 |             'cyan',
 65 |             'white',
 66 |         ],
 67 |             list(range(30, 38))
 68 |         ))
 69 |     )
 70 | 
 71 |     RESET = '\033[0m'
 72 | 
 73 |     @staticmethod
 74 |     def colored(text, color=None, on_color=None, attrs=None):
 75 | 
 76 |         if os.getenv('ANSI_COLORS_DISABLED') is None:
 77 |             fmt_str = '\033[%dm%s'
 78 |             if color is not None:
 79 |                 text = fmt_str % (TermColor.COLORS[color], text)
 80 | 
 81 |             if on_color is not None:
 82 |                 text = fmt_str % (TermColor.HIGHLIGHTS[on_color], text)
 83 | 
 84 |             if attrs is not None:
 85 |                 for attr in attrs:
 86 |                     text = fmt_str % (TermColor.ATTRIBUTES[attr], text)
 87 | 
 88 |             text += TermColor.RESET
 89 |         return text
 90 | 
 91 | 
 92 | class CommonUtils:
 93 | 
 94 |     def __init__(self):
 95 |         pass
 96 | 
 97 |     @staticmethod
 98 |     def fix_str_args(args):
 99 |         return list(map(lambda x: str(x).strip(), args))
100 | 
101 |     @staticmethod
102 |     def get_format_time(for_mat='%Y-%m-%d %H:%M:%S'):
103 |         return TermColor.colored(datetime.datetime.now().strftime(for_mat), 'yellow').encode('utf8')
104 | 
105 |     @staticmethod
106 |     def space_join_line_arg(*args):
107 |         return ' '.join(args) + '\n'
108 | 
109 | 
110 | class LogUtils:
111 | 
112 |     def __init__(self):
113 |         pass
114 | 
115 |     @staticmethod
116 |     def log_now_time_str():
117 |         sys.stdout.buffer.write(CommonUtils.get_format_time())
118 | 
119 |     @staticmethod
120 |     def log_str(color_str, args):
121 |         args = CommonUtils.fix_str_args(args)
122 |         text = ' '.join(args)
123 |         text = color_str + ' ' + text + '\n'
124 |         sys.stdout.buffer.write(text.encode('utf8'))
125 | 
126 |     @staticmethod
127 |     def log_info(*args):
128 |         color_str = TermColor.colored('[*INFO*]', 'cyan')
129 |         LogUtils.log_str(color_str, args)
130 | 
131 |     @staticmethod
132 |     def log_running(*args):
133 |         color_str = TermColor.colored('[*RUNNING*]', 'yellow')
134 |         LogUtils.log_str(color_str, args)
135 | 
136 |     @staticmethod
137 |     def log_finish(*args):
138 |         color_str = TermColor.colored('*FINISH*', 'green')
139 |         LogUtils.log_str(color_str, args)
140 | 
141 |     @staticmethod
142 |     def log_error(*args):
143 |         color_str = TermColor.colored('[*ERROR*]', 'red')
144 |         LogUtils.log_str(color_str, args)
145 | 
146 |     @staticmethod
147 |     def log_to_file(file_path, line):
148 |         """
149 | 
150 |         :param file_path: log file path
151 |         :param line: a str type
152 |         :return:
153 |         """
154 |         with open(file_path, 'a', encoding='utf8') as f:
155 |             line = CommonUtils.space_join_line_arg(LogUtils.get_format_time(), line)
156 |             f.write(line)
157 | 
158 | 
159 | class DbUtils(LogUtils):
160 |     """
161 |     all_tables is store all tables sqlalchemy metadata
162 |     """
163 |     all_tables: Dict[str, sa.Table] = {}
164 | 
165 |     def __init__(self, spider_name):
166 |         spider_name = spider_name.split(".")[0]
167 |         super(DbUtils, self).__init__()
168 |         global config
169 |         config = ConfigParser()
170 |         config_path = os.path.join(pwd_dir, f'{spider_name}.ini')
171 |         config.read(config_path)
172 |         env = os.getenv('ENV', 'test')
173 |         config = config[env]
174 |         self.engine = None
175 | 
176 |     async def init_engine(self):
177 |         """
178 |         init connect pymysql,create tables by sqlalchemy
179 |         :return:
180 |         """
181 |         self.engine = await aio_create_engine(
182 |             user=config['db_user'],
183 |             password=config['db_password'],
184 |             port=3306,
185 |             host=config['db_host'],
186 |             db=config['db_database'],
187 |             autocommit=True
188 |         )
189 |         engine = create_engine(
190 |             f'mysql+pymysql://{config["db_user"]}:{config["db_password"]}@{config["db_host"]}/{config["db_database"]}',
191 |             echo=True,
192 | 
193 |         )
194 |         metadata.bind = engine
195 |         try:
196 |             if self.__getattribute__('is_drop_tables'):
197 |                 yes = input("请输入yes,确认删除表数据")
198 |                 if yes != 'yes':
199 |                     exit(1)
200 |                 metadata.drop_all()
201 |                 LogUtils.log_finish('已清空表')
202 |         except Exception as e:
203 |             LogUtils.log_info("not found is_drop_tables")
204 |         self._generate_tables()
205 |         metadata.create_all(engine)
206 | 
207 |     def _generate_tables(self):
208 |         """
209 |         get all attrs and check name ,if start_with "table" ,get it values to generate columns.
210 | 
211 |         :return:
212 |         """
213 |         for k, v in inspect.getmembers(self):
214 |             if k.startswith('table'):
215 |                 table_name = k.split('_')[-1]
216 |                 try:
217 |                     tbl = Table(table_name, metadata, Column('id', Integer, primary_key=True), *v)
218 |                 except Exception as e:
219 |                     pass
220 |                 self.all_tables[table_name] = tbl
221 |                 LogUtils.log_finish(table_name, '创建完成')
222 | 
223 |     @asyncio.coroutine
224 |     def insert_one(self, sql):
225 |         """
226 |         :param sql: this sql you can sqlalchemy api generate a sql  or you can just write origin sql
227 |         :return:
228 |         """
229 |         with (yield from self.engine) as conn:
230 |             yield from conn.execute(sql)
231 | 
232 |     @asyncio.coroutine
233 |     def query(self, sql):
234 |         with (yield from self.engine) as conn:
235 |             res = yield from conn.execute(sql)
236 |             res = yield from res.fetchall()
237 |             return res
238 | 
239 |     @asyncio.coroutine
240 |     def execute(self, sql):
241 |         with (yield from self.engine) as conn:
242 |             yield from conn.execute(sql)
243 | 
244 | 
245 | class FileStore:
246 | 
247 |     def __init__(self, file_name, headers, reset_task_list):
248 |         self.file_name = f'{file_name}.csv'
249 |         self.headers = headers
250 |         self.reset_task_list = reset_task_list
251 |         self.write_headers()
252 | 
253 |     def write_headers(self):
254 |         """
255 |         write headers
256 |         :return:
257 |         """
258 |         if not os.path.exists(os.path.join(pwd_dir, self.file_name)) or self.reset_task_list:
259 |             with open(os.path.join(pwd_dir, self.file_name), 'w', encoding='utf8', newline='') as f:
260 |                 dict_write = csv.DictWriter(f, fieldnames=self.headers)
261 |                 dict_write.writeheader()
262 | 
263 |     def write(self, d):
264 |         """
265 | 
266 |         :param d: dict type
267 |         :return:
268 |         """
269 |         with open(os.path.join(pwd_dir, self.file_name), 'a', encoding='utf8', newline='') as f:
270 |             dict_write = csv.DictWriter(f, fieldnames=self.headers)
271 |             dict_write.writerow(d)
272 | 
273 | 
274 | class WriteUtil(LogUtils):
275 |     """
276 |      all_files : store all FileStore object,you can pass a name to write data to different files
277 |     """
278 |     all_files: Dict[str, FileStore] = {}
279 | 
280 |     def __init__(self):
281 | 
282 |         super(WriteUtil, self).__init__()
283 |         self._generate_files()
284 | 
285 |     def _generate_files(self):
286 |         try:
287 |             reset_task_list = self.__getattribute__('reset_task_list')
288 |         except Exception as e:
289 |             self.log_info('not found reset_task_list option ')
290 |             return
291 |         for k, v in inspect.getmembers(self):
292 |             if k.startswith('file'):
293 |                 file_name = k.split("_")[-1]
294 |                 self.all_files[file_name] = FileStore(file_name, v, reset_task_list)
295 |                 LogUtils.log_finish(f'创建{file_name}存储文件成功')
296 | 
297 | 
298 | class BaseSpider(DbUtils, WriteUtil):
299 |     """
300 |     spider_name:default is filename
301 |     """
302 | 
303 |     def __init__(self, spider_name: str):
304 |         # in windows spider_name maybe is .\asy_crawl.py ,so we need trim it
305 |         if spider_name.startswith(".\\"):
306 |             spider_name = spider_name.replace(".\\", "")
307 |         spider_name = spider_name.split('.')[0]
308 |         DbUtils.__init__(self, spider_name)
309 |         WriteUtil.__init__(self)
310 |         self.task_queue = asyncio.queues.Queue()
311 |         self.task_list = []
312 |         self.redis = None
313 |         self.db = None
314 |         self.spider_name = spider_name
315 |         self.start_time = None
316 |         self.finish_file_name = None
317 |         self.all_file_name = None
318 |         self.init_file_name(spider_name)
319 |         self.config = ConfigParser()
320 |         config_path = os.path.join(pwd_dir, f'{spider_name}.ini')
321 |         self.config.read(config_path)
322 |         env = os.getenv('ENV', 'test')
323 |         self.config = self.config[env]
324 | 
325 |     def init_file_name(self, spider_name):
326 |         """
327 | 
328 |         :param spider_name:generate task_list files
329 |         :return:
330 |         """
331 |         spider_name = spider_name.split('.')[0]
332 |         self.finish_file_name = f'{spider_name}_finish.log'
333 |         self.all_file_name = f'{spider_name}_all.log'
334 | 
335 |     def get_tasks_list_by_file(self):
336 |         """
337 |         read from all.log and finish.log and get not finish task
338 |         :return:
339 |         """
340 |         finish_set = set()
341 |         all_set = set()
342 |         if os.path.exists(os.path.join(pwd_dir, self.all_file_name)):
343 |             with open(self.all_file_name, 'r', encoding='utf8') as f:
344 |                 for line in f.readlines():
345 |                     all_set.add(line)
346 |         if os.path.exists(os.path.join(pwd_dir, self.finish_file_name)):
347 |             with open(self.finish_file_name, 'r', encoding='utf8') as f:
348 |                 for line in f.readlines():
349 |                     finish_set.add(line)
350 |         return list(all_set.difference(finish_set))
351 | 
352 |     async def _generate_task(self):
353 |         """
354 |         init task_list data
355 |         :return:
356 |         """
357 |         try:
358 |             reset_task_list = self.__getattribute__('reset_task_list')
359 |         except Exception as e:
360 |             LogUtils.log_info("not found reset_task_list")
361 |             return
362 |         if not os.path.exists(os.path.join(pwd_dir, self.all_file_name)) or reset_task_list:
363 |             if os.path.exists(os.path.join(pwd_dir, self.all_file_name)):
364 |                 os.remove(os.path.join(pwd_dir, self.all_file_name))
365 |             if os.path.exists(os.path.join(pwd_dir, self.finish_file_name)):
366 |                 os.remove(os.path.join(pwd_dir, self.finish_file_name))
367 |             try:
368 |                 generate_callback = self.__getattribute__('create_tasks')
369 |             except Exception as e:
370 |                 LogUtils.log_info("not found create_tasks")
371 |                 return
372 |             task_lists = await generate_callback()
373 |             task_lists = list(set(task_lists))
374 |             for task in task_lists:
375 |                 with open(os.path.join(pwd_dir, self.all_file_name), 'a', encoding='utf8') as f:
376 |                     f.write(str(task))
377 |                     f.write('\n')
378 |         else:
379 |             task_lists = self.get_tasks_list_by_file()
380 |         for task in task_lists:
381 |             self.task_queue.put_nowait(str(task).strip())
382 |         for _ in range(1):
383 |             self.task_queue.put_nowait(None)
384 | 
385 |     async def base_worker(self):
386 |         """
387 |         every woker is in while, get task form task_queue
388 |         :return:
389 |         """
390 |         try:
391 |             worker_callback = self.__getattribute__('worker')
392 |         except Exception as e:
393 |             LogUtils.log_info("not found worker")
394 |         while True:
395 |             global last_exc
396 |             task = await self.task_queue.get()
397 |             if task:
398 |                 self.log_running(task)
399 |                 try:
400 |                     await worker_callback(task)
401 |                     with open(self.finish_file_name, 'a', encoding='utf8') as f:
402 |                         f.write(str(task).strip())
403 |                         f.write('\n')
404 |                     self.log_finish(task)
405 |                 except Exception as e:
406 |                     if str(traceback.format_exc()) != last_exc:
407 |                         with open("error.log", 'a', encoding='utf8') as f:
408 |                             f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
409 |                             f.write('\n')
410 |                             f.write(traceback.format_exc())
411 |                         last_exc = str(traceback.format_exc())
412 |                 self.task_queue.task_done()
413 |             else:
414 |                 print("正在等待最后的任务完成，默认等待5秒")
415 |                 await asyncio.sleep(5)
416 |                 total_time = time.monotonic() - self.started_time
417 |                 for task in self.task_list:
418 |                     task.cancel()
419 |                 print(f"除去等待时间，本次代码运行花费时间为: {total_time - 2}")
420 |                 exit(0)
421 | 
422 |     async def wait_all_task(self):
423 |         """
424 |         wait all worker is finished
425 |         :return:
426 |         """
427 |         await self.task_queue.join()
428 | 
429 |     def run(self):
430 |         """
431 |         main function,scrapy start with it
432 |         :return:
433 |         """
434 |         loop = asyncio.get_event_loop()
435 |         is_open_mysql = self.__getattribute__("is_open_mysql")
436 |         if is_open_mysql:
437 |             loop.run_until_complete(self.init_engine())
438 |         loop.run_until_complete(self._generate_task())
439 |         self.task_list = []
440 |         for i in range(int(self.config['workers_num'])):
441 |             task = loop.create_task(self.base_worker())
442 |             self.task_list.append(task)
443 |         self.started_time = time.monotonic()
444 |         loop.run_until_complete(self.wait_all_task())
445 | 
446 | 


--------------------------------------------------------------------------------
/src/lrabbit_scrapy/android/sslpass.js:
--------------------------------------------------------------------------------
  1 | setTimeout(function () {
  2 |     Java.perform(function () {
  3 |         console.log('');
  4 |         console.log('======');
  5 |         console.log('[#] Android Bypass for various Certificate Pinning methods [#]');
  6 |         console.log('======');
  7 | 
  8 | 
  9 |         var X509TrustManager = Java.use('javax.net.ssl.X509TrustManager');
 10 |         var SSLContext = Java.use('javax.net.ssl.SSLContext');
 11 | 
 12 |         // TrustManager (Android < 7) //
 13 |         ////////////////////////////////
 14 |         var TrustManager = Java.registerClass({
 15 |             // Implement a custom TrustManager
 16 |             name: 'dev.asd.test.TrustManager',
 17 |             implements: [X509TrustManager],
 18 |             methods: {
 19 |                 checkClientTrusted: function (chain, authType) {
 20 |                 },
 21 |                 checkServerTrusted: function (chain, authType) {
 22 |                 },
 23 |                 getAcceptedIssuers: function () {
 24 |                     return [];
 25 |                 }
 26 |             }
 27 |         });
 28 |         // Prepare the TrustManager array to pass to SSLContext.init()
 29 |         var TrustManagers = [TrustManager.$new()];
 30 |         // Get a handle on the init() on the SSLContext class
 31 |         var SSLContext_init = SSLContext.init.overload(
 32 |             '[Ljavax.net.ssl.KeyManager;', '[Ljavax.net.ssl.TrustManager;', 'java.security.SecureRandom');
 33 |         try {
 34 |             // Override the init method, specifying the custom TrustManager
 35 |             SSLContext_init.implementation = function (keyManager, trustManager, secureRandom) {
 36 |                 console.log('[+] Bypassing Trustmanager (Android < 7) request');
 37 |                 SSLContext_init.call(this, keyManager, TrustManagers, secureRandom);
 38 |             };
 39 |         } catch (err) {
 40 |             console.log('[-] TrustManager (Android < 7) pinner not found');
 41 |             //console.log(err);
 42 |         }
 43 | 
 44 | 
 45 |         // OkHTTPv3 (quadruple bypass) //
 46 |         // /////////////////////////////////
 47 |         try {
 48 |             // Bypass OkHTTPv3 {1}
 49 |             var okhttp3_Activity_1 = Java.use('okhttp3.CertificatePinner');
 50 |             okhttp3_Activity_1.check.overload('java.lang.String', 'java.util.List').implementation = function (a, b) {
 51 |                 console.log('[+] Bypassing OkHTTPv3 {1}: ' + a);
 52 | 
 53 | 
 54 |             };
 55 |         } catch (err) {
 56 |             console.log('[-] OkHTTPv3 {1} pinner not found');
 57 |             //console.log(err);
 58 |         }
 59 |         try {
 60 |             // Bypass OkHTTPv3 {2}
 61 |             // This method of CertificatePinner.check could be found in some old Android app
 62 |             var okhttp3_Activity_2 = Java.use('okhttp3.CertificatePinner');
 63 |             okhttp3_Activity_2.check.overload('java.lang.String', 'java.security.cert.Certificate').implementation = function (a, b) {
 64 |                 console.log('[+] Bypassing OkHTTPv3 {2}: ' + a);
 65 |                 return true;
 66 |             };
 67 |         } catch (err) {
 68 |             console.log('[-] OkHTTPv3 {2} pinner not found');
 69 |             //console.log(err);
 70 |         }
 71 |         try {
 72 |             // Bypass OkHTTPv3 {3}
 73 |             var okhttp3_Activity_3 = Java.use('okhttp3.CertificatePinner');
 74 |             okhttp3_Activity_3.check.overload('java.lang.String', '[Ljava.security.cert.Certificate;').implementation = function (a, b) {
 75 |                 console.log('[+] Bypassing OkHTTPv3 {3}: ' + a);
 76 |                 return true;
 77 |             };
 78 |         } catch (err) {
 79 |             console.log('[-] OkHTTPv3 {3} pinner not found');
 80 |             //console.log(err);
 81 |         }
 82 |         try {
 83 |             // Bypass OkHTTPv3 {4}
 84 |             var okhttp3_Activity_4 = Java.use('okhttp3.CertificatePinner');
 85 |             okhttp3_Activity_4['check$okhttp'].implementation = function (a, b) {
 86 |                 console.log('[+] Bypassing OkHTTPv3 {4}: ' + a);
 87 | 
 88 | 
 89 |             };
 90 |         } catch (err) {
 91 |             console.log('[-] OkHTTPv3 {4} pinner not found');
 92 |             //console.log(err);
 93 |         }
 94 | 
 95 | 
 96 |         //Trustkit (triple bypass) //
 97 |         ////////////////////////////
 98 |         try {
 99 |             // Bypass Trustkit {1}
100 |             var trustkit_Activity_1 = Java.use('com.datatheorem.android.trustkit.pinning.OkHostnameVerifier');
101 |             trustkit_Activity_1.verify.overload('java.lang.String', 'javax.net.ssl.SSLSession').implementation = function (a, b) {
102 |                 console.log('[+] Bypassing Trustkit {1}: ' + a);
103 |                 return true;
104 |             };
105 |         } catch (err) {
106 |             console.log('[-] Trustkit {1} pinner not found');
107 |             //console.log(err);
108 |         }
109 |         try {
110 |             // Bypass Trustkit {2}
111 |             var trustkit_Activity_2 = Java.use('com.datatheorem.android.trustkit.pinning.OkHostnameVerifier');
112 |             trustkit_Activity_2.verify.overload('java.lang.String', 'java.security.cert.X509Certificate').implementation = function (a, b) {
113 |                 console.log('[+] Bypassing Trustkit {2}: ' + a);
114 |                 return true;
115 |             };
116 |         } catch (err) {
117 |             console.log('[-] Trustkit {2} pinner not found');
118 |             //console.log(err);
119 |         }
120 |         try {
121 |             // Bypass Trustkit {3}
122 |             var trustkit_PinningTrustManager = Java.use('com.datatheorem.android.trustkit.pinning.PinningTrustManager');
123 |             trustkit_PinningTrustManager.checkServerTrusted.implementation = function () {
124 |                 console.log('[+] Bypassing Trustkit {3}');
125 |             };
126 |         } catch (err) {
127 |             console.log('[-] Trustkit {3} pinner not found');
128 |             //console.log(err);
129 |         }
130 | 
131 |         try {
132 |             var TrustManagerImpl = Java.use('com.android.org.conscrypt.TrustManagerImpl');
133 |             TrustManagerImpl.verifyChain.implementation = function (untrustedChain, trustAnchorChain, host, clientAuth, ocspData, tlsSctData) {
134 |                 console.log('[+] Bypassing TrustManagerImpl (Android > 7): ' + host);
135 |                 return untrustedChain;
136 |             };
137 |         } catch (err) {
138 |             console.log('[-] TrustManagerImpl (Android > 7) pinner not found');
139 |             //console.log(err);
140 |         }
141 | 
142 | 
143 |         // Appcelerator Titanium //
144 |         ///////////////////////////
145 |         try {
146 |             var appcelerator_PinningTrustManager = Java.use('appcelerator.https.PinningTrustManager');
147 |             appcelerator_PinningTrustManager.checkServerTrusted.implementation = function () {
148 |                 console.log('[+] Bypassing Appcelerator PinningTrustManager');
149 |             };
150 |         } catch (err) {
151 |             console.log('[-] Appcelerator PinningTrustManager pinner not found');
152 |             //console.log(err);
153 |         }
154 | 
155 | 
156 |         // OpenSSLSocketImpl Conscrypt //
157 |         /////////////////////////////////
158 |         try {
159 |             var OpenSSLSocketImpl = Java.use('com.android.org.conscrypt.OpenSSLSocketImpl');
160 |             OpenSSLSocketImpl.verifyCertificateChain.implementation = function (certRefs, JavaObject, authMethod) {
161 |                 console.log('[+] Bypassing OpenSSLSocketImpl Conscrypt');
162 |             };
163 |         } catch (err) {
164 |             console.log('[-] OpenSSLSocketImpl Conscrypt pinner not found');
165 |             //console.log(err);
166 |         }
167 | 
168 | 
169 |         // OpenSSLEngineSocketImpl Conscrypt //
170 |         ///////////////////////////////////////
171 |         try {
172 |             var OpenSSLEngineSocketImpl_Activity = Java.use('com.android.org.conscrypt.OpenSSLEngineSocketImpl');
173 |             OpenSSLSocketImpl_Activity.verifyCertificateChain.overload('[Ljava.lang.Long;', 'java.lang.String').implementation = function (a, b) {
174 |                 console.log('[+] Bypassing OpenSSLEngineSocketImpl Conscrypt: ' + b);
175 |             };
176 |         } catch (err) {
177 |             console.log('[-] OpenSSLEngineSocketImpl Conscrypt pinner not found');
178 |             //console.log(err);
179 |         }
180 | 
181 | 
182 |         // OpenSSLSocketImpl Apache Harmony //
183 |         //////////////////////////////////////
184 |         try {
185 |             var OpenSSLSocketImpl_Harmony = Java.use('org.apache.harmony.xnet.provider.jsse.OpenSSLSocketImpl');
186 |             OpenSSLSocketImpl_Harmony.verifyCertificateChain.implementation = function (asn1DerEncodedCertificateChain, authMethod) {
187 |                 console.log('[+] Bypassing OpenSSLSocketImpl Apache Harmony');
188 |             };
189 |         } catch (err) {
190 |             console.log('[-] OpenSSLSocketImpl Apache Harmony pinner not found');
191 |             //console.log(err);
192 |         }
193 | 
194 | 
195 |         // PhoneGap sslCertificateChecker (https://github.com/EddyVerbruggen/SSLCertificateChecker-PhoneGap-Plugin) //
196 |         //////////////////////////////////////////////////////////////////////////////////////////////////////////////
197 |         try {
198 |             var phonegap_Activity = Java.use('nl.xservices.plugins.sslCertificateChecker');
199 |             phonegap_Activity.execute.overload('java.lang.String', 'org.json.JSONArray', 'org.apache.cordova.CallbackContext').implementation = function (a, b, c) {
200 |                 console.log('[+] Bypassing PhoneGap sslCertificateChecker: ' + a);
201 |                 return true;
202 |             };
203 |         } catch (err) {
204 |             console.log('[-] PhoneGap sslCertificateChecker pinner not found');
205 |             //console.log(err);
206 |         }
207 | 
208 | 
209 |         //IBM MobileFirst pinTrustedCertificatePublicKey (double bypass) //
210 |         ////////////////////////////////////////////////////////////////////
211 |         try {
212 |             // Bypass IBM MobileFirst {1}
213 |             var WLClient_Activity_1 = Java.use('com.worklight.wlclient.api.WLClient');
214 |             WLClient_Activity_1.getInstance().pinTrustedCertificatePublicKey.overload('java.lang.String').implementation = function (cert) {
215 |                 console.log('[+] Bypassing IBM MobileFirst pinTrustedCertificatePublicKey {1}: ' + cert);
216 |                 return;
217 |             };
218 |         } catch (err) {
219 |             console.log('[-] IBM MobileFirst pinTrustedCertificatePublicKey {1} pinner not found');
220 |             //console.log(err);
221 |         }
222 |         try {
223 |             // Bypass IBM MobileFirst {2}
224 |             var WLClient_Activity_2 = Java.use('com.worklight.wlclient.api.WLClient');
225 |             WLClient_Activity_2.getInstance().pinTrustedCertificatePublicKey.overload('[Ljava.lang.String;').implementation = function (cert) {
226 |                 console.log('[+] Bypassing IBM MobileFirst pinTrustedCertificatePublicKey {2}: ' + cert);
227 |                 return;
228 |             };
229 |         } catch (err) {
230 |             console.log('[-] IBM MobileFirst pinTrustedCertificatePublicKey {2} pinner not found');
231 |             //console.log(err);
232 |         }
233 | 
234 | 
235 |         // IBM WorkLight (ancestor of MobileFirst) HostNameVerifierWithCertificatePinning (quadruple bypass) //
236 |         ///////////////////////////////////////////////////////////////////////////////////////////////////////
237 |         try {
238 |             // Bypass IBM WorkLight {1}
239 |             var worklight_Activity_1 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning');
240 |             worklight_Activity_1.verify.overload('java.lang.String', 'javax.net.ssl.SSLSocket').implementation = function (a, b) {
241 |                 console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {1}: ' + a);
242 |                 return;
243 |             };
244 |         } catch (err) {
245 |             console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {1} pinner not found');
246 |             //console.log(err);
247 |         }
248 |         try {
249 |             // Bypass IBM WorkLight {2}
250 |             var worklight_Activity_2 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning');
251 |             worklight_Activity_2.verify.overload('java.lang.String', 'java.security.cert.X509Certificate').implementation = function (a, b) {
252 |                 console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {2}: ' + a);
253 |                 return;
254 |             };
255 |         } catch (err) {
256 |             console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {2} pinner not found');
257 |             //console.log(err);
258 |         }
259 |         try {
260 |             // Bypass IBM WorkLight {3}
261 |             var worklight_Activity_3 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning');
262 |             worklight_Activity_3.verify.overload('java.lang.String', '[Ljava.lang.String;', '[Ljava.lang.String;').implementation = function (a, b) {
263 |                 console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {3}: ' + a);
264 |                 return;
265 |             };
266 |         } catch (err) {
267 |             console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {3} pinner not found');
268 |             //console.log(err);
269 |         }
270 |         try {
271 |             // Bypass IBM WorkLight {4}
272 |             var worklight_Activity_4 = Java.use('com.worklight.wlclient.certificatepinning.HostNameVerifierWithCertificatePinning');
273 |             worklight_Activity_4.verify.overload('java.lang.String', 'javax.net.ssl.SSLSession').implementation = function (a, b) {
274 |                 console.log('[+] Bypassing IBM WorkLight HostNameVerifierWithCertificatePinning {4}: ' + a);
275 |                 return true;
276 |             };
277 |         } catch (err) {
278 |             console.log('[-] IBM WorkLight HostNameVerifierWithCertificatePinning {4} pinner not found');
279 |             //console.log(err);
280 |         }
281 | 
282 | 
283 |         //Conscrypt CertPinManager //
284 |         //////////////////////////////
285 |         try {
286 |             var conscrypt_CertPinManager_Activity = Java.use('com.android.org.conscrypt.CertPinManager');
287 |             conscrypt_CertPinManager_Activity.isChainValid.overload('java.lang.String', 'java.util.List').implementation = function (a, b) {
288 |                 console.log('[+] Bypassing Conscrypt CertPinManager: ' + a);
289 |                 return true;
290 |             };
291 |         } catch (err) {
292 |             console.log('[-] Conscrypt CertPinManager pinner not found');
293 |             //console.log(err);
294 |         }
295 | 
296 | 
297 |         // CWAC-Netsecurity (unofficial back-port pinner for Android<4.2) CertPinManager //
298 |         ///////////////////////////////////////////////////////////////////////////////////
299 |         try {
300 |             var cwac_CertPinManager_Activity = Java.use('com.commonsware.cwac.netsecurity.conscrypt.CertPinManager');
301 |             cwac_CertPinManager_Activity.isChainValid.overload('java.lang.String', 'java.util.List').implementation = function (a, b) {
302 |                 console.log('[+] Bypassing CWAC-Netsecurity CertPinManager: ' + a);
303 |                 return true;
304 |             };
305 |         } catch (err) {
306 |             console.log('[-] CWAC-Netsecurity CertPinManager pinner not found');
307 |             //console.log(err);
308 |         }
309 | 
310 | 
311 |         // Worklight Androidgap WLCertificatePinningPlugin //
312 |         /////////////////////////////////////////////////////
313 |         try {
314 |             var androidgap_WLCertificatePinningPlugin_Activity = Java.use('com.worklight.androidgap.plugin.WLCertificatePinningPlugin');
315 |             androidgap_WLCertificatePinningPlugin_Activity.execute.overload('java.lang.String', 'org.json.JSONArray', 'org.apache.cordova.CallbackContext').implementation = function (a, b, c) {
316 |                 console.log('[+] Bypassing Worklight Androidgap WLCertificatePinningPlugin: ' + a);
317 |                 return true;
318 |             };
319 |         } catch (err) {
320 |             console.log('[-] Worklight Androidgap WLCertificatePinningPlugin pinner not found');
321 |             //console.log(err);
322 |         }
323 | 
324 | 
325 |         // Netty FingerprintTrustManagerFactory //
326 |         //////////////////////////////////////////
327 |         try {
328 |             var netty_FingerprintTrustManagerFactory = Java.use('io.netty.handler.ssl.util.FingerprintTrustManagerFactory');
329 |             //NOTE: sometimes this below implementation could be useful
330 |             //var netty_FingerprintTrustManagerFactory = Java.use('org.jboss.netty.handler.ssl.util.FingerprintTrustManagerFactory');
331 |             netty_FingerprintTrustManagerFactory.checkTrusted.implementation = function (type, chain) {
332 |                 console.log('[+] Bypassing Netty FingerprintTrustManagerFactory');
333 |             };
334 |         } catch (err) {
335 |             console.log('[-] Netty FingerprintTrustManagerFactory pinner not found');
336 |             //console.log(err);
337 |         }
338 | 
339 | 
340 |         //Squareup CertificatePinner [OkHTTP<v3] (double bypass) //
341 |         //////////////////////////////////////////////////////////
342 |         try {
343 |             // Bypass Squareup CertificatePinner  {1}
344 |             var Squareup_CertificatePinner_Activity_1 = Java.use('com.squareup.okhttp.CertificatePinner');
345 |             Squareup_CertificatePinner_Activity_1.check.overload('java.lang.String', 'java.security.cert.Certificate').implementation = function (a, b) {
346 |                 console.log('[+] Bypassing Squareup CertificatePinner {1}: ' + a);
347 |                 return;
348 |             };
349 |         } catch (err) {
350 |             console.log('[-] Squareup CertificatePinner {1} pinner not found');
351 |             //console.log(err);
352 |         }
353 |         try {
354 |             // Bypass Squareup CertificatePinner {2}
355 |             var Squareup_CertificatePinner_Activity_2 = Java.use('com.squareup.okhttp.CertificatePinner');
356 |             Squareup_CertificatePinner_Activity_2.check.overload('java.lang.String', 'java.util.List').implementation = function (a, b) {
357 |                 console.log('[+] Bypassing Squareup CertificatePinner {2}: ' + a);
358 |                 return;
359 |             };
360 |         } catch (err) {
361 |             console.log('[-] Squareup CertificatePinner {2} pinner not found');
362 |             //console.log(err);
363 |         }
364 | 
365 | 
366 |         //Squareup OkHostnameVerifier [OkHTTP v3] (double bypass) //
367 |         ///////////////////////////////////////////////////////////
368 |         try {
369 |             // Bypass Squareup OkHostnameVerifier {1}
370 |             var Squareup_OkHostnameVerifier_Activity_1 = Java.use('com.squareup.okhttp.internal.tls.OkHostnameVerifier');
371 |             Squareup_OkHostnameVerifier_Activity_1.verify.overload('java.lang.String', 'java.security.cert.X509Certificate').implementation = function (a, b) {
372 |                 console.log('[+] Bypassing Squareup OkHostnameVerifier {1}: ' + a);
373 |                 return true;
374 |             };
375 |         } catch (err) {
376 |             console.log('[-] Squareup OkHostnameVerifier pinner not found');
377 |             //console.log(err);
378 |         }
379 |         try {
380 |             // Bypass Squareup OkHostnameVerifier {2}
381 |             var Squareup_OkHostnameVerifier_Activity_2 = Java.use('com.squareup.okhttp.internal.tls.OkHostnameVerifier');
382 |             Squareup_OkHostnameVerifier_Activity_2.verify.overload('java.lang.String', 'javax.net.ssl.SSLSession').implementation = function (a, b) {
383 |                 console.log('[+] Bypassing Squareup OkHostnameVerifier {2}: ' + a);
384 |                 return true;
385 |             };
386 |         } catch (err) {
387 |             console.log('[-] Squareup OkHostnameVerifier pinner not found');
388 |             //console.log(err);
389 |         }
390 | 
391 |         1
392 |         // Android WebViewClient (double bypass) //
393 |         ///////////////////////////////////////////
394 |         try {
395 |             // Bypass WebViewClient {1} (deprecated from Android 6)
396 |             var AndroidWebViewClient_Activity_1 = Java.use('android.webkit.WebViewClient');
397 |             AndroidWebViewClient_Activity_1.onReceivedSslError.overload('android.webkit.WebView', 'android.webkit.SslErrorHandler', 'android.net.http.SslError').implementation = function (obj1, obj2, obj3) {
398 |                 console.log('[+] Bypassing Android WebViewClient {1}');
399 |             };
400 |         } catch (err) {
401 |             console.log('[-] Android WebViewClient {1} pinner not found');
402 |             //console.log(err)
403 |         }
404 |         try {
405 |             // Bypass WebViewClient {2}
406 |             var AndroidWebViewClient_Activity_2 = Java.use('android.webkit.WebViewClient');
407 |             AndroidWebViewClient_Activity_2.onReceivedSslError.overload('android.webkit.WebView', 'android.webkit.WebResourceRequest', 'android.webkit.WebResourceError').implementation = function (obj1, obj2, obj3) {
408 |                 console.log('[+] Bypassing Android WebViewClient {2}');
409 |             };
410 |         } catch (err) {
411 |             console.log('[-] Android WebViewClient {2} pinner not found');
412 |             //console.log(err)
413 |         }
414 | 
415 | 
416 |         // Apache Cordova WebViewClient //
417 |         //////////////////////////////////
418 |         try {
419 |             var CordovaWebViewClient_Activity = Java.use('org.apache.cordova.CordovaWebViewClient');
420 |             CordovaWebViewClient_Activity.onReceivedSslError.overload('android.webkit.WebView', 'android.webkit.SslErrorHandler', 'android.net.http.SslError').implementation = function (obj1, obj2, obj3) {
421 |                 console.log('[+] Bypassing Apache Cordova WebViewClient');
422 |                 obj3.proceed();
423 |             };
424 |         } catch (err) {
425 |             console.log('[-] Apache Cordova WebViewClient pinner not found');
426 |             //console.log(err);
427 |         }
428 | 
429 |         try {
430 |             var boye_AbstractVerifier = Java.use('ch.boye.httpclientandroidlib.conn.ssl.AbstractVerifier');
431 |             boye_AbstractVerifier.verify.implementation = function (host, ssl) {
432 |                 console.log('[+] Bypassing Boye AbstractVerifier: ' + host);
433 |             };
434 |         } catch (err) {
435 |             console.log('[-] Boye AbstractVerifier pinner not found');
436 |             //console.log(err);
437 |         }
438 |     });
439 | 
440 | }, 1000)
441 | 
442 | 
443 | 
444 | 


--------------------------------------------------------------------------------