├── .gitignore ├── BaseFile ├── GetLocalFile.py ├── GetProxyIp.py ├── Logger.py ├── ReadConfig.py ├── UserAgent.py └── __init__.py ├── Common ├── CsvHelper.py ├── JsonHelper.py ├── KafkaHelper.py ├── MongoHelper.py ├── MySqlHelper.py ├── RedisHelper.py ├── RedisHelperLongConncet.py └── __init__.py ├── Config ├── HEADERS.py ├── KAFKA ├── MONGODB ├── MYSQL ├── PROXYIP ├── REDIS └── __init__.py ├── Data ├── cnblogs.json ├── img │ ├── 1540974405032.jpg │ ├── 1540974407587.jpg │ └── 1540974408414.jpg └── jianshu.csv ├── LICENSE ├── README.md └── Spider ├── __init__.py ├── request_html_demo_1.py ├── request_html_demo_2.py └── request_html_demo_3.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /BaseFile/GetLocalFile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: GetLocalFile.py 10 | @time: 2018/9/25 17:26 11 | @describe: 操作本地文件 12 | """ 13 | import sys 14 | import os 15 | 16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 17 | sys.path.append("..") 18 | 19 | 20 | class GetLocalFile: 21 | """ 读取本地文件,返回:List """ 22 | 23 | @staticmethod 24 | def get_local_file(filename): 25 | with open(filename, "r", encoding="utf-8") as f: 26 | data = f.readlines() 27 | return [url[:-1] for url in data] 28 | -------------------------------------------------------------------------------- /BaseFile/GetProxyIp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: GetProxyIp.py 10 | @time: 2018/9/25 17:45 11 | @describe: 返回代理IP 12 | """ 13 | import sys 14 | import os 15 | import random 16 | import time 17 | 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | 21 | 22 | class GetProxyIp: 23 | """随机从文件中读取proxy""" 24 | 25 | @staticmethod 26 | def get_random_proxy(): 27 | while True: 28 | with open('../Config/PROXYIP', 'r') as f: 29 | proxies = f.readlines() 30 | if proxies: 31 | break 32 | else: 33 | time.sleep(1) 34 | proxy = random.choice(proxies).strip() 35 | return proxy 36 | 37 | """返回HTTP/HTTPS的代理IP,可根据代理IP类型更改""" 38 | 39 | def get_IP_Http(self): 40 | IP = self.get_random_proxy() 41 | proxies = { 42 | "http": IP, 43 | } 44 | return proxies 45 | 46 | def get_IP_Https(self): 47 | IP = self.get_random_proxy() 48 | proxies = { 49 | "https": IP, 50 | } 51 | return proxies 52 | 53 | 54 | if __name__ == '__main__': 55 | print(GetProxyIp().get_IP_Http()) 56 | -------------------------------------------------------------------------------- /BaseFile/Logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @site: 9 | @software: PyCharm 10 | @file: logger.py 11 | @time: 2018/3/23 14:30 12 | @describe: log日志编写 13 | """ 14 | import logging 15 | import os 16 | import datetime 17 | 18 | FOREGROUND_WHITE = 0x0007 19 | FOREGROUND_BLUE = 0x01 # text color contains blue. 20 | FOREGROUND_GREEN = 0x02 # text color contains green. 21 | FOREGROUND_RED = 0x04 # text color contains red. 22 | FOREGROUND_YELLOW = FOREGROUND_RED | FOREGROUND_GREEN 23 | STD_OUTPUT_HANDLE = -11 24 | # 创建log文件夹 25 | log_dir = '../logs' 26 | if not os.path.exists(log_dir): 27 | os.mkdir(log_dir) 28 | print(log_dir) 29 | 30 | 31 | class Logger: 32 | def __init__(self, path, clevel=logging.DEBUG, Flevel=logging.DEBUG): 33 | fmt = logging.Formatter('[%(asctime)s] [%(levelname)s] %(message)s', '%Y-%m-%d %H:%M:%S') 34 | startTime = datetime.datetime.now().strftime('%Y-%m-%d') 35 | path = os.path.join(log_dir, str(startTime) + "-" + path) 36 | self.logger = logging.getLogger(path) 37 | self.logger.setLevel(logging.DEBUG) 38 | # 设置CMD日志 39 | sh = logging.StreamHandler() 40 | sh.setFormatter(fmt) 41 | sh.setLevel(clevel) 42 | # 设置文件日志 43 | fh = logging.FileHandler(path) 44 | fh.setFormatter(fmt) 45 | fh.setLevel(Flevel) 46 | self.logger.addHandler(sh) 47 | self.logger.addHandler(fh) 48 | 49 | def debug(self, message, color=FOREGROUND_BLUE): 50 | self.logger.debug(message) 51 | 52 | def info(self, message, color=FOREGROUND_GREEN): 53 | self.logger.info(message) 54 | 55 | def war(self, message, color=FOREGROUND_YELLOW): 56 | self.logger.warn(message) 57 | 58 | def error(self, message, color=FOREGROUND_RED): 59 | self.logger.error(message) 60 | 61 | def cri(self, message): 62 | self.logger.critical(message) 63 | 64 | 65 | if __name__ == '__main__': 66 | logyyx = Logger('test.log', logging.WARNING, logging.DEBUG) 67 | logyyx.debug('一个debug信息') 68 | logyyx.info('一个info信息') 69 | logyyx.war('一个warning信息') 70 | logyyx.error('一个error信息') 71 | logyyx.cri('一个致命critical信息') 72 | -------------------------------------------------------------------------------- /BaseFile/ReadConfig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: ReadConfig.py 10 | @time: 2018/9/25 19:51 11 | @describe: 读取各类配置文件 12 | """ 13 | import sys 14 | import os 15 | from yaml import load 16 | 17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 18 | sys.path.append("..") 19 | 20 | 21 | class ReadConfig: 22 | """ 读取各类YAM配置文件""" 23 | 24 | @staticmethod 25 | def get_conf(path_name): 26 | config_path = os.path.join(os.path.dirname(__file__), path_name) 27 | with open(config_path) as f: 28 | cont = f.read() 29 | return load(cont) 30 | -------------------------------------------------------------------------------- /BaseFile/UserAgent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @site: 9 | @software: PyCharm 10 | @file: user_agent.py 11 | @time: 2018/8/13 10:47 12 | @describe: 浏览器请求头 13 | 因为请求头分手机版本和PC版本,版本不同,页面返回信息页有所不同,注意使用 14 | """ 15 | import random 16 | import os 17 | import sys 18 | 19 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 20 | sys.path.append("..") 21 | USER_AGENT_LIST = [ 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 23 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 24 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 25 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 26 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 27 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 28 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 30 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 32 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 35 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 36 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 38 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 40 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 41 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 42 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 43 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 44 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 45 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 46 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 47 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 48 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 49 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 50 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 51 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 52 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 53 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 54 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 55 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", 56 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", 57 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 58 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 59 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 60 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", 62 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 64 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 65 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 66 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 67 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 68 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 69 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 70 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 71 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 72 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 73 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 74 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 75 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 76 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 77 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)" 78 | ] 79 | USER_AGENT_PHONE_LIST = [ 80 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 81 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 82 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 83 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 84 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 85 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 86 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 87 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 88 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 89 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 90 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 91 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999" 92 | ] 93 | 94 | 95 | class UserAgent: 96 | """电脑端请求头""" 97 | 98 | @staticmethod 99 | def pc_agent(): 100 | return random.choice(USER_AGENT_LIST) 101 | 102 | """手机端请求头""" 103 | 104 | @staticmethod 105 | def phone_agent(): 106 | return random.choice(USER_AGENT_PHONE_LIST) 107 | -------------------------------------------------------------------------------- /BaseFile/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: __init__.py 10 | @time: 2018/9/26 18:34 11 | @describe: 12 | """ 13 | import sys 14 | import os 15 | 16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 17 | sys.path.append("..") -------------------------------------------------------------------------------- /Common/CsvHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: CsvHelper.py 10 | @time: 2018/9/26 15:48 11 | @describe: csv 助手 12 | """ 13 | import logging 14 | import sys 15 | import os 16 | import csv 17 | 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | from BaseFile.Logger import Logger 21 | logger = Logger('csv.log', logging.WARNING, logging.DEBUG) 22 | 23 | 24 | class CsvHelper: 25 | @staticmethod 26 | def CsvConnection(fireName): 27 | # 存入csv文件 28 | out = open('../Data/%s' % fireName, 'a', newline='', encoding="utf-8") 29 | # 设定写入模式 30 | csv_write = csv.writer(out, dialect='excel') 31 | return csv_write 32 | 33 | """ 以追加方式写入csv文件,message 数据格式:List """ 34 | 35 | def csv_write(self, fireName, message): 36 | try: 37 | csv_write = self.CsvConnection(fireName) 38 | csv_write.writerow([msg for msg in message]) 39 | print("write successful!") 40 | except Exception as e: 41 | print("[csv write error]", e) 42 | logger.error("[csv write error]"+str(e)) 43 | 44 | """ 读取csv文件,返回List """ 45 | 46 | @staticmethod 47 | def csv_read(fireName): 48 | try: 49 | with open("../Data/%s" % fireName, "r", encoding="utf-8") as csvfile: 50 | reader2 = csv.reader(csvfile) 51 | return [x for x in reader2] 52 | except Exception as e: 53 | print("[csv read error]", e) 54 | logger.error("[csv read error]" + str(e)) 55 | 56 | 57 | if __name__ == '__main__': 58 | message = "host", "1002", "1003" 59 | # 写入csv文件 60 | CsvHelper().csv_write("test.csv", message) 61 | # 读取csv文件 62 | print(CsvHelper().csv_read("test.csv")) 63 | -------------------------------------------------------------------------------- /Common/JsonHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: JsonHelper.py 10 | @time: 2018/10/31 15:04 11 | @describe: 读写json文件 12 | """ 13 | import logging 14 | import sys 15 | import os 16 | import json 17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 18 | sys.path.append("..") 19 | from BaseFile.Logger import Logger 20 | logger = Logger('Json.log', logging.WARNING, logging.DEBUG) 21 | 22 | 23 | class JsonHelper: 24 | """ 以追加方式写入json文件,message 数据格式:List """ 25 | 26 | def json_write(self, fireName, message): 27 | try: 28 | out = open('../Data/%s' % fireName, 'a', encoding='utf-8') 29 | out.write(json.dumps(message) + "\n") 30 | print("write successful!") 31 | except Exception as e: 32 | print("[json write error]", e) 33 | logger.error("[json write error]"+str(e)) 34 | 35 | """读取json文件,原文件每行为一个独立json串,组合并不是一个正确的json格式""" 36 | 37 | def json_read(self, fireName): 38 | try: 39 | with open('../Data/%s' % fireName, 'r', encoding='utf-8') as f: 40 | msg = f.readlines() 41 | return [json.loads(data[:-1]) for data in msg] 42 | except Exception as e: 43 | print("[json read error]", e) 44 | logger.error("[json read error]"+str(e)) 45 | 46 | """读取json文件,只做查看,无返回值,需要返回值,使用上一个方法""" 47 | 48 | def json_watch(self, fireName): 49 | try: 50 | f = open('../Data/%s' % fireName, 'r', encoding='utf-8') 51 | for data in f: 52 | print("*" * 150) 53 | print(json.loads(data)) 54 | print("*" * 150, "\n") 55 | except Exception as e: 56 | print("[json watch error]", e) 57 | logger.error("[json watch error]"+str(e)) 58 | 59 | 60 | if __name__ == '__main__': 61 | # 读取json--操作返回数据 62 | str_json = JsonHelper().json_read("cnblogs.json") 63 | for i in str_json: 64 | print(i) 65 | print("*" * 100, "\n") 66 | # 读取json--查看 67 | JsonHelper().json_watch("cnblogs.json") 68 | -------------------------------------------------------------------------------- /Common/KafkaHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: KafkaHelper.py 10 | @time: 2018/9/26 14:17 11 | @describe: kafka 助手 12 | """ 13 | import logging 14 | import sys 15 | import os 16 | from pykafka import KafkaClient 17 | 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | from BaseFile.ReadConfig import ReadConfig as RC 21 | from BaseFile.Logger import Logger 22 | logger = Logger('kafka.log', logging.WARNING, logging.DEBUG) 23 | """ 读取哪个kafka 配置""" 24 | DBName = "kafka_demo1" 25 | settings = RC().get_conf("../Config/kafka")[DBName] # 获取Config-KAFKA 配置 26 | host = settings['host'] 27 | 28 | 29 | class KafkaHelper: 30 | """DBName 指定读取哪个配置文件""" 31 | client = KafkaClient(hosts=host) # 可接受多个client 32 | 33 | def __init__(self): 34 | self.topics = settings['topics'] 35 | self.zookeeper_connect = settings['zookeeper_connect'] 36 | self.topics = settings['topics'] 37 | self.client = self.client 38 | 39 | """ 配置连接 """ 40 | 41 | def KafkaConnectionPool(self): 42 | try: 43 | topic = self.client.topics[self.topics.encode('utf-8')] # 选择一个topic 44 | return topic 45 | except Exception as e: 46 | print('[kafka-connect] error', e) 47 | logger.error('[kafka-connect] error '+str(e)) 48 | 49 | """ 生成一条消息,并发送至kafka: partitionkey:分区名称;message:消息内容""" 50 | 51 | def producer_kafka(self, partitionKey, message): 52 | topic = self.KafkaConnectionPool() 53 | try: 54 | with topic.get_sync_producer() as producer: 55 | producer.produce(partition_key=partitionKey.encode('utf-8'), 56 | message=message.encode('utf-8')) 57 | print("successful send msg to kafka~~~~~") 58 | except Exception as e: 59 | print('[producer_kafka] error', e) 60 | logger.error('[producer_kafka] error '+str(e)) 61 | 62 | """ 从zookeeper消费 get_balanced_consumer""" 63 | 64 | def consumer_zookeeper(self): 65 | topic = self.KafkaConnectionPool() 66 | try: 67 | balanced_consumer = topic.get_balanced_consumer( 68 | consumer_group='demo'.encode('utf-8'), 69 | auto_commit_enable=True, # 设置为False的时候不需要添加consumer_group,直接连接topic即可取到消息 70 | zookeeper_connect=self.zookeeper_connect # 这里就是连接多个zk 71 | ) 72 | for message in balanced_consumer: 73 | if message is not None: 74 | print(message.offset, message.partition_key, str(message.value, encoding="utf-8")) 75 | except Exception as e: 76 | print("[consumer_zookeeper] error:", e) 77 | logger.error("[consumer_zookeeper] error "+str(e)) 78 | 79 | """ 从kafka消费 get_simple_consumer""" 80 | 81 | def consumer_kafka(self): 82 | topic = self.KafkaConnectionPool() 83 | try: 84 | # 从kafka消费 85 | kafka_consumer = topic.get_simple_consumer( 86 | consumer_group='demo'.encode("utf-8"), 87 | auto_commit_enable=True, 88 | consumer_id='demo'.encode("utf-8") 89 | ) 90 | for message in kafka_consumer: 91 | if message is not None: 92 | print(message.offset, message.partition_key, str(message.value, encoding="utf-8")) 93 | except Exception as e: 94 | print("[consumer_kafka] error", e) 95 | logger.error("[consumer_kafka] error "+str(e)) 96 | 97 | 98 | if __name__ == '__main__': 99 | # # print(KafkaHelper().consumer_kafka()) 100 | message = '{"demo3":"test02"}' 101 | KafkaHelper().producer_kafka('demo',message) 102 | -------------------------------------------------------------------------------- /Common/MongoHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: MongoHelper.py 10 | @time: 2018/9/26 11:21 11 | @describe: mongodb 助手 12 | http://www.runoob.com/mongodb/mongodb-connections.html 13 | """ 14 | import logging 15 | import sys 16 | import os 17 | from pymongo import MongoClient 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | from BaseFile.ReadConfig import ReadConfig as RC 21 | from BaseFile.Logger import Logger 22 | logger = Logger('mongodb.log', logging.WARNING, logging.DEBUG) 23 | 24 | 25 | class MongoHelper: 26 | def __init__(self, DBName): 27 | self.settings = RC().get_conf("../Config/MONGODB")[DBName] # 获取Config-MONGODB 配置 28 | self.host = self.settings['host'] 29 | self.port = self.settings['port'] 30 | self.user = self.settings['user'] 31 | self.passwd = self.settings['passwd'] 32 | self.dbname = self.settings['db'] 33 | self.table = self.settings['table'] 34 | self.conn = MongoClient(host=self.host, port=self.port) 35 | # 如果用户名密码存在则认证登录 36 | if self.user or self.passwd is not None: 37 | self.db_auth = self.conn.admin 38 | self.db_auth.authenticate(self.user, self.passwd) 39 | self.db = self.conn.get_database(self.dbname) 40 | self.collection = self.db.get_collection(self.table) 41 | 42 | def insert(self, item, collection_name=None): 43 | """ 44 | 插入数据,这里的数据可以是一个,也可以是多个 45 | :param item: 需要插入的数据 46 | :param collection_name: 可选,需要访问哪个集合 47 | :return: 48 | """ 49 | try: 50 | if collection_name is not None: 51 | collection = self.db.get_collection(self.db) 52 | collection.insert(item) 53 | else: 54 | self.collection.insert(item) 55 | except Exception as e: 56 | print("mongodb insert error!", e) 57 | logger.error("mongodb insert error! "+str(e)) 58 | finally: 59 | self.conn.close() 60 | 61 | def find(self, expression=None, collection_name=None): 62 | """ 63 | 进行简单查询,可以指定条件和集合 64 | :param expression: 查询条件,可以为空 65 | :param collection_name: 集合名称 66 | :return: 所有结果 67 | """ 68 | try: 69 | if collection_name is not None: 70 | collection = self.db.get_collection(self.db) 71 | if expression is None: 72 | return collection.find() 73 | else: 74 | return collection.find(expression) 75 | else: 76 | if expression is None: 77 | return self.collection.find() 78 | else: 79 | return self.collection.find(expression) 80 | except Exception as e: 81 | print("mongodb find error!", e) 82 | logger.error("mongodb find error! "+str(e)) 83 | finally: 84 | self.conn.close() 85 | 86 | def get_collection(self, collection_name=None): 87 | """ 88 | 很多时候单纯的查询不能够通过这个类封装的方法执行,这时候就可以直接获取到对应的collection进行操作 89 | :param collection_name: 集合名称 90 | :return: collection 91 | """ 92 | try: 93 | if collection_name is None: 94 | return self.collection 95 | else: 96 | return self.get_collection(collection_name) 97 | except Exception as e: 98 | print("mongodb get_collection error!", e) 99 | logger.error("mongodb get_collection error! "+str(e)) 100 | finally: 101 | self.conn.close() 102 | 103 | 104 | if __name__ == '__main__': 105 | db = MongoHelper("mongo_test") 106 | # item = {'addredd': 'zhangsan', 'index': '23'} 107 | # db.insert(item) # 插入 108 | for item in db.find(): # 查询 109 | print(item) 110 | -------------------------------------------------------------------------------- /Common/MySqlHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @site: 9 | @software: PyCharm 10 | @file: dbhelper.py 11 | @time: 2018/9/25 17:46 12 | @describe: 数据库操作助手 13 | http://www.runoob.com/mysql/mysql-tutorial.html 14 | """ 15 | import logging 16 | import sys 17 | import os 18 | import pymysql 19 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 20 | sys.path.append("..") 21 | from BaseFile.ReadConfig import ReadConfig as RC 22 | from BaseFile.Logger import Logger 23 | logger = Logger('mysql.log', logging.WARNING, logging.DEBUG) 24 | 25 | 26 | class MysqlHelper: 27 | """这个类也是读取 Config/MYSQL中的配置,自行修改代码进行操作""" 28 | 29 | def __init__(self, DBName): 30 | self.settings = RC().get_conf("../Config/MYSQL")[DBName] # 获取Config-MYSQL配置,设置MYSQL连接信息 31 | self.host = self.settings['host'] 32 | self.port = self.settings['port'] 33 | self.user = self.settings['user'] 34 | self.passwd = self.settings['passwd'] 35 | self.db = self.settings['db'] 36 | 37 | # 连接到mysql,不是连接到具体的数据库 38 | def connectMysql(self): 39 | conn = pymysql.connect(host=self.host, 40 | port=self.port, 41 | user=self.user, 42 | passwd=self.passwd, 43 | # db=self.db,不指定数据库名 44 | charset='utf8') # 要指定编码,否则中文可能乱码 45 | return conn 46 | 47 | # 连接到具体的数据库(settings中设置的MYSQL_DBNAME) 48 | def connectDatabase(self): 49 | conn = pymysql.connect(host=self.host, 50 | port=self.port, 51 | user=self.user, 52 | passwd=self.passwd, 53 | db=self.db, 54 | charset='utf8') # 要指定编码,否则中文可能乱码 55 | return conn 56 | 57 | # 创建数据库 58 | 59 | def createDatabase(self): 60 | """因为创建数据库直接修改 Config-MYSQL 中的配置 MYSQL_DBNAME 即可,所以就不要传sql语句了""" 61 | conn = self.connectMysql() # 连接数据库 62 | cur = conn.cursor() 63 | try: 64 | sql = "create database if not exists " + self.db 65 | cur.execute(sql) # 执行sql语句 66 | except Exception as e: 67 | print("Error createDatabase data!", e) 68 | logger.error("Error createDatabase data! "+str(e)) 69 | finally: 70 | cur.close() 71 | conn.close() 72 | 73 | # 创建表 74 | def createTable(self, sql): 75 | conn = self.connectDatabase() 76 | cur = conn.cursor() 77 | try: 78 | cur.execute(sql) 79 | except Exception as e: 80 | print("Error createTable data!", e) 81 | logger.error("Error createTable data! "+str(e)) 82 | finally: 83 | cur.close() 84 | conn.close() 85 | 86 | # 插入数据 87 | def insert(self, sql, *params): 88 | conn = self.connectDatabase() 89 | cur = conn.cursor() 90 | try: 91 | cur.execute(sql, params) 92 | conn.commit() 93 | except Exception as e: 94 | print("Error insert data!", e) 95 | logger.error("Error insert data! "+str(e)) 96 | finally: 97 | cur.close() 98 | conn.close() 99 | 100 | # 更新数据 101 | def update(self, sql, *params): 102 | conn = self.connectDatabase() 103 | cur = conn.cursor() 104 | try: 105 | cur.execute(sql, params) 106 | conn.commit() 107 | except Exception as e: 108 | print("Error update data!", e) 109 | logger.error("Error update data! "+str(e)) 110 | finally: 111 | cur.close() 112 | conn.close() 113 | 114 | # 删除数据 115 | def delete(self, sql, *params): 116 | conn = self.connectDatabase() 117 | cur = conn.cursor() 118 | try: 119 | cur.execute(sql, params) 120 | conn.commit() 121 | except Exception as e: 122 | print("Error delete data!", e) 123 | logger.error("Error delete data! "+str(e)) 124 | finally: 125 | cur.close() 126 | conn.close() 127 | 128 | # 查询数据 129 | def select(self, sql): 130 | conn = self.connectDatabase() 131 | cur = conn.cursor() 132 | try: 133 | cur.execute(sql) 134 | conn.commit() 135 | # 获取所有记录列表 136 | results = cur.fetchall() 137 | list_results = [] 138 | for i in range(len(results)): 139 | list_results.append(list(results[i])) 140 | return list_results 141 | except Exception as e: 142 | print("Error: unable to fecth data", e) 143 | logger.error("Error: unable to fecth data! "+str(e)) 144 | finally: 145 | cur.close() 146 | conn.close() 147 | 148 | 149 | '''测试DBHelper的类''' 150 | 151 | 152 | class TestDBHelper(): 153 | def __init__(self, DBNAME): 154 | self.dbHelper = DBHelper(DBNAME) 155 | 156 | # 测试创建数据库(settings配置文件中的MYSQL_DBNAME,直接修改settings配置文件即可) 157 | def testCreateDatebase(self): 158 | self.dbHelper.createDatabase() 159 | # 测试创建表 160 | 161 | def testCreateTable(self): 162 | sql = "create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))" 163 | self.dbHelper.createTable(sql) 164 | 165 | # 测试插入 166 | def testInsert(self, item): 167 | sql = "insert into testtable(name,url) values(%s,%s)" 168 | # params=("Ncepu_Etl","Ncepu_Etl") 169 | params = (item["name"], item["url"]) 170 | self.dbHelper.insert(sql, *params) # *表示拆分元组,调用insert(*params)会重组成元组 171 | 172 | def testUpdate(self): 173 | sql = "update testtable set name=%s,url=%s where id=%s" 174 | params = ("update", "update", "1") 175 | self.dbHelper.update(sql, *params) 176 | 177 | def testDelete(self): 178 | sql = "delete from testtable where id=%s" 179 | params = ("1") 180 | self.dbHelper.delete(sql, *params) 181 | 182 | def testSelect(self): 183 | sql = "select url from testtable limit 5" 184 | # params=("1") 185 | return self.dbHelper.select(sql) 186 | 187 | 188 | if __name__ == "__main__": 189 | testDBHelper = TestDBHelper('test01') 190 | # testDBHelper.testCreateDatebase() #执行测试创建数据库 191 | # testDBHelper.testCreateTable() #执行测试创建表 192 | # testDBHelper.testInsert() #执行测试插入数据 193 | # testDBHelper.testUpdate() #执行测试更新数据 194 | # testDBHelper.testDelete() #执行测试删除数据 195 | print(testDBHelper.testSelect()) # 执行测试查询数据 196 | -------------------------------------------------------------------------------- /Common/RedisHelper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: RedisHelper.py 10 | @time: 2018/9/26 10:14 11 | @describe: redis 操作助手 12 | 列出了常用操作,若要使用更多方法,可根据需求增加 13 | http://www.runoob.com/redis/redis-tutorial.html 14 | """ 15 | import logging 16 | import sys 17 | import os 18 | import redis 19 | import time 20 | 21 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 22 | sys.path.append("..") 23 | from BaseFile.ReadConfig import ReadConfig as RC 24 | from BaseFile.Logger import Logger 25 | logger = Logger('redisPool.log', logging.WARNING, logging.DEBUG) 26 | 27 | 28 | # Redis助手一 29 | class RedisHelper: 30 | """创建Redis连接,每次使用完,自动释放连接 31 | """ 32 | def __init__(self, DBName): 33 | self.settings = RC().get_conf("../Config/REDIS")[DBName] # 获取Config-REDIS 配置 34 | self.host = self.settings['host'] 35 | self.port = self.settings['port'] 36 | self.user = self.settings['user'] 37 | self.passwd = self.settings['passwd'] 38 | self.db = self.settings['db'] 39 | try: 40 | # # 建立 REDIS 连接池 41 | # self.pool = redis.ConnectionPool(host=self.host, port=self.port, db=self.db, password=self.passwd, 42 | # decode_responses=True, socket_timeout=300) 43 | # self.r = redis.Redis(connection_pool=self.pool) 44 | self.r = redis.Redis(host=self.host, port=self.port, db=self.db, password=self.passwd, 45 | decode_responses=True, socket_timeout=300) 46 | 47 | except Exception as e: 48 | print("REDIS CONTENT ERROR:", e) 49 | logger.error("REDIS CONTENT ERROR: "+str(e)) 50 | 51 | # 第二个参数listURL,必须传入list结构数据,插入到redis 52 | def redis_lpush(self, keyName, listUrl): 53 | try: 54 | i = 0 55 | for data in listUrl: 56 | i += 1 57 | self.r.lpush(keyName, data) 58 | print(i) 59 | print("successful push list!") 60 | except Exception as e: 61 | print('[redis_lpush] ERROR', e) 62 | logger.error('[redis_lpush] ERROR '+str(e)) 63 | 64 | # 检查key是否存在 65 | def redis_exists(self, keyName): 66 | try: 67 | return self.r.exists(keyName) 68 | except Exception as e: 69 | print('[redis_exists] ERROR', e) 70 | logger.error('[redis_exists] ERROR '+str(e)) 71 | 72 | # 以lpop方式取出元素,在keyName对应的列表的左侧获取第一个元素并在列表中移除 73 | def redis_lpop(self, keyName): 74 | try: 75 | url_list = self.r.lpop(keyName).decode() # 获取 76 | return url_list 77 | except Exception as e: 78 | print('[redis_pop] ERROR', e) 79 | logger.error('[redis_pop] ERROR '+str(e)) 80 | 81 | # 获取redis长度 82 | def redis_llen(self, keyName): 83 | try: 84 | length = self.r.llen(keyName) 85 | return length 86 | except Exception as e: 87 | print("[redis_llen] ERROR", e) 88 | logger.error("[redis_llen] ERROR "+str(e)) 89 | 90 | # 以lrange方式取出元素 91 | def redis_lrange(self, keyName, start, end): 92 | try: 93 | url_list = self.r.lrange(keyName, start, end) 94 | return url_list 95 | except Exception as e: 96 | print('[redis_lrange] ERROR', e) 97 | logger.error('[redis_lrange] ERROR '+str(e)) 98 | 99 | # 以rpop方式取出元素,在keyName对应的列表的右侧获取第一个元素并在列表中移除 100 | def redis_rpop(self, keyName): 101 | try: 102 | url_list = self.r.rpop(keyName).decode() 103 | return url_list 104 | except Exception as e: 105 | print('[redis_rpop] ERROR', e) 106 | logger.error('[redis_rpop] ERROR '+str(e)) 107 | 108 | # Set 是 String 类型的无序集合。集合成员是唯一的,这就意味着集合中不能出现重复的数据 109 | def redis_sadd(self, keyName, listUrl): 110 | try: 111 | i = 0 112 | for data in listUrl: 113 | i += 1 114 | self.r.sadd(keyName, data) 115 | print(i) 116 | print("successful sadd list!") 117 | except Exception as e: 118 | print('[redis_sadd] ERROR', e) 119 | logger.error('[redis_sadd] ERROR '+str(e)) 120 | 121 | # 移除Set并返回集合中的一个随机元素 122 | def redis_spop(self, keyName): 123 | try: 124 | url_list = self.r.spop(keyName).decode() 125 | return url_list 126 | except Exception as e: 127 | print('[redis_spop] ERROR', e) 128 | logger.error('[redis_spop] ERROR '+str(e)) 129 | 130 | # 移除Set并返回集合中所有成员 131 | def redis_smembers(self, keyName): 132 | try: 133 | url_list = self.r.smembers(keyName).decode() 134 | return url_list 135 | except Exception as e: 136 | print('[redis_smembers] ERROR', e) 137 | logger.error('[redis_smembers] ERROR '+str(e)) 138 | 139 | 140 | if __name__ == '__main__': 141 | r = RedisHelper('test01').redis_llen("test") 142 | print(r) 143 | -------------------------------------------------------------------------------- /Common/RedisHelperLongConncet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: RedisHelperLongConncet.py 10 | @time: 2018/11/9 11:13 11 | @describe: redis 操作助手 12 | 列出了常用操作,若要使用更多方法,可根据需求增加 13 | http://www.runoob.com/redis/redis-tutorial.html 14 | """ 15 | import logging 16 | import sys 17 | import os 18 | import redis 19 | import time 20 | 21 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 22 | sys.path.append("..") 23 | from BaseFile.ReadConfig import ReadConfig as RC 24 | from BaseFile.Logger import Logger 25 | logger = Logger('redisStatic.log', logging.WARNING, logging.DEBUG) 26 | 27 | """ 配置文件名字""" 28 | CONFIGNAME = "test01" 29 | 30 | 31 | # USE IT :from Common.RedisHelperLongConncet import RedisHelperConnect as RHC 32 | # CONNECT POOL: pool = redisConfig("test02").getConfig() in the last lines 33 | # 获取配置信息 34 | class redisConfig: 35 | """建立 redispool-不释放连接""" 36 | 37 | def __init__(self, DBName): 38 | self.DBName = DBName 39 | 40 | # 获取配置信息 41 | def getConfig(self): 42 | try: 43 | DBName = self.DBName 44 | settings = RC().get_conf("../Config/REDIS")[DBName] # 获取Config-REDIS 配置 45 | host = settings['host'] 46 | port = settings['port'] 47 | user = settings['user'] 48 | passwd = settings['passwd'] 49 | db = settings['db'] 50 | # 建立 REDIS 连接池 51 | pool = redis.ConnectionPool(host=host, port=port, db=db, password=passwd, decode_responses=True, 52 | socket_timeout=300) 53 | return pool 54 | except Exception as e: 55 | print("Redis Read config error!", e, "no config in REDIS.YAML!") 56 | logger.error("Redis Read config error! "+str(e) + " no config in REDIS.YAML!") 57 | 58 | 59 | """ 使用那个配置文件 """ 60 | pool = redisConfig(CONFIGNAME).getConfig() 61 | 62 | 63 | class RedisHelperConnect: 64 | """创建Redis连接,不释放连接 65 | redis.ConnectionPool:创建连接池 66 | """ 67 | try: 68 | r = redis.Redis(connection_pool=pool) 69 | except Exception as e: 70 | print("REDIS CONTENT ERROR:", e) 71 | logger.error("REDIS CONTENT ERROR:"+str(e)) 72 | 73 | # 第二个参数listURL,必须传入list结构数据,插入到redis 74 | def redis_lpush(self, keyName, listUrl): 75 | try: 76 | i = 0 77 | for data in listUrl: 78 | i += 1 79 | self.r.lpush(keyName, data) 80 | print(i) 81 | print("successful push list!") 82 | except Exception as e: 83 | print('[redis_lpush] ERROR', e) 84 | logger.error('[redis_lpush] ERROR '+str(e)) 85 | 86 | # 检查key是否存在 87 | def redis_exists(self, keyName): 88 | try: 89 | return self.r.exists(keyName) 90 | except Exception as e: 91 | print('[redis_exists] ERROR', e) 92 | logger.error('[redis_exists] ERROR '+str(e)) 93 | 94 | # 以lpop方式取出元素,在keyName对应的列表的左侧获取第一个元素并在列表中移除 95 | def redis_lpop(self, keyName): 96 | try: 97 | url_list = self.r.lpop(keyName).decode() # 获取 98 | return url_list 99 | except Exception as e: 100 | print('[redis_pop] ERROR', e) 101 | logger.error('[redis_pop] ERROR'+str(e)) 102 | 103 | # 获取redis长度 104 | def redis_llen(self, keyName): 105 | try: 106 | length = self.r.llen(keyName) 107 | return length 108 | except Exception as e: 109 | print("[redis_llen] ERROR", e) 110 | logger.error("[redis_llen] ERROR "+str(e)) 111 | 112 | # 以lrange方式取出元素 113 | def redis_lrange(self, keyName, start, end): 114 | try: 115 | url_list = self.r.lrange(keyName, start, end) 116 | return url_list 117 | except Exception as e: 118 | print('[redis_lrange] ERROR', e) 119 | logger.error('[redis_lrange] ERROR '+str(e)) 120 | 121 | # 以rpop方式取出元素,在keyName对应的列表的右侧获取第一个元素并在列表中移除 122 | def redis_rpop(self, keyName): 123 | try: 124 | url_list = self.r.rpop(keyName).decode() 125 | return url_list 126 | except Exception as e: 127 | print('[redis_rpop] ERROR', e) 128 | logger.error('[redis_rpop] ERROR '+str(e)) 129 | 130 | # Set 是 String 类型的无序集合。集合成员是唯一的,这就意味着集合中不能出现重复的数据 131 | def redis_sadd(self, keyName, listUrl): 132 | try: 133 | i = 0 134 | for data in listUrl: 135 | i += 1 136 | self.r.sadd(keyName, data) 137 | print(i) 138 | print("successful sadd list!") 139 | except Exception as e: 140 | print('[redis_sadd] ERROR', e) 141 | logger.error('[redis_sadd] ERROR '+str(e)) 142 | 143 | # 移除Set并返回集合中的一个随机元素 144 | def redis_spop(self, keyName): 145 | try: 146 | url_list = self.r.spop(keyName).decode() 147 | return url_list 148 | except Exception as e: 149 | print('[redis_spop] ERROR', e) 150 | logger.error('[redis_spop] ERROR '+str(e)) 151 | 152 | # 移除Set并返回集合中所有成员 153 | def redis_smembers(self, keyName): 154 | try: 155 | url_list = self.r.smembers(keyName).decode() 156 | return url_list 157 | except Exception as e: 158 | print('[redis_spop] ERROR', e) 159 | logger.error('[redis_spop] ERROR '+str(e)) 160 | 161 | 162 | if __name__ == '__main__': 163 | r = RedisHelperConnect().redis_llen("url") -------------------------------------------------------------------------------- /Common/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: __init__.py.py 10 | @time: 2018/9/25 20:02 11 | @describe: 12 | """ 13 | import sys 14 | import os 15 | 16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 17 | sys.path.append("..") -------------------------------------------------------------------------------- /Config/HEADERS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: HEADERS.py 10 | @time: 2018/9/26 17:31 11 | @describe: 请求头集合--爬虫请求头信息在此配置 12 | 'User-Agent': '%s' % UserAgent.pc_agent() 启用轮换浏览器请求头 13 | """ 14 | import os 15 | import sys 16 | 17 | sys.path.append(r'your_path') 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | from BaseFile.UserAgent import UserAgent 21 | 22 | HEADERS = { 23 | # 配置样例 24 | "heasers": { 25 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 26 | 'Accept-Encoding': 'gzip, deflate, br', 27 | 'Accept-Language': 'zh-CN,zh;q=0.9', 28 | 'Cache-Control': 'max-age=0', 29 | 'Connection': 'keep-alive', 30 | 'Cookie': 'GA1.2.151205434.1528702564; user_trace_token=20180611153613-1e11d7da-6d4a-11e8-9446-5254005c3644; LGUID=20180611153613-1e11da71-6d4a-11e8-9446-5254005c3644; JSESSIONID=ABAAABAAAGFABEFA887FF2126C2345351E1CF33022A085A; _gid=GA1.2.295504001.1536894927; LGSID=20180914111529-6ee84ad5-b7cc-11e8-b939-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536894927; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_navigation; SEARCH_ID=f8b502632588469da5ea73ee9dd382a5; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536897145; LGRID=20180914115228-993585b5-b7d1-11e8-b939-5254005c3644', 31 | 'Host': 'www.lagou.com', 32 | # 'Referer': 'https://www.lagou.com/zhaopin/Java/?labelWords=label', 33 | 'Upgrade-Insecure-Requests': '1', 34 | 'User-Agent': '%s' % UserAgent.pc_agent()}, 35 | # 简书 36 | "headersJianShun": { 37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 38 | 'Accept-Encoding': 'gzip, deflate, br', 39 | 'Accept-Language': 'zh-CN,zh;q=0.9', 40 | 'Cache-Control': 'max-age=0', 41 | 'Connection': 'keep-alive', 42 | 'Host': 'www.jianshu.com', 43 | 'Upgrade-Insecure-Requests': '1', 44 | 'User-Agent': '%s' % UserAgent.pc_agent()}, 45 | 46 | } 47 | 48 | if __name__ == '__main__': 49 | print(HEADERS['heasers']) 50 | -------------------------------------------------------------------------------- /Config/KAFKA: -------------------------------------------------------------------------------- 1 | kafka_test: 2 | host: 'localhost:9092;' 3 | topics: 'topic_demo' 4 | zookeeper_connect: 'zookeeper_demo:2181' 5 | 6 | 7 | -------------------------------------------------------------------------------- /Config/MONGODB: -------------------------------------------------------------------------------- 1 | mongo_test: 2 | host : 'localhost' 3 | port : 27017 4 | db : 'demo' 5 | table : 'demo' 6 | user : 7 | passwd : -------------------------------------------------------------------------------- /Config/MYSQL: -------------------------------------------------------------------------------- 1 | test01: 2 | host: localhost 3 | user: root 4 | passwd: root 5 | port: 3306 6 | db: test 7 | 8 | -------------------------------------------------------------------------------- /Config/PROXYIP: -------------------------------------------------------------------------------- 1 | 183.129.207.78:18118 2 | 58.250.23.210:1080 -------------------------------------------------------------------------------- /Config/REDIS: -------------------------------------------------------------------------------- 1 | test01: 2 | host: localhost 3 | port: 6379 4 | user: 5 | passwd: root 6 | db: 1 -------------------------------------------------------------------------------- /Config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: __init__.py.py 10 | @time: 2018/9/25 17:25 11 | @describe: 12 | """ 13 | import sys 14 | import os 15 | 16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 17 | sys.path.append("..") -------------------------------------------------------------------------------- /Data/cnblogs.json: -------------------------------------------------------------------------------- 1 | [{"Title": "\u9a6c\u4e912018\u5e74\u81f4\u80a1\u4e1c\u4fe1\uff1a\u6ca1\u6709\u5386\u53f2\u6027\u6311\u6218 \u4f55\u6765\u5386\u53f2\u6027\u673a\u9047\uff1f", "url": "https://news.cnblogs.com/n/610919/"}, {"Title": "\u82f9\u679c\u53d1\u5e03\u4f1a\u6700\u5168\u6c47\u603b\uff1a\u300c\u5168\u9762\u5c4f\u300diPad\u6027\u80fd\u65e0\u654c\uff0cMac\u7535\u8111\u4e1c\u5c71\u518d\u8d77", "url": "https://news.cnblogs.com/n/610932/"}, {"Title": "\u674e\u548f\u79bb\u4e16\uff01\u505a\u597d\u8fd9\u4e9b\u7b5b\u67e5\uff0c\u522b\u8ba9\u764c\u75c7\u62d6\u5230\u665a\u671f\u66f4\u91cd\u8981", "url": "https://news.cnblogs.com/n/610869/"}, {"Title": "\u764c\u7ec6\u80de\u5185\u53d1\u73b0\u81ea\u6740\u5f00\u5173 \u764c\u75c7\u6cbb\u7597\u6216\u5f00\u542f\u65b0\u65f6\u4ee3", "url": "https://news.cnblogs.com/n/610859/"}, {"Title": "\u5171\u4eab\u5355\u8f66\u51b2\u51fb\u6ce2\uff1a\u4e2d\u56fd\u201c\u81ea\u884c\u8f66\u7b2c\u4e00\u9547\u201d\u7684\u8870\u843d", "url": "https://news.cnblogs.com/n/610821/"}, {"Title": "\u764c\u75c7\u514d\u75ab\u7597\u6cd5\u51e0\u5e74\u5185\u6216\u6709\u66f4\u5927\u7a81\u7834", "url": "https://news.cnblogs.com/n/610811/"}, {"Title": "\u5357\u975e\u5a92\u4f53\u4eba\u4e13\u8bbf\u9a6c\u4e91\uff1a\u9a6c\u4e91\u65e0\u6bd4\u6fc0\u52b1\u4eba\u5fc3\u4f46\u53c8\u65e0\u6bd4\u8c26\u5351", "url": "https://news.cnblogs.com/n/610768/"}, {"Title": "\u9996\u4f4d\u661f\u9645\u8bbf\u5ba2\u201c\u5965\u964c\u964c\u201d\u5c0f\u884c\u661f\u98de\u4e22\u4e86\uff01", "url": "https://news.cnblogs.com/n/610725/"}, {"Title": "IBM\u62df\u65a5\u8d44340\u4ebf\u7f8e\u5143\u6536\u8d2d\u7ea2\u5e3d \u6ea2\u4ef763%", "url": "https://news.cnblogs.com/n/610708/"}, {"Title": "\u9996\u679a\u6c11\u8425\u8fd0\u8f7d\u706b\u7bad\u672a\u80fd\u5165\u8f68\uff0c\u4f60\u5e94\u8be5\u77e5\u9053\u7684\u66f4\u591a", "url": "https://news.cnblogs.com/n/610695/"}, {"Title": "\u591a\u4efb\u52a1\u53ef\u80fd\u4f1a\u6076\u5316\u8bb0\u5fc6", "url": "https://news.cnblogs.com/n/610688/"}, {"Title": "GitHub\u65b0\u4efbCEO\u8c08\u5fae\u8f6f\u6536\u8d2d\uff1a\u4fdd\u7559GitHub\u4ef7\u503c\u89c2", "url": "https://news.cnblogs.com/n/610626/"}, {"Title": "\u4e3a\u4ec0\u4e48\u5f88\u591a\u70ab\u9177\u7684\u4ea7\u54c1\u5e76\u6ca1\u6709\u6d41\u884c\u8d77\u6765\uff1f", "url": "https://news.cnblogs.com/n/610625/"}, {"Title": "\u201cRNG\u8f93\u4e86\u201d\uff0c\u4e3a\u4f55\u80fd\u5728\u793e\u4ea4\u5708\u5f15\u8d77\u5927\u9707\u52a8\uff1f", "url": "https://news.cnblogs.com/n/610584/"}, {"Title": "\u4eba\u7c7b\u7ec6\u80de\u53ef\u5236\u9020\u8ba1\u7b97\u673a\u82af\u7247\uff0c\u66f4\u5c0f\u8fd0\u884c\u901f\u5ea6\u66f4\u5feb\uff01", "url": "https://news.cnblogs.com/n/610552/"}, {"Title": "10\u5c81\u5973\u5b69\u5f00\u53d1\u4e16\u754c\u9996\u6b3eAI\u684c\u6e38\uff0c13\u5c81\u5c11\u5e74\u7528AI\u68c0\u6d4b\u80f0\u817a", "url": "https://news.cnblogs.com/n/610482/"}, {"Title": "\u673a\u5668\u4eba\u7a0b\u5e8f\u50cf\u4eba\u7c7b\u7a0b\u5e8f\u5458\u4e00\u6837\u4fee bug", "url": "https://news.cnblogs.com/n/610505/"}, {"Title": "\u4eba\u7c7b\u7caa\u4fbf\u91cc\u53d1\u73b0\u591a\u79cd\u5851\u6599\uff01\u767d\u8272\u6c61\u67d3\u7ec8\u8ba9\u4eba\u7c7b\u81ea\u98df\u5176\u679c", "url": "https://news.cnblogs.com/n/610466/"}] 2 | -------------------------------------------------------------------------------- /Data/img/1540974405032.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974405032.jpg -------------------------------------------------------------------------------- /Data/img/1540974407587.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974407587.jpg -------------------------------------------------------------------------------- /Data/img/1540974408414.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974408414.jpg -------------------------------------------------------------------------------- /Data/jianshu.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/jianshu.csv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # requests升级版requests-html 爬虫编写及通用爬虫模块搭建 2 | *** 3 | #### 安装: pip install requests-html 4 | #### 中文文档:https://cncert.github.io/requests-html-doc-cn/#/ 5 | # 搭建常用通用爬虫各组件 6 | ## 简介: 7 | - 1、 爬虫模块编写,支持pyquery、xpath、JavaScript、beautifulsoup、正则等多种解析模式,使用请查看上面中文文档; 8 | - 2、 支持抓取各类日志保存,抓取日志、错误日志等各类日志信息; 9 | - 3、 抓取起始链接可来自于Redis,只需提供Redis-key信息,不用额外编写; 10 | - 4、 抓取信息持久化支持CSV、JSON、MYSQL、REDIS、KAFAKA、MONGODB等几大类常用持久化工具; 11 | - 5、 该框架主要是几大模块的组合,至于爬虫逻辑的实现,根据个人需求。 12 | ## 文件树: 13 | |-Requests_Html\_Spider          |--目录文件
14 |    |--BaseFile                               |--基础配置
15 |        |---GetLocalFile.py                   |--读取本地文件,如URL
16 |        |---GetProxyIp.py                      |--获取代理IP
17 |        |---Logger.py                            |--配置logging日志
18 |        |--- 19 | ReadConfig.py                    |--读取配置文件
20 |        |--- 21 | UserAgent.py                      |--轮换请求头
22 |    |--Common                                |--公共操作类
23 |        |---CsvHelper.py                       |--操作CSV文件
24 |        |---JsonHelper.py                      |--操作JSON文件
25 |        |---KafkaHelper.py                    |--操作KAFKA文件
26 |        |---MongoHelper.py                  |--操作MONGODB文件
27 |        |---MysqlHelper.py                    |--操作MYSQL文件
28 |        |---RedisHelper.py                    |--操作REDIS文件
29 |     |--Config                                   |--配置信息
30 |        |---HEADERS.py                        |--配置请求头
31 |        |---KAFKA                                  |--KAFKA配置
32 |        |---MONGODB                           |--MONGODB配置
33 |        |---MYSQL                                 |--MYSQL配置
34 |        |---PROXYIP                              |--代理IP配置
35 |        |---REDIS                                  |--REDIS配置
36 |     |--Data                                      |--文件存储目录
37 |     |--Logs                                      |--Logs日志存储目录
38 |     |--Spider                                    |--爬虫类
39 |        |---request\_html\_demo\_1.py   |--简书python爬虫教程抓取
40 |        |---request\_html\_demo\_2.py   |--爬取博客园新闻
41 |        |---request\_html\_demo\_3.py   |--爬取电脑高清壁纸库
42 | ## 说明:  本框架主要是爬虫基本常用模块组合,避免了日常爬虫编写中各类组件重新编写过程,同时结合requests—html使得编写更为简便,其中requests-html是requests的原作者专门针对爬虫编写的一个新模块,并在不断的跟新状态,[官方-github](https://github.com/kennethreitz/requests-html) 43 | #### Only Python 3.6 is supported. 44 | -------------------------------------------------------------------------------- /Spider/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: __init__.py.py 10 | @time: 2018/9/25 17:24 11 | @describe: 12 | """ 13 | import sys 14 | import os 15 | import BaseFile 16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 17 | sys.path.append("..") -------------------------------------------------------------------------------- /Spider/request_html_demo_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: request_html_demo_1.py 10 | @time: 2018/9/25 17:34 11 | @describe: 使用requests-html爬虫模块抓取~简书爬虫页面 12 | 中文文档:https://cncert.github.io/requests-html-doc-cn/#/ 13 | """ 14 | import json 15 | import logging 16 | import sys 17 | import os 18 | import requests 19 | import time 20 | # 导入requests_html包 21 | from requests_html import HTMLSession 22 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '\\' + '..')) 23 | sys.path.append("..") 24 | from BaseFile.Logger import Logger 25 | # 导入读取列表文件 26 | from BaseFile.GetLocalFile import GetLocalFile as GF 27 | from BaseFile.GetProxyIp import GetProxyIp as GP 28 | from Config.HEADERS import HEADERS as HS 29 | from Common.CsvHelper import CsvHelper as CV 30 | from Common.JsonHelper import JsonHelper as JS 31 | 32 | logger = Logger('jianshu.log', logging.WARNING, logging.DEBUG) 33 | # 实例化requests_html 34 | session = HTMLSession() 35 | 36 | 37 | # 基础版--抓取简书 前五页 python 爬虫教程 38 | def get_jianshu(base_url): 39 | try: 40 | htmlSource = session.get(base_url, headers=HS['headersJianShun']) 41 | print("请求状态码:", htmlSource.status_code) 42 | liText = htmlSource.html.find("#list-container > ul > li > div > a") 43 | for i in liText: 44 | print(i.text) 45 | print([x for x in i.absolute_links][0]) 46 | except Exception as e: 47 | print("请求错误!", e) 48 | logger.error(e) 49 | 50 | 51 | # 代理IP版本 52 | def get_jianshu_ip(base_url): 53 | try: 54 | proxies = GP().get_IP() 55 | print(proxies) 56 | htmlSource = session.get(base_url, headers=HS['headersJianShun'], proxies=proxies, verify=False) 57 | print("请求状态码:", htmlSource.status_code) 58 | liText = htmlSource.html.find("#list-container > ul > li > div > a") 59 | for i in liText: 60 | print(i.text) 61 | print([x for x in i.absolute_links][0]) 62 | except Exception as e: 63 | print("请求错误!", e) 64 | logger.error(e) 65 | 66 | 67 | # 存入csv版本 68 | def get_jianshu_fire(base_url): 69 | try: 70 | htmlSource = session.get(base_url, headers=HS['headersJianShun']) 71 | print("请求状态码:", htmlSource.status_code) 72 | liText = htmlSource.html.find("#list-container > ul > li > div > a") 73 | message = "标题", "URL" 74 | CV().csv_write("jianshu.csv", message) 75 | for i in liText: 76 | title = i.text 77 | link = [x for x in i.absolute_links][0] 78 | print(title) 79 | print(link) 80 | # 写入csv的数据格式,逗号分割 81 | message = title, link 82 | CV().csv_write("jianshu.csv", message) 83 | except Exception as e: 84 | print("请求错误!", e) 85 | 86 | 87 | if __name__ == '__main__': 88 | base_url = "https://www.jianshu.com/c/a480500350e7?order_by=added_at&page=" 89 | # # 基础版 90 | # get_jianshu(base_url+str(1)) 91 | # # 代理IP版 92 | # get_jianshu_ip(base_url+str(2)) 93 | # # 存入csv版 94 | # get_jianshu_fire(base_url+str(3)) 95 | 96 | 97 | str4=[] 98 | # dict4="222" 99 | str4.append({"title":1111}) 100 | str4.append({"title":1111}) 101 | 102 | print(str4) 103 | '''{"001":[{"title": "1111"}, {"title": "1111"}, {"title": "1111"}],"002":[{"title":"2222"}]}''' 104 | message = (str4) 105 | JS().json_write("test.json", message) -------------------------------------------------------------------------------- /Spider/request_html_demo_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: request_html_demo_2.py.py 10 | @time: 2018/10/31 15:34 11 | @describe: 获取博客园新闻 12 | https://news.cnblogs.com/n/recommend 13 | """ 14 | import sys 15 | import os 16 | 17 | from requests_html import HTMLSession 18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 19 | sys.path.append("..") 20 | from Common.JsonHelper import JsonHelper as JS 21 | session = HTMLSession() 22 | 23 | 24 | def get_cnblog(url): 25 | r = session.get(url) 26 | # 通过CSS找到新闻标签 27 | news = r.html.find('h2.news_entry > a') 28 | json_list = [] 29 | for new in news: 30 | title = new.text 31 | liks = [x for x in new.absolute_links][0] 32 | print(title) # 获得新闻标题 33 | print(liks) # 获得新闻链接 34 | json_list.append({"Title": title, "url": liks}) 35 | JS().json_write("cnblogs.json", json_list) 36 | 37 | 38 | if __name__ == '__main__': 39 | url = "https://news.cnblogs.com/n/recommend" 40 | get_cnblog(url) -------------------------------------------------------------------------------- /Spider/request_html_demo_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | @version: v1.0 5 | @author: W_H_J 6 | @license: Apache Licence 7 | @contact: 415900617@qq.com 8 | @software: PyCharm 9 | @file: request_html_demo_3.py 10 | @time: 2018/10/31 16:18 11 | @describe: 最高清壁纸库-桌面下载 12 | requests_html 解析方式 13 | """ 14 | import logging 15 | import sys 16 | import os 17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..')) 18 | sys.path.append("..") 19 | from BaseFile.Logger import Logger 20 | from requests_html import HTMLSession 21 | import requests 22 | import time 23 | logger = Logger('img.log', logging.WARNING, logging.DEBUG) 24 | session = HTMLSession() 25 | i = 0 26 | 27 | 28 | # 解析图片列表 29 | def get_girl_list(): 30 | # 返回一个 response 对象 31 | response = session.get('http://www.win4000.com/zt/xinggan.html') 32 | content = response.html.find('div.Left_bar', first=True) 33 | li_list = content.find('li') 34 | for li in li_list: 35 | url = li.find('a', first=True).attrs['href'] 36 | get_girl_detail(url) 37 | 38 | 39 | # 解析图片详细 40 | def get_girl_detail(url): 41 | # 返回一个 response 对象 42 | response = session.get(url) # 单位秒数 43 | content = response.html.find('div.scroll-img-cont', first=True) 44 | li_list = content.find('li') 45 | for li in li_list: 46 | img_url = li.find('img', first=True).attrs['data-original'] 47 | img_url = img_url[0:img_url.find('_')] + '.jpg' 48 | print(img_url) 49 | save_image(img_url) 50 | 51 | 52 | # 保持大图 53 | def save_image(img_url): 54 | try: 55 | global i 56 | i += 1 57 | print("=="*10 + ">>", i, "==>img") 58 | img_response = requests.get(img_url) 59 | t = int(round(time.time() * 1000)) # 毫秒级时间戳 60 | f = open('../Data/img/%d.jpg' % t, 'ab') # 存储图片,多媒体文件需要参数b(二进制文件) 61 | f.write(img_response.content) # 多媒体存储content 62 | f.close() 63 | except Exception as e: 64 | print("Downloads error:", e) 65 | logger.error(e) 66 | 67 | 68 | if __name__ == '__main__': 69 | print("Downloads img start, Please Don't close the window!") 70 | time.sleep(10) 71 | get_girl_list() 72 | print("Dolnloads img successful, Please to see /Data/img") 73 | a = input("please input q to close the windows!") 74 | if str(a) == 'q': 75 | os.close() --------------------------------------------------------------------------------