├── .gitignore
├── BaseFile
├── GetLocalFile.py
├── GetProxyIp.py
├── Logger.py
├── ReadConfig.py
├── UserAgent.py
└── __init__.py
├── Common
├── CsvHelper.py
├── JsonHelper.py
├── KafkaHelper.py
├── MongoHelper.py
├── MySqlHelper.py
├── RedisHelper.py
├── RedisHelperLongConncet.py
└── __init__.py
├── Config
├── HEADERS.py
├── KAFKA
├── MONGODB
├── MYSQL
├── PROXYIP
├── REDIS
└── __init__.py
├── Data
├── cnblogs.json
├── img
│ ├── 1540974405032.jpg
│ ├── 1540974407587.jpg
│ └── 1540974408414.jpg
└── jianshu.csv
├── LICENSE
├── README.md
└── Spider
├── __init__.py
├── request_html_demo_1.py
├── request_html_demo_2.py
└── request_html_demo_3.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/BaseFile/GetLocalFile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: GetLocalFile.py
10 | @time: 2018/9/25 17:26
11 | @describe: 操作本地文件
12 | """
13 | import sys
14 | import os
15 |
16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
17 | sys.path.append("..")
18 |
19 |
20 | class GetLocalFile:
21 | """ 读取本地文件,返回:List """
22 |
23 | @staticmethod
24 | def get_local_file(filename):
25 | with open(filename, "r", encoding="utf-8") as f:
26 | data = f.readlines()
27 | return [url[:-1] for url in data]
28 |
--------------------------------------------------------------------------------
/BaseFile/GetProxyIp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: GetProxyIp.py
10 | @time: 2018/9/25 17:45
11 | @describe: 返回代理IP
12 | """
13 | import sys
14 | import os
15 | import random
16 | import time
17 |
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 |
21 |
22 | class GetProxyIp:
23 | """随机从文件中读取proxy"""
24 |
25 | @staticmethod
26 | def get_random_proxy():
27 | while True:
28 | with open('../Config/PROXYIP', 'r') as f:
29 | proxies = f.readlines()
30 | if proxies:
31 | break
32 | else:
33 | time.sleep(1)
34 | proxy = random.choice(proxies).strip()
35 | return proxy
36 |
37 | """返回HTTP/HTTPS的代理IP,可根据代理IP类型更改"""
38 |
39 | def get_IP_Http(self):
40 | IP = self.get_random_proxy()
41 | proxies = {
42 | "http": IP,
43 | }
44 | return proxies
45 |
46 | def get_IP_Https(self):
47 | IP = self.get_random_proxy()
48 | proxies = {
49 | "https": IP,
50 | }
51 | return proxies
52 |
53 |
54 | if __name__ == '__main__':
55 | print(GetProxyIp().get_IP_Http())
56 |
--------------------------------------------------------------------------------
/BaseFile/Logger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @site:
9 | @software: PyCharm
10 | @file: logger.py
11 | @time: 2018/3/23 14:30
12 | @describe: log日志编写
13 | """
14 | import logging
15 | import os
16 | import datetime
17 |
18 | FOREGROUND_WHITE = 0x0007
19 | FOREGROUND_BLUE = 0x01 # text color contains blue.
20 | FOREGROUND_GREEN = 0x02 # text color contains green.
21 | FOREGROUND_RED = 0x04 # text color contains red.
22 | FOREGROUND_YELLOW = FOREGROUND_RED | FOREGROUND_GREEN
23 | STD_OUTPUT_HANDLE = -11
24 | # 创建log文件夹
25 | log_dir = '../logs'
26 | if not os.path.exists(log_dir):
27 | os.mkdir(log_dir)
28 | print(log_dir)
29 |
30 |
31 | class Logger:
32 | def __init__(self, path, clevel=logging.DEBUG, Flevel=logging.DEBUG):
33 | fmt = logging.Formatter('[%(asctime)s] [%(levelname)s] %(message)s', '%Y-%m-%d %H:%M:%S')
34 | startTime = datetime.datetime.now().strftime('%Y-%m-%d')
35 | path = os.path.join(log_dir, str(startTime) + "-" + path)
36 | self.logger = logging.getLogger(path)
37 | self.logger.setLevel(logging.DEBUG)
38 | # 设置CMD日志
39 | sh = logging.StreamHandler()
40 | sh.setFormatter(fmt)
41 | sh.setLevel(clevel)
42 | # 设置文件日志
43 | fh = logging.FileHandler(path)
44 | fh.setFormatter(fmt)
45 | fh.setLevel(Flevel)
46 | self.logger.addHandler(sh)
47 | self.logger.addHandler(fh)
48 |
49 | def debug(self, message, color=FOREGROUND_BLUE):
50 | self.logger.debug(message)
51 |
52 | def info(self, message, color=FOREGROUND_GREEN):
53 | self.logger.info(message)
54 |
55 | def war(self, message, color=FOREGROUND_YELLOW):
56 | self.logger.warn(message)
57 |
58 | def error(self, message, color=FOREGROUND_RED):
59 | self.logger.error(message)
60 |
61 | def cri(self, message):
62 | self.logger.critical(message)
63 |
64 |
65 | if __name__ == '__main__':
66 | logyyx = Logger('test.log', logging.WARNING, logging.DEBUG)
67 | logyyx.debug('一个debug信息')
68 | logyyx.info('一个info信息')
69 | logyyx.war('一个warning信息')
70 | logyyx.error('一个error信息')
71 | logyyx.cri('一个致命critical信息')
72 |
--------------------------------------------------------------------------------
/BaseFile/ReadConfig.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: ReadConfig.py
10 | @time: 2018/9/25 19:51
11 | @describe: 读取各类配置文件
12 | """
13 | import sys
14 | import os
15 | from yaml import load
16 |
17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
18 | sys.path.append("..")
19 |
20 |
21 | class ReadConfig:
22 | """ 读取各类YAM配置文件"""
23 |
24 | @staticmethod
25 | def get_conf(path_name):
26 | config_path = os.path.join(os.path.dirname(__file__), path_name)
27 | with open(config_path) as f:
28 | cont = f.read()
29 | return load(cont)
30 |
--------------------------------------------------------------------------------
/BaseFile/UserAgent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @site:
9 | @software: PyCharm
10 | @file: user_agent.py
11 | @time: 2018/8/13 10:47
12 | @describe: 浏览器请求头
13 | 因为请求头分手机版本和PC版本,版本不同,页面返回信息页有所不同,注意使用
14 | """
15 | import random
16 | import os
17 | import sys
18 |
19 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
20 | sys.path.append("..")
21 | USER_AGENT_LIST = [
22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
23 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
24 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
25 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
26 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
27 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
28 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
30 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
32 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
35 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
36 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
37 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
38 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
39 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
40 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
41 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
42 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
43 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
44 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
45 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
46 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
47 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
48 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
49 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
50 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
51 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
52 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
53 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
54 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
55 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
56 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
57 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
58 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
59 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
60 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
62 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
63 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
64 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
65 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
66 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
67 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
68 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
69 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
70 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
71 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
72 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
73 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
74 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
75 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
76 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
77 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
78 | ]
79 | USER_AGENT_PHONE_LIST = [
80 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
81 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
82 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
83 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
84 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
85 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
86 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
87 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
88 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
89 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
90 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
91 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
92 | ]
93 |
94 |
95 | class UserAgent:
96 | """电脑端请求头"""
97 |
98 | @staticmethod
99 | def pc_agent():
100 | return random.choice(USER_AGENT_LIST)
101 |
102 | """手机端请求头"""
103 |
104 | @staticmethod
105 | def phone_agent():
106 | return random.choice(USER_AGENT_PHONE_LIST)
107 |
--------------------------------------------------------------------------------
/BaseFile/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: __init__.py
10 | @time: 2018/9/26 18:34
11 | @describe:
12 | """
13 | import sys
14 | import os
15 |
16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
17 | sys.path.append("..")
--------------------------------------------------------------------------------
/Common/CsvHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: CsvHelper.py
10 | @time: 2018/9/26 15:48
11 | @describe: csv 助手
12 | """
13 | import logging
14 | import sys
15 | import os
16 | import csv
17 |
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 | from BaseFile.Logger import Logger
21 | logger = Logger('csv.log', logging.WARNING, logging.DEBUG)
22 |
23 |
24 | class CsvHelper:
25 | @staticmethod
26 | def CsvConnection(fireName):
27 | # 存入csv文件
28 | out = open('../Data/%s' % fireName, 'a', newline='', encoding="utf-8")
29 | # 设定写入模式
30 | csv_write = csv.writer(out, dialect='excel')
31 | return csv_write
32 |
33 | """ 以追加方式写入csv文件,message 数据格式:List """
34 |
35 | def csv_write(self, fireName, message):
36 | try:
37 | csv_write = self.CsvConnection(fireName)
38 | csv_write.writerow([msg for msg in message])
39 | print("write successful!")
40 | except Exception as e:
41 | print("[csv write error]", e)
42 | logger.error("[csv write error]"+str(e))
43 |
44 | """ 读取csv文件,返回List """
45 |
46 | @staticmethod
47 | def csv_read(fireName):
48 | try:
49 | with open("../Data/%s" % fireName, "r", encoding="utf-8") as csvfile:
50 | reader2 = csv.reader(csvfile)
51 | return [x for x in reader2]
52 | except Exception as e:
53 | print("[csv read error]", e)
54 | logger.error("[csv read error]" + str(e))
55 |
56 |
57 | if __name__ == '__main__':
58 | message = "host", "1002", "1003"
59 | # 写入csv文件
60 | CsvHelper().csv_write("test.csv", message)
61 | # 读取csv文件
62 | print(CsvHelper().csv_read("test.csv"))
63 |
--------------------------------------------------------------------------------
/Common/JsonHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: JsonHelper.py
10 | @time: 2018/10/31 15:04
11 | @describe: 读写json文件
12 | """
13 | import logging
14 | import sys
15 | import os
16 | import json
17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
18 | sys.path.append("..")
19 | from BaseFile.Logger import Logger
20 | logger = Logger('Json.log', logging.WARNING, logging.DEBUG)
21 |
22 |
23 | class JsonHelper:
24 | """ 以追加方式写入json文件,message 数据格式:List """
25 |
26 | def json_write(self, fireName, message):
27 | try:
28 | out = open('../Data/%s' % fireName, 'a', encoding='utf-8')
29 | out.write(json.dumps(message) + "\n")
30 | print("write successful!")
31 | except Exception as e:
32 | print("[json write error]", e)
33 | logger.error("[json write error]"+str(e))
34 |
35 | """读取json文件,原文件每行为一个独立json串,组合并不是一个正确的json格式"""
36 |
37 | def json_read(self, fireName):
38 | try:
39 | with open('../Data/%s' % fireName, 'r', encoding='utf-8') as f:
40 | msg = f.readlines()
41 | return [json.loads(data[:-1]) for data in msg]
42 | except Exception as e:
43 | print("[json read error]", e)
44 | logger.error("[json read error]"+str(e))
45 |
46 | """读取json文件,只做查看,无返回值,需要返回值,使用上一个方法"""
47 |
48 | def json_watch(self, fireName):
49 | try:
50 | f = open('../Data/%s' % fireName, 'r', encoding='utf-8')
51 | for data in f:
52 | print("*" * 150)
53 | print(json.loads(data))
54 | print("*" * 150, "\n")
55 | except Exception as e:
56 | print("[json watch error]", e)
57 | logger.error("[json watch error]"+str(e))
58 |
59 |
60 | if __name__ == '__main__':
61 | # 读取json--操作返回数据
62 | str_json = JsonHelper().json_read("cnblogs.json")
63 | for i in str_json:
64 | print(i)
65 | print("*" * 100, "\n")
66 | # 读取json--查看
67 | JsonHelper().json_watch("cnblogs.json")
68 |
--------------------------------------------------------------------------------
/Common/KafkaHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: KafkaHelper.py
10 | @time: 2018/9/26 14:17
11 | @describe: kafka 助手
12 | """
13 | import logging
14 | import sys
15 | import os
16 | from pykafka import KafkaClient
17 |
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 | from BaseFile.ReadConfig import ReadConfig as RC
21 | from BaseFile.Logger import Logger
22 | logger = Logger('kafka.log', logging.WARNING, logging.DEBUG)
23 | """ 读取哪个kafka 配置"""
24 | DBName = "kafka_demo1"
25 | settings = RC().get_conf("../Config/kafka")[DBName] # 获取Config-KAFKA 配置
26 | host = settings['host']
27 |
28 |
29 | class KafkaHelper:
30 | """DBName 指定读取哪个配置文件"""
31 | client = KafkaClient(hosts=host) # 可接受多个client
32 |
33 | def __init__(self):
34 | self.topics = settings['topics']
35 | self.zookeeper_connect = settings['zookeeper_connect']
36 | self.topics = settings['topics']
37 | self.client = self.client
38 |
39 | """ 配置连接 """
40 |
41 | def KafkaConnectionPool(self):
42 | try:
43 | topic = self.client.topics[self.topics.encode('utf-8')] # 选择一个topic
44 | return topic
45 | except Exception as e:
46 | print('[kafka-connect] error', e)
47 | logger.error('[kafka-connect] error '+str(e))
48 |
49 | """ 生成一条消息,并发送至kafka: partitionkey:分区名称;message:消息内容"""
50 |
51 | def producer_kafka(self, partitionKey, message):
52 | topic = self.KafkaConnectionPool()
53 | try:
54 | with topic.get_sync_producer() as producer:
55 | producer.produce(partition_key=partitionKey.encode('utf-8'),
56 | message=message.encode('utf-8'))
57 | print("successful send msg to kafka~~~~~")
58 | except Exception as e:
59 | print('[producer_kafka] error', e)
60 | logger.error('[producer_kafka] error '+str(e))
61 |
62 | """ 从zookeeper消费 get_balanced_consumer"""
63 |
64 | def consumer_zookeeper(self):
65 | topic = self.KafkaConnectionPool()
66 | try:
67 | balanced_consumer = topic.get_balanced_consumer(
68 | consumer_group='demo'.encode('utf-8'),
69 | auto_commit_enable=True, # 设置为False的时候不需要添加consumer_group,直接连接topic即可取到消息
70 | zookeeper_connect=self.zookeeper_connect # 这里就是连接多个zk
71 | )
72 | for message in balanced_consumer:
73 | if message is not None:
74 | print(message.offset, message.partition_key, str(message.value, encoding="utf-8"))
75 | except Exception as e:
76 | print("[consumer_zookeeper] error:", e)
77 | logger.error("[consumer_zookeeper] error "+str(e))
78 |
79 | """ 从kafka消费 get_simple_consumer"""
80 |
81 | def consumer_kafka(self):
82 | topic = self.KafkaConnectionPool()
83 | try:
84 | # 从kafka消费
85 | kafka_consumer = topic.get_simple_consumer(
86 | consumer_group='demo'.encode("utf-8"),
87 | auto_commit_enable=True,
88 | consumer_id='demo'.encode("utf-8")
89 | )
90 | for message in kafka_consumer:
91 | if message is not None:
92 | print(message.offset, message.partition_key, str(message.value, encoding="utf-8"))
93 | except Exception as e:
94 | print("[consumer_kafka] error", e)
95 | logger.error("[consumer_kafka] error "+str(e))
96 |
97 |
98 | if __name__ == '__main__':
99 | # # print(KafkaHelper().consumer_kafka())
100 | message = '{"demo3":"test02"}'
101 | KafkaHelper().producer_kafka('demo',message)
102 |
--------------------------------------------------------------------------------
/Common/MongoHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: MongoHelper.py
10 | @time: 2018/9/26 11:21
11 | @describe: mongodb 助手
12 | http://www.runoob.com/mongodb/mongodb-connections.html
13 | """
14 | import logging
15 | import sys
16 | import os
17 | from pymongo import MongoClient
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 | from BaseFile.ReadConfig import ReadConfig as RC
21 | from BaseFile.Logger import Logger
22 | logger = Logger('mongodb.log', logging.WARNING, logging.DEBUG)
23 |
24 |
25 | class MongoHelper:
26 | def __init__(self, DBName):
27 | self.settings = RC().get_conf("../Config/MONGODB")[DBName] # 获取Config-MONGODB 配置
28 | self.host = self.settings['host']
29 | self.port = self.settings['port']
30 | self.user = self.settings['user']
31 | self.passwd = self.settings['passwd']
32 | self.dbname = self.settings['db']
33 | self.table = self.settings['table']
34 | self.conn = MongoClient(host=self.host, port=self.port)
35 | # 如果用户名密码存在则认证登录
36 | if self.user or self.passwd is not None:
37 | self.db_auth = self.conn.admin
38 | self.db_auth.authenticate(self.user, self.passwd)
39 | self.db = self.conn.get_database(self.dbname)
40 | self.collection = self.db.get_collection(self.table)
41 |
42 | def insert(self, item, collection_name=None):
43 | """
44 | 插入数据,这里的数据可以是一个,也可以是多个
45 | :param item: 需要插入的数据
46 | :param collection_name: 可选,需要访问哪个集合
47 | :return:
48 | """
49 | try:
50 | if collection_name is not None:
51 | collection = self.db.get_collection(self.db)
52 | collection.insert(item)
53 | else:
54 | self.collection.insert(item)
55 | except Exception as e:
56 | print("mongodb insert error!", e)
57 | logger.error("mongodb insert error! "+str(e))
58 | finally:
59 | self.conn.close()
60 |
61 | def find(self, expression=None, collection_name=None):
62 | """
63 | 进行简单查询,可以指定条件和集合
64 | :param expression: 查询条件,可以为空
65 | :param collection_name: 集合名称
66 | :return: 所有结果
67 | """
68 | try:
69 | if collection_name is not None:
70 | collection = self.db.get_collection(self.db)
71 | if expression is None:
72 | return collection.find()
73 | else:
74 | return collection.find(expression)
75 | else:
76 | if expression is None:
77 | return self.collection.find()
78 | else:
79 | return self.collection.find(expression)
80 | except Exception as e:
81 | print("mongodb find error!", e)
82 | logger.error("mongodb find error! "+str(e))
83 | finally:
84 | self.conn.close()
85 |
86 | def get_collection(self, collection_name=None):
87 | """
88 | 很多时候单纯的查询不能够通过这个类封装的方法执行,这时候就可以直接获取到对应的collection进行操作
89 | :param collection_name: 集合名称
90 | :return: collection
91 | """
92 | try:
93 | if collection_name is None:
94 | return self.collection
95 | else:
96 | return self.get_collection(collection_name)
97 | except Exception as e:
98 | print("mongodb get_collection error!", e)
99 | logger.error("mongodb get_collection error! "+str(e))
100 | finally:
101 | self.conn.close()
102 |
103 |
104 | if __name__ == '__main__':
105 | db = MongoHelper("mongo_test")
106 | # item = {'addredd': 'zhangsan', 'index': '23'}
107 | # db.insert(item) # 插入
108 | for item in db.find(): # 查询
109 | print(item)
110 |
--------------------------------------------------------------------------------
/Common/MySqlHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @site:
9 | @software: PyCharm
10 | @file: dbhelper.py
11 | @time: 2018/9/25 17:46
12 | @describe: 数据库操作助手
13 | http://www.runoob.com/mysql/mysql-tutorial.html
14 | """
15 | import logging
16 | import sys
17 | import os
18 | import pymysql
19 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
20 | sys.path.append("..")
21 | from BaseFile.ReadConfig import ReadConfig as RC
22 | from BaseFile.Logger import Logger
23 | logger = Logger('mysql.log', logging.WARNING, logging.DEBUG)
24 |
25 |
26 | class MysqlHelper:
27 | """这个类也是读取 Config/MYSQL中的配置,自行修改代码进行操作"""
28 |
29 | def __init__(self, DBName):
30 | self.settings = RC().get_conf("../Config/MYSQL")[DBName] # 获取Config-MYSQL配置,设置MYSQL连接信息
31 | self.host = self.settings['host']
32 | self.port = self.settings['port']
33 | self.user = self.settings['user']
34 | self.passwd = self.settings['passwd']
35 | self.db = self.settings['db']
36 |
37 | # 连接到mysql,不是连接到具体的数据库
38 | def connectMysql(self):
39 | conn = pymysql.connect(host=self.host,
40 | port=self.port,
41 | user=self.user,
42 | passwd=self.passwd,
43 | # db=self.db,不指定数据库名
44 | charset='utf8') # 要指定编码,否则中文可能乱码
45 | return conn
46 |
47 | # 连接到具体的数据库(settings中设置的MYSQL_DBNAME)
48 | def connectDatabase(self):
49 | conn = pymysql.connect(host=self.host,
50 | port=self.port,
51 | user=self.user,
52 | passwd=self.passwd,
53 | db=self.db,
54 | charset='utf8') # 要指定编码,否则中文可能乱码
55 | return conn
56 |
57 | # 创建数据库
58 |
59 | def createDatabase(self):
60 | """因为创建数据库直接修改 Config-MYSQL 中的配置 MYSQL_DBNAME 即可,所以就不要传sql语句了"""
61 | conn = self.connectMysql() # 连接数据库
62 | cur = conn.cursor()
63 | try:
64 | sql = "create database if not exists " + self.db
65 | cur.execute(sql) # 执行sql语句
66 | except Exception as e:
67 | print("Error createDatabase data!", e)
68 | logger.error("Error createDatabase data! "+str(e))
69 | finally:
70 | cur.close()
71 | conn.close()
72 |
73 | # 创建表
74 | def createTable(self, sql):
75 | conn = self.connectDatabase()
76 | cur = conn.cursor()
77 | try:
78 | cur.execute(sql)
79 | except Exception as e:
80 | print("Error createTable data!", e)
81 | logger.error("Error createTable data! "+str(e))
82 | finally:
83 | cur.close()
84 | conn.close()
85 |
86 | # 插入数据
87 | def insert(self, sql, *params):
88 | conn = self.connectDatabase()
89 | cur = conn.cursor()
90 | try:
91 | cur.execute(sql, params)
92 | conn.commit()
93 | except Exception as e:
94 | print("Error insert data!", e)
95 | logger.error("Error insert data! "+str(e))
96 | finally:
97 | cur.close()
98 | conn.close()
99 |
100 | # 更新数据
101 | def update(self, sql, *params):
102 | conn = self.connectDatabase()
103 | cur = conn.cursor()
104 | try:
105 | cur.execute(sql, params)
106 | conn.commit()
107 | except Exception as e:
108 | print("Error update data!", e)
109 | logger.error("Error update data! "+str(e))
110 | finally:
111 | cur.close()
112 | conn.close()
113 |
114 | # 删除数据
115 | def delete(self, sql, *params):
116 | conn = self.connectDatabase()
117 | cur = conn.cursor()
118 | try:
119 | cur.execute(sql, params)
120 | conn.commit()
121 | except Exception as e:
122 | print("Error delete data!", e)
123 | logger.error("Error delete data! "+str(e))
124 | finally:
125 | cur.close()
126 | conn.close()
127 |
128 | # 查询数据
129 | def select(self, sql):
130 | conn = self.connectDatabase()
131 | cur = conn.cursor()
132 | try:
133 | cur.execute(sql)
134 | conn.commit()
135 | # 获取所有记录列表
136 | results = cur.fetchall()
137 | list_results = []
138 | for i in range(len(results)):
139 | list_results.append(list(results[i]))
140 | return list_results
141 | except Exception as e:
142 | print("Error: unable to fecth data", e)
143 | logger.error("Error: unable to fecth data! "+str(e))
144 | finally:
145 | cur.close()
146 | conn.close()
147 |
148 |
149 | '''测试DBHelper的类'''
150 |
151 |
152 | class TestDBHelper():
153 | def __init__(self, DBNAME):
154 | self.dbHelper = DBHelper(DBNAME)
155 |
156 | # 测试创建数据库(settings配置文件中的MYSQL_DBNAME,直接修改settings配置文件即可)
157 | def testCreateDatebase(self):
158 | self.dbHelper.createDatabase()
159 | # 测试创建表
160 |
161 | def testCreateTable(self):
162 | sql = "create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))"
163 | self.dbHelper.createTable(sql)
164 |
165 | # 测试插入
166 | def testInsert(self, item):
167 | sql = "insert into testtable(name,url) values(%s,%s)"
168 | # params=("Ncepu_Etl","Ncepu_Etl")
169 | params = (item["name"], item["url"])
170 | self.dbHelper.insert(sql, *params) # *表示拆分元组,调用insert(*params)会重组成元组
171 |
172 | def testUpdate(self):
173 | sql = "update testtable set name=%s,url=%s where id=%s"
174 | params = ("update", "update", "1")
175 | self.dbHelper.update(sql, *params)
176 |
177 | def testDelete(self):
178 | sql = "delete from testtable where id=%s"
179 | params = ("1")
180 | self.dbHelper.delete(sql, *params)
181 |
182 | def testSelect(self):
183 | sql = "select url from testtable limit 5"
184 | # params=("1")
185 | return self.dbHelper.select(sql)
186 |
187 |
188 | if __name__ == "__main__":
189 | testDBHelper = TestDBHelper('test01')
190 | # testDBHelper.testCreateDatebase() #执行测试创建数据库
191 | # testDBHelper.testCreateTable() #执行测试创建表
192 | # testDBHelper.testInsert() #执行测试插入数据
193 | # testDBHelper.testUpdate() #执行测试更新数据
194 | # testDBHelper.testDelete() #执行测试删除数据
195 | print(testDBHelper.testSelect()) # 执行测试查询数据
196 |
--------------------------------------------------------------------------------
/Common/RedisHelper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: RedisHelper.py
10 | @time: 2018/9/26 10:14
11 | @describe: redis 操作助手
12 | 列出了常用操作,若要使用更多方法,可根据需求增加
13 | http://www.runoob.com/redis/redis-tutorial.html
14 | """
15 | import logging
16 | import sys
17 | import os
18 | import redis
19 | import time
20 |
21 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
22 | sys.path.append("..")
23 | from BaseFile.ReadConfig import ReadConfig as RC
24 | from BaseFile.Logger import Logger
25 | logger = Logger('redisPool.log', logging.WARNING, logging.DEBUG)
26 |
27 |
28 | # Redis助手一
29 | class RedisHelper:
30 | """创建Redis连接,每次使用完,自动释放连接
31 | """
32 | def __init__(self, DBName):
33 | self.settings = RC().get_conf("../Config/REDIS")[DBName] # 获取Config-REDIS 配置
34 | self.host = self.settings['host']
35 | self.port = self.settings['port']
36 | self.user = self.settings['user']
37 | self.passwd = self.settings['passwd']
38 | self.db = self.settings['db']
39 | try:
40 | # # 建立 REDIS 连接池
41 | # self.pool = redis.ConnectionPool(host=self.host, port=self.port, db=self.db, password=self.passwd,
42 | # decode_responses=True, socket_timeout=300)
43 | # self.r = redis.Redis(connection_pool=self.pool)
44 | self.r = redis.Redis(host=self.host, port=self.port, db=self.db, password=self.passwd,
45 | decode_responses=True, socket_timeout=300)
46 |
47 | except Exception as e:
48 | print("REDIS CONTENT ERROR:", e)
49 | logger.error("REDIS CONTENT ERROR: "+str(e))
50 |
51 | # 第二个参数listURL,必须传入list结构数据,插入到redis
52 | def redis_lpush(self, keyName, listUrl):
53 | try:
54 | i = 0
55 | for data in listUrl:
56 | i += 1
57 | self.r.lpush(keyName, data)
58 | print(i)
59 | print("successful push list!")
60 | except Exception as e:
61 | print('[redis_lpush] ERROR', e)
62 | logger.error('[redis_lpush] ERROR '+str(e))
63 |
64 | # 检查key是否存在
65 | def redis_exists(self, keyName):
66 | try:
67 | return self.r.exists(keyName)
68 | except Exception as e:
69 | print('[redis_exists] ERROR', e)
70 | logger.error('[redis_exists] ERROR '+str(e))
71 |
72 | # 以lpop方式取出元素,在keyName对应的列表的左侧获取第一个元素并在列表中移除
73 | def redis_lpop(self, keyName):
74 | try:
75 | url_list = self.r.lpop(keyName).decode() # 获取
76 | return url_list
77 | except Exception as e:
78 | print('[redis_pop] ERROR', e)
79 | logger.error('[redis_pop] ERROR '+str(e))
80 |
81 | # 获取redis长度
82 | def redis_llen(self, keyName):
83 | try:
84 | length = self.r.llen(keyName)
85 | return length
86 | except Exception as e:
87 | print("[redis_llen] ERROR", e)
88 | logger.error("[redis_llen] ERROR "+str(e))
89 |
90 | # 以lrange方式取出元素
91 | def redis_lrange(self, keyName, start, end):
92 | try:
93 | url_list = self.r.lrange(keyName, start, end)
94 | return url_list
95 | except Exception as e:
96 | print('[redis_lrange] ERROR', e)
97 | logger.error('[redis_lrange] ERROR '+str(e))
98 |
99 | # 以rpop方式取出元素,在keyName对应的列表的右侧获取第一个元素并在列表中移除
100 | def redis_rpop(self, keyName):
101 | try:
102 | url_list = self.r.rpop(keyName).decode()
103 | return url_list
104 | except Exception as e:
105 | print('[redis_rpop] ERROR', e)
106 | logger.error('[redis_rpop] ERROR '+str(e))
107 |
108 | # Set 是 String 类型的无序集合。集合成员是唯一的,这就意味着集合中不能出现重复的数据
109 | def redis_sadd(self, keyName, listUrl):
110 | try:
111 | i = 0
112 | for data in listUrl:
113 | i += 1
114 | self.r.sadd(keyName, data)
115 | print(i)
116 | print("successful sadd list!")
117 | except Exception as e:
118 | print('[redis_sadd] ERROR', e)
119 | logger.error('[redis_sadd] ERROR '+str(e))
120 |
121 | # 移除Set并返回集合中的一个随机元素
122 | def redis_spop(self, keyName):
123 | try:
124 | url_list = self.r.spop(keyName).decode()
125 | return url_list
126 | except Exception as e:
127 | print('[redis_spop] ERROR', e)
128 | logger.error('[redis_spop] ERROR '+str(e))
129 |
130 | # 移除Set并返回集合中所有成员
131 | def redis_smembers(self, keyName):
132 | try:
133 | url_list = self.r.smembers(keyName).decode()
134 | return url_list
135 | except Exception as e:
136 | print('[redis_smembers] ERROR', e)
137 | logger.error('[redis_smembers] ERROR '+str(e))
138 |
139 |
140 | if __name__ == '__main__':
141 | r = RedisHelper('test01').redis_llen("test")
142 | print(r)
143 |
--------------------------------------------------------------------------------
/Common/RedisHelperLongConncet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: RedisHelperLongConncet.py
10 | @time: 2018/11/9 11:13
11 | @describe: redis 操作助手
12 | 列出了常用操作,若要使用更多方法,可根据需求增加
13 | http://www.runoob.com/redis/redis-tutorial.html
14 | """
15 | import logging
16 | import sys
17 | import os
18 | import redis
19 | import time
20 |
21 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
22 | sys.path.append("..")
23 | from BaseFile.ReadConfig import ReadConfig as RC
24 | from BaseFile.Logger import Logger
25 | logger = Logger('redisStatic.log', logging.WARNING, logging.DEBUG)
26 |
27 | """ 配置文件名字"""
28 | CONFIGNAME = "test01"
29 |
30 |
31 | # USE IT :from Common.RedisHelperLongConncet import RedisHelperConnect as RHC
32 | # CONNECT POOL: pool = redisConfig("test02").getConfig() in the last lines
33 | # 获取配置信息
34 | class redisConfig:
35 | """建立 redispool-不释放连接"""
36 |
37 | def __init__(self, DBName):
38 | self.DBName = DBName
39 |
40 | # 获取配置信息
41 | def getConfig(self):
42 | try:
43 | DBName = self.DBName
44 | settings = RC().get_conf("../Config/REDIS")[DBName] # 获取Config-REDIS 配置
45 | host = settings['host']
46 | port = settings['port']
47 | user = settings['user']
48 | passwd = settings['passwd']
49 | db = settings['db']
50 | # 建立 REDIS 连接池
51 | pool = redis.ConnectionPool(host=host, port=port, db=db, password=passwd, decode_responses=True,
52 | socket_timeout=300)
53 | return pool
54 | except Exception as e:
55 | print("Redis Read config error!", e, "no config in REDIS.YAML!")
56 | logger.error("Redis Read config error! "+str(e) + " no config in REDIS.YAML!")
57 |
58 |
59 | """ 使用那个配置文件 """
60 | pool = redisConfig(CONFIGNAME).getConfig()
61 |
62 |
63 | class RedisHelperConnect:
64 | """创建Redis连接,不释放连接
65 | redis.ConnectionPool:创建连接池
66 | """
67 | try:
68 | r = redis.Redis(connection_pool=pool)
69 | except Exception as e:
70 | print("REDIS CONTENT ERROR:", e)
71 | logger.error("REDIS CONTENT ERROR:"+str(e))
72 |
73 | # 第二个参数listURL,必须传入list结构数据,插入到redis
74 | def redis_lpush(self, keyName, listUrl):
75 | try:
76 | i = 0
77 | for data in listUrl:
78 | i += 1
79 | self.r.lpush(keyName, data)
80 | print(i)
81 | print("successful push list!")
82 | except Exception as e:
83 | print('[redis_lpush] ERROR', e)
84 | logger.error('[redis_lpush] ERROR '+str(e))
85 |
86 | # 检查key是否存在
87 | def redis_exists(self, keyName):
88 | try:
89 | return self.r.exists(keyName)
90 | except Exception as e:
91 | print('[redis_exists] ERROR', e)
92 | logger.error('[redis_exists] ERROR '+str(e))
93 |
94 | # 以lpop方式取出元素,在keyName对应的列表的左侧获取第一个元素并在列表中移除
95 | def redis_lpop(self, keyName):
96 | try:
97 | url_list = self.r.lpop(keyName).decode() # 获取
98 | return url_list
99 | except Exception as e:
100 | print('[redis_pop] ERROR', e)
101 | logger.error('[redis_pop] ERROR'+str(e))
102 |
103 | # 获取redis长度
104 | def redis_llen(self, keyName):
105 | try:
106 | length = self.r.llen(keyName)
107 | return length
108 | except Exception as e:
109 | print("[redis_llen] ERROR", e)
110 | logger.error("[redis_llen] ERROR "+str(e))
111 |
112 | # 以lrange方式取出元素
113 | def redis_lrange(self, keyName, start, end):
114 | try:
115 | url_list = self.r.lrange(keyName, start, end)
116 | return url_list
117 | except Exception as e:
118 | print('[redis_lrange] ERROR', e)
119 | logger.error('[redis_lrange] ERROR '+str(e))
120 |
121 | # 以rpop方式取出元素,在keyName对应的列表的右侧获取第一个元素并在列表中移除
122 | def redis_rpop(self, keyName):
123 | try:
124 | url_list = self.r.rpop(keyName).decode()
125 | return url_list
126 | except Exception as e:
127 | print('[redis_rpop] ERROR', e)
128 | logger.error('[redis_rpop] ERROR '+str(e))
129 |
130 | # Set 是 String 类型的无序集合。集合成员是唯一的,这就意味着集合中不能出现重复的数据
131 | def redis_sadd(self, keyName, listUrl):
132 | try:
133 | i = 0
134 | for data in listUrl:
135 | i += 1
136 | self.r.sadd(keyName, data)
137 | print(i)
138 | print("successful sadd list!")
139 | except Exception as e:
140 | print('[redis_sadd] ERROR', e)
141 | logger.error('[redis_sadd] ERROR '+str(e))
142 |
143 | # 移除Set并返回集合中的一个随机元素
144 | def redis_spop(self, keyName):
145 | try:
146 | url_list = self.r.spop(keyName).decode()
147 | return url_list
148 | except Exception as e:
149 | print('[redis_spop] ERROR', e)
150 | logger.error('[redis_spop] ERROR '+str(e))
151 |
152 | # 移除Set并返回集合中所有成员
153 | def redis_smembers(self, keyName):
154 | try:
155 | url_list = self.r.smembers(keyName).decode()
156 | return url_list
157 | except Exception as e:
158 | print('[redis_spop] ERROR', e)
159 | logger.error('[redis_spop] ERROR '+str(e))
160 |
161 |
162 | if __name__ == '__main__':
163 | r = RedisHelperConnect().redis_llen("url")
--------------------------------------------------------------------------------
/Common/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: __init__.py.py
10 | @time: 2018/9/25 20:02
11 | @describe:
12 | """
13 | import sys
14 | import os
15 |
16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
17 | sys.path.append("..")
--------------------------------------------------------------------------------
/Config/HEADERS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: HEADERS.py
10 | @time: 2018/9/26 17:31
11 | @describe: 请求头集合--爬虫请求头信息在此配置
12 | 'User-Agent': '%s' % UserAgent.pc_agent() 启用轮换浏览器请求头
13 | """
14 | import os
15 | import sys
16 |
17 | sys.path.append(r'your_path')
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 | from BaseFile.UserAgent import UserAgent
21 |
22 | HEADERS = {
23 | # 配置样例
24 | "heasers": {
25 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
26 | 'Accept-Encoding': 'gzip, deflate, br',
27 | 'Accept-Language': 'zh-CN,zh;q=0.9',
28 | 'Cache-Control': 'max-age=0',
29 | 'Connection': 'keep-alive',
30 | 'Cookie': 'GA1.2.151205434.1528702564; user_trace_token=20180611153613-1e11d7da-6d4a-11e8-9446-5254005c3644; LGUID=20180611153613-1e11da71-6d4a-11e8-9446-5254005c3644; JSESSIONID=ABAAABAAAGFABEFA887FF2126C2345351E1CF33022A085A; _gid=GA1.2.295504001.1536894927; LGSID=20180914111529-6ee84ad5-b7cc-11e8-b939-5254005c3644; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536894927; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_navigation; SEARCH_ID=f8b502632588469da5ea73ee9dd382a5; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536897145; LGRID=20180914115228-993585b5-b7d1-11e8-b939-5254005c3644',
31 | 'Host': 'www.lagou.com',
32 | # 'Referer': 'https://www.lagou.com/zhaopin/Java/?labelWords=label',
33 | 'Upgrade-Insecure-Requests': '1',
34 | 'User-Agent': '%s' % UserAgent.pc_agent()},
35 | # 简书
36 | "headersJianShun": {
37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
38 | 'Accept-Encoding': 'gzip, deflate, br',
39 | 'Accept-Language': 'zh-CN,zh;q=0.9',
40 | 'Cache-Control': 'max-age=0',
41 | 'Connection': 'keep-alive',
42 | 'Host': 'www.jianshu.com',
43 | 'Upgrade-Insecure-Requests': '1',
44 | 'User-Agent': '%s' % UserAgent.pc_agent()},
45 |
46 | }
47 |
48 | if __name__ == '__main__':
49 | print(HEADERS['heasers'])
50 |
--------------------------------------------------------------------------------
/Config/KAFKA:
--------------------------------------------------------------------------------
1 | kafka_test:
2 | host: 'localhost:9092;'
3 | topics: 'topic_demo'
4 | zookeeper_connect: 'zookeeper_demo:2181'
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Config/MONGODB:
--------------------------------------------------------------------------------
1 | mongo_test:
2 | host : 'localhost'
3 | port : 27017
4 | db : 'demo'
5 | table : 'demo'
6 | user :
7 | passwd :
--------------------------------------------------------------------------------
/Config/MYSQL:
--------------------------------------------------------------------------------
1 | test01:
2 | host: localhost
3 | user: root
4 | passwd: root
5 | port: 3306
6 | db: test
7 |
8 |
--------------------------------------------------------------------------------
/Config/PROXYIP:
--------------------------------------------------------------------------------
1 | 183.129.207.78:18118
2 | 58.250.23.210:1080
--------------------------------------------------------------------------------
/Config/REDIS:
--------------------------------------------------------------------------------
1 | test01:
2 | host: localhost
3 | port: 6379
4 | user:
5 | passwd: root
6 | db: 1
--------------------------------------------------------------------------------
/Config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: __init__.py.py
10 | @time: 2018/9/25 17:25
11 | @describe:
12 | """
13 | import sys
14 | import os
15 |
16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
17 | sys.path.append("..")
--------------------------------------------------------------------------------
/Data/cnblogs.json:
--------------------------------------------------------------------------------
1 | [{"Title": "\u9a6c\u4e912018\u5e74\u81f4\u80a1\u4e1c\u4fe1\uff1a\u6ca1\u6709\u5386\u53f2\u6027\u6311\u6218 \u4f55\u6765\u5386\u53f2\u6027\u673a\u9047\uff1f", "url": "https://news.cnblogs.com/n/610919/"}, {"Title": "\u82f9\u679c\u53d1\u5e03\u4f1a\u6700\u5168\u6c47\u603b\uff1a\u300c\u5168\u9762\u5c4f\u300diPad\u6027\u80fd\u65e0\u654c\uff0cMac\u7535\u8111\u4e1c\u5c71\u518d\u8d77", "url": "https://news.cnblogs.com/n/610932/"}, {"Title": "\u674e\u548f\u79bb\u4e16\uff01\u505a\u597d\u8fd9\u4e9b\u7b5b\u67e5\uff0c\u522b\u8ba9\u764c\u75c7\u62d6\u5230\u665a\u671f\u66f4\u91cd\u8981", "url": "https://news.cnblogs.com/n/610869/"}, {"Title": "\u764c\u7ec6\u80de\u5185\u53d1\u73b0\u81ea\u6740\u5f00\u5173 \u764c\u75c7\u6cbb\u7597\u6216\u5f00\u542f\u65b0\u65f6\u4ee3", "url": "https://news.cnblogs.com/n/610859/"}, {"Title": "\u5171\u4eab\u5355\u8f66\u51b2\u51fb\u6ce2\uff1a\u4e2d\u56fd\u201c\u81ea\u884c\u8f66\u7b2c\u4e00\u9547\u201d\u7684\u8870\u843d", "url": "https://news.cnblogs.com/n/610821/"}, {"Title": "\u764c\u75c7\u514d\u75ab\u7597\u6cd5\u51e0\u5e74\u5185\u6216\u6709\u66f4\u5927\u7a81\u7834", "url": "https://news.cnblogs.com/n/610811/"}, {"Title": "\u5357\u975e\u5a92\u4f53\u4eba\u4e13\u8bbf\u9a6c\u4e91\uff1a\u9a6c\u4e91\u65e0\u6bd4\u6fc0\u52b1\u4eba\u5fc3\u4f46\u53c8\u65e0\u6bd4\u8c26\u5351", "url": "https://news.cnblogs.com/n/610768/"}, {"Title": "\u9996\u4f4d\u661f\u9645\u8bbf\u5ba2\u201c\u5965\u964c\u964c\u201d\u5c0f\u884c\u661f\u98de\u4e22\u4e86\uff01", "url": "https://news.cnblogs.com/n/610725/"}, {"Title": "IBM\u62df\u65a5\u8d44340\u4ebf\u7f8e\u5143\u6536\u8d2d\u7ea2\u5e3d \u6ea2\u4ef763%", "url": "https://news.cnblogs.com/n/610708/"}, {"Title": "\u9996\u679a\u6c11\u8425\u8fd0\u8f7d\u706b\u7bad\u672a\u80fd\u5165\u8f68\uff0c\u4f60\u5e94\u8be5\u77e5\u9053\u7684\u66f4\u591a", "url": "https://news.cnblogs.com/n/610695/"}, {"Title": "\u591a\u4efb\u52a1\u53ef\u80fd\u4f1a\u6076\u5316\u8bb0\u5fc6", "url": "https://news.cnblogs.com/n/610688/"}, {"Title": "GitHub\u65b0\u4efbCEO\u8c08\u5fae\u8f6f\u6536\u8d2d\uff1a\u4fdd\u7559GitHub\u4ef7\u503c\u89c2", "url": "https://news.cnblogs.com/n/610626/"}, {"Title": "\u4e3a\u4ec0\u4e48\u5f88\u591a\u70ab\u9177\u7684\u4ea7\u54c1\u5e76\u6ca1\u6709\u6d41\u884c\u8d77\u6765\uff1f", "url": "https://news.cnblogs.com/n/610625/"}, {"Title": "\u201cRNG\u8f93\u4e86\u201d\uff0c\u4e3a\u4f55\u80fd\u5728\u793e\u4ea4\u5708\u5f15\u8d77\u5927\u9707\u52a8\uff1f", "url": "https://news.cnblogs.com/n/610584/"}, {"Title": "\u4eba\u7c7b\u7ec6\u80de\u53ef\u5236\u9020\u8ba1\u7b97\u673a\u82af\u7247\uff0c\u66f4\u5c0f\u8fd0\u884c\u901f\u5ea6\u66f4\u5feb\uff01", "url": "https://news.cnblogs.com/n/610552/"}, {"Title": "10\u5c81\u5973\u5b69\u5f00\u53d1\u4e16\u754c\u9996\u6b3eAI\u684c\u6e38\uff0c13\u5c81\u5c11\u5e74\u7528AI\u68c0\u6d4b\u80f0\u817a", "url": "https://news.cnblogs.com/n/610482/"}, {"Title": "\u673a\u5668\u4eba\u7a0b\u5e8f\u50cf\u4eba\u7c7b\u7a0b\u5e8f\u5458\u4e00\u6837\u4fee bug", "url": "https://news.cnblogs.com/n/610505/"}, {"Title": "\u4eba\u7c7b\u7caa\u4fbf\u91cc\u53d1\u73b0\u591a\u79cd\u5851\u6599\uff01\u767d\u8272\u6c61\u67d3\u7ec8\u8ba9\u4eba\u7c7b\u81ea\u98df\u5176\u679c", "url": "https://news.cnblogs.com/n/610466/"}]
2 |
--------------------------------------------------------------------------------
/Data/img/1540974405032.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974405032.jpg
--------------------------------------------------------------------------------
/Data/img/1540974407587.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974407587.jpg
--------------------------------------------------------------------------------
/Data/img/1540974408414.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/img/1540974408414.jpg
--------------------------------------------------------------------------------
/Data/jianshu.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangchengdeye/Requests_Html_Spider/3ea30f8f16f88e8f2934b411fa720c76ad054482/Data/jianshu.csv
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # requests升级版requests-html 爬虫编写及通用爬虫模块搭建
2 | ***
3 | #### 安装: pip install requests-html
4 | #### 中文文档:https://cncert.github.io/requests-html-doc-cn/#/
5 | # 搭建常用通用爬虫各组件
6 | ## 简介:
7 | - 1、 爬虫模块编写,支持pyquery、xpath、JavaScript、beautifulsoup、正则等多种解析模式,使用请查看上面中文文档;
8 | - 2、 支持抓取各类日志保存,抓取日志、错误日志等各类日志信息;
9 | - 3、 抓取起始链接可来自于Redis,只需提供Redis-key信息,不用额外编写;
10 | - 4、 抓取信息持久化支持CSV、JSON、MYSQL、REDIS、KAFAKA、MONGODB等几大类常用持久化工具;
11 | - 5、 该框架主要是几大模块的组合,至于爬虫逻辑的实现,根据个人需求。
12 | ## 文件树:
13 | |-Requests_Html\_Spider |--目录文件
14 | |--BaseFile |--基础配置
15 | |---GetLocalFile.py |--读取本地文件,如URL
16 | |---GetProxyIp.py |--获取代理IP
17 | |---Logger.py |--配置logging日志
18 | |---
19 | ReadConfig.py |--读取配置文件
20 | |---
21 | UserAgent.py |--轮换请求头
22 | |--Common |--公共操作类
23 | |---CsvHelper.py |--操作CSV文件
24 | |---JsonHelper.py |--操作JSON文件
25 | |---KafkaHelper.py |--操作KAFKA文件
26 | |---MongoHelper.py |--操作MONGODB文件
27 | |---MysqlHelper.py |--操作MYSQL文件
28 | |---RedisHelper.py |--操作REDIS文件
29 | |--Config |--配置信息
30 | |---HEADERS.py |--配置请求头
31 | |---KAFKA |--KAFKA配置
32 | |---MONGODB |--MONGODB配置
33 | |---MYSQL |--MYSQL配置
34 | |---PROXYIP |--代理IP配置
35 | |---REDIS |--REDIS配置
36 | |--Data |--文件存储目录
37 | |--Logs |--Logs日志存储目录
38 | |--Spider |--爬虫类
39 | |---request\_html\_demo\_1.py |--简书python爬虫教程抓取
40 | |---request\_html\_demo\_2.py |--爬取博客园新闻
41 | |---request\_html\_demo\_3.py |--爬取电脑高清壁纸库
42 | ## 说明: 本框架主要是爬虫基本常用模块组合,避免了日常爬虫编写中各类组件重新编写过程,同时结合requests—html使得编写更为简便,其中requests-html是requests的原作者专门针对爬虫编写的一个新模块,并在不断的跟新状态,[官方-github](https://github.com/kennethreitz/requests-html)
43 | #### Only Python 3.6 is supported.
44 |
--------------------------------------------------------------------------------
/Spider/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: __init__.py.py
10 | @time: 2018/9/25 17:24
11 | @describe:
12 | """
13 | import sys
14 | import os
15 | import BaseFile
16 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
17 | sys.path.append("..")
--------------------------------------------------------------------------------
/Spider/request_html_demo_1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: request_html_demo_1.py
10 | @time: 2018/9/25 17:34
11 | @describe: 使用requests-html爬虫模块抓取~简书爬虫页面
12 | 中文文档:https://cncert.github.io/requests-html-doc-cn/#/
13 | """
14 | import json
15 | import logging
16 | import sys
17 | import os
18 | import requests
19 | import time
20 | # 导入requests_html包
21 | from requests_html import HTMLSession
22 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '\\' + '..'))
23 | sys.path.append("..")
24 | from BaseFile.Logger import Logger
25 | # 导入读取列表文件
26 | from BaseFile.GetLocalFile import GetLocalFile as GF
27 | from BaseFile.GetProxyIp import GetProxyIp as GP
28 | from Config.HEADERS import HEADERS as HS
29 | from Common.CsvHelper import CsvHelper as CV
30 | from Common.JsonHelper import JsonHelper as JS
31 |
32 | logger = Logger('jianshu.log', logging.WARNING, logging.DEBUG)
33 | # 实例化requests_html
34 | session = HTMLSession()
35 |
36 |
37 | # 基础版--抓取简书 前五页 python 爬虫教程
38 | def get_jianshu(base_url):
39 | try:
40 | htmlSource = session.get(base_url, headers=HS['headersJianShun'])
41 | print("请求状态码:", htmlSource.status_code)
42 | liText = htmlSource.html.find("#list-container > ul > li > div > a")
43 | for i in liText:
44 | print(i.text)
45 | print([x for x in i.absolute_links][0])
46 | except Exception as e:
47 | print("请求错误!", e)
48 | logger.error(e)
49 |
50 |
51 | # 代理IP版本
52 | def get_jianshu_ip(base_url):
53 | try:
54 | proxies = GP().get_IP()
55 | print(proxies)
56 | htmlSource = session.get(base_url, headers=HS['headersJianShun'], proxies=proxies, verify=False)
57 | print("请求状态码:", htmlSource.status_code)
58 | liText = htmlSource.html.find("#list-container > ul > li > div > a")
59 | for i in liText:
60 | print(i.text)
61 | print([x for x in i.absolute_links][0])
62 | except Exception as e:
63 | print("请求错误!", e)
64 | logger.error(e)
65 |
66 |
67 | # 存入csv版本
68 | def get_jianshu_fire(base_url):
69 | try:
70 | htmlSource = session.get(base_url, headers=HS['headersJianShun'])
71 | print("请求状态码:", htmlSource.status_code)
72 | liText = htmlSource.html.find("#list-container > ul > li > div > a")
73 | message = "标题", "URL"
74 | CV().csv_write("jianshu.csv", message)
75 | for i in liText:
76 | title = i.text
77 | link = [x for x in i.absolute_links][0]
78 | print(title)
79 | print(link)
80 | # 写入csv的数据格式,逗号分割
81 | message = title, link
82 | CV().csv_write("jianshu.csv", message)
83 | except Exception as e:
84 | print("请求错误!", e)
85 |
86 |
87 | if __name__ == '__main__':
88 | base_url = "https://www.jianshu.com/c/a480500350e7?order_by=added_at&page="
89 | # # 基础版
90 | # get_jianshu(base_url+str(1))
91 | # # 代理IP版
92 | # get_jianshu_ip(base_url+str(2))
93 | # # 存入csv版
94 | # get_jianshu_fire(base_url+str(3))
95 |
96 |
97 | str4=[]
98 | # dict4="222"
99 | str4.append({"title":1111})
100 | str4.append({"title":1111})
101 |
102 | print(str4)
103 | '''{"001":[{"title": "1111"}, {"title": "1111"}, {"title": "1111"}],"002":[{"title":"2222"}]}'''
104 | message = (str4)
105 | JS().json_write("test.json", message)
--------------------------------------------------------------------------------
/Spider/request_html_demo_2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: request_html_demo_2.py.py
10 | @time: 2018/10/31 15:34
11 | @describe: 获取博客园新闻
12 | https://news.cnblogs.com/n/recommend
13 | """
14 | import sys
15 | import os
16 |
17 | from requests_html import HTMLSession
18 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
19 | sys.path.append("..")
20 | from Common.JsonHelper import JsonHelper as JS
21 | session = HTMLSession()
22 |
23 |
24 | def get_cnblog(url):
25 | r = session.get(url)
26 | # 通过CSS找到新闻标签
27 | news = r.html.find('h2.news_entry > a')
28 | json_list = []
29 | for new in news:
30 | title = new.text
31 | liks = [x for x in new.absolute_links][0]
32 | print(title) # 获得新闻标题
33 | print(liks) # 获得新闻链接
34 | json_list.append({"Title": title, "url": liks})
35 | JS().json_write("cnblogs.json", json_list)
36 |
37 |
38 | if __name__ == '__main__':
39 | url = "https://news.cnblogs.com/n/recommend"
40 | get_cnblog(url)
--------------------------------------------------------------------------------
/Spider/request_html_demo_3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | @version: v1.0
5 | @author: W_H_J
6 | @license: Apache Licence
7 | @contact: 415900617@qq.com
8 | @software: PyCharm
9 | @file: request_html_demo_3.py
10 | @time: 2018/10/31 16:18
11 | @describe: 最高清壁纸库-桌面下载
12 | requests_html 解析方式
13 | """
14 | import logging
15 | import sys
16 | import os
17 | sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
18 | sys.path.append("..")
19 | from BaseFile.Logger import Logger
20 | from requests_html import HTMLSession
21 | import requests
22 | import time
23 | logger = Logger('img.log', logging.WARNING, logging.DEBUG)
24 | session = HTMLSession()
25 | i = 0
26 |
27 |
28 | # 解析图片列表
29 | def get_girl_list():
30 | # 返回一个 response 对象
31 | response = session.get('http://www.win4000.com/zt/xinggan.html')
32 | content = response.html.find('div.Left_bar', first=True)
33 | li_list = content.find('li')
34 | for li in li_list:
35 | url = li.find('a', first=True).attrs['href']
36 | get_girl_detail(url)
37 |
38 |
39 | # 解析图片详细
40 | def get_girl_detail(url):
41 | # 返回一个 response 对象
42 | response = session.get(url) # 单位秒数
43 | content = response.html.find('div.scroll-img-cont', first=True)
44 | li_list = content.find('li')
45 | for li in li_list:
46 | img_url = li.find('img', first=True).attrs['data-original']
47 | img_url = img_url[0:img_url.find('_')] + '.jpg'
48 | print(img_url)
49 | save_image(img_url)
50 |
51 |
52 | # 保持大图
53 | def save_image(img_url):
54 | try:
55 | global i
56 | i += 1
57 | print("=="*10 + ">>", i, "==>img")
58 | img_response = requests.get(img_url)
59 | t = int(round(time.time() * 1000)) # 毫秒级时间戳
60 | f = open('../Data/img/%d.jpg' % t, 'ab') # 存储图片,多媒体文件需要参数b(二进制文件)
61 | f.write(img_response.content) # 多媒体存储content
62 | f.close()
63 | except Exception as e:
64 | print("Downloads error:", e)
65 | logger.error(e)
66 |
67 |
68 | if __name__ == '__main__':
69 | print("Downloads img start, Please Don't close the window!")
70 | time.sleep(10)
71 | get_girl_list()
72 | print("Dolnloads img successful, Please to see /Data/img")
73 | a = input("please input q to close the windows!")
74 | if str(a) == 'q':
75 | os.close()
--------------------------------------------------------------------------------