\n
(.*?) 上映')
37 | findscore = re.compile(
38 | r'
\n(.*?)
')
39 | html = BeautifulSoup(html, 'lxml')
40 | data = []
41 | for item in html.findAll('div', class_='el-card__body'):
42 | item = str(item)
43 | movie = []
44 | chinese_name = re.findall(findname, item)[0][0]
45 | english_name = re.findall(findname, item)[0][1]
46 | type = ''
47 | for i in re.findall(findtype, item):
48 | type += i + ' '
49 | country = re.findall(findinfo, item)[0][0]
50 | time = re.findall(findinfo, item)[0][1]
51 | published = re.findall(findpublished, item)
52 | if len(published) == 1:
53 | published = published[0]
54 | else:
55 | published = None
56 | score = re.findall(findscore, item)[0].strip()
57 | movie.append(chinese_name)
58 | movie.append(english_name)
59 | movie.append(type.strip())
60 | movie.append(country)
61 | movie.append(time)
62 | movie.append(published)
63 | movie.append(score)
64 | data.append(movie)
65 | return data
66 |
67 |
68 | def save_data(data):
69 | sqlite = sqlite3.connect('网络爬虫数据库.db')
70 | cursor = sqlite.cursor()
71 | sql = '''create table 模拟登陆网站爬虫
72 | ('中文名称' text primary key not null,
73 | '其他名称' text not null,
74 | '类型' text not null,
75 | '国家' text not null,
76 | '时长' text not null,
77 | '上映' date,
78 | '评分' text)'''
79 | try:
80 | cursor.execute(sql)
81 | sqlite.commit()
82 | except sqlite3.OperationalError:
83 | print('数据表已存在')
84 | for item in data:
85 | for index in range(len(item)):
86 | item[index] = '"' + str(item[index]) + '"'
87 | sql = '''insert into 模拟登陆网站爬虫
88 | values(%s)''' % ', '.join(item)
89 | cursor.execute(sql)
90 | sqlite.commit()
91 | sqlite.close()
92 |
93 |
94 | def main():
95 | url = 'https://login3.scrape.center/api/login'
96 | """
97 | 对接 JWT 模拟登录方式,适合用作 JWT 模拟登录练习。
98 | 代码测试时间:
99 | """
100 | start = time.time()
101 | html = get_html(url)
102 | # data = get_data(html)
103 | # save_data(data)
104 | print('运行时间:{:.6f}'.format(time.time() - start))
105 |
106 |
107 | if __name__ == '__main__':
108 | """网站异常,无法测试"""
109 | pass
110 | # main()
111 |
--------------------------------------------------------------------------------
/代码示例/JA3指纹.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import requests
4 | from fake_useragent import FakeUserAgent
5 |
6 | ua = [FakeUserAgent().chrome for _ in range(10)]
7 | for i in ua:
8 | response = requests.get(
9 | 'https://ja3er.com/json',
10 | headers={
11 | 'User-Agent': i})
12 | print(response.json())
13 | time.sleep(1)
14 |
--------------------------------------------------------------------------------
/代码示例/JA3指纹破解.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import requests
4 | from requests.adapters import HTTPAdapter
5 | from requests.packages.urllib3.util.ssl_ import create_urllib3_context
6 |
7 | ORIGIN_CIPHERS = (
8 | 'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
9 | 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
10 |
11 |
12 | class DESAdapter(HTTPAdapter):
13 | def __init__(self, *args, **kwargs):
14 | """
15 | A TransportAdapter that re-enables 3DES support in Requests.
16 | """
17 | CIPHERS = ORIGIN_CIPHERS.split(':')
18 | random.shuffle(CIPHERS)
19 | CIPHERS = ':'.join(CIPHERS)
20 | self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
21 | super().__init__(*args, **kwargs)
22 |
23 | def init_poolmanager(self, *args, **kwargs):
24 | context = create_urllib3_context(ciphers=self.CIPHERS)
25 | kwargs['ssl_context'] = context
26 | return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
27 |
28 | def proxy_manager_for(self, *args, **kwargs):
29 | context = create_urllib3_context(ciphers=self.CIPHERS)
30 | kwargs['ssl_context'] = context
31 | return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
32 |
33 |
34 | headers = {
35 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
36 | s = requests.Session()
37 | s.headers.update(headers)
38 |
39 | for _ in range(10):
40 | s.mount('https://ja3er.com', DESAdapter())
41 | resp = s.get('https://ja3er.com/json').json()
42 | print(resp)
43 |
--------------------------------------------------------------------------------
/代码示例/URL参数修改.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
2 |
3 |
4 | def replace_field(url, name, value):
5 | parse = urlparse(url) # 把网址转成 ParseResult 对象
6 | query = parse.query # ParseResult 对象的.query 属性,是一个字符串,也就是网址中,问号后面的内容
7 | query_pair = parse_qs(query) # 把 .query 输出的字符串转成字典
8 | query_pair[name] = value # 修改值
9 | new_query = urlencode(query_pair, doseq=True) # 把字段转成. query 形式的字符串
10 | new_parse = parse._replace(query=new_query)
11 | return urlunparse(new_parse) # 把ParseResult对象转回网址字符串
12 |
13 |
14 | url_list = [
15 | 'https://xxx.com/articlelist?category=technology',
16 | 'https://xxx.com/articlelist?category=technology&after=',
17 | 'https://xxx.com/articlelist?category=technology&after=asdrtJKSAZFD',
18 | 'https://xxx.com/articlelist?category=technology&after=asdrtJKSAZFD&other=abc'
19 | ]
20 |
21 | for url in url_list:
22 | next_page = replace_field(url, 'after', '0000000')
23 | print(next_page)
24 |
--------------------------------------------------------------------------------
/代码示例/分布式爬虫示例/加入任务队列.py:
--------------------------------------------------------------------------------
1 | import parsel
2 | import requests
3 | from fake_useragent import FakeUserAgent
4 | from redis import Redis
5 |
6 |
7 | def push_redis_list(text):
8 | redis = Redis(host='127.0.0.1', port=6379, password='')
9 | for item in text:
10 | redis.lpush('标题', item)
11 |
12 |
13 | def get_url():
14 | header = {'user-agent': FakeUserAgent().chrome}
15 | response = requests.get('https://www.baidu.com/', headers=header)
16 | code = response.encoding
17 | html = parsel.Selector(text=response.content.decode(code))
18 | return [
19 | item
20 | for item in html.xpath(
21 | '//ul[@class="s-hotsearch-content"]/li/a/span[2]'
22 | )
23 | .css('::text')
24 | .getall()
25 | ]
26 |
27 |
28 | if __name__ == '__main__':
29 | push_redis_list(get_url())
30 |
--------------------------------------------------------------------------------
/代码示例/分布式爬虫示例/读取任务队列.py:
--------------------------------------------------------------------------------
1 | from redis import Redis
2 |
3 |
4 | def get_redis_list():
5 | redis = Redis(host='127.0.0.1', port=6379, password='')
6 | while True:
7 | text = redis.lpop('标题')
8 | if text is None:
9 | break
10 | else:
11 | print(text.decode('utf-8'))
12 |
13 |
14 | if __name__ == '__main__':
15 | get_redis_list()
16 |
--------------------------------------------------------------------------------
/代码示例/加密解密.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import hashlib
3 | import hmac
4 |
5 | import rsa
6 | from Crypto.Cipher import AES
7 | from Crypto.Cipher import DES3
8 | from pyDes import CBC
9 | from pyDes import PAD_PKCS5
10 | from pyDes import des
11 |
12 |
13 | class USE_AES:
14 | """
15 | AES
16 | 除了MODE_SIV模式key长度为:32, 48, or 64,
17 | 其余key长度为16, 24 or 32
18 | 详细见AES内部文档
19 | CBC模式传入iv参数
20 | 本例使用常用的ECB模式
21 | """
22 |
23 | def __init__(self, key):
24 | if len(key) > 32:
25 | key = key[:32]
26 | self.key = self.to_16(key)
27 |
28 | def to_16(self, key):
29 | """
30 | 转为16倍数的bytes数据
31 | :param key:
32 | :return:
33 | """
34 | key = bytes(key, encoding="utf8")
35 | while len(key) % 16 != 0:
36 | key += b'\0'
37 | return key # 返回bytes
38 |
39 | def aes(self):
40 | return AES.new(self.key, AES.MODE_ECB) # 初始化加密器
41 |
42 | def encrypt(self, text):
43 | aes = self.aes()
44 | return str(base64.encodebytes(aes.encrypt(self.to_16(text))),
45 | encoding='utf8').replace('\n', '') # 加密
46 |
47 | def decode_bytes(self, text):
48 | aes = self.aes()
49 | return str(aes.decrypt(base64.decodebytes(bytes(
50 | text, encoding='utf8'))).rstrip(b'\0').decode("utf8")) # 解密
51 |
52 |
53 | class USE_RSA:
54 | """
55 | 生成密钥可保存.pem格式文件
56 | 1024位的证书,加密时最大支持117个字节,解密时为128;
57 | 2048位的证书,加密时最大支持245个字节,解密时为256。
58 | 加密大文件时需要先用AES或者DES加密,再用RSA加密密钥,详细见文档
59 | 文档:https://stuvel.eu/files/python-rsa-doc/usage.html#generating-keys
60 | """
61 |
62 | def __init__(self, number=1024):
63 | """
64 | :param number: 公钥、私钥
65 | """
66 | self.pubkey, self.privkey = rsa.newkeys(number)
67 |
68 | def rsaEncrypt(self, text):
69 | """
70 | :param test: str
71 | :return: bytes
72 | """
73 | content = text.encode('utf-8')
74 | return rsa.encrypt(content, self.pubkey)
75 |
76 | def rsaDecrypt(self, text):
77 | """
78 | :param text:bytes
79 | :return: str
80 | """
81 | content = rsa.decrypt(text, self.privkey)
82 | return content.decode('utf-8')
83 |
84 | def savePem(self, path_name, text):
85 | """
86 | :param path_name: 保存路径
87 | :param text: str
88 | :return:bytes
89 | """
90 | if "PEM" in path_name.upper():
91 | path_name = path_name[:-4]
92 | with open('{}.pem'.format(path_name), 'bw') as f:
93 | f.write(text.save_pkcs1())
94 |
95 | def readPem(self, path_name, key_type):
96 | """
97 | :param path_name: 密钥文件
98 | :param key_type:类型
99 | :return:
100 | """
101 | if 'pubkey' in key_type:
102 | self.pubkey = rsa.PublicKey.load_pkcs1(path_name)
103 | else:
104 | self.privkey = rsa.PublicKey.load_pkcs1(path_name)
105 | return True
106 |
107 | def sign(self, message, priv_key=None, hash_method='SHA-1'):
108 | """
109 | 生成明文的哈希签名以便还原后对照
110 | :param message: str
111 | :param priv_key:
112 | :param hash_method: 哈希的模式
113 | :return:
114 | """
115 | if priv_key is None:
116 | priv_key = self.privkey
117 | return rsa.sign(message.encode(), priv_key, hash_method)
118 |
119 | def checkSign(self, mess, result, pubkey=None):
120 | """
121 | 验证签名:传入解密后明文、签名、公钥,验证成功返回哈希方法,失败则报错
122 | :param mess: str
123 | :param result: bytes
124 | :param pubkey:
125 | :return: str
126 | """
127 | if pubkey is None:
128 | pubkey = self.privkey
129 | try:
130 | result = rsa.verify(mess, result, pubkey)
131 | return result
132 | except BaseException:
133 | return False
134 |
135 |
136 | class USE_DES:
137 | """
138 | des(key,[mode], [IV], [pad], [pad mode])
139 | key:必须正好8字节
140 | mode(模式):ECB、CBC
141 | iv:CBC模式中必须提供长8字节
142 | pad:填充字符
143 | padmode:加密填充模式PAD_NORMAL or PAD_PKCS5
144 | """
145 |
146 | def __init__(self, key, iv):
147 | if not isinstance(key, bytes):
148 | key = bytes(key, encoding="utf8")
149 | if not isinstance(iv, bytes):
150 | iv = bytes(iv, encoding="utf8")
151 | self.key = key
152 | self.iv = iv
153 |
154 | def encrypt(self, text):
155 | """
156 | DES 加密
157 | :param text: 原始字符串
158 | :return: 加密后字符串,bytes
159 | """
160 | if not isinstance(text, bytes):
161 | text = bytes(text, "utf-8")
162 | secret_key = self.key
163 | iv = self.iv
164 | k = des(secret_key, CBC, iv, pad=None, padmode=PAD_PKCS5)
165 | return k.encrypt(text, padmode=PAD_PKCS5)
166 |
167 | def descrypt(self, text):
168 | """
169 | DES 解密
170 | :param text: 加密后的字符串,bytes
171 | :return: 解密后的字符串
172 | """
173 | secret_key = self.key
174 | iv = self.iv
175 | k = des(secret_key, CBC, iv, pad=None, padmode=PAD_PKCS5)
176 | de = k.decrypt(text, padmode=PAD_PKCS5)
177 | return de.decode()
178 |
179 |
180 | class USE_DES3:
181 | """
182 | new(key, mode, *args, **kwargs)
183 | key:必须8bytes倍数介于16-24
184 | mode:
185 | iv:初始化向量适用于MODE_CBC、MODE_CFB、MODE_OFB、MODE_OPENPGP,4种模式
186 | ``MODE_CBC``, ``MODE_CFB``, and ``MODE_OFB``长度为8bytes
187 | ```MODE_OPENPGP```加密时8bytes解密时10bytes
188 | 未提供默认随机生成
189 | nonce:仅在 ``MODE_EAX`` and ``MODE_CTR``模式中使用
190 | ``MODE_EAX``建议16bytes
191 | ``MODE_CTR``建议[0, 7]长度
192 | 未提供则随机生成
193 | segment_size:分段大小,仅在 ``MODE_CFB``模式中使用,长度为8倍数,未指定则默认为8
194 | mac_len: 适用``MODE_EAX``模式,身份验证标记的长度(字节),它不能超过8(默认值)
195 | initial_value:适用```MODE_CTR```,计数器的初始值计数器块。默认为**0**。
196 | """
197 |
198 | def __init__(self, key):
199 | self.key = key
200 | self.mode = DES3.MODE_ECB
201 |
202 | def encrypt(self, text):
203 | """
204 | 传入明文
205 | :param text:bytes类型,长度是KEY的倍数
206 | :return:
207 | """
208 | if not isinstance(text, bytes):
209 | text = bytes(text, 'utf-8')
210 | x = len(text) % 8
211 | text = text + b'\0' * x
212 | cryptor = DES3.new(self.key, self.mode)
213 | return cryptor.encrypt(text)
214 |
215 | def decrypt(self, text):
216 | cryptor = DES3.new(self.key, self.mode)
217 | plain_text = cryptor.decrypt(text)
218 | return str(plain_text.decode("utf-8")).rstrip('\0')
219 |
220 |
221 | def USE_MD5(test):
222 | if not isinstance(test, bytes):
223 | test = bytes(test, 'utf-8')
224 | m = hashlib.md5()
225 | m.update(test)
226 | return m.hexdigest()
227 |
228 |
229 | def USE_HMAC(key, text):
230 | if not isinstance(key, bytes):
231 | key = bytes(key, 'utf-8')
232 | if not isinstance(text, bytes):
233 | text = bytes(text, 'utf-8')
234 | h = hmac.new(key, text, digestmod='MD5')
235 | return h.hexdigest()
236 |
237 |
238 | def USE_SHA(text):
239 | if not isinstance(text, bytes):
240 | text = bytes(text, 'utf-8')
241 | sha = hashlib.sha1(text)
242 | return sha.hexdigest()
243 |
244 |
245 | if __name__ == '__main__':
246 | aes_test = USE_AES("secretKey")
247 | ciphertext = aes_test.encrypt("测试")
248 | plaintext = aes_test.decode_bytes(ciphertext)
249 | print(ciphertext, plaintext)
250 | rsa_test = USE_RSA()
251 | ciphertext = rsa_test.rsaEncrypt("测试加密")
252 | plaintext = rsa_test.rsaDecrypt(ciphertext)
253 | print(ciphertext, plaintext)
254 | des_test = USE_DES(b"12345678", b"12345678")
255 | ciphertext = des_test.encrypt("测试加密")
256 | plaintext = des_test.descrypt(ciphertext)
257 | print(ciphertext, plaintext)
258 | des3_test = USE_DES3(b"123456789qazxswe")
259 | ciphertext = des3_test.encrypt("测试加密")
260 | plaintext = des3_test.decrypt(ciphertext)
261 | print(ciphertext, plaintext)
262 | md5_test = USE_MD5("测试签名")
263 | hmac_test = USE_HMAC("123456", "测试")
264 | sha_test = USE_SHA("测试加密")
265 | print(ciphertext, plaintext)
266 |
--------------------------------------------------------------------------------
/代码示例/重试装饰器示例.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 |
4 | def retry(func):
5 | max_retry = 10
6 |
7 | def run():
8 | for i in range(max_retry):
9 | status_code = func()
10 | if status_code == 200:
11 | return f'第{i + 1}次访问 ' + str(status_code) + ' 访问成功!'
12 | else:
13 | print(f'第{i + 1}次访问', status_code, '重试中!')
14 | return '访问失败!'
15 |
16 | return run
17 |
18 |
19 | @retry
20 | def requests():
21 | return random.choice([200, 404, 404, 404, 404])
22 |
23 |
24 | def common():
25 | pass
26 |
27 |
28 | def main():
29 | response = requests()
30 | print(response)
31 | print(requests)
32 | print(retry)
33 | print(common)
34 |
35 |
36 | if __name__ == '__main__':
37 | main()
38 |
--------------------------------------------------------------------------------
/其他网络爬虫/GXNAS壁纸爬虫_1.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import random
4 | import re
5 | import time
6 |
7 | import requests
8 | import xlwt
9 | from bs4 import BeautifulSoup
10 | from pyppeteer import launch
11 |
12 |
13 | async def get_html(url, num):
14 | img = re.compile(r'
![(.*)]()
= 100:
100 | print(len(all_data))
101 | break
102 | with open('data', 'wb') as f:
103 | pickle.dump(all_data, f)
104 |
105 |
106 | def get_info():
107 | headers = {
108 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q'
109 | '=0.8,application/signed-exchange;v=b3;q=0.9',
110 | 'accept-encoding': 'gzip, deflate, br',
111 | 'accept-language': 'zh-CN,zh;q=0.9',
112 | 'cache-control': 'no-cache',
113 | 'dnt': '1',
114 | 'pragma': 'no-cache',
115 | 'referer': 'https://www.mycancergenome.org/content/biomarkers/',
116 | 'sec-fetch-dest': 'document',
117 | 'sec-fetch-mode': 'navigate',
118 | 'sec-fetch-site': 'same-origin',
119 | 'sec-fetch-user': '?1',
120 | 'upgrade-insecure-requests': '1',
121 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
122 | '7.0.4280.66 Safari/537.36'}
123 | path = os.getcwd() + '\\progress'
124 | if os.path.exists(path):
125 | with open('progress', 'rb') as f:
126 | progress = pickle.load(f)
127 | print('当前数据数量:', progress)
128 | else:
129 | progress = 0
130 | with open('data', 'rb') as f:
131 | data = pickle.load(f)
132 | client = MongoClient()
133 | db = client['癌症基因组数据库']
134 | collection = db['详细数据']
135 | z = 0
136 | for i in data[progress:]:
137 | """
138 | 部分基因无法访问详情页
139 | """
140 | if i[0] in ['MALT1']:
141 | continue
142 | if len(i) == 1:
143 | info = {'Drugs': None, 'Info': None, 'Name': i[0]}
144 | save_data(info, collection)
145 | progress += 1
146 | elif len(i) == 2 and i[1] == 0:
147 | info = {'Info': None, 'Name': i[0], 'Drugs': i[1]}
148 | save_data(info, collection)
149 | progress += 1
150 | elif len(i) == 2:
151 | try:
152 | z += 1
153 | cache = open_url_T_1(headers, i[0])
154 | if cache:
155 | info = {'Name': i[0], 'Drugs': i[1], 'Info': cache}
156 | save_data(info, collection)
157 | time.sleep(random.randrange(1, 4, 1) + random.random())
158 | else:
159 | print('访问超时或响应码错误,请重试')
160 | break
161 | except BaseException:
162 | time.sleep(random.randrange(1, 4, 1) + random.random())
163 | z += 1
164 | try:
165 | cache = open_url_T_2(headers, i[0])
166 | except ValueError:
167 | break
168 | if cache == 'None':
169 | info = {'Name': '', 'Drugs': '', 'Info': None}
170 | info['Name'] = i[0]
171 | info['Drugs'] = i[1]
172 | save_data(info, collection)
173 | else:
174 | if cache:
175 | info = {'Name': '', 'Drugs': '', 'Info': ''}
176 | info['Name'] = i[0]
177 | info['Drugs'] = i[1]
178 | info['Info'] = cache
179 | save_data(info, collection)
180 | else:
181 | print('访问超时或响应码错误,请重试')
182 | break
183 | progress += 1
184 | else:
185 | raise ValueError(i)
186 | # break # 调试使用
187 | """
188 | 启用此代码则每次运行发送50次请求后关闭程序
189 | 注释此代码则运行获取全部数据
190 | """
191 | if z >= 500:
192 | break
193 | with open('progress', 'wb') as f:
194 | pickle.dump(progress, f)
195 | print('已获取数据数量:', progress) # 调试使用
196 |
197 |
198 | def auto_name(name):
199 | name = name.replace(')',
200 | '').replace('*',
201 | '').replace('_',
202 | '-').replace('(',
203 | '-').replace(';',
204 | '-').replace(' ',
205 | '-').replace('--',
206 | '-').lower()
207 | return name
208 |
209 |
210 | def open_url_F(name):
211 | """
212 | 暂时不需要调用
213 | """
214 | headers = {
215 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q='
216 | '0.8,application/signed-exchange;v=b3;q=0.9',
217 | 'accept-encoding': 'gzip, deflate, br',
218 | 'accept-language': 'zh-CN,zh;q=0.9',
219 | 'cache-control': 'no-cache',
220 | 'dnt': '1',
221 | 'pragma': 'no-cache',
222 | 'referer': 'https://www.mycancergenome.org/content/biomarkers/',
223 | 'sec-fetch-dest': 'document',
224 | 'sec-fetch-mode': 'navigate',
225 | 'sec-fetch-site': 'same-origin',
226 | 'sec-fetch-user': '?1',
227 | 'upgrade-insecure-requests': '1',
228 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
229 | '7.0.4280.66 Safari/537.36'}
230 | url = 'https://www.mycancergenome.org/content/alteration/{}/'
231 | response = requests.get(url.format(name), headers=headers)
232 | html = response.content.decode(response.encoding)
233 | soup = BeautifulSoup(html, 'lxml')
234 | num = soup.select(
235 | 'body > div.main-content > div.alteration-detail > div.row.header > div.small-12.medium-9.column'
236 | 's > div:nth-child(1) > div > p > a')[0].text
237 | print(soup.select('body > div.main-content > div.alteration-detail > div.row.header > div.small-12.medium-9.column'
238 | 's > div:nth-child(2) > div > p:nth-child({})'.format(int(num) + 1))[0].text.strip())
239 |
240 |
241 | def open_url_T_1(
242 | headers,
243 | name,
244 | url='https://www.mycancergenome.org/content/alteration/{}/'):
245 | try:
246 | response = requests.get(
247 | url.format(
248 | auto_name(name)),
249 | headers=headers,
250 | timeout=10)
251 | # print(response.url) # 调试代码
252 | # print(response.status_code)
253 | except BaseException:
254 | return None
255 | if response.status_code == 200:
256 | data = [name, response.url]
257 | html = response.content.decode(response.encoding)
258 | soup = BeautifulSoup(html, 'lxml')
259 | info = soup.select('div#therapies-toggle > p')
260 | content = ''.join(i.text for i in info)
261 | data.append(content.replace('\n', '').replace(' ', '').strip())
262 | num = soup.select('div#therapies-toggle > p:last-of-type > a')[0].text
263 | reference = soup.select(
264 | 'div.small-12.columns > p.reference')[int(num) - 1].text
265 | data.append(reference.replace('\n', '').replace(' ', '').strip())
266 | BDT = soup.select(
267 | 'div#therapies-toggle > div.about-alteration-therapy-row')
268 | data.append({})
269 | for a in BDT:
270 | title = a.select(
271 | 'p.about-alteration-therapy-header')[0].text.replace('+', '').strip()
272 | if title not in ['Bosutinib', 'Imatinib']:
273 | continue
274 | data[4][title] = []
275 | Bosutinib = a.select(
276 | 'div.about-alteration-therapy-content > div.about-alteration-therapy-disease-row')
277 | for b, c in enumerate(Bosutinib):
278 | data[4][title].append([])
279 | t1 = c.select(
280 | 'p.about-alteration-therapy-disease-header')[0].text.replace('-', '')
281 | data[4][title][b].append(
282 | t1.replace(
283 | '\n', '').replace(
284 | ' ', '').strip())
285 | t2 = c.select(
286 | 'div.row.table-row.targeted-therapy-table-small-screen-container')
287 | for f in t2:
288 | data[4][title][b].append([])
289 | t3 = f.select('div.small-12.columns.biomarker-criteria')
290 | data[4][title][b][-1].append(t3[0].text.replace(
291 | '\n', '').replace(' ', '').strip())
292 | row = f.select(
293 | 'div.small-12.columns.response-setting-note > div.row')
294 | data[4][title][b][-1].append([])
295 | for d, e in enumerate(row):
296 | data[4][title][b][-1][-1].append(
297 | e.text.replace('\n', '').replace(' ', '').strip())
298 | return data
299 | elif response.status_code == 404:
300 | raise ValueError
301 | else:
302 | print(response.url)
303 | return None
304 |
305 |
306 | def open_url_T_2(
307 | headers,
308 | name,
309 | url='https://www.mycancergenome.org/content/gene/{}/'):
310 | try:
311 | response = requests.get(
312 | url.format(
313 | auto_name(name)),
314 | headers=headers,
315 | timeout=10)
316 | # print(response.url) # 调试代码
317 | # print(response.status_code)
318 | except BaseException:
319 | return None
320 | if response.status_code == 200:
321 | data = [name, response.url]
322 | html = response.content.decode(response.encoding)
323 | soup = BeautifulSoup(html, 'lxml')
324 | info = soup.select('div#therapies-toggle > p')
325 | content = ''.join(i.text for i in info)
326 | data.append(content.replace('\n', '').replace(' ', '').strip())
327 | try:
328 | num = soup.select(
329 | 'div#therapies-toggle > p:last-of-type > a')[0].text
330 | except IndexError:
331 | return 'None'
332 | reference = soup.select(
333 | 'div.small-12.columns > p.reference')[int(num) - 1].text
334 | data.append(reference.replace('\n', '').replace(' ', '').strip())
335 | BDT = soup.select('div#therapies-toggle > div.about-gene-therapy-row')
336 | data.append({})
337 | for a in BDT:
338 | title = a.select(
339 | 'p.about-gene-therapy-header')[0].text.replace('+', '').strip()
340 | if title not in ['Bosutinib', 'Imatinib']:
341 | continue
342 | data[4][title] = []
343 | Bosutinib = a.select(
344 | 'div.about-gene-therapy-content > div.about-gene-therapy-disease-row')
345 | for b, c in enumerate(Bosutinib):
346 | data[4][title].append([])
347 | t1 = c.select(
348 | 'p.about-gene-therapy-disease-header')[0].text.replace('-', '')
349 | data[4][title][b].append(
350 | t1.replace(
351 | '\n', '').replace(
352 | ' ', '').strip())
353 | t2 = c.select(
354 | 'div.row.table-row.targeted-therapy-table-small-screen-container')
355 | for f in t2:
356 | data[4][title][b].append([])
357 | t3 = f.select('div.small-12.columns.biomarker-criteria')
358 | data[4][title][b][-1].append(t3[0].text.replace(
359 | '\n', '').replace(' ', '').strip())
360 | row = f.select(
361 | 'div.small-12.columns.response-setting-note > div.row')
362 | data[4][title][b][-1].append([])
363 | for d, e in enumerate(row):
364 | data[4][title][b][-1][-1].append(
365 | e.text.replace('\n', '').replace(' ', '').strip())
366 | return data
367 | elif response.status_code == 302:
368 | print(name, response.url)
369 | raise ValueError
370 | else:
371 | print(response.url)
372 | return None
373 |
374 |
375 | def save_data(data, collection):
376 | collection.insert_one(data)
377 |
378 |
379 | def main():
380 | hits = 16380 # 数据总数,手动修改
381 | get_data(hits)
382 | get_info()
383 | print('程序已结束')
384 |
385 |
386 | if __name__ == '__main__':
387 | main()
388 |
--------------------------------------------------------------------------------
/其他网络爬虫/下厨房网爬虫.py:
--------------------------------------------------------------------------------
1 | import random
2 | import sqlite3
3 | import time
4 |
5 | from requests_html import HTMLSession
6 |
7 |
8 | def get_page_url():
9 | """返回每页的网址"""
10 | list_ = []
11 | base = 'http://www.xiachufang.com/explore/?page='
12 | for page in range(1, 2): # 测试时减少爬取量
13 | """生成 1 ~ 20 页的网页地址"""
14 | url = base + str(page)
15 | list_.append(url)
16 | return list_ # 返回 1 ~ 20 页的地址
17 |
18 |
19 | def get_url(list_):
20 | """获取每个菜品的详细链接"""
21 | item_url = []
22 | for page in list_: # 遍历每一页的网址
23 | html = open_url(page) # 请求网页
24 | time.sleep(random.randrange(4, 7, 1)) # 减慢爬取速度
25 | href = html.html.find(
26 | '.recipe.recipe-215-horizontal.pure-g.image-link.display-block > a') # 查找每一个菜品的详细链接
27 | for url in href:
28 | item_url.append(
29 | 'http://www.xiachufang.com' +
30 | url.attrs['href']) # 拼接每一个菜品的详细链接
31 | return item_url # 返回 1 ~ 20 页的全部菜品详细链接
32 |
33 |
34 | def open_url(url):
35 | session = HTMLSession()
36 | response = session.get(url) # 向网页发送请求
37 | return response # 返回网页响应
38 |
39 |
40 | def get_data(url_list):
41 | data = [] # 储存总数据
42 | for item in url_list[:3]: # 遍历每一个菜品的详细链接,切片减少爬取数据量
43 | cache = [] # 储存每个菜品的数据
44 | html = open_url(item) # 发送请求
45 | if str(html) == '
': # 判断是否请求成功
46 | time.sleep(random.randrange(4, 7, 1)) # 减慢爬取速度
47 | title = html.html.find('.page-title') # 匹配菜名
48 | if bool(title): # 判断匹配结果
49 | cache.append(title[0].text) # 添加有效数据
50 | else:
51 | print(item) # 输出检查以便判断问题
52 | cache.append('') # 添加空数据,避免直接报错
53 | recipeIngredient = html.html.find('.ings') # 匹配用料
54 | if bool(recipeIngredient): # 判断匹配结果
55 | cache.append(recipeIngredient[0].text) # 添加有效数据
56 | else:
57 | print(item) # 输出检查以便判断问题
58 | cache.append('') # 添加空数据,避免直接报错
59 | recipeInstructions = html.html.find('.steps p.text') # 匹配做法步骤
60 | if bool(recipeInstructions): # 判断匹配结果
61 | """这里匹配的结果是包含多项的列表,要先处理成单个字符串再添加有效数据"""
62 | steps = ''
63 | for i in range(len(recipeInstructions)):
64 | """遍历匹配结果,这个数据是做法步骤,不同菜品的步骤数不相等,通过遍历组成单个字符串"""
65 | steps += recipeInstructions[i].text
66 | cache.append(steps) # 添加有效数据
67 | else:
68 | print(item) # 输出检查以便判断问题
69 | cache.append('') # 添加空数据,避免直接报错
70 | image = html.html.find(
71 | 'div.cover.image.expandable.block-negative-margin > img') # 匹配效果图链接
72 | if bool(image): # 判断匹配结果
73 | cache.append(image[0].attrs['src']) # 添加有效数据
74 | else:
75 | print(item) # 输出检查以便判断问题
76 | cache.append('') # 添加空数据,避免直接报错
77 | url = html.html.find('link[rel=canonical]') # 匹配详细链接
78 | if bool(url): # 判断匹配结果
79 | cache.append(url[0].attrs['href']) # 添加有效数据
80 | else:
81 | print(item) # 输出检查以便判断问题
82 | cache.append('') # 添加空数据,避免直接报错
83 | data.append(cache) # 添加一个菜品的数据到总数据
84 | else:
85 | break # 请求失败说明被封IP,跳出循环,爬取结束
86 | return data # 返回总数据的列表
87 |
88 |
89 | def save_data(data):
90 | sqlite = sqlite3.connect('本周最受欢迎.db') # 连接数据库
91 | cursor = sqlite.cursor() # 获取数据库游标
92 | sql = '''create table 本周最受欢迎
93 | ('菜名' text,
94 | '用料' text,
95 | '做法' text,
96 | '效果图' text,
97 | '链接' text)''' # 创建表
98 | cursor.execute(sql) # 执行SQL语句
99 | sqlite.commit() # 提交更改
100 | for item in data: # 遍历保存数据
101 | for index in range(len(item)): # 数据预处理,SQL语句格式要求
102 | item[index] = '"' + str(item[index]) + '"'
103 | sql = '''insert into 本周最受欢迎
104 | values(%s)''' % ', '.join(item) # 插入数据
105 | cursor.execute(sql) # 执行SQL语句
106 | sqlite.commit() # 提交更改
107 | sqlite.close() # 关闭数据库
108 |
109 |
110 | def main():
111 | list_ = get_page_url() # 生成每一页的网址
112 | url = get_url(list_) # 获取 1 ~ 20 页的全部菜品详细链接
113 | data = get_data(url) # 提取数据
114 | save_data(data) # 保存数据
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/其他网络爬虫/北京市政务数据资源网爬虫.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import random
4 | import re
5 | import time
6 |
7 | import requests
8 | from bs4 import BeautifulSoup
9 |
10 |
11 | def request_data(start):
12 | url = 'https://data.beijing.gov.cn/search/1_file/elevate'
13 | header = {
14 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
15 | '83.121 Safari/537.36'}
16 | data = {
17 | 'q': '_text_:*',
18 | 'wt': 'json',
19 | 'rows': '10',
20 | 'start': start,
21 | 'enableElevation': 'true',
22 | 'forceElevation': 'true',
23 | 'sort': 'publishDate desc',
24 | 'fl': '_uuid_,title,content,indexUrl,pubDateStr,pubDate,publishDate,publishDateStr,size,score,unitName,download'
25 | 'Count,callCount,imgsrc,[elevated],imgsrc',
26 | 'fq': '',
27 | }
28 | response = requests.post(url=url, headers=header, data=data)
29 | if response.status_code != 200:
30 | raise ValueError('请求项目列表失败')
31 | time.sleep(random.randrange(2, 5, 1))
32 | return response
33 |
34 |
35 | def get_data(key):
36 | start = 0
37 | data = request_data(start)
38 | data = data.content.decode('utf-8')
39 | data = json.loads(data)
40 | content_id = data['response']['docs']
41 | content_id_list = []
42 | for item in content_id:
43 | indexUrl = item['indexUrl']
44 | indexUrl = re.findall(re.compile(r'/([0-9]*?).htm'), indexUrl)
45 | if bool(indexUrl):
46 | content_id_list.append(indexUrl[0])
47 | else:
48 | raise ValueError('提取数据失败')
49 | break
50 | get_api_id(content_id_list, key)
51 | pages = data['response']['numFound']
52 | for page in range(pages // 10):
53 | start = (page + 1) * 10
54 | data = request_data(start)
55 | data = data.content.decode('utf-8')
56 | data = json.loads(data)
57 | content_id = data['response']['docs']
58 | content_id_list = []
59 | for item in content_id:
60 | indexUrl = item['indexUrl']
61 | indexUrl = re.findall(re.compile(r'/([0-9]*?).htm'), indexUrl)
62 | if bool(indexUrl):
63 | content_id_list.append(indexUrl[0])
64 | else:
65 | raise ValueError('提取数据失败')
66 | break
67 | get_api_id(content_id_list, key)
68 | break
69 |
70 |
71 | def get_download(id_list, key):
72 | api = 'http://data.beijing.gov.cn:80/cms/web/APIInterface/userApply.jsp?id={}&key={}'
73 | header = {
74 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
75 | '83.121 Safari/537.36'}
76 | for item in id_list:
77 | response = requests.get(api.format(item, key), headers=header)
78 | if response.status_code == 200:
79 | try:
80 | data = json.loads(response.text)
81 | url = data['result']['address']
82 | name = data['result']['name']
83 | save_data(name, url)
84 | except BaseException:
85 | raise ValueError(response.text)
86 | else:
87 | raise ValueError('请求api失败')
88 |
89 |
90 | def save_data(name, url):
91 | root = os.getcwd() + '\\数据结果\\'
92 | path = root + name + '.' + url.split('.')[-1]
93 | if not os.path.exists(root):
94 | os.mkdir(root)
95 | if not os.path.exists(path):
96 | header = {
97 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
98 | '83.121 Safari/537.36'}
99 | response = requests.get(url, headers=header)
100 | time.sleep(random.randrange(2, 5, 1))
101 | with open(path, 'wb') as data:
102 | data.write(response.content)
103 | data.close()
104 |
105 |
106 | def get_api_id(content_id, key):
107 | model = 'https://data.beijing.gov.cn/cms/web/APIInterface/dataDoc.jsp?contentID='
108 | header = {
109 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
110 | '83.121 Safari/537.36'}
111 | for item in content_id:
112 | response = requests.get(url=model + item, headers=header)
113 | time.sleep(random.randrange(2, 5, 1))
114 | if response.status_code != 200:
115 | raise ValueError('请求详情页失败')
116 | soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
117 | id_list = [
118 | index.select('td:last-of-type')[0].text
119 | for index in soup.select(
120 | 'div.content-box.fn-clear > table.content-tab:first-of-type > tbody > tr'
121 | )
122 | ]
123 |
124 | if bool(id_list):
125 | get_download(id_list, key)
126 |
127 |
128 | def main():
129 | key = input('输入API唯一标识码(key):')
130 | get_data(key)
131 |
132 |
133 | if __name__ == '__main__':
134 | main()
135 |
--------------------------------------------------------------------------------
/其他网络爬虫/小米步数.py:
--------------------------------------------------------------------------------
1 | import random
2 | import re
3 | import requests
4 | import time
5 |
6 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
7 | headers = {
8 | 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; MI 6 MIUI/20.6.18)'
9 | }
10 |
11 |
12 | # 获取登录code
13 | def get_code(location):
14 | code_pattern = re.compile("(?<=access=).*?(?=&)")
15 | return code_pattern.findall(location)[0]
16 |
17 |
18 | # 登录
19 | def login(user, password):
20 | url = f"https://api-user.huami.com/registrations/+86{user}/tokens"
21 | headers = {
22 | "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
23 | "User-Agent": "MiFit/4.6.0 (iPhone; iOS 14.0.1; Scale/2.00)"
24 | }
25 | data = {
26 | "client_id": "HuaMi",
27 | "password": f"{password}",
28 | "redirect_uri": "https://s3-us-west-2.amazonaws.com/hm-registration/successsignin.html",
29 | "token": "access"
30 | }
31 | response = requests.post(url, data=data, headers=headers, allow_redirects=False)
32 | location = response.headers["Location"]
33 | try:
34 | code = get_code(location)
35 | except:
36 | return 0, 0
37 | # print("access_code获取成功!")ste
38 | # print(code)
39 |
40 | url = "https://account.huami.com/v2/client/login"
41 | data = {
42 | "app_name": "com.xiaomi.hm.health",
43 | "app_version": "4.6.0",
44 | "code": f"{code}",
45 | "country_code": "CN",
46 | "device_id": "2C8B4939-0CCD-4E94-8CBA-CB8EA6E613A1",
47 | "device_model": "phone",
48 | "grant_type": "access_token",
49 | "third_name": "huami_phone",
50 | }
51 | response = requests.post(url, data=data, headers=headers).json()
52 | login_token = response["token_info"]["login_token"]
53 | # print("login_token获取成功!")
54 | # print(login_token)
55 | userid = response["token_info"]["user_id"]
56 | # print("userid获取成功!")
57 | # print(userid)
58 |
59 | return login_token, userid
60 |
61 |
62 | # 主函数
63 | def main(user, password, step):
64 | login_token, userid = login(user, password)
65 | if login_token == 0:
66 | return "login fail!"
67 |
68 | timestamp = str(time.time())[:-3].replace('.', '')
69 | sync_time = str(time.time())[:-6].replace('.', '')
70 |
71 | app_token = get_app_token(login_token)
72 |
73 | today = time.strftime("%F")
74 | data = '%5B%7B%22data_hr%22%3A%22%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9L%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
75 | '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FVv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0v%5C' \
76 | '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9e%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0n%5C%2Fa' \
77 | '%5C%2F%5C%2F%5C%2FS%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0b%5C%2F%5C%2F' \
78 | '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F1FK%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
79 | '%5C%2F%5C%2F%5C%2FR%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
80 | '%5C%2F%5C%2F%5C%2F9PTFFpaf9L%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FR%5C' \
81 | '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0j%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
82 | '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9K%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
83 | '%2FOv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fzf%5C%2F%5C%2F%5C%2F86%5C%2Fzr%5C' \
84 | '%2FOv88%5C%2Fzf%5C%2FPf%5C%2F%5C%2F%5C%2F0v%5C%2FS%5C%2F8%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
85 | '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FSf%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fz3%5C' \
86 | '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0r%5C%2FOv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FS%5C%2F9L%5C%2Fzb%5C' \
87 | '%2FSf9K%5C%2F0v%5C%2FRf9H%5C%2Fzj%5C%2FSf9K%5C%2F0%5C%2F%5C%2FN%5C%2F%5C%2F%5C%2F%5C%2F0D%5C%2FSf83%5C' \
88 | '%2Fzr%5C%2FPf9M%5C%2F0v%5C%2FOv9e%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
89 | '%2FS%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fzv%5C%2F%5C%2Fz7%5C%2FO%5C' \
90 | '%2F83%5C%2Fzv%5C%2FN%5C%2F83%5C%2Fzr%5C%2FN%5C%2F86%5C%2Fz%5C%2F%5C%2FNv83%5C%2Fzn%5C%2FXv84%5C%2Fzr%5C' \
91 | '%2FPP84%5C%2Fzj%5C%2FN%5C%2F9e%5C%2Fzr%5C%2FN%5C%2F89%5C%2F03%5C%2FP%5C%2F89%5C%2Fz3%5C%2FQ%5C%2F9N%5C' \
92 | '%2F0v%5C%2FTv9C%5C%2F0H%5C%2FOf9D%5C%2Fzz%5C%2FOf88%5C%2Fz%5C%2F%5C%2FPP9A%5C%2Fzr%5C%2FN%5C%2F86%5C%2Fzz' \
93 | '%5C%2FNv87%5C%2F0D%5C%2FOv84%5C%2F0v%5C%2FO%5C%2F84%5C%2Fzf%5C%2FMP83%5C%2FzH%5C%2FNv83%5C%2Fzf%5C%2FN%5C' \
94 | '%2F84%5C%2Fzf%5C%2FOf82%5C%2Fzf%5C%2FOP83%5C%2Fzb%5C%2FMv81%5C%2FzX%5C%2FR%5C%2F9L%5C%2F0v%5C%2FO%5C%2F9I' \
95 | '%5C%2F0T%5C%2FS%5C%2F9A%5C%2Fzn%5C%2FPf89%5C%2Fzn%5C%2FNf9K%5C%2F07%5C%2FN%5C%2F83%5C%2Fzn%5C%2FNv83%5C' \
96 | '%2Fzv%5C%2FO%5C%2F9A%5C%2F0H%5C%2FOf8%5C%2F%5C%2Fzj%5C%2FPP83%5C%2Fzj%5C%2FS%5C%2F87%5C%2Fzj%5C%2FNv84%5C' \
97 | '%2Fzf%5C%2FOf83%5C%2Fzf%5C%2FOf83%5C%2Fzb%5C%2FNv9L%5C%2Fzj%5C%2FNv82%5C%2Fzb%5C%2FN%5C%2F85%5C%2Fzf%5C' \
98 | '%2FN%5C%2F9J%5C%2Fzf%5C%2FNv83%5C%2Fzj%5C%2FNv84%5C%2F0r%5C%2FSv83%5C%2Fzf%5C%2FMP%5C%2F%5C%2F%5C%2Fzb%5C' \
99 | '%2FMv82%5C%2Fzb%5C%2FOf85%5C%2Fz7%5C%2FNv8%5C%2F%5C%2F0r%5C%2FS%5C%2F85%5C%2F0H%5C%2FQP9B%5C%2F0D%5C' \
100 | '%2FNf89%5C%2Fzj%5C%2FOv83%5C%2Fzv%5C%2FNv8%5C%2F%5C%2F0f%5C%2FSv9O%5C%2F0ZeXv%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
101 | '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F1X%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
102 | '%2F9B%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FTP%5C%2F%5C%2F%5C%2F1b%5C%2F' \
103 | '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9N' \
104 | '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B' \
105 | '%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
106 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
107 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
108 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
109 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
110 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
111 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
112 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
113 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
114 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
115 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
116 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
117 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
118 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
119 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
120 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
121 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
122 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
123 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
124 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
125 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
126 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
127 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
128 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
129 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
130 | '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
131 | '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%22' \
132 | '%2C%22date%22%3A%222021-08-07%22%2C%22data%22%3A%5B%7B%22start%22%3A0%2C%22stop%22%3A1439%2C%22value%22' \
133 | '%3A%22UA8AUBQAUAwAUBoAUAEAYCcAUBkAUB4AUBgAUCAAUAEAUBkAUAwAYAsAYB8AYB0AYBgAYCoAYBgAYB4AUCcAUBsAUB8AUBwAUBIAYBkAYB8AUBoAUBMAUCEAUCIAYBYAUBwAUCAAUBgAUCAAUBcAYBsAYCUAATIPYD0KECQAYDMAYB0AYAsAYCAAYDwAYCIAYB0AYBcAYCQAYB0AYBAAYCMAYAoAYCIAYCEAYCYAYBsAYBUAYAYAYCIAYCMAUB0AUCAAUBYAUCoAUBEAUC8AUB0AUBYAUDMAUDoAUBkAUC0AUBQAUBwAUA0AUBsAUAoAUCEAUBYAUAwAUB4AUAwAUCcAUCYAUCwKYDUAAUUlEC8IYEMAYEgAYDoAYBAAUAMAUBkAWgAAWgAAWgAAWgAAWgAAUAgAWgAAUBAAUAQAUA4AUA8AUAkAUAIAUAYAUAcAUAIAWgAAUAQAUAkAUAEAUBkAUCUAWgAAUAYAUBEAWgAAUBYAWgAAUAYAWgAAWgAAWgAAWgAAUBcAUAcAWgAAUBUAUAoAUAIAWgAAUAQAUAYAUCgAWgAAUAgAWgAAWgAAUAwAWwAAXCMAUBQAWwAAUAIAWgAAWgAAWgAAWgAAWgAAWgAAWgAAWgAAWREAWQIAUAMAWSEAUDoAUDIAUB8AUCEAUC4AXB4AUA4AWgAAUBIAUA8AUBAAUCUAUCIAUAMAUAEAUAsAUAMAUCwAUBYAWgAAWgAAWgAAWgAAWgAAWgAAUAYAWgAAWgAAWgAAUAYAWwAAWgAAUAYAXAQAUAMAUBsAUBcAUCAAWwAAWgAAWgAAWgAAWgAAUBgAUB4AWgAAUAcAUAwAWQIAWQkAUAEAUAIAWgAAUAoAWgAAUAYAUB0AWgAAWgAAUAkAWgAAWSwAUBIAWgAAUC4AWSYAWgAAUAYAUAoAUAkAUAIAUAcAWgAAUAEAUBEAUBgAUBcAWRYAUA0AWSgAUB4AUDQAUBoAXA4AUA8AUBwAUA8AUA4AUA4AWgAAUAIAUCMAWgAAUCwAUBgAUAYAUAAAUAAAUAAAUAAAUAAAUAAAUAAAUAAAUAAAWwAAUAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAeSEAeQ8AcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBcAcAAAcAAAcCYOcBUAUAAAUAAAUAAAUAAAUAUAUAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCgAeQAAcAAAcAAAcAAAcAAAcAAAcAYAcAAAcBgAeQAAcAAAcAAAegAAegAAcAAAcAcAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCkAeQAAcAcAcAAAcAAAcAwAcAAAcAAAcAIAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCIAeQAAcAAAcAAAcAAAcAAAcAAAeRwAeQAAWgAAUAAAUAAAUAAAUAAAUAAAcAAAcAAAcBoAeScAeQAAegAAcBkAeQAAUAAAUAAAUAAAUAAAUAAAUAAAcAAAcAAAcAAAcAAAcAAAcAAAegAAegAAcAAAcAAAcBgAeQAAcAAAcAAAcAAAcAAAcAAAcAkAegAAegAAcAcAcAAAcAcAcAAAcAAAcAAAcAAAcA8AeQAAcAAAcAAAeRQAcAwAUAAAUAAAUAAAUAAAUAAAUAAAcAAAcBEAcA0AcAAAWQsAUAAAUAAAUAAAUAAAUAAAcAAAcAoAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAYAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBYAegAAcAAAcAAAegAAcAcAcAAAcAAAcAAAcAAAcAAAeRkAegAAegAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAEAcAAAcAAAcAAAcAUAcAQAcAAAcBIAeQAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBsAcAAAcAAAcBcAeQAAUAAAUAAAUAAAUAAAUAAAUBQAcBYAUAAAUAAAUAoAWRYAWTQAWQAAUAAAUAAAUAAAcAAAcAAAcAAAcAAAcAAAcAMAcAAAcAQAcAAAcAAAcAAAcDMAeSIAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBQAeQwAcAAAcAAAcAAAcAMAcAAAeSoAcA8AcDMAcAYAeQoAcAwAcFQAcEMAeVIAaTYAbBcNYAsAYBIAYAIAYAIAYBUAYCwAYBMAYDYAYCkAYDcAUCoAUCcAUAUAUBAAWgAAYBoAYBcAYCgAUAMAUAYAUBYAUA4AUBgAUAgAUAgAUAsAUAsAUA4AUAMAUAYAUAQAUBIAASsSUDAAUDAAUBAAYAYAUBAAUAUAUCAAUBoAUCAAUBAAUAoAYAIAUAQAUAgAUCcAUAsAUCIAUCUAUAoAUA4AUB8AUBkAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAA%22%2C%22tz%22%3A32%2C%22did%22%3A%22DA932FFFFE8816E7%22%2C%22src%22%3A24%7D%5D%2C%22summary%22%3A%22%7B%5C%22v%5C%22%3A6%2C%5C%22slp%5C%22%3A%7B%5C%22st%5C%22%3A1628296479%2C%5C%22ed%5C%22%3A1628296479%2C%5C%22dp%5C%22%3A0%2C%5C%22lt%5C%22%3A0%2C%5C%22wk%5C%22%3A0%2C%5C%22usrSt%5C%22%3A-1440%2C%5C%22usrEd%5C%22%3A-1440%2C%5C%22wc%5C%22%3A0%2C%5C%22is%5C%22%3A0%2C%5C%22lb%5C%22%3A0%2C%5C%22to%5C%22%3A0%2C%5C%22dt%5C%22%3A0%2C%5C%22rhr%5C%22%3A0%2C%5C%22ss%5C%22%3A0%7D%2C%5C%22stp%5C%22%3A%7B%5C%22ttl%5C%22%3A18272%2C%5C%22dis%5C%22%3A10627%2C%5C%22cal%5C%22%3A510%2C%5C%22wk%5C%22%3A41%2C%5C%22rn%5C%22%3A50%2C%5C%22runDist%5C%22%3A7654%2C%5C%22runCal%5C%22%3A397%2C%5C%22stage%5C%22%3A%5B%7B%5C%22start%5C%22%3A327%2C%5C%22stop%5C%22%3A341%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A481%2C%5C%22cal%5C%22%3A13%2C%5C%22step%5C%22%3A680%7D%2C%7B%5C%22start%5C%22%3A342%2C%5C%22stop%5C%22%3A367%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A2295%2C%5C%22cal%5C%22%3A95%2C%5C%22step%5C%22%3A2874%7D%2C%7B%5C%22start%5C%22%3A368%2C%5C%22stop%5C%22%3A377%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1592%2C%5C%22cal%5C%22%3A88%2C%5C%22step%5C%22%3A1664%7D%2C%7B%5C%22start%5C%22%3A378%2C%5C%22stop%5C%22%3A386%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1072%2C%5C%22cal%5C%22%3A51%2C%5C%22step%5C%22%3A1245%7D%2C%7B%5C%22start%5C%22%3A387%2C%5C%22stop%5C%22%3A393%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1036%2C%5C%22cal%5C%22%3A57%2C%5C%22step%5C%22%3A1124%7D%2C%7B%5C%22start%5C%22%3A394%2C%5C%22stop%5C%22%3A398%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A488%2C%5C%22cal%5C%22%3A19%2C%5C%22step%5C%22%3A607%7D%2C%7B%5C%22start%5C%22%3A399%2C%5C%22stop%5C%22%3A414%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A2220%2C%5C%22cal%5C%22%3A120%2C%5C%22step%5C%22%3A2371%7D%2C%7B%5C%22start%5C%22%3A415%2C%5C%22stop%5C%22%3A427%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1268%2C%5C%22cal%5C%22%3A59%2C%5C%22step%5C%22%3A1489%7D%2C%7B%5C%22start%5C%22%3A428%2C%5C%22stop%5C%22%3A433%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A152%2C%5C%22cal%5C%22%3A4%2C%5C%22step%5C%22%3A238%7D%2C%7B%5C%22start%5C%22%3A434%2C%5C%22stop%5C%22%3A444%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A2295%2C%5C%22cal%5C%22%3A95%2C%5C%22step%5C%22%3A2874%7D%2C%7B%5C%22start%5C%22%3A445%2C%5C%22stop%5C%22%3A455%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1592%2C%5C%22cal%5C%22%3A88%2C%5C%22step%5C%22%3A1664%7D%2C%7B%5C%22start%5C%22%3A456%2C%5C%22stop%5C%22%3A466%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1072%2C%5C%22cal%5C%22%3A51%2C%5C%22step%5C%22%3A1245%7D%2C%7B%5C%22start%5C%22%3A467%2C%5C%22stop%5C%22%3A477%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1036%2C%5C%22cal%5C%22%3A57%2C%5C%22step%5C%22%3A1124%7D%2C%7B%5C%22start%5C%22%3A478%2C%5C%22stop%5C%22%3A488%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A488%2C%5C%22cal%5C%22%3A19%2C%5C%22step%5C%22%3A607%7D%2C%7B%5C%22start%5C%22%3A489%2C%5C%22stop%5C%22%3A499%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A2220%2C%5C%22cal%5C%22%3A120%2C%5C%22step%5C%22%3A2371%7D%2C%7B%5C%22start%5C%22%3A500%2C%5C%22stop%5C%22%3A511%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1268%2C%5C%22cal%5C%22%3A59%2C%5C%22step%5C%22%3A1489%7D%2C%7B%5C%22start%5C%22%3A512%2C%5C%22stop%5C%22%3A522%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A152%2C%5C%22cal%5C%22%3A4%2C%5C%22step%5C%22%3A238%7D%5D%7D%2C%5C%22goal%5C%22%3A8000%2C%5C%22tz%5C%22%3A%5C%2228800%5C%22%7D%22%2C%22source%22%3A24%2C%22type%22%3A0%7D%5D '
134 | # find_date = re.compile(r'.*?date%22%3A%22(.*?)%22%2C%22data.*?')
135 | # find_step = re.compile(r'.*?ttl%5C%22%3A(.*?)%2C%5C%22dis.*?')
136 | # data = re.sub(find_date.findall(data)[0], today, data)
137 | # data = re.sub(find_step.findall(data)[0], step, data)
138 | data.replace('1628296479', sync_time)
139 | data.replace('2021-08-07', today)
140 | data.replace('18272', step)
141 |
142 | url = f'https://api-mifit-cn.huami.com/v1/data/band_data.json?&t={timestamp}'
143 | headers = {
144 | "apptoken": app_token,
145 | "Content-Type": "application/x-www-form-urlencoded"
146 | }
147 |
148 | data = f'userid={userid}&last_sync_data_time={sync_time}&device_type=0&last_deviceid=DA932FFFFE8816E7&data_json={data}'
149 |
150 | response = requests.post(url, data=data, headers=headers).json()
151 | # print(response)
152 | result = f"{user[:4]}****{user[-4:]}: [{now}] 修改步数({step})" + response['message']
153 | print(result)
154 | return result
155 |
156 |
157 | # 获取时间戳
158 | def get_time():
159 | url = 'http://api.m.taobao.com/rest/api3.do?api=mtop.common.getTimestamp'
160 | response = requests.get(url, headers=headers).json()
161 | t = response['data']['t']
162 | return str(time.time())[:-3].replace('.', '')
163 |
164 |
165 | # 获取app_token
166 | def get_app_token(login_token):
167 | url = f"https://account-cn.huami.com/v1/client/app_tokens?app_name=com.xiaomi.hm.health&dn=api-user.huami.com" \
168 | f"%2Capi-mifit.huami.com%2Capp-analytics.huami.com&login_token={login_token} "
169 | response = requests.get(url, headers=headers).json()
170 | # print("app_token获取成功!")
171 | # print(app_token)
172 | return response['token_info']['app_token']
173 |
174 |
175 | if __name__ == "__main__":
176 | """半成品:不可用"""
177 | user = "00000"
178 | password = "00000"
179 | step_low = "10000"
180 | step_high = "20000"
181 | step = str(random.randint(int(step_low), int(step_high)))
182 | main(user, password, step)
183 |
--------------------------------------------------------------------------------
/其他网络爬虫/彼岸图网爬虫.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import re
4 | import time
5 |
6 | import requests
7 | import xlwt
8 | from bs4 import BeautifulSoup
9 | from fake_useragent import FakeUserAgent
10 |
11 |
12 | def get_url(num):
13 | url_list = []
14 | for i in range(num):
15 | if i == 0:
16 | url = 'http://pic.netbian.com/'
17 | else:
18 | url = 'http://pic.netbian.com/index_' + str(i + 1) + '.html'
19 | url_list.append(url)
20 | return url_list
21 |
22 |
23 | def get_html(url):
24 | header = {'user-agent': FakeUserAgent().chrome}
25 | if isinstance(url, list):
26 | all_html = ''
27 | for i in url:
28 | html = requests.get(url=i, headers=header)
29 | html = BeautifulSoup(html.content, 'html.parser')
30 | html = html.findAll('ul', class_="clearfix")
31 | all_html += str(html)
32 | return all_html
33 | elif isinstance(url, str):
34 | html = requests.get(url=url, headers=header)
35 | html = BeautifulSoup(html.content, 'html.parser')
36 | html = str(html)
37 | return html
38 | else:
39 | raise KeyError('我也不知道哪里错了')
40 |
41 |
42 | def deal_data(html):
43 | data_list = []
44 | findurl = re.compile(r'href="(.*?)"')
45 | finddata = re.compile(
46 | r'')
47 | url = re.findall(findurl, html)
48 | for i in range(len(url)):
49 | cache = []
50 | url[i] = 'http://pic.netbian.com/' + url[i]
51 | html = get_html(url[i])
52 | data = re.findall(finddata, html)
53 | cache.append('http://pic.netbian.com' + data[0][0])
54 | cache.append(data[0][1])
55 | data_list.append(cache)
56 | return data_list
57 |
58 |
59 | def sava_xlsx(data):
60 | book = xlwt.Workbook(encoding='utf-8')
61 | sheet = book.add_sheet('爬取结果')
62 | tap = ('图片链接', '关键字')
63 | for i in range(2):
64 | sheet.write(0, i, tap[i]) # 添加列标签
65 | for i in range(len(data)):
66 | tap = data[i]
67 | if len(data) >= 100:
68 | print('\r', end='')
69 | print('正在保存数据到表格: {:.2f}%'.format(
70 | ((i + 1) / len(data)) * 100), '▉' * ((i + 1) // (len(data) // 50)), end='')
71 | elif len(data) > 0:
72 | print('\r', end='')
73 | print('正在保存数据到表格: {:.2f}%'.format(
74 | ((i + 1) / len(data)) * 100), '▉' * ((i + 1) * 50 // (len(data))), end='')
75 | else:
76 | print('出现错误')
77 | for j in range(2):
78 | data_1 = tap[j]
79 | sheet.write(i + 1, j, data_1)
80 | book.save('图片爬虫2.xlsx')
81 | print('\n')
82 |
83 |
84 | def sava_path(data):
85 | root = os.getcwd() + '\\图片爬虫2\\'
86 | header = {'user-agent': FakeUserAgent().chrome}
87 | repeat = 0
88 | for index, item in enumerate(data):
89 | url = str(item[0])
90 | path = root + url.split('/')[-1]
91 | if len(data) >= 100:
92 | print('\r', end='')
93 | print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
94 | '▉' * ((index + 1) // (len(data) // 50)),
95 | end='')
96 | else:
97 | print('\r', end='')
98 | print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
99 | '▉' * ((index + 1) * 50 // (len(data))),
100 | end='')
101 | try:
102 | if not os.path.exists(root): # 判断根目录是否存在
103 | os.mkdir(root) # 创建根目录
104 | if not os.path.exists(path): # 判断文件是否存在
105 | file = requests.get(url=url, headers=header) # 请求文件
106 | time.sleep(random.randrange(2, 5, 1))
107 | with open(path, 'wb') as save:
108 | save.write(file.content)
109 | save.close()
110 | else:
111 | repeat += 1
112 | except BaseException:
113 | print('\n保存失败')
114 | print('\n重复图片:' + str(repeat) + '张')
115 |
116 |
117 | def main():
118 | num = int(input('爬取页数(1 ~ 1250):'))
119 | if num < 1 or num > 1250:
120 | raise ValueError('页数输入错误')
121 | url = get_url(num)
122 | html = get_html(url)
123 | data = deal_data(html)
124 | sava_xlsx(data)
125 | sava_path(data)
126 | print('程序结束')
127 |
128 |
129 | if __name__ == '__main__':
130 | main()
131 |
--------------------------------------------------------------------------------
/其他网络爬虫/抖音无水印视频爬虫.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import time
4 |
5 | import requests
6 |
7 |
8 | class DouYin:
9 | def __init__(self):
10 | self.headers = {
11 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
12 | 'Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'}
13 | self.android_headers = {'user-agent': 'Android'}
14 |
15 | def get_share_url(self, url):
16 | response = requests.get(
17 | url,
18 | headers=self.headers,
19 | allow_redirects=False)
20 | if 'location' in response.headers.keys():
21 | return response.headers['location']
22 | else:
23 | raise Exception("解析失败")
24 |
25 | def get_data(self, url, share):
26 | response = requests.get(url, headers=self.headers).text
27 | json_str = json.loads(response)
28 | download_url = json_str['item_list'][0]['video']['play_addr']['url_list'][0].replace(
29 | "playwm", "play")
30 | name = json.loads(response)[
31 | 'item_list'][0]['share_info']['share_title']
32 | with open(name + '.mp4', 'wb') as f:
33 | f.write(
34 | requests.get(
35 | url=download_url,
36 | headers=self.android_headers).content)
37 | print('视频下载完成!')
38 | print('软件即将退出')
39 | for i in range(1, 6):
40 | time.sleep(1)
41 | print('\r', end='')
42 | print(6 - i, end='')
43 |
44 | def run(self):
45 | share = input("请输入抖音短视频分享链接:")
46 | url = re.findall(r'https://v.douyin.com/.*/', share)[0]
47 | location = self.get_share_url(url)
48 | vid = re.findall(r'/share/video/(\d*)', location)[0]
49 | url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'.format(
50 | vid)
51 | self.get_data(url, share)
52 |
53 |
54 | if __name__ == '__main__':
55 | dy = DouYin()
56 | dy.run()
57 |
--------------------------------------------------------------------------------
/其他网络爬虫/有道翻译.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import random
3 | import time
4 |
5 | import requests
6 |
7 |
8 | class YouDao:
9 | def __init__(self):
10 | self.cookie = self.get_cookie()
11 |
12 | def get_cookie(self):
13 | headers = {
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicat'
15 | 'ion/signed-exchange;v=b3;q=0.9',
16 | 'Accept-Encoding': 'gzip, deflate',
17 | 'Accept-Language': 'zh-CN,zh;q=0.9',
18 | 'Connection': 'keep-alive',
19 | 'DNT': '1',
20 | 'Host': 'fanyi.youdao.com',
21 | 'Upgrade-Insecure-Requests': '1',
22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
23 | '7.0.4280.141 Safari/537.36 Edg/87.0.664.75'}
24 | response = requests.get('http://fanyi.youdao.com/', headers=headers)
25 | return [i.name + '=' + i.value for i in response.cookies]
26 |
27 | def get_data(self, key):
28 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
29 | headers = {
30 | "Accept": "application/json, text/javascript, */*; q=0.01",
31 | "Accept-Encoding": "gzip, deflate",
32 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
33 | "Cache-Control": "no-cache",
34 | "Connection": "keep-alive",
35 | "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
36 | 'Cookie': '%s' % ','.join(
37 | self.cookie),
38 | "DNT": "1",
39 | "Host": "fanyi.youdao.com",
40 | "Origin": "http://fanyi.youdao.com",
41 | "Pragma": "no-cache",
42 | "Referer": "http://fanyi.youdao.com/",
43 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro"
44 | "me/87.0.4280.141 Safari/537.36 Edg/87.0.664.75",
45 | "X-Requested-With": "XMLHttpRequest"}
46 | lts = str(int(time.time() * 1000))
47 | salt = lts + str(random.randint(0, 10))
48 | sign = self.get_sign(key, salt)
49 | form = {'i': key,
50 | 'from': 'AUTO',
51 | 'to': 'AUTO',
52 | 'smartresult': 'dict',
53 | 'client': 'fanyideskweb',
54 | 'salt': salt,
55 | 'sign': sign,
56 | 'lts': lts,
57 | 'bv': '02c2dd94fb562b4304f9b0c657990444',
58 | 'doctype': 'json',
59 | 'version': '2.1',
60 | 'keyfrom': 'fanyi.web',
61 | 'action': 'FY_BY_REALTlME'}
62 | response = requests.post(url=url, headers=headers, params=form)
63 | data = response.json()
64 | print('翻译结果:', data['translateResult'][0][0]['tgt'])
65 | print('来自有道词典结果:')
66 | for i in data['smartResult']['entries']:
67 | if i:
68 | print(i.replace('\r\n', ''))
69 | else:
70 | continue
71 |
72 | def get_sign(self, key, salt):
73 | sign = "fanyideskweb" + key + salt + "Tbh5E8=q6U3EXe+&L[4c@"
74 | data = hashlib.md5()
75 | data.update(sign.encode('utf-8'))
76 | return data.hexdigest()
77 |
78 | def translate(self, key):
79 | self.get_data(key) if key else print('翻译内容为空')
80 |
81 |
82 | def main():
83 | key = input('翻译内容:')
84 | YouDao().translate(key)
85 |
86 |
87 | if __name__ == '__main__':
88 | main()
89 |
--------------------------------------------------------------------------------
/其他网络爬虫/百度百科爬虫项目.py:
--------------------------------------------------------------------------------
1 | from urllib import parse
2 |
3 | import chardet
4 | import requests
5 | from bs4 import BeautifulSoup
6 | from fake_useragent import FakeUserAgent
7 |
8 |
9 | class UrlManager():
10 | def __init__(self):
11 | self.new_urls = set()
12 | self.old_urls = set()
13 |
14 | def new_url_size(self):
15 | """返回未爬取网址数量"""
16 | return len(self.new_urls)
17 |
18 | def old_url_size(self):
19 | """返回已爬取网址数量"""
20 | return len(self.old_urls)
21 |
22 | def has_new_url(self):
23 | """判断有无未爬取网址"""
24 | return self.new_url_size() != 0
25 |
26 | def get_new_url(self):
27 | """获取一个待爬取网址"""
28 | new_url = self.new_urls.pop()
29 | self.old_urls.add(new_url)
30 | return new_url
31 |
32 | def add_new_url(self, url):
33 | """添加待爬取单个网址"""
34 | if url is None:
35 | return None
36 | if url not in self.new_urls and url not in self.old_urls:
37 | self.new_urls.add(url)
38 |
39 | def add_new_urls(self, urls):
40 | """添加待爬取多个网址"""
41 | if urls is None or len(urls) == 0:
42 | return None
43 | else:
44 | for url in urls:
45 | self.add_new_url(url)
46 |
47 |
48 | class HtmlDownloader():
49 | def __init__(self):
50 | self.header = {'user-agent': FakeUserAgent().chrome}
51 |
52 | def download(self, url):
53 | if url is None:
54 | return None
55 | response = requests.get(url, headers=self.header)
56 | if response.status_code == 200:
57 | return response.content.decode(
58 | chardet.detect(response.content)['encoding'])
59 | else:
60 | return None
61 |
62 |
63 | class HtmlParser():
64 | def parser(self, url, html):
65 | if url is None or html is None:
66 | return
67 | soup = BeautifulSoup(html, 'lxml')
68 | new_urls = self._get_new_url(url, soup)
69 | new_data = self._get_new_data(url, soup)
70 | return new_urls, new_data
71 |
72 | def _get_new_url(self, url, soup):
73 | new_urls = set()
74 | links = soup.select('div.basic-info.cmn-clearfix a')
75 | if bool(links):
76 | for item in links:
77 | new_url = parse.urljoin(url, item['href'])
78 | new_urls.add(new_url)
79 | return new_urls
80 |
81 | def _get_new_data(self, url, soup):
82 | new_data = [url]
83 | info = soup.select('div.lemma-summary > div.para')
84 | title = soup.select('.lemmaWgt-lemmaTitle-title > h1')
85 | if bool(info) and bool(title):
86 | new_data.append(title[0].text)
87 | new_data.append(info[0].text)
88 | return new_data
89 | return None
90 |
91 |
92 | class DataOutput():
93 | def __init__(self):
94 | self.datas = []
95 |
96 | def store_data(self, data):
97 | if data is None:
98 | return
99 | self.datas.append(data)
100 |
101 | def output_file(self):
102 | with open('百度百科爬虫结果.html', 'w+', encoding='utf-8') as html:
103 | html.write('')
104 | html.write('')
105 | html.write('')
106 | html.write(' ')
107 | html.write(' 爬取结果')
108 | html.write('')
109 | html.write('')
110 | html.write('')
111 | html.write('\n链接 | \n关键词 | \n描述 |
')
112 | for item in self.datas:
113 | html.write('')
114 | html.write('%s | ' % item[0])
115 | html.write('%s | ' % item[1])
116 | html.write('%s | ' % item[2])
117 | html.write('
')
118 | html.write('
')
119 | html.write('')
120 | html.write('')
121 |
122 |
123 | class SpiderMan():
124 | def __init__(self):
125 | self.manager = UrlManager()
126 | self.downloader = HtmlDownloader()
127 | self.parser = HtmlParser()
128 | self.output = DataOutput()
129 |
130 | def crawl(self, root_url):
131 | self.manager.add_new_url(root_url)
132 | while (self.manager.new_url_size()
133 | and self.manager.old_url_size() <= 100):
134 | new_url = self.manager.get_new_url()
135 | html = self.downloader.download(new_url)
136 | new_urls, new_data = self.parser.parser(new_url, html)
137 | self.manager.add_new_urls(new_urls)
138 | self.output.store_data(new_data)
139 | self.output.output_file()
140 |
141 |
142 | if __name__ == '__main__':
143 | spider = SpiderMan()
144 | spider.crawl(
145 | 'https://baike.baidu.com/item/%E8%8F%B2%E5%BE%8B%E5%AE%BE%E5%B8%98%E8%9B%A4')
146 |
--------------------------------------------------------------------------------
/其他网络爬虫/糗事百科爬虫.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from requests_html import HTMLSession
4 |
5 |
6 | def get_html(list):
7 | session = HTMLSession()
8 | data = []
9 | have_url = re.compile('查看全文')
10 | for url in list:
11 | html = session.get(url)
12 | all_article = html.html.find('.contentHerf')
13 | all_maintext = html.html.find('.main-text')
14 | for item in range(len(all_article)):
15 | cache = []
16 | article = all_article[item].text
17 | cache.append(article)
18 | if bool(re.findall(have_url, article)):
19 | cache.append(
20 | 'https://www.qiushibaike.com' +
21 | all_article[item].attrs['href'])
22 | else:
23 | cache.append(None)
24 | try:
25 | maintext = all_maintext[item].text
26 | cache.append(maintext)
27 | except BaseException:
28 | cache.append('无')
29 | data.append(cache)
30 | return data
31 |
32 |
33 | def deal_text(data):
34 | modify2 = re.compile(r'\n[0-9]*')
35 | for i in data:
36 | for j in range(3):
37 | if i[j] is not None:
38 | i[j] = modify2.sub('', i[j])
39 | if j == 0:
40 | print('段子:', i[j])
41 | elif j == 1:
42 | if i[j] is not None:
43 | print('查看全文:', i[j])
44 | else:
45 | print('神评:', i[j])
46 | print('\n')
47 |
48 |
49 | def url_list(pages):
50 | url_list = []
51 | for i in range(pages):
52 | if i == 0:
53 | url = 'https://www.qiushibaike.com/text/'
54 | else:
55 | url = 'https://www.qiushibaike.com/text/page/' + str(i + 1) + '/'
56 | url_list.append(url)
57 | return (url_list)
58 |
59 |
60 | def input_data():
61 | try:
62 | print('爬取范围( 1 ~ 13 )')
63 | pages = int(input('爬取页数:'))
64 | except BaseException:
65 | raise ValueError('页数输入错误')
66 | if pages >= 1 and pages <= 13:
67 | return pages
68 | else:
69 | raise ValueError('页数输入错误')
70 |
71 |
72 | def main():
73 | pages = input_data()
74 | url = url_list(pages)
75 | data = get_html(url)
76 | deal_text(data)
77 |
78 |
79 | if __name__ == '__main__':
80 | main()
81 |
--------------------------------------------------------------------------------
/其他网络爬虫/豆瓣Top250爬虫.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sqlite3
3 | import time
4 | import urllib.request
5 |
6 | import pymysql
7 | import xlwt
8 | from bs4 import BeautifulSoup
9 | from termcolor import colored
10 |
11 |
12 | def main(means): # 以不同的格式进行保存数据
13 | if means == '表格': # 以xlsx格式保存
14 | douban = 'https://movie.douban.com/top250?start='
15 | datalist = get_data(douban)
16 | try:
17 | savexlsx(datalist)
18 | print('爬取成功,文件已保存')
19 | except PermissionError:
20 | print('文件已打开,无法写入数据')
21 | print('请重新运行')
22 | elif means == 'SQLite': # 以sqlite格式保存数据
23 | douban = 'https://movie.douban.com/top250?start='
24 | datalist = get_data(douban)
25 | dbpath = '数据库.db'
26 | savedb(datalist, dbpath)
27 | elif means == 'MySQL':
28 | douban = 'https://movie.douban.com/top250?start='
29 | datalist = get_data(douban)
30 | mysql_save(datalist)
31 | else:
32 | print('保存格式错误')
33 |
34 |
35 | # 正则表达式规则
36 | findlink = re.compile(r'(.*?)')
38 | findscore = re.compile(r'(.*?)')
39 | findpeople = re.compile(r'(.*)人评价')
40 | findinfo = re.compile(r'(.*?)')
41 | findtype = re.compile(r'''导演.*?
42 | .* / .* / (.*)''')
43 |
44 |
45 | # 解析网址
46 | def askURL(douban):
47 | # 设置浏览器UA
48 | User = {
49 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.41"
50 | "47.89 Safari/537.36 Edg/84.0.522.40Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3"
51 | "6 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.40"}
52 | # 携带参数访问网址
53 | request = urllib.request.Request(url=douban, headers=User) # 指定参数
54 | response = urllib.request.urlopen(request) # 解析网址
55 | html = response.read().decode('utf-8') # 记录源码
56 | return html # 返回数据
57 |
58 |
59 | # 翻页爬取
60 | def get_data(douban):
61 | data_list = [] # 总数据,包含全部电影信息
62 | for i in range(10): # 每一页获取网页源码
63 | print('正在解析第' + str(i) + '页')
64 | url = douban + str(i * 25)
65 | html = askURL(url) # 返回一页网页源码
66 | time.sleep(2)
67 | print('获取成功')
68 | soup = BeautifulSoup(html, 'html.parser') # 解析网页源码
69 | for item in soup.find_all('div', class_="item"): # 解析每一部电影信息
70 | data = [] # 临时储存数据,包含一部电影信息
71 | item = str(item) # 转化为字符串
72 | link = re.findall(findlink, item)[0]
73 | data.append(link) # 添加临时储存数据
74 | title = re.findall(findtitle, item)
75 | if len(title) == 2:
76 | c = title[0].replace('/', '')
77 | data.append(c)
78 | e = title[1].replace('/', '')
79 | e = e.strip()
80 | data.append(e)
81 | else:
82 | data.append(title[0])
83 | data.append('无')
84 | score = re.findall(findscore, item)[0]
85 | score = score.strip()
86 | data.append(score)
87 | people = re.findall(findpeople, item)[0]
88 | people = people.strip()
89 | data.append(people)
90 | info = re.findall(findinfo, item)
91 | if len(info) == 0:
92 | data.append('无')
93 | else:
94 | data.append(info[0])
95 | type = re.findall(findtype, item)[0]
96 | type = type.strip()
97 | data.append(type)
98 | data_list.append(data) # 把一部电影的信息以列表形式添加到总数据
99 | return data_list # 返回全部电影信息
100 |
101 |
102 | def savexlsx(datalist):
103 | book = xlwt.Workbook(encoding='utf-8')
104 | sheet = book.add_sheet('爬取结果', cell_overwrite_ok=True) # 覆盖写入
105 | tap = ('链接', '电影名称', '英文名称', '评分', '评价人数', '介绍', '类型')
106 | for i in range(7):
107 | sheet.write(0, i, tap[i]) # 添加列标签
108 | for i in range(250):
109 | tap = datalist[i]
110 | print('正在保存第' + str(i + 1) + '条')
111 | for j in range(7):
112 | data_1 = tap[j]
113 | sheet.write(i + 1, j, data_1)
114 | book.save('豆瓣TOP250.xlsx')
115 |
116 |
117 | def savedb(datalist, dbpath):
118 | list = sqlite3.connect(dbpath)
119 | cursor = list.cursor()
120 | basesql = '''create table 豆瓣top250
121 | ('TOP' integer primary key not null ,
122 | '链接' text not null,
123 | '电影名称' text not null,
124 | '英文名称' text,
125 | '评分' number not null,
126 | '评价人数' number not null,
127 | '介绍' text,
128 | '类型' text not null)'''
129 | try:
130 | cursor.execute(basesql)
131 | list.commit()
132 | except sqlite3.OperationalError:
133 | print('数据表已存在')
134 | for i, data in enumerate(datalist): # 把索引赋给i,把元素赋给data
135 | for index in range(7):
136 | data[index] = '"' + str(data[index]) + '"'
137 | sql = '''insert into 豆瓣top250('链接', '电影名称',
138 | '英文名称', '评分', '评价人数', '介绍', '类型')
139 | values(%s)''' % ','.join(data)
140 | cursor.execute(sql)
141 | print('正在保存第' + str(i + 1) + '条数据')
142 | list.commit()
143 | print('数据表已保存完毕')
144 | list.close()
145 | print('数据库已关闭')
146 |
147 |
148 | def mysql_save(datalist):
149 | try:
150 | database = pymysql.connect('localhost', 'root', '数据库密码', '数据库名称')
151 | try:
152 | sql = '''create table 豆瓣top250
153 | (TOP int(3) primary key auto_increment,
154 | 链接 text,
155 | 电影名称 text,
156 | 英文名称 text,
157 | 评分 float,
158 | 评价人数 mediumint,
159 | 介绍 text,
160 | 类型 text)'''
161 | cursor = database.cursor()
162 | cursor.execute(sql)
163 | database.commit()
164 | print('新建表成功')
165 | except BaseException:
166 | print('数据表已存在')
167 | for i, data in enumerate(datalist): # 把索引赋给i,把元素赋给data
168 | for index in range(7):
169 | data[index] = '"' + str(data[index]) + '"'
170 | sql = '''insert into 豆瓣top250(链接, 电影名称,
171 | 英文名称, 评分, 评价人数, 介绍, 类型)
172 | values(%s)''' % ','.join(data)
173 | cursor.execute(sql)
174 | print('正在保存第' + str(i + 1) + '条数据')
175 | database.commit()
176 | print('数据表已保存完毕')
177 | database.close()
178 | print('数据库已关闭')
179 | except BaseException:
180 | raise ValueError('数据库名称或密码错误')
181 |
182 |
183 | if __name__ == '__main__':
184 | print(colored('使用MySQL数据库保存爬取数据需要先安装MySQl\n且需要在代码中修改MySQL数据库密码和连接数据库名称', 'red'))
185 | print('输入“表格”或“SQLite”或“MySQL”')
186 | means = input('选择保存形式:xlsx表格、SQLite数据库、MySQL数据库')
187 | start = time.time()
188 | main(means)
189 | print('运行时间:%.5f' % float(time.time() - start))
190 | print('程序已关闭')
191 |
--------------------------------------------------------------------------------
/其他网络爬虫/铅笔小说网爬虫.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import re
4 | import time
5 |
6 | import unicodedata
7 | from requests_html import HTMLSession
8 | from termcolor import colored
9 |
10 |
11 | def input_data():
12 | url = input('输入网址:')
13 | check_url = re.compile(r'^https://www.x23qb.com/book/.*?/')
14 | url = re.findall(check_url, url)
15 | if bool(url):
16 | return url
17 | else:
18 | raise ValueError('网址输入错误')
19 |
20 |
21 | def open_url(url):
22 | session = HTMLSession()
23 | if len(url) == 1:
24 | response = session.get(url[0])
25 | return response
26 | elif len(url) > 1:
27 | html = []
28 | for page in url:
29 | response = session.get(page)
30 | print('获取小说内容中...')
31 | time.sleep(random.randrange(2, 5, 1))
32 | html.append(response)
33 | return html
34 | else:
35 | raise TypeError('传入网址发生异常')
36 |
37 |
38 | def get_url(html):
39 | title = []
40 | url = []
41 | for page in html.html.find('#chapterList li > a'):
42 | url.append('https://www.x23qb.com' + page.attrs['href'])
43 | title.append(unicodedata.normalize('NFKC', page.text))
44 | return title, url
45 |
46 |
47 | def get_text(text):
48 | findtext = re.compile(r'chapter\(\);(.*?)铅笔小说', flags=re.DOTALL)
49 | data = []
50 | for page in text:
51 | content = page.html.find('#TextContent')
52 | if len(content) != 1:
53 | raise ValueError(content)
54 | read = re.findall(findtext, content[0].text)
55 | if len(read) == 1:
56 | data.append(read[0])
57 | else:
58 | raise ValueError(read)
59 | return data
60 |
61 |
62 | def save_txt(book, title, text):
63 | if len(title) == len(text):
64 | for index in range(len(title)):
65 | root = os.getcwd() + '\\' + book + '\\'
66 | file = root + title[index] + '.txt'
67 | if not os.path.exists(root):
68 | os.mkdir(root)
69 | if not os.path.exists(file):
70 | with open(file, 'w+', encoding='utf-8') as txt:
71 | txt.write(unicodedata.normalize('NFKC', text[index]))
72 | txt.close()
73 | print(colored(title[index] + '保存成功', 'yellow'))
74 | else:
75 | save = input(
76 | colored(
77 | title[index] +
78 | '已存在,是否覆盖保存?\n覆盖保存直接回车,不保存请输入任意字符后回车\n需要关闭文件后再覆盖保存\n',
79 | 'red'))
80 | if bool(save):
81 | print(colored(title[index] + '已存在,未保存', 'red'))
82 | else:
83 | with open(file, 'w+', encoding='utf-8') as txt:
84 | txt.write(unicodedata.normalize('NFKC', text[index]))
85 | txt.close()
86 | print(colored(title[index] + '覆盖保存成功', 'yellow'))
87 | else:
88 | print('获取小说数据异常')
89 |
90 |
91 | def book_root(name):
92 | name = name.replace('\\', '')
93 | name = name.replace('/', '')
94 | name = name.replace('?', '')
95 | name = name.replace(':', '')
96 | name = name.replace('*', '')
97 | name = name.replace('|', '')
98 | name = name.replace('<', '')
99 | name = name.replace('>', '')
100 | name = name.replace('"', '')
101 | return name
102 |
103 |
104 | def main():
105 | url = input_data()
106 | start = time.time()
107 | html = open_url(url)
108 | book = html.html.find('.d_title h1')[0].text
109 | book = book_root(book)
110 | title, url = get_url(html)
111 | html = open_url(url)
112 | text = get_text(html)
113 | save_txt(book, title, text)
114 | print('爬取结束,运行时间:{:.6f}'.format(time.time() - start))
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/前程无忧/51job爬虫_1.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from urllib import parse
3 | import json
4 | import pymysql
5 | import time
6 | import random
7 |
8 |
9 | def get_html(key, page=1):
10 | header = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
12 | '3.121 Safari/537.36',
13 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
14 | 'Host': 'search.51job.com',
15 | 'Accept-Encoding': 'gzip, deflate, br',
16 | 'Sec-Fetch-Dest': 'empty',
17 | 'Sec-Fetch-Mode': 'cors',
18 | 'Sec-Fetch-Site': 'same-origin',
19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
20 | 'Cache-Control': 'no-cache',
21 | 'Connection': 'keep-alive',
22 | 'DNT': '1',
23 | 'Pragma': 'no-cache',
24 | 'X-Requested-With': 'XMLHttpRequest',
25 | 'Referer': 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000'
26 | '&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&w'
27 | 'elfare='.format(
28 | parse.quote(key),
29 | page)}
30 | url = 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&' \
31 | 'cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
32 | session = requests.Session()
33 | response = session.get(url.format(parse.quote(key), page), headers=header)
34 | if response.status_code == 200:
35 | return response.content.decode(response.encoding)
36 | else:
37 | raise Warning('请求网页失败')
38 |
39 |
40 | def get_data(key, html, page):
41 | data, pages = deal_data(html)
42 | save_data(data)
43 | if page != 0:
44 | for i in range(2, page + 2):
45 | time.sleep(random.random() + random.randrange(2, 4, 1))
46 | html = get_html(key, page=i)
47 | data, pages = deal_data(html)
48 | save_data(data)
49 | print('获取数据结束')
50 |
51 |
52 | def save_data(data):
53 | """首次使用需要在下方代码填写数据库信息(自行创建数据库)"""
54 | try:
55 | database = pymysql.connect(
56 | host='localhost',
57 | user='root',
58 | password='数据库密码',
59 | db='数据库名称')
60 | except pymysql.err.OperationalError:
61 | raise ValueError("连接数据库失败,请检查数据库密码与数据库名称(运行前手动创建数据库)")
62 | cursor = database.cursor()
63 | sql = '''create table if not exists 51job
64 | (链接 text,
65 | 职位 text,
66 | 发布日期 DATETIME,
67 | 月薪 text,
68 | 信息 text,
69 | 福利 text,
70 | 公司名称 text,
71 | 公司性质 text,
72 | 公司规模 text,
73 | 行业分类 text)'''
74 | cursor.execute(sql)
75 | database.commit()
76 | sql = '''insert into 51job VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
77 | cursor.executemany(sql, data)
78 | database.commit()
79 | database.close()
80 |
81 |
82 | def deal_data(data):
83 | data = json.loads(data)
84 | items = []
85 | engine_search_result = data['engine_search_result']
86 | for i in engine_search_result:
87 | cache = []
88 | job_href = i['job_href']
89 | job_name = i['job_name']
90 | issuedate = i['issuedate']
91 | if bool(i['providesalary_text']):
92 | providesalary_text = i['providesalary_text']
93 | else:
94 | providesalary_text = None
95 | if bool(i['attribute_text']):
96 | attribute_text = '%s' % ','.join(i['attribute_text'])
97 | else:
98 | attribute_text = None
99 | jobwelf_list = '%s' % ','.join(i['jobwelf_list'])
100 | if not bool(jobwelf_list):
101 | jobwelf_list = None
102 | company_name = i['company_name']
103 | companytype_text = i['companytype_text']
104 | if bool(i['companysize_text']):
105 | companysize_text = i['companysize_text']
106 | else:
107 | companysize_text = None
108 | companyind_text = i['companyind_text']
109 | cache.append(job_href)
110 | cache.append(job_name)
111 | cache.append(issuedate)
112 | cache.append(providesalary_text)
113 | cache.append(attribute_text)
114 | cache.append(jobwelf_list)
115 | cache.append(company_name)
116 | cache.append(companytype_text)
117 | cache.append(companysize_text)
118 | cache.append(companyind_text)
119 | items.append(cache)
120 | pages = data['jobid_count']
121 | return items, int(pages)
122 |
123 |
124 | def get_page(html, page):
125 | item, items = deal_data(html)
126 | pages = items // 50 + 1 if items % 50 != 0 else items // 50
127 | if pages >= page >= 1:
128 | return page - 1
129 | print('获取到总页数为:' + str(pages) + '\n输入页数超出总页数或页数输入错误\n本次运行程序只获取第一页数据')
130 | return 0
131 |
132 |
133 | def main():
134 | print('首次运行请在代码中修改MySQL数据库密码和数据库名称')
135 | key = input('请输入关键字:')
136 | page = int(input('请输入获取页数:'))
137 | html = get_html(key)
138 | page = get_page(html, page)
139 | get_data(key, html, page)
140 |
141 |
142 | if __name__ == '__main__':
143 | main()
144 |
--------------------------------------------------------------------------------
/前程无忧/51job爬虫_2.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from urllib import parse
3 | import json
4 | import sqlite3
5 | import time
6 | import random
7 |
8 |
9 | def get_html(key, page=1):
10 | header = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
12 | '3.121 Safari/537.36',
13 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
14 | 'Host': 'search.51job.com',
15 | 'Accept-Encoding': 'gzip, deflate, br',
16 | 'Sec-Fetch-Dest': 'empty',
17 | 'Sec-Fetch-Mode': 'cors',
18 | 'Sec-Fetch-Site': 'same-origin',
19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
20 | 'Cache-Control': 'no-cache',
21 | 'Connection': 'keep-alive',
22 | 'DNT': '1',
23 | 'Pragma': 'no-cache',
24 | 'X-Requested-With': 'XMLHttpRequest',
25 | 'Referer': 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000'
26 | '&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&w'
27 | 'elfare='.format(
28 | parse.quote(key),
29 | page)}
30 | url = 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&' \
31 | 'cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
32 | session = requests.Session()
33 | response = session.get(url.format(parse.quote(key), page), headers=header)
34 | if response.status_code == 200:
35 | return response.content.decode(response.encoding)
36 | else:
37 | raise Warning('请求网页失败')
38 |
39 |
40 | def get_data(key, html, page):
41 | data, pages = deal_data(html)
42 | save_data(data)
43 | if page != 0:
44 | for i in range(2, page + 2):
45 | time.sleep(random.random() + random.randrange(2, 4, 1))
46 | html = get_html(key, page=i)
47 | data, pages = deal_data(html)
48 | save_data(data)
49 | print('获取数据结束')
50 |
51 |
52 | def save_data(data):
53 | database = sqlite3.connect('51job.db')
54 | cursor = database.cursor()
55 | sql = '''create table if not exists 前程无忧
56 | (链接 text,
57 | 职位 text,
58 | 发布日期 DATETIME,
59 | 月薪 text,
60 | 信息 text,
61 | 福利 text,
62 | 公司名称 text,
63 | 公司性质 text,
64 | 公司规模 text,
65 | 行业分类 text)'''
66 | cursor.execute(sql)
67 | database.commit()
68 | sql = '''insert into 前程无忧 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'''
69 | cursor.executemany(sql, data)
70 | database.commit()
71 | database.close()
72 |
73 |
74 | def deal_data(data):
75 | data = json.loads(data)
76 | items = []
77 | engine_search_result = data['engine_search_result']
78 | for i in engine_search_result:
79 | cache = []
80 | job_href = i['job_href']
81 | job_name = i['job_name']
82 | issuedate = i['issuedate']
83 | if bool(i['providesalary_text']):
84 | providesalary_text = i['providesalary_text']
85 | else:
86 | providesalary_text = None
87 | if bool(i['attribute_text']):
88 | attribute_text = '%s' % ','.join(i['attribute_text'])
89 | else:
90 | attribute_text = None
91 | jobwelf_list = '%s' % ','.join(i['jobwelf_list'])
92 | if not bool(jobwelf_list):
93 | jobwelf_list = None
94 | company_name = i['company_name']
95 | companytype_text = i['companytype_text']
96 | if bool(i['companysize_text']):
97 | companysize_text = i['companysize_text']
98 | else:
99 | companysize_text = None
100 | companyind_text = i['companyind_text']
101 | cache.append(job_href)
102 | cache.append(job_name)
103 | cache.append(issuedate)
104 | cache.append(providesalary_text)
105 | cache.append(attribute_text)
106 | cache.append(jobwelf_list)
107 | cache.append(company_name)
108 | cache.append(companytype_text)
109 | cache.append(companysize_text)
110 | cache.append(companyind_text)
111 | items.append(cache)
112 | pages = data['jobid_count']
113 | return items, int(pages)
114 |
115 |
116 | def get_page(html, page):
117 | item, items = deal_data(html)
118 | pages = items // 50 + 1 if items % 50 != 0 else items // 50
119 | if pages >= page >= 1:
120 | return page - 1
121 | print('获取到总页数为:' + str(pages) + '\n输入页数超出总页数或页数输入错误\n本次运行程序只获取第一页数据')
122 | return 0
123 |
124 |
125 | def main():
126 | """数据保存到SQLite"""
127 | key = input('请输入关键字:')
128 | page = int(input('请输入获取页数:'))
129 | html = get_html(key)
130 | page = get_page(html, page)
131 | get_data(key, html, page)
132 |
133 |
134 | if __name__ == '__main__':
135 | main()
136 |
--------------------------------------------------------------------------------
/哔哩哔哩/B站弹幕爬虫.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | from fake_useragent import FakeUserAgent
5 | import json
6 | import jieba
7 | import wordcloud
8 |
9 |
10 | def input_url():
11 | url = input('输入视频网址:')
12 | if url[-1] != '/':
13 | url += '/'
14 | check_url = re.compile(r'^https://www.bilibili.com/video/(.*?)/$')
15 | bv = re.findall(check_url, url)
16 | if bool(bv):
17 | if len(bv[0].split('/')) == 1:
18 | return bv[0]
19 | else:
20 | raise ValueError('视频网址输入错误')
21 |
22 |
23 | def get_url(bv):
24 | content, text = open_url(
25 | 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp'.format(bv))
26 | text = json.loads(text)
27 | av = text['data'][0]['cid']
28 | return 'http://comment.bilibili.com/' + str(av) + '.xml'
29 |
30 |
31 | def open_url(url):
32 | header = {'user-agent': FakeUserAgent().chrome}
33 | html = requests.get(url=url, headers=header)
34 | return html.content, html.text
35 |
36 |
37 | def deal_data(html):
38 | find = re.compile(r'(.*?)')
39 | data = ''
40 | html = BeautifulSoup(html, 'html.parser')
41 | for item in html.findAll('d'):
42 | item = str(item)
43 | item = re.findall(find, item)
44 | data += item[0]
45 | return data
46 |
47 |
48 | def cloud(data):
49 | data = jieba.cut(data)
50 | word = wordcloud.WordCloud(
51 | font_path='msyh.ttc',
52 | background_color='white',
53 | width=1920,
54 | height=1080)
55 | word.generate('%s' % ' '.join(data))
56 | word.to_file('弹幕词云图.png')
57 |
58 |
59 | def main():
60 | url = input_url()
61 | url = get_url(url)
62 | content, text = open_url(url)
63 | data = deal_data(content)
64 | cloud(data)
65 |
66 |
67 | if __name__ == '__main__':
68 | main()
69 |
--------------------------------------------------------------------------------
/哔哩哔哩/B站评论爬虫.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | from fake_useragent import FakeUserAgent
5 | import json
6 | import jieba
7 | import wordcloud
8 | import xlwt
9 |
10 |
11 | def input_data():
12 | url = input('输入视频网址:')
13 | if url[-1] != '/':
14 | url += '/'
15 | check_url = re.compile(r'^https://www.bilibili.com/video/(.*?)/$')
16 | bv = re.findall(check_url, url)
17 | if not bool(bv):
18 | raise ValueError('视频网址格式错误')
19 | if len(bv[0].split('/')) != 1:
20 | raise ValueError('视频网址格式错误')
21 | page = int(input('爬取评论页数(仅爬取指定页数评论):'))
22 | if page < 1:
23 | raise ValueError('爬取评论总页数输入错误')
24 | return url, page
25 |
26 |
27 | def get_url(url, page):
28 | findav = re.compile(
29 | r' i[3]:
100 | print('起始页数大于结束页数:', i)
101 | return False
102 | return True
103 |
104 | @property
105 | def tasks_length(self):
106 | return len(self.tasks)
107 |
108 | def save_config(self):
109 | with open(self.config_file, 'w') as f:
110 | f.write(json.dumps(self.config_new))
111 | print('配置文件已更新!\n未完成全部爬取任务前不要修改', self.config_file, '文件!')
112 |
113 |
114 | class Database:
115 | db_name = 'haodouDB'
116 | sql = f"""create database IF NOT EXISTS {db_name} CHARACTER SET utf8mb4"""
117 |
118 | def __init__(self, mysql):
119 | self.mysql_db = pymysql.connect(
120 | host=mysql['host'],
121 | user=mysql['user'],
122 | password=mysql['password'])
123 | self.cursor = self.mysql_db.cursor()
124 | self.initial_db(mysql)
125 |
126 | def initial_db(self, mysql):
127 | self.cursor.execute(self.sql)
128 | self.mysql_db.commit()
129 | self.mysql_db.close()
130 | self.mysql_db = pymysql.connect(
131 | host=mysql['host'],
132 | user=mysql['user'],
133 | password=mysql['password'],
134 | db=self.db_name)
135 | self.cursor = self.mysql_db.cursor()
136 |
137 | def create_table(self):
138 | sql = """CREATE TABLE IF NOT EXISTS 好豆网数据(
139 | ID INTEGER primary key,
140 | 链接 text not null,
141 | 菜名 text not null,
142 | 主料_1 text not null,
143 | 主料_2 text,
144 | 主料_3 text,
145 | 主料_4 text,
146 | 主料_5 text,
147 | 主料_6 text,
148 | 主料_7 text,
149 | 主料_8 text,
150 | 主料_9 text,
151 | 主料_10 text,
152 | 主料_11 text,
153 | 主料_12 text,
154 | 主料_13 text,
155 | 主料_14 text,
156 | 主料_15 text,
157 | 主料_16 text,
158 | 主料_17 text,
159 | 主料_18 text,
160 | 主料_19 text,
161 | 主料_20 text,
162 | 辅料 text not null,
163 | 步骤 text not null,
164 | 收藏 INTEGER not null,
165 | 类型 text not null
166 | )"""
167 | self.cursor.execute(sql)
168 | self.mysql_db.commit()
169 |
170 | def insert_data(self, data):
171 | if not data:
172 | return
173 | sql = """insert ignore into 好豆网数据 values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
174 | %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
175 | self.cursor.execute(sql, data)
176 | self.mysql_db.commit()
177 |
178 | def __del__(self):
179 | self.mysql_db.close()
180 | print('数据库已关闭')
181 |
182 |
183 | class Spider:
184 | base_url = 'https://www.haodou.com'
185 | ajax = 'https://vhop.haodou.com/hop/router/rest.json'
186 | headers_ajax = {
187 | 'Accept': 'application/json, text/plain, */*',
188 | 'Accept-Encoding': 'gzip, deflate, br',
189 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
190 | 'Cache-Control': 'no-cache',
191 | 'Connection': 'keep-alive',
192 | 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
193 | 'DNT': '1',
194 | 'Host': 'vhop.haodou.com',
195 | 'Origin': 'https://www.haodou.com',
196 | 'Pragma': 'no-cache',
197 | 'Referer': 'https://www.haodou.com/',
198 | 'Sec-Fetch-Dest': 'empty',
199 | 'Sec-Fetch-Mode': 'cors',
200 | 'Sec-Fetch-Site': 'same-site',
201 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
202 | '89.114 Safari/537.36 Edg/89.0.774.68'}
203 | headers_item = {
204 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicatio'
205 | 'n/signed-exchange;v=b3;q=0.9',
206 | 'Accept-Encoding': 'gzip, deflate, br',
207 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
208 | 'Cache-Control': 'no-cache',
209 | 'Connection': 'keep-alive',
210 | 'DNT': '1',
211 | 'Host': 'www.haodou.com',
212 | 'Pragma': 'no-cache',
213 | 'Referer': 'https://www.haodou.com/recipe/all/',
214 | 'Sec-Fetch-Dest': 'document',
215 | 'Sec-Fetch-Mode': 'navigate',
216 | 'Sec-Fetch-Site': 'none',
217 | 'Sec-Fetch-User': '?1',
218 | 'Upgrade-Insecure-Requests': '1',
219 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
220 | '89.114 Safari/537.36 Edg/89.0.774.68'}
221 | form_first = {
222 | '_HOP_': None,
223 | 'from': 'mvue',
224 | 'adcode': '100000',
225 | 'appid': '100',
226 | 'Siger': None,
227 | 'uuid': '0',
228 | 'uid': '0',
229 | 'hduid': '0',
230 | 'vc': '177',
231 | 'vn': '1.0.0'}
232 | form_second = {'numbers': '[]',
233 | 'moduleId': '5d35709cfd96c61a103a13c2',
234 | 'id': None,
235 | '_HOP_': None,
236 | 'from': 'mvue',
237 | 'adcode': '100000',
238 | 'appid': '100',
239 | 'Siger': None,
240 | 'uuid': '0',
241 | 'uid': '0',
242 | 'hduid': '0',
243 | 'vc': '177',
244 | 'vn': '1.0.0',
245 | 'last': None}
246 |
247 | class Parameter:
248 |
249 | @staticmethod
250 | def get_hop(time_, sign):
251 | return f'{{"version":"1.0.0","action":"api.www.recipe.category","secret_id":"5722f877e4b0d4512e3fd872","c' \
252 | f'urrent_time":{time_},"sign":"{sign}"}}'
253 |
254 | @staticmethod
255 | def url_encode(text):
256 | return quote(text)
257 |
258 | @staticmethod
259 | def get_current_time():
260 | return str(time.time())[:10]
261 |
262 | def get_last(self, page, total):
263 | base = f'{{current:{page},total:{total},offset:{page * 40},limit:40}}'
264 | return self.url_encode(base)
265 |
266 | @staticmethod
267 | def get_sign(time_, siger_, last=None, id_=None):
268 | if last and id_:
269 | return (
270 | f"Siger{siger_}_HOP_.actionapi.www.recipe.category_HOP_.current_time{time_}_HOP_.secret_id5722f"
271 | f"877e4b0d4512e3fd872_HOP_.version1.0.0adcode100000appid100frommvuehduid0id{id_}last{last}mod"
272 | f"uleId5d35709cfd96c61a103a13c2numbers%5B%5Duid0uuid0vc177vn1.0.01bc0d50feafb484b863d4100a561"
273 | f"a9cf")
274 |
275 | else:
276 | return (
277 | f"Siger{siger_}_HOP_.actionapi.www.search.default_HOP_.current_time{time_}_HOP_.secret_id5722f8"
278 | f"77e4b0d4512e3fd872_HOP_.version1.0.0adcode100000appid100frommvuehduid0uid0uuid0vc177vn1.0.0"
279 | f"1bc0d50feafb484b863d4100a561a9cf")
280 |
281 | @staticmethod
282 | def get_siger():
283 | return time.strftime("%Y%m%d")
284 |
285 | def __init__(self, progress, tasks, length):
286 | self.session = HTMLSession()
287 | self.type_ = progress['type']
288 | self.page = progress['page']
289 | self.tasks = tasks
290 | self.length = length
291 | self.parameter = self.Parameter()
292 | self.state = False
293 | self.last = None
294 |
295 | def open_url(self, url, id_, page):
296 | current = 1 if page < 2 else page - 1
297 | if not self.last or page == 1:
298 | html = self.get_last(url)
299 | if self.state:
300 | return ['0']
301 | if page == 1:
302 | print('获取菜品列表成功:', url, page)
303 | return [i.attrs['href'] for i in html.find('div > a.lists')]
304 | current_time = self.parameter.get_current_time()
305 | siger = self.md5(self.md5(self.parameter.get_siger()))
306 | last = '{{"current":{},"total":{},"offset":{},"limit":40}}'.format(
307 | current, self.last, (page - 1) * 40)
308 | sign = self.md5(
309 | self.parameter.get_sign(
310 | current_time,
311 | siger,
312 | self.parameter.url_encode(last),
313 | id_))
314 | hop = self.parameter.get_hop(current_time, sign)
315 | form = copy.deepcopy(self.form_second)
316 | form['id'] = id_
317 | form['Siger'] = siger
318 | form['_HOP_'] = hop
319 | form['last'] = last
320 | response = self.session.post(
321 | self.ajax, headers=self.headers_ajax, data=form)
322 | if response.status_code != 200:
323 | print('获取菜品列表失败:', url, page)
324 | self.state = True
325 | return ['0']
326 | self.wait()
327 | print('获取菜品列表成功:', url, page)
328 | return self.get_url(response.html.html)
329 |
330 | def get_html(self, url, id_):
331 | headers = copy.deepcopy(self.headers_item)
332 | headers['Referer'] += id_
333 | response = self.session.get(url, headers=headers)
334 | if response.status_code != 200:
335 | print('获取菜品详情失败:', url)
336 | self.state = True
337 | return None
338 | self.wait()
339 | print('获取菜品详情成功:', url)
340 | return response.html
341 |
342 | def get_url(self, content):
343 | try:
344 | initial = json.loads(content)
345 | return ["/recipe/" + str(item["id"])
346 | for item in initial['data']['dataset']]
347 | except TypeError as e:
348 | for item in initial['data']['dataset']:
349 | print(type(item["id"]), item["id"])
350 | print(e)
351 | except KeyError:
352 | pass
353 | self.state = True
354 | return ['0']
355 |
356 | def get_data(self, html):
357 | if not html:
358 | return None
359 | try:
360 | name = html.find('div.content-right > h1.title-p')[0].text
361 | ingredients = [i.text for i in html.find('div.ingredients')]
362 | condiment = [i.text for i in html.find('div.condiment')]
363 | practices = [i.text for i in html.find('div.practices > div.pai')]
364 | favorite = html.find(
365 | 'div.read > div.cntFavorite:nth-child(2)')[0].text
366 | return [
367 | self._filter(name),
368 | ','.join(self._filter(condiment)),
369 | ','.join(self._filter(practices)),
370 | self._filter(favorite)
371 | ], self._filter(ingredients)
372 | except IndexError:
373 | return None
374 |
375 | def run(self, config, database):
376 | for i in self.tasks[self.type_:self.length]:
377 | id_ = self.get_id(i[0])
378 | start = max(self.page, i[2])
379 | config["Progress"]["page"] = start
380 | for j in range(start, i[3] + 1):
381 | if self.state:
382 | return
383 | url = self.open_url(i[0], id_, j)
384 | for x in url:
385 | if self.state:
386 | return
387 | html = self.get_html(self.base_url + x, id_)
388 | data, ingredients = self.get_data(html)
389 | if data:
390 | data = self.merge(x, data, i[1], ingredients)
391 | database.insert_data(data)
392 | else:
393 | self.state = True
394 | # break
395 | else:
396 | config["Progress"]["page"] += 1
397 | # break
398 | else:
399 | config["Progress"]["type"] += 1
400 | config["Progress"]["page"] = 0
401 | self.last = None
402 | return # 默认每次运行仅获取一种类型的数据,注释此代码可实现一次性爬取全部数据
403 | else:
404 | print('已完成全部爬取任务!')
405 |
406 | def get_last(self, url):
407 | response = self.session.get(url, headers=self.headers_item)
408 | if response.status_code != 200:
409 | self.state = True
410 | print('获取 last 异常:', url)
411 | return
412 | last = re.findall(
413 | re.compile(r'total:(\d+),'),
414 | response.html.html)
415 | if len(last) != 2:
416 | self.state = True
417 | print('获取 last 异常:', last)
418 | return
419 | self.wait()
420 | self.last = last[1]
421 | return response.html
422 |
423 | def get_id(self, url):
424 | id_ = re.findall(r'^https://www.haodou.com/recipe/all/(\d+$)', url)
425 | if not id_:
426 | self.state = True
427 | return
428 | return id_[0]
429 |
430 | @staticmethod
431 | def wait():
432 | time.sleep(random.random() + random.randint(2, 5))
433 | return
434 |
435 | @staticmethod
436 | def md5(text):
437 | hash_ = hashlib.md5()
438 | hash_.update(bytes(text, encoding='utf-8'))
439 | return hash_.hexdigest()
440 |
441 | @staticmethod
442 | def _filter(content):
443 | if isinstance(content, str):
444 | return content.replace('\n', '').strip()
445 | elif isinstance(content, list):
446 | return [i.replace('\n', '').strip() for i in content]
447 | else:
448 | raise ValueError
449 |
450 | def merge(self, link, data, type_, ingredients):
451 | try:
452 | id_ = re.findall(r'^/recipe/(\d+)$', link)[0]
453 | except IndexError:
454 | self.state = True
455 | return None
456 | item = [id_, self.base_url + link, type_]
457 | if len(ingredients) > 20:
458 | self.state = True
459 | print('主料信息过多:', self.base_url + link, '请联系开发者或自行修改代码!')
460 | return None
461 | while len(ingredients) < 20:
462 | ingredients.append(None)
463 | data[1:1] = ingredients
464 | item[2:2] = data
465 | return item
466 |
467 |
468 | class Core:
469 | def __init__(self, config, database, spider):
470 | self.config = config()
471 | if not self.config.check_all():
472 | print(f'请修改 {self.config.config_file} 配置文件')
473 | self.run = False
474 | else:
475 | self.mysql_config, self.progress_config, self.tasks_config, self.tasks_length = self.config.get_config()
476 | self.database = database(self.mysql_config)
477 | self.spider = spider(
478 | self.progress_config,
479 | self.tasks_config,
480 | self.tasks_length)
481 | self.run = True
482 |
483 | def start(self):
484 | if not self.run:
485 | print('未开始爬取数据!')
486 | return
487 | print('若程序报错请保留错误信息,并联系开发者!')
488 | self.database.create_table()
489 | config_new = self.config.new_config()
490 | self.spider.run(config_new, self.database)
491 | self.config.save_config()
492 |
493 | def __del__(self):
494 | print('程序已退出!')
495 |
496 |
497 | def main():
498 | corn = Core(Config, Database, Spider)
499 | corn.start()
500 |
501 |
502 | if __name__ == '__main__':
503 | main()
504 |
--------------------------------------------------------------------------------
/淘宝/淘宝数据爬虫.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | import time
5 |
6 | import requests
7 | import xlwt
8 |
9 |
10 | def get_html(url, cookie):
11 | header = {
12 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrom'
13 | 'e/85.0.4183.83 Safari/537.36',
14 | 'cookie': cookie}
15 | html = requests.get(url=url, headers=header)
16 | html = html.content.decode('utf-8')
17 | return html
18 |
19 |
20 | def deal_data(html):
21 | find = re.compile(r'g_page_config = \{(.*)\};')
22 | data = re.findall(find, html)
23 | file = json.loads('{' + data[0] + '}')
24 | data = []
25 | for i in range(44):
26 | cache = [
27 | file['mods']['itemlist']['data']['auctions'][i]['raw_title'],
28 | file['mods']['itemlist']['data']['auctions'][i]['view_price'],
29 | file['mods']['itemlist']['data']['auctions'][i]['item_loc']]
30 | try:
31 | cache.append(file['mods']['itemlist']['data']
32 | ['auctions'][i]['view_sales'])
33 | except BaseException:
34 | cache.append(None)
35 | cache.append(file['mods']['itemlist']['data']['auctions'][i]['nick'])
36 | data.append(cache)
37 | return data
38 |
39 |
40 | def savexlsx(datalist):
41 | book = xlwt.Workbook(encoding='utf-8')
42 | sheet = book.add_sheet('爬取结果', cell_overwrite_ok=True)
43 | tap = ('描述', '价格', '发货地', '销量', '店铺')
44 | for i in range(5):
45 | sheet.write(0, i, tap[i])
46 | for i in range(len(datalist)):
47 | tap = datalist[i]
48 | for j in range(5):
49 | data_1 = tap[j]
50 | sheet.write(i + 1, j, data_1)
51 | book.save('淘宝爬虫结果.xlsx')
52 | print('文件已保存')
53 |
54 |
55 | def main():
56 | url = 'https://s.taobao.com/search?q={}&s={}'
57 | key = str(input('爬取关键字:'))
58 | page = int(input('爬取页数:'))
59 | cookie = str(input('粘贴cookie到此处:'))
60 | datalist = []
61 | try:
62 | for _ in range(page):
63 | data = get_html(url.format(key, page * 44), cookie)
64 | data = deal_data(data)
65 | for j in range(len(data)):
66 | datalist.append(data[j])
67 | time.sleep(random.randrange(3, 7, 1))
68 | savexlsx(datalist)
69 | except BaseException:
70 | print('请检查 cookie 是否有误')
71 | print('程序结束')
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/美食天下/美食天下.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import random
4 | from bs4 import BeautifulSoup
5 | import pymysql
6 | import json
7 | import os
8 | import re
9 |
10 | HEADERS_1 = {
11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicatio'
12 | 'n/signed-exchange;v=b3;q=0.9',
13 | 'Accept-Encoding': 'gzip, deflate, br',
14 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
15 | 'Cache-Control': 'no-cache',
16 | 'Connection': 'keep-alive',
17 | 'DNT': '1',
18 | 'Host': 'home.meishichina.com',
19 | 'Pragma': 'no-cache',
20 | 'Referer': 'https://www.meishichina.com/',
21 | 'Sec-Fetch-Dest': 'document',
22 | 'Sec-Fetch-Mode': 'navigate',
23 | 'Sec-Fetch-Site': 'same-site',
24 | 'Sec-Fetch-User': '?1',
25 | 'Upgrade-Insecure-Requests': '1',
26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
27 | '8.0.4324.150 Safari/537.36 Edg/88.0.705.63'}
28 |
29 | HEADERS_2 = HEADERS_1.copy()
30 | del HEADERS_2['Referer']
31 | DATABASE = 'deliciousFood'
32 |
33 | ROOT = os.getcwd() + '\\cache\\'
34 |
35 |
36 | # if not os.path.exists(ROOT):
37 | # os.mkdir(ROOT)
38 |
39 |
40 | def get_data():
41 | if os.path.exists('data.json'):
42 | with open('data.json', 'r', encoding='utf-8') as f:
43 | data = json.load(f)
44 | data = data['data']
45 | return data
46 | else:
47 | with open('data.json', 'w', encoding='utf-8') as f:
48 | data = {
49 | "data": [[0, "url_1", "demo_1", 100],
50 | [1, "url_2", "demo_2", 100]]
51 | }
52 | f.write(json.dumps(data))
53 | print('已在当前目录生成 data.json 文件')
54 | print('请在 data.json 文件输入程序必要信息后再运行本程序!')
55 | print(
56 | '格式: [索引(整数), 链接(字符串), 类型(字符串), 爬取页数(整数)]\n按格式在 data.json 输入相关信息,注意最外侧还有一对中括号')
57 | print('爬取任务未完成时不要修改 data.json')
58 | return None
59 |
60 |
61 | def check_data(data):
62 | if not data:
63 | return None
64 | _ = -1
65 | for i, j in enumerate(data):
66 | if j[0] - _ != 1:
67 | print(j, '索引错误')
68 | return None
69 | _ = i
70 | if not re.findall(
71 | r'^https://home.meishichina.com/recipe/[a-z0-9]*?/$',
72 | j[1]):
73 | print(j, '链接错误')
74 | return None
75 | if not j[2]:
76 | print(j, '类型错误')
77 | return None
78 | if j[3] < 1 or j[3] > 100:
79 | print(j, '爬取页数错误')
80 | return None
81 | data[i][1] = j[1] + 'page/{}/'
82 | return data
83 |
84 |
85 | HEADERS_3 = {
86 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q'
87 | '=0.8,application/signed-exchange;v=b3;q=0.9',
88 | 'accept-encoding': 'gzip, deflate, br',
89 | 'accept-language': 'zh-CN,zh;q=0.9',
90 | 'cache-control': 'no-cache',
91 | 'dnt': '1',
92 | 'pragma': 'no-cache',
93 | 'referer': 'https://home.meishichina.com/',
94 | 'sec-fetch-dest': 'document',
95 | 'sec-fetch-mode': 'navigate',
96 | 'sec-fetch-site': 'none',
97 | 'sec-fetch-user': '?1',
98 | 'upgrade-insecure-requests': '1',
99 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro'
100 | 'me/87.0.4280.141 Safari/537.36'}
101 |
102 |
103 | def get_json():
104 | if os.path.exists('MySQL.json'):
105 | with open('MySQL.json', 'r') as f:
106 | template = json.load(f)
107 | host = template['host']
108 | user = template['user']
109 | password = template['password']
110 | return host, user, password
111 | else:
112 | with open('MySQL.json', 'w') as f:
113 | template = {'host': '', 'user': '', 'password': ''}
114 | f.write(json.dumps(template))
115 | print('已在当前目录生成 MySQL.json 文件')
116 | print('请在 MySQL.json 文件输入数据库信息后再运行本程序!')
117 | return None, None, None
118 |
119 |
120 | def create_db(host, user, password):
121 | try:
122 | db = pymysql.connect(host=host, user=user, password=password)
123 | except pymysql.err.OperationalError:
124 | print('连接数据库失败,请检查 MySQL.json 文件')
125 | exit()
126 | sql = f"create database {DATABASE} CHARACTER SET utf8mb4"
127 | cursor = db.cursor()
128 | try:
129 | cursor.execute(sql)
130 | except pymysql.err.ProgrammingError:
131 | pass
132 | db.commit()
133 | db.close()
134 |
135 |
136 | def create_table(db, cursor):
137 | # sql = """create table {}(
138 | # ID MEDIUMINT primary key,
139 | # 链接 text not null,
140 | # 菜名 text not null,
141 | # 食材 text not null,
142 | # 步骤 text not null,
143 | # 效果图 text not null)"""
144 | # # 图片数据 MEDIUMBLOB
145 | # for i in list_:
146 | # try:
147 | # cursor.execute(sql.format(i[2]))
148 | # db.commit()
149 | # except pymysql.err.OperationalError:
150 | # continue
151 | sql = """create table if not exists 美食天下(
152 | ID MEDIUMINT primary key,
153 | 链接 text not null,
154 | 菜名 text not null,
155 | 食材 text not null,
156 | 步骤 text not null,
157 | 效果图 text not null,
158 | 类型 text not null)"""
159 | # 图片数据 MEDIUMBLOB
160 | cursor.execute(sql)
161 | db.commit()
162 |
163 |
164 | def wait_time():
165 | time.sleep(random.random() + random.randint(1, 5))
166 |
167 |
168 | def get_urls(session, url, page):
169 | global HEADERS_1
170 | url = url.format(page)
171 | print('当前网址:', url)
172 | response = session.get(url, headers=HEADERS_1)
173 | if response.status_code == 200:
174 | soup = BeautifulSoup(response.content, 'lxml')
175 | urls = [i['href']
176 | for i in soup.select('ul > li > div.detail > h2 > a')]
177 | wait_time()
178 | return session, urls
179 | else:
180 | print(url, response.status_code)
181 | return None, None
182 |
183 |
184 | def open_url(session, url):
185 | global HEADERS_2
186 | response = session.get(url, headers=HEADERS_2)
187 | if response.status_code == 200:
188 | wait_time()
189 | return session, response.content
190 | else:
191 | print(url, response.status_code)
192 | return None, None
193 |
194 |
195 | def get_img(url):
196 | global HEADERS_3
197 | response = requests.get(url, headers=HEADERS_3)
198 | if response.status_code == 200:
199 | wait_time()
200 | return response.content
201 | else:
202 | print(response.status_code, url)
203 | return None
204 |
205 |
206 | def save_img(id_, img):
207 | if img:
208 | global ROOT
209 | root = os.path.join(ROOT, id_ + '.jpg')
210 | with open(root, 'wb') as f:
211 | f.write(img)
212 | print(f'已保存图片:{id_}')
213 | else:
214 | print(f'保存图片失败:{id_}')
215 |
216 |
217 | def image_data(id_):
218 | global ROOT
219 | root = os.path.join(ROOT, id_ + '.jpg')
220 | with open(root, 'rb') as f:
221 | img = f.read()
222 | return img
223 |
224 |
225 | def deal_data(html):
226 | soup = BeautifulSoup(html, 'lxml')
227 | try:
228 | link = soup.select('h1.recipe_De_title > a')[0]['href']
229 | title = soup.select('h1.recipe_De_title > a')[0]['title']
230 | img = soup.select('div.recipe_De_imgBox > a')[0].img['src']
231 | ingredients = [i.text for i in soup.select('fieldset.particulars b')]
232 | step = [i.text for i in soup.select(
233 | 'div.recipeStep > ul > li > div.recipeStep_word')]
234 | except IndexError:
235 | return 'Error'
236 | id_ = get_id(link)
237 | """需要下载图片到本地请取消注释"""
238 | # save_img(id_, get_img(img))
239 | data = [
240 | id_,
241 | link,
242 | title,
243 | '%s' %
244 | ','.join(ingredients),
245 | '%s' %
246 | ''.join(step).replace('"', "'"),
247 | img]
248 | if None not in data:
249 | return data
250 | print(data)
251 | return None
252 |
253 |
254 | def get_id(url):
255 | return re.findall(
256 | r'^https://home.meishichina.com/recipe-([0-9]*?).html',
257 | url)[0]
258 |
259 |
260 | def save_data(db, cursor, data, type_):
261 | save = True
262 | data.append(type_)
263 | for i in range(7):
264 | data[i] = '"' + data[i] + '"'
265 | sql = """insert into 美食天下 (ID, 链接, 菜名, 食材, 步骤, 效果图, 类型)
266 | values(%s)""" % ','.join(data)
267 | try:
268 | cursor.execute(sql)
269 | db.commit()
270 | print('已保存数据', data[0], data[2])
271 | except pymysql.err.IntegrityError:
272 | save = False
273 | # try:
274 | # if data[-1] and save:
275 | # sql = "insert into %s (图片数据) values (%s)"
276 | # args = (type_, pymysql.Binary(data[-1]))
277 | # cursor.execute(sql, args)
278 | # db.commit()
279 | # print('已保存图片', data[0], data[2])
280 | # except pymysql.err.IntegrityError:
281 | # save = False
282 | return save
283 |
284 |
285 | def save_process(progress):
286 | with open('progress.json', 'w') as f:
287 | f.write(json.dumps(progress))
288 |
289 |
290 | def main():
291 | print('除非发生未知异常,否则不要直接关闭程序')
292 | print('爬取任务未完成时不要修改 data.json')
293 | _ = get_data()
294 | crawler_data = check_data(_)
295 | host, user, password = get_json()
296 | if None in [crawler_data, host, user, password]:
297 | print('data.json 或 MySQL.json 文件内容错误')
298 | exit()
299 | create_db(host, user, password)
300 | db = pymysql.connect(
301 | host=host,
302 | user=user,
303 | password=password,
304 | db=DATABASE)
305 | cursor = db.cursor()
306 | create_table(db, cursor)
307 | over = False
308 | if os.path.exists('progress.json'):
309 | with open('progress.json', 'r') as f:
310 | progress = json.load(f)
311 | if progress['type'] == len(
312 | crawler_data) - 1 and progress['page'] > crawler_data[progress['type']][3]:
313 | print('已获取全部数据,现在可以修改 data.json 文件')
314 | over = True
315 | progress['type'] += 1
316 | progress['page'] = 1
317 | elif progress['type'] == len(crawler_data):
318 | print('已获取全部数据,现在可以修改 data.json 文件')
319 | over = True
320 | else:
321 | progress = {'type': 0, 'page': 1}
322 | _ = progress.copy()
323 | start_type = _['type']
324 | start_page = _['page']
325 | session = requests.Session()
326 | for item in crawler_data[start_type:]:
327 | if over:
328 | break
329 | progress['type'] = item[0]
330 | if progress['page'] > item[3]:
331 | progress['page'] = 1
332 | break # 单次运行只爬取一种类型
333 | for page in range(start_page, item[3] + 1):
334 | # if page - start_page >= 10:
335 | # """单次运行爬取10页,注释代码块可取消限制,修改代码可修改单次爬取页数"""
336 | # over = True
337 | time.sleep(random.random() + random.randint(5, 15))
338 | if over:
339 | break
340 | print('正在爬取 {} 的第 {} 页数据'.format(item[2], page))
341 | session, urls = get_urls(session, item[1], page)
342 | if session and urls:
343 | for info in urls:
344 | # break # 测试使用
345 | if over:
346 | break
347 | session, html = open_url(session, info)
348 | if session and html:
349 | data = deal_data(html)
350 | if data == 'Error':
351 | print('疑似无效链接:', info)
352 | continue
353 | elif data:
354 | result = save_data(db, cursor, data, item[2])
355 | if not result:
356 | over = True
357 | break
358 | else:
359 | over = True
360 | break
361 | else:
362 | over = True
363 | break
364 | break # 测试使用
365 | progress['page'] = page if over else page + 1
366 | else:
367 | over = True
368 | break
369 | save_process(progress)
370 | db.close()
371 | print('程序已退出')
372 |
373 |
374 | if __name__ == '__main__':
375 | start_time = time.time()
376 | main()
377 | print('本次运行时间:{:.6f}'.format(time.time() - start_time))
378 |
--------------------------------------------------------------------------------
/超能网/超能网文章内容爬虫.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 | from parsel import Selector
5 |
6 |
7 | def get_code(url):
8 | headers = {
9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
10 | "application/signed-exchange;v=b3;q=0.9",
11 | "Accept-Encoding": "gzip, deflate, br",
12 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
13 | "Cache-Control": "no-cache",
14 | "Connection": "keep-alive",
15 | "DNT": "1",
16 | "Host": "www.expreview.com",
17 | "Pragma": "no-cache",
18 | "sec-ch-ua": r'\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Microsoft Edge\";v=\"96\"',
19 | "sec-ch-ua-mobile": "?0",
20 | "sec-ch-ua-platform": r'\"Windows\"',
21 | "Sec-Fetch-Dest": "document",
22 | "Sec-Fetch-Mode": "navigate",
23 | "Sec-Fetch-Site": "same-origin",
24 | "Sec-Fetch-User": "?1",
25 | "Upgrade-Insecure-Requests": "1",
26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
27 | "Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62 "}
28 | resource = requests.get(url, headers=headers)
29 | if resource.status_code == requests.codes.ok:
30 | return resource.text
31 | return ""
32 |
33 |
34 | def selector(code):
35 | return Selector(text=code)
36 |
37 |
38 | def get_text(select):
39 | text = select.xpath("//div[@id='post_body']/p/text()").getall()
40 | [print(i) for i in text]
41 |
42 |
43 | def main():
44 | # url = input('输入文章链接:')
45 | url = 'https://www.expreview.com/80774.html'
46 | if re.match(r'https://www.expreview.com/\d+?.html', url):
47 | code = get_code(url)
48 | select = selector(code)
49 | get_text(select)
50 | else:
51 | print('文章链接格式错误!')
52 |
53 |
54 | if __name__ == '__main__':
55 | main()
56 |
--------------------------------------------------------------------------------
/链家/链家二手房爬虫_1.py:
--------------------------------------------------------------------------------
1 | from pyppeteer import launch
2 | import asyncio
3 | import time
4 | import random
5 | import re
6 | from bs4 import BeautifulSoup
7 | import xlwt
8 |
9 |
10 | async def get_html(_url, _pg):
11 | try:
12 | browser = await launch()
13 | page = await browser.newPage()
14 | await page.evaluate('''() =>{
15 | Object.defineProperties(navigator,{ webdriver:{ get: () => false } });
16 | window.navigator.chrome = { runtime: {}, };
17 | Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
18 | Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], });
19 | }''')
20 | await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})')
21 | for pg in range(1, _pg + 1):
22 | print('正在爬取第 %d 页数据' % pg)
23 | await page.goto(_url + 'pg%d' % pg) # 访问网站
24 | await page.waitFor(random.randrange(2000, 4000, 200))
25 | html = await page.content()
26 | data = deal_data(html)
27 | save_data(data, pg)
28 | finally:
29 | await browser.close() # 关闭浏览器
30 |
31 |
32 | def deal_data(html):
33 | soup = BeautifulSoup(html, 'lxml')
34 | item = soup.select('ul.sellListContent > li > div.info.clear')
35 | data = []
36 | for i in item:
37 | url = i.find('div', class_='title').a['href']
38 | title = i.select('div.title > a')[0].text.strip()
39 | flood = [_.text.strip() for _ in i.select('div.flood > div > a')]
40 | address = i.select('div.address > div')[0].text.strip()
41 | follow_info = i.select('div.followInfo')[0].text.strip()
42 | tag = [_.text.strip() for _ in i.select('div.tag > span')]
43 | info = [_.text.strip() for _ in i.select('div.priceInfo > div')]
44 | cache = [
45 | url,
46 | title,
47 | '%s' %
48 | '-'.join(flood),
49 | address,
50 | follow_info,
51 | '%s' %
52 | ','.join(tag),
53 | info[0],
54 | info[1]]
55 | data.append(cache)
56 | return data
57 |
58 |
59 | def save_data(data, pg):
60 | top = ['链接', '标题', '位置', '详情', '数据', '标签', '总价', '单价']
61 | excel = xlwt.Workbook(encoding='utf-8')
62 | sheet = excel.add_sheet('第%s页爬取结果' % pg, cell_overwrite_ok=True)
63 | for i in range(len(top)):
64 | sheet.write(0, i, top[i])
65 | for i in range(len(data)):
66 | for x, y in enumerate(data[i]):
67 | sheet.write(i + 1, x, y)
68 | excel.save('链家二手房爬取结果_%s.xls' % time.time())
69 |
70 |
71 | if __name__ == '__main__':
72 | """使用无头浏览器爬取"""
73 | url = input(
74 | '正确网址示例:https://bj.lianjia.com/ershoufang/dongcheng/\n错误网址示例:https://bj.lianjia.com/ershoufang/\n输入网址:')
75 | pg = int(input('输入爬取页数(1~100):'))
76 | if pg < 1 or pg > 100:
77 | print('爬取页数输入错误,本次运行只爬取第一页数据')
78 | pg = 1
79 | if re.match(r'https://[a-z]*?.lianjia.com/ershoufang/[a-z]*?/$', url):
80 | asyncio.get_event_loop().run_until_complete(get_html(url, pg))
81 | else:
82 | print('网址输入错误')
83 |
--------------------------------------------------------------------------------