├── 58同城
    └── 58同城网址获取.py
├── 99藏书网
    ├── 99藏书网小说_1.py
    ├── 99藏书网小说_2.py
    └── 99藏书网小说_3.py
├── LICENSE
├── README.md
├── 丁香医生
    └── 疾病标签.py
├── 京东
    └── 京东商品爬虫_1.py
├── 从零开始的网络爬虫
    ├── 01_基础网站爬虫.py
    ├── 02_基础网站爬虫.py
    ├── 03_基础网站爬虫.py
    ├── 04_无HTTPS证书网站爬虫.py
    ├── 05_HTTP认证网站爬虫.py
    ├── 06_慢速网站爬虫.py
    ├── 07_异步加载网站爬虫.py
    ├── 08_动态页面渲染网站爬虫.js
    ├── 08_动态页面渲染网站爬虫_1.py
    ├── 08_动态页面渲染网站爬虫_2.py
    ├── 09_无页码翻页网站爬虫_1.py
    ├── 09_无页码翻页网站爬虫_2.py
    ├── 10_异步智能页面网站爬虫.py
    ├── 11_大批量动态页面网站爬虫.py
    ├── 12_数据接口参数加密网站爬虫.py
    ├── 13_模拟登陆网站爬虫_1.py
    ├── 13_模拟登陆网站爬虫_2.py
    ├── 15_CSS样式偏移网站爬虫.py
    └── 21_模拟登陆网站爬虫.py
├── 代码示例
    ├── JA3指纹.py
    ├── JA3指纹破解.py
    ├── URL参数修改.py
    ├── 分布式爬虫示例
    │   ├── 加入任务队列.py
    │   └── 读取任务队列.py
    ├── 加密解密.py
    └── 重试装饰器示例.py
├── 其他网络爬虫
    ├── GXNAS壁纸爬虫_1.py
    ├── GXNAS壁纸爬虫_2.py
    ├── myCancerGenome.py
    ├── 下厨房网爬虫.py
    ├── 北京市政务数据资源网爬虫.py
    ├── 小米步数.py
    ├── 彼岸图网爬虫.py
    ├── 抖音无水印视频爬虫.py
    ├── 有道翻译.py
    ├── 百度百科爬虫项目.py
    ├── 糗事百科爬虫.py
    ├── 豆瓣Top250爬虫.py
    └── 铅笔小说网爬虫.py
├── 前程无忧
    ├── 51job爬虫_1.py
    └── 51job爬虫_2.py
├── 哔哩哔哩
    ├── B站弹幕爬虫.py
    ├── B站评论爬虫.py
    └── msyh.ttc
├── 好豆网
    └── 好豆网爬虫.py
├── 淘宝
    └── 淘宝数据爬虫.py
├── 美食天下
    └── 美食天下.py
├── 超能网
    └── 超能网文章内容爬虫.py
└── 链家
    └── 链家二手房爬虫_1.py


/58同城/58同城网址获取.py:
--------------------------------------------------------------------------------
 1 | from requests_html import HTMLSession
 2 | import json
 3 | 
 4 | 
 5 | def get_html():
 6 |     session = HTMLSession()
 7 |     html = session.get('https://www.58.com/changecity.html?catepath=chuzu')
 8 |     html.html.render()
 9 |     data_1 = html.html.find('.hot-city')
10 |     data_2 = html.html.find('.content-city')
11 |     url = {item.text: 'https:' + item.attrs['href'] for item in data_1}
12 |     for item in data_2:
13 |         url[item.text] = 'https:' + item.attrs['href']
14 |     return url
15 | 
16 | 
17 | def save(data):
18 |     with open('area.txt', 'w+') as file:
19 |         data = json.dumps(data)
20 |         file.write(data)
21 | 
22 | 
23 | def main():
24 |     data = get_html()
25 |     save(data)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/99藏书网/99藏书网小说_1.py:
--------------------------------------------------------------------------------
 1 | from CharacterFilter import clean
 2 | from pyppeteer import launch
 3 | import asyncio
 4 | import re
 5 | import time
 6 | 
 7 | 
 8 | async def get_html(url):
 9 |     """单页爬取"""
10 |     browser = await launch({'headless': True})
11 |     page = await browser.newPage()
12 |     await page.setViewport({'width': 1920, 'height': 1080})
13 |     await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro'
14 |                             'me/89.0.4389.90 Safari/537.36')
15 |     await page.evaluateOnNewDocument('''() =>{
16 |         Object.defineProperties(navigator,{ webdriver:{ get: () => false } });
17 |         window.navigator.chrome = { runtime: {},  };
18 |         Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
19 |         Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], });
20 |         }''')
21 |     await page.goto(url)
22 |     title = await page.content()
23 |     title = re.findall(re.compile(r'<title>(.*?)_在线阅读_九九藏书网</title>'), title)
24 |     if not title:
25 |         await browser.close()
26 |         print('出错了')
27 |         return
28 |     title = clean(title[0])
29 |     for _ in range(20):  # 若爬取内容不全，可增加迭代次数
30 |         await page.keyboard.press('End')
31 |         await page.waitFor(500)
32 |     await page.waitFor(3000)
33 |     png = await page.querySelector('div#content')
34 |     await png.screenshot(path=f'{title}.png')
35 |     await browser.close()
36 | 
37 | 
38 | def main():
39 |     url = input('输入网址：')
40 |     start = time.time()
41 |     if re.findall(
42 |             re.compile(r'^http://www.99csw.com/book/[0-9]+?/[0-9]+?.htm$'),
43 |             url):
44 |         asyncio.get_event_loop().run_until_complete(get_html(url))
45 |     print('程序运行时间：{:.6f}'.format(time.time() - start))
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     """爬取正文并保存为图像"""
50 |     main()
51 | 


--------------------------------------------------------------------------------
/99藏书网/99藏书网小说_2.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from fake_useragent import FakeUserAgent
 4 | import base64
 5 | import time
 6 | import re
 7 | 
 8 | 
 9 | class Spider:
10 |     def __init__(self):
11 |         self.headers = {
12 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
13 |                       'application/signed-exchange;v=b3;q=0.9',
14 |             'Accept-Encoding': 'gzip, deflate',
15 |             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
16 |             'Cache-Control': 'no-cache',
17 |             'Connection': 'keep-alive',
18 |             'DNT': '1',
19 |             'Host': 'www.99csw.com',
20 |             'Pragma': 'no-cache',
21 |             'Upgrade-Insecure-Requests': '1',
22 |             'User-Agent': FakeUserAgent().chrome}
23 | 
24 |     def get_html(self, url):
25 |         response = requests.get(url, headers=self.headers)
26 |         if response.status_code == 200:
27 |             return response.content.decode('utf-8')
28 |         return None
29 | 
30 |     def get_data(self, tree):
31 |         if tree is None:
32 |             return None
33 |         meta = [int(i) for i in self.get_meta(tree)]
34 |         text = self.get_text(tree)
35 |         if len(meta) != len(text) - 1:
36 |             return None
37 |         result = ['' for _ in meta]
38 |         x = 0
39 |         for i, j in enumerate(meta):
40 |             if j < 3:
41 |                 result[j] = text[i + 1]
42 |                 x += 1
43 |             else:
44 |                 result[j - x] = text[i + 1]
45 |                 x += 2
46 |         return result
47 | 
48 |     @staticmethod
49 |     def get_meta(tree):
50 |         meta = tree.xpath(".//meta[5]/@content")[0]
51 |         if meta:
52 |             data = base64.b64decode(meta).decode("ascii")
53 |             return re.split(r'[A-Z]+%', data)
54 |         return None
55 | 
56 |     @staticmethod
57 |     def get_text(tree):
58 |         if tree is None:
59 |             return None
60 |         content = [tree.xpath(".//div[@id='content']/h2/text()")[0]]
61 |         content[1:1] = [i.text for i in tree.xpath(
62 |             ".//div[@id='content']/div")]
63 |         return content
64 | 
65 |     def run(self, url):
66 |         html = self.get_html(url)
67 |         tree = etree.HTML(html) if html else None
68 |         data = self.get_data(tree)
69 |         for i in data:
70 |             print(i)
71 | 
72 | 
73 | def main():
74 |     url = input('输入网址：')
75 |     if not re.findall(r'http://www.99csw.com/book/[0-9]+/[0-9]+.htm', url):
76 |         print('网址格式错误！')
77 |     else:
78 |         start = time.time()
79 |         spider = Spider()
80 |         spider.run(url)
81 |         print('程序运行时间：{:.6f}'.format(time.time() - start))
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/99藏书网/99藏书网小说_3.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from fake_useragent import FakeUserAgent
 4 | import time
 5 | import re
 6 | import base64
 7 | 
 8 | 
 9 | def get_meta(url):
10 |     headers = {
11 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
12 |                   'application/signed-exchange;v=b3;q=0.9',
13 |         'Accept-Encoding': 'gzip, deflate',
14 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
15 |         'Cache-Control': 'no-cache',
16 |         'Connection': 'keep-alive',
17 |         'DNT': '1',
18 |         'Host': 'www.99csw.com',
19 |         'Pragma': 'no-cache',
20 |         'Upgrade-Insecure-Requests': '1',
21 |         'User-Agent': FakeUserAgent().chrome}
22 |     response = requests.get(url, headers=headers)
23 |     tree = etree.HTML(response.content)
24 |     return tree.xpath('//meta[5]/@content')[0]
25 | 
26 | 
27 | def decrypt(text):
28 |     data = base64.b64decode(text).decode("ascii")
29 |     return [int(i) for i in re.split(r'[A-Z]+%', data)]
30 | 
31 | 
32 | def get_data(text):
33 |     data = []
34 |     i = 0
35 |     for j in text:
36 |         if j < 3:
37 |             data.append(j)
38 |             i += 1
39 |         else:
40 |             data.append(j - i)
41 |             i += 2
42 |     return data
43 | 
44 | 
45 | def main():
46 |     url = input('输入网址：')
47 |     start = time.time()
48 |     meta = get_meta(url)
49 |     data = get_data(decrypt(meta))
50 |     print(data)
51 |     print('程序运行时间：{:.6f}'.format(time.time() - start))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python 网络爬虫实例
 2 | 
 3 | **分享各类网站数据爬虫实例，全部爬虫代码开源，可供学习和使用，不可用于非法用途。**
 4 | 
 5 | _**2021-12-19：今后不定时更新**_
 6 | 
 7 | ## 爬虫案例
 8 | 
 9 | 1. 基础网站爬虫（一）
10 | 2. 基础网站爬虫（二）
11 | 3. 基础网站爬虫（三）
12 | 4. 无 HTTPS 证书网站爬虫
13 | 5. HTTP 认证网站爬虫
14 | 6. 慢速网站爬虫
15 | 7. 异步加载网站爬虫
16 | 8. 动态页面渲染网站爬虫
17 | 9. 无页码翻页网站爬虫
18 | 10. 异步智能页面网站爬虫
19 | 11. 大批量动态页面网站爬虫
20 | 12. 数据接口参数加密网站爬虫
21 | 13. 模拟登录网站爬虫
22 | 14. CSS样式偏移网站爬虫
23 | 
24 | ## 爬虫名单
25 | 
26 | * 豆瓣 Top250 电影爬虫
27 | * 京东商品数据爬虫
28 | * 淘宝商品数据爬虫
29 | * 糗事百科段子爬虫
30 | * 彼岸图网图片爬虫
31 | * GXNAS 精美壁纸图片爬虫
32 | * 抖音无水印视频爬虫
33 | * 哔哩哔哩视频弹幕爬虫
34 | * 哔哩哔哩视频评论爬虫
35 | * 铅笔小说网爬虫
36 | * 半次元无水印图片爬虫
37 | * 下厨房网数据爬虫
38 | * 百度百科爬虫项目
39 | * 北京市政务数据资源网爬虫
40 | * 51Job 职位数据爬虫
41 | * 癌症医学数据库爬虫
42 | * 有道在线翻译爬虫
43 | * 链家二手房数据爬虫
44 | * 美食天下数据爬虫
45 | * 丁香医生数据爬虫
46 | * 九九藏书网小说爬虫
47 | * 好豆网数据爬虫
48 | 


--------------------------------------------------------------------------------
/丁香医生/疾病标签.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | from fake_useragent import FakeUserAgent
 5 | from requests_html import HTMLSession
 6 | 
 7 | 
 8 | def get_link(session, headers):
 9 |     url = 'https://dxy.com/diseases'
10 |     response = session.get(url, headers=headers)
11 |     link = [
12 |         i.attrs['href']
13 |         for i in response.html.find('div.section-filter-box-wrap.normal > a')
14 |     ]
15 |     return session, link
16 | 
17 | 
18 | def get_disease_link(session, link, headers):
19 |     json_re = re.compile(r'window.\$\$data=(.*)')
20 |     disease = []
21 |     tag_name = []
22 |     for i in link:
23 |         response = session.get(i, headers=headers)
24 |         script = response.html.find('script')
25 |         if not script[3]:
26 |             print('匹配 script 失败！')
27 |             break
28 |         json_ = re.findall(json_re, script[3].text)
29 |         if not json_:
30 |             print('JSON 格式化失败！')
31 |             break
32 |         data = json.loads(json_[0])
33 |         for j in data['diseases'][1:]:
34 |             for x in j['tag_list']:
35 |                 tag_id = x['tag_id']
36 |                 tag_name.append(x['tag_name'])
37 |                 disease.append(tag_id)
38 |         break  # 调试代码
39 |     return session, disease, tag_name
40 | 
41 | 
42 | def get_tag(session, disease, headers):
43 |     url = 'https://dxy.com/disease/{}/detail'
44 |     data = []
45 |     for i in disease:
46 |         response = session.get(url.format(i), headers=headers)
47 |         data.append([j.text for j in response.html.find(
48 |             'div.tag-content-tags > a')])
49 |         break  # 调试代码
50 |     return data
51 | 
52 | 
53 | def main():
54 |     session = HTMLSession()
55 |     headers = {
56 |         'user-agent': FakeUserAgent().chrome,
57 |         'referer': 'https://dxy.com/',
58 |         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applic'
59 |                   'ation/signed-exchange;v=b3;q=0.9',
60 |         'accept-encoding': 'gzip, deflate, br',
61 |         'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
62 |         'upgrade-insecure-requests': '1'}
63 |     session, link = get_link(session, headers)
64 |     session, disease, tag_name = get_disease_link(session, link, headers)
65 |     data = get_tag(session, disease, headers)
66 |     # if len(tag_name) != len(data):
67 |     #     print('爬取发生异常！')
68 |     #     exit()
69 |     print(data)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/京东/京东商品爬虫_1.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from pyppeteer.launcher import launch
 3 | import asyncio
 4 | import time
 5 | import random
 6 | from fake_useragent import FakeUserAgent
 7 | import csv
 8 | 
 9 | 
10 | def get_url(keyword, pages):
11 |     url_list = []
12 |     while 1 <= pages <= 100:
13 |         if pages == 1:
14 |             url = 'https://search.jd.com/Search?keyword={}'.format(
15 |                 keyword)
16 |         else:
17 |             url = 'https://search.jd.com/Search?keyword={}&page={}'.format(
18 |                 keyword, pages * 2 - 1)
19 |         pages -= 1
20 |         url_list.append(url)
21 |     if not url_list:
22 |         raise ValueError('页数输入错误')
23 |     url_list.reverse()
24 |     return url_list
25 | 
26 | 
27 | def deal_data(html):
28 |     soup = BeautifulSoup(html, 'lxml')
29 |     data = []
30 |     for item in soup.select('ul.gl-warp.clearfix > li.gl-item'):
31 |         _ = []
32 |         cache = item.select('div.p-price > strong > i')
33 |         price = cache[0].text.strip() if cache else None
34 |         cache = item.select('div.p-name.p-name-type-2 > a')
35 |         link = 'https:' + cache[0]['href'] if cache else None
36 |         cache = item.select('div.p-name.p-name-type-2 > a > em')
37 |         if cache and len(cache[0].text) <= 4:
38 |             cache = item.select('div.p-name.p-name-type-2 > a > i')
39 |         describe = cache[0].text.replace('\n', '').strip() if cache else None
40 |         cache = item.select('div.p-commit > strong > a')
41 |         evaluate = cache[0].text.strip() if cache else None
42 |         cache = item.select('div.p-shop > span > a')
43 |         shop_name = cache[0].text.strip() if cache else None
44 |         cache = item.select('div.p-icons > i')
45 |         label = [i.text.strip() for i in cache] if cache else None
46 |         _.append(link)
47 |         _.append(price)
48 |         _.append(describe)
49 |         _.append(evaluate)
50 |         _.append(shop_name)
51 |         _.append(label)
52 |         data.append(_)
53 |     return data
54 | 
55 | 
56 | def save_data(data):
57 |     with open(f'{time.time()}.csv', 'w', encoding='utf-8', newline='') as f:
58 |         f = csv.writer(f, delimiter=',')
59 |         f.writerows(data)
60 | 
61 | 
62 | async def get_html(url):
63 |     browser = await launch()
64 |     page = await browser.newPage()
65 |     await page.setUserAgent(FakeUserAgent().chrome)
66 |     await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})')
67 |     for i in url:
68 |         await page.goto(i)
69 |         await page.waitFor(random.randrange(2, 5, 1))
70 |         for _ in range(12):
71 |             await page.keyboard.press('PageDown')
72 |             time.sleep(random.randrange(1, 4, 1))
73 |         html = await page.content()
74 |         data = deal_data(html)
75 |         save_data(data)
76 |     await browser.close()
77 | 
78 | 
79 | def main():
80 |     kw = input('搜索关键字：')
81 |     pages = int(input('爬取页数：'))
82 |     urls = get_url(kw, pages)
83 |     asyncio.get_event_loop().run_until_complete(get_html(urls))
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     """使用无头浏览器爬取"""
88 |     main()
89 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/01_基础网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import sqlite3
 5 | import time
 6 | 
 7 | 
 8 | def get_html(url):
 9 |     template = url + 'page/'
10 |     html = ''
11 |     for page in range(1, 11):
12 |         url = template + str(page)
13 |         response = open_url(url)
14 |         html += response
15 |     return html
16 | 
17 | 
18 | def open_url(url):
19 |     response = urllib.request.urlopen(url)
20 |     return response.read().decode('utf-8')
21 | 
22 | 
23 | def get_data(html):
24 |     findname = re.compile(
25 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
26 |     findtype = re.compile(
27 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
28 |     findinfo = re.compile(
29 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
30 |     findpublished = re.compile(
31 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
32 |     findscore = re.compile(
33 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
34 |     html = BeautifulSoup(html, 'html.parser')
35 |     data = []
36 |     for item in html.findAll('div', class_='el-card__body'):
37 |         item = str(item)
38 |         movie = []
39 |         chinese_name = re.findall(findname, item)[0][0]
40 |         english_name = re.findall(findname, item)[0][1]
41 |         type = ''.join(i + ' ' for i in re.findall(findtype, item))
42 |         country = re.findall(findinfo, item)[0][0]
43 |         time = re.findall(findinfo, item)[0][1]
44 |         published = re.findall(findpublished, item)
45 |         published = published[0] if len(published) == 1 else None
46 |         score = re.findall(findscore, item)[0].strip()
47 |         movie.append(chinese_name)
48 |         movie.append(english_name)
49 |         movie.append(type.strip())
50 |         movie.append(country)
51 |         movie.append(time)
52 |         movie.append(published)
53 |         movie.append(score)
54 |         data.append(movie)
55 |     return data
56 | 
57 | 
58 | def save_data(data):
59 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
60 |     cursor = sqlite.cursor()
61 |     sql = '''create table 基础网站爬虫
62 |     ('中文名称' text primary key not null,
63 |     '其他名称' text not null,
64 |     '类型' text not null,
65 |     '国家' text not null,
66 |     '时长' text not null,
67 |     '上映' date,
68 |     '评分' text)'''
69 |     try:
70 |         cursor.execute(sql)
71 |         sqlite.commit()
72 |     except sqlite3.OperationalError:
73 |         print('数据表已存在')
74 |     for item in data:
75 |         for index in range(len(item)):
76 |             item[index] = '"' + str(item[index]) + '"'
77 |         sql = '''insert into 基础网站爬虫
78 |         values(%s)''' % ', '.join(item)
79 |         cursor.execute(sql)
80 |     sqlite.commit()
81 |     sqlite.close()
82 | 
83 | 
84 | def main():
85 |     url = 'https://ssr1.scrape.center/'
86 |     """
87 |     电影数据网站，无反爬，数据通过服务端渲染，适合基本爬虫练习。
88 |     代码测试时间：2021/2/24
89 |     """
90 |     start = time.time()
91 |     html = get_html(url)
92 |     data = get_data(html)
93 |     save_data(data)
94 |     print('运行时间：{:.6f}'.format(time.time() - start))
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/02_基础网站爬虫.py:
--------------------------------------------------------------------------------
 1 | from requests_html import HTMLSession
 2 | from fake_useragent import FakeUserAgent
 3 | 
 4 | DATA = {
 5 |     '_token': None,
 6 |     'email': '输入登录邮箱',
 7 |     'password': '输入登录密码'
 8 | }
 9 | 
10 | 
11 | def get_cookie(session):
12 |     response = session.get('http://glidedsky.com/login')
13 |     token = response.html.find('input[name=_token]')
14 |     if not token:
15 |         raise ValueError
16 |     DATA['_token'] = token[0].attrs['value']
17 |     session.post('http://glidedsky.com/login', data=DATA)
18 |     return session
19 | 
20 | 
21 | def get_html(session, url):
22 |     get_cookie(session)
23 |     headers = {
24 |         'user-agent': FakeUserAgent().chrome
25 |     }
26 |     response = session.get(url, headers=headers)
27 |     num = [int(i.text) for i in response.html.find('div.col-md-1')]
28 |     return sum(num)
29 | 
30 | 
31 | def main():
32 |     url = 'http://glidedsky.com/level/web/crawler-basic-1'
33 |     session = HTMLSession()
34 |     print(get_html(session, url))
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/03_基础网站爬虫.py:
--------------------------------------------------------------------------------
 1 | from requests_html import HTMLSession
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import time
 4 | 
 5 | DATA = {
 6 |     '_token': None,
 7 |     'email': '输入登陆邮箱',
 8 |     'password': '输入登陆密码'
 9 | }
10 | 
11 | 
12 | def get_cookie():
13 |     session = HTMLSession()
14 |     response = session.get('http://glidedsky.com/login')
15 |     token = response.html.find('input[name=_token]')
16 |     if not token:
17 |         raise ValueError
18 |     DATA['_token'] = token[0].attrs['value']
19 |     session.post('http://glidedsky.com/login', data=DATA)
20 |     return session
21 | 
22 | 
23 | def get_html(url, session):
24 |     headers = {
25 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
26 |                       '89.90 Safari/537.36'}
27 |     response = session.get(url, headers=headers)
28 |     num = [int(i.text.replace("'", '').strip())
29 |            for i in response.html.find('div.col-md-1')]
30 |     return sum(num)
31 | 
32 | 
33 | def main():
34 |     base = 'http://glidedsky.com/level/web/crawler-basic-2?page={}'
35 |     url = [base.format(i) for i in range(1, 1001)]
36 |     start = time.time()
37 |     session = get_cookie()
38 |     session = [session for _ in range(1, 1001)]
39 |     pool = ThreadPoolExecutor(max_workers=10)
40 |     nums = sum(pool.map(get_html, url, session))
41 |     print(nums)
42 |     print('爬取用时：{:.6f}'.format(time.time() - start))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/04_无HTTPS证书网站爬虫.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | import sqlite3
  4 | import time
  5 | import requests
  6 | 
  7 | 
  8 | def get_html(url):
  9 |     template = url + 'page/'
 10 |     html = ''
 11 |     for page in range(1, 11):
 12 |         url = template + str(page)
 13 |         response = open_url(url)
 14 |         html += response
 15 |     return html
 16 | 
 17 | 
 18 | def open_url(url):
 19 |     response = requests.get(url, verify=False)
 20 |     return response.text
 21 | 
 22 | 
 23 | def get_data(html):
 24 |     findname = re.compile(
 25 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
 26 |     findtype = re.compile(
 27 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
 28 |     findinfo = re.compile(
 29 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
 30 |     findpublished = re.compile(
 31 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
 32 |     findscore = re.compile(
 33 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
 34 |     html = BeautifulSoup(html, 'html.parser')
 35 |     data = []
 36 |     for item in html.findAll('div', class_='el-card__body'):
 37 |         item = str(item)
 38 |         movie = []
 39 |         chinese_name = re.findall(findname, item)[0][0]
 40 |         english_name = re.findall(findname, item)[0][1]
 41 |         type = ''.join(i + ' ' for i in re.findall(findtype, item))
 42 |         country = re.findall(findinfo, item)[0][0]
 43 |         time = re.findall(findinfo, item)[0][1]
 44 |         published = re.findall(findpublished, item)
 45 |         published = published[0] if len(published) == 1 else None
 46 |         score = re.findall(findscore, item)[0].strip()
 47 |         movie.append(chinese_name)
 48 |         movie.append(english_name)
 49 |         movie.append(type.strip())
 50 |         movie.append(country)
 51 |         movie.append(time)
 52 |         movie.append(published)
 53 |         movie.append(score)
 54 |         data.append(movie)
 55 |     return data
 56 | 
 57 | 
 58 | def save_data(data):
 59 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 60 |     cursor = sqlite.cursor()
 61 |     sql = '''create table 无HTTPS证书网站爬虫
 62 |     ('中文名称' text primary key not null,
 63 |     '其他名称' text not null,
 64 |     '类型' text not null,
 65 |     '国家' text not null,
 66 |     '时长' text not null,
 67 |     '上映' date,
 68 |     '评分' text)'''
 69 |     try:
 70 |         cursor.execute(sql)
 71 |         sqlite.commit()
 72 |     except sqlite3.OperationalError:
 73 |         print('数据表已存在')
 74 |     for item in data:
 75 |         for index in range(len(item)):
 76 |             item[index] = '"' + str(item[index]) + '"'
 77 |         sql = '''insert into 无HTTPS证书网站爬虫
 78 |         values(%s)''' % ', '.join(item)
 79 |         cursor.execute(sql)
 80 |     sqlite.commit()
 81 |     sqlite.close()
 82 | 
 83 | 
 84 | def main():
 85 |     url = 'https://ssr2.scrape.center/'
 86 |     """
 87 |     电影数据网站，无反爬，无 HTTPS 证书，适合用作 HTTPS 证书验证。
 88 |     代码测试时间：2021/2/24
 89 |     测试失败，500 服务器错误
 90 |     """
 91 |     start = time.time()
 92 |     html = get_html(url)
 93 |     data = get_data(html)
 94 |     save_data(data)
 95 |     print('运行时间：{:.6f}'.format(time.time() - start))
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/05_HTTP认证网站爬虫.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | import sqlite3
  4 | import time
  5 | import requests
  6 | 
  7 | 
  8 | def get_html(url):
  9 |     template = url + 'page/'
 10 |     html = ''
 11 |     for page in range(1, 11):
 12 |         url = template + str(page)
 13 |         response = open_url(url)
 14 |         html += response
 15 |     return html
 16 | 
 17 | 
 18 | def open_url(url):
 19 |     response = requests.get(url, auth=('admin', 'admin'))
 20 |     return response.text
 21 | 
 22 | 
 23 | def get_data(html):
 24 |     findname = re.compile(
 25 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
 26 |     findtype = re.compile(
 27 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
 28 |     findinfo = re.compile(
 29 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
 30 |     findpublished = re.compile(
 31 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
 32 |     findscore = re.compile(
 33 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
 34 |     html = BeautifulSoup(html, 'html.parser')
 35 |     data = []
 36 |     for item in html.findAll('div', class_='el-card__body'):
 37 |         item = str(item)
 38 |         movie = []
 39 |         chinese_name = re.findall(findname, item)[0][0]
 40 |         english_name = re.findall(findname, item)[0][1]
 41 |         type = ''.join(i + ' ' for i in re.findall(findtype, item))
 42 |         country = re.findall(findinfo, item)[0][0]
 43 |         time = re.findall(findinfo, item)[0][1]
 44 |         published = re.findall(findpublished, item)
 45 |         published = published[0] if len(published) == 1 else None
 46 |         score = re.findall(findscore, item)[0].strip()
 47 |         movie.append(chinese_name)
 48 |         movie.append(english_name)
 49 |         movie.append(type.strip())
 50 |         movie.append(country)
 51 |         movie.append(time)
 52 |         movie.append(published)
 53 |         movie.append(score)
 54 |         data.append(movie)
 55 |     return data
 56 | 
 57 | 
 58 | def save_data(data):
 59 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 60 |     cursor = sqlite.cursor()
 61 |     sql = '''create table HTTP认证网站爬虫
 62 |     ('中文名称' text primary key not null,
 63 |     '其他名称' text not null,
 64 |     '类型' text not null,
 65 |     '国家' text not null,
 66 |     '时长' text not null,
 67 |     '上映' date,
 68 |     '评分' text)'''
 69 |     try:
 70 |         cursor.execute(sql)
 71 |         sqlite.commit()
 72 |     except sqlite3.OperationalError:
 73 |         print('数据表已存在')
 74 |     for item in data:
 75 |         for index in range(len(item)):
 76 |             item[index] = '"' + str(item[index]) + '"'
 77 |         sql = '''insert into HTTP认证网站爬虫
 78 |         values(%s)''' % ', '.join(item)
 79 |         cursor.execute(sql)
 80 |     sqlite.commit()
 81 |     sqlite.close()
 82 | 
 83 | 
 84 | def main():
 85 |     url = 'https://ssr3.scrape.center/'
 86 |     """
 87 |     电影数据网站，无反爬，带有 HTTP Basic Authentication，适合用作 HTTP 认证案例，用户名密码均为 admin。
 88 |     代码测试时间：2021/2/24
 89 |     测试失败，500 服务器错误
 90 |     """
 91 |     start = time.time()
 92 |     html = get_html(url)
 93 |     data = get_data(html)
 94 |     save_data(data)
 95 |     print('运行时间：{:.6f}'.format(time.time() - start))
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/06_慢速网站爬虫.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import sqlite3
 4 | import time
 5 | import requests
 6 | 
 7 | 
 8 | def get_html(url):
 9 |     template = url + 'page/'
10 |     html = ''
11 |     for page in range(1, 11):
12 |         url = template + str(page)
13 |         response = open_url(url)
14 |         html += response
15 |     return html
16 | 
17 | 
18 | def open_url(url):
19 |     response = requests.get(url, timeout=None)
20 |     return response.text
21 | 
22 | 
23 | def get_data(html):
24 |     findname = re.compile(
25 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
26 |     findtype = re.compile(
27 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
28 |     findinfo = re.compile(
29 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
30 |     findpublished = re.compile(
31 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
32 |     findscore = re.compile(
33 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
34 |     html = BeautifulSoup(html, 'html.parser')
35 |     data = []
36 |     for item in html.findAll('div', class_='el-card__body'):
37 |         item = str(item)
38 |         movie = []
39 |         chinese_name = re.findall(findname, item)[0][0]
40 |         english_name = re.findall(findname, item)[0][1]
41 |         type = ''.join(i + ' ' for i in re.findall(findtype, item))
42 |         country = re.findall(findinfo, item)[0][0]
43 |         time = re.findall(findinfo, item)[0][1]
44 |         published = re.findall(findpublished, item)
45 |         published = published[0] if len(published) == 1 else None
46 |         score = re.findall(findscore, item)[0].strip()
47 |         movie.append(chinese_name)
48 |         movie.append(english_name)
49 |         movie.append(type.strip())
50 |         movie.append(country)
51 |         movie.append(time)
52 |         movie.append(published)
53 |         movie.append(score)
54 |         data.append(movie)
55 |     return data
56 | 
57 | 
58 | def save_data(data):
59 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
60 |     cursor = sqlite.cursor()
61 |     sql = '''create table 慢速网站爬虫
62 |     ('中文名称' text primary key not null,
63 |     '其他名称' text not null,
64 |     '类型' text not null,
65 |     '国家' text not null,
66 |     '时长' text not null,
67 |     '上映' date,
68 |     '评分' text)'''
69 |     try:
70 |         cursor.execute(sql)
71 |         sqlite.commit()
72 |     except sqlite3.OperationalError:
73 |         print('数据表已存在')
74 |     for item in data:
75 |         for index in range(len(item)):
76 |             item[index] = '"' + str(item[index]) + '"'
77 |         sql = '''insert into 慢速网站爬虫
78 |         values(%s)''' % ', '.join(item)
79 |         cursor.execute(sql)
80 |     sqlite.commit()
81 |     sqlite.close()
82 | 
83 | 
84 | def main():
85 |     url = 'https://ssr4.scrape.center/'
86 |     """
87 |     电影数据网站，无反爬，每个响应增加了 5 秒延迟，适合测试慢速网站爬取或做爬取速度测试，减少网速干扰。
88 |     代码测试时间：2021/2/24
89 |     """
90 |     start = time.time()
91 |     html = get_html(url)
92 |     data = get_data(html)
93 |     save_data(data)
94 |     print('运行时间：{:.6f}'.format(time.time() - start))
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/07_异步加载网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | def get_html(template):
 8 |     html = []
 9 |     for page in range(10):
10 |         url = template + str(page * 10)
11 |         response = open_url(url)
12 |         html.append(response)
13 |     return html
14 | 
15 | 
16 | def open_url(url):
17 |     response = requests.get(url)
18 |     return response.content.decode('utf-8')
19 | 
20 | 
21 | def get_data(html):
22 |     data = []
23 |     for pages in html:
24 |         for item in range(10):
25 |             movie = []
26 |             page = json.loads(pages)
27 |             name = page['results'][item]['name']
28 |             alias = page['results'][item]['alias']
29 |             categories = page['results'][item]['categories']
30 |             regions = page['results'][item]['regions']
31 |             minute = page['results'][item]['minute']
32 |             published_at = page['results'][item]['published_at']
33 |             score = page['results'][item]['score']
34 |             movie.append(name)
35 |             movie.append(alias)
36 |             movie.append('%s' % ' '.join(categories))
37 |             movie.append('%s' % ' '.join(regions))
38 |             movie.append(str(minute) + ' 分钟')
39 |             movie.append(published_at)
40 |             movie.append(score)
41 |             data.append(movie)
42 |     return data
43 | 
44 | 
45 | def save_data(data):
46 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
47 |     cursor = sqlite.cursor()
48 |     sql = '''create table 异步加载网站爬虫
49 |     ('中文名称' text primary key not null,
50 |     '其他名称' text not null,
51 |     '类型' text not null,
52 |     '国家' text not null,
53 |     '时长' text not null,
54 |     '上映' date,
55 |     '评分' text)'''
56 |     try:
57 |         cursor.execute(sql)
58 |         sqlite.commit()
59 |     except sqlite3.OperationalError:
60 |         print('数据表已存在')
61 |     for item in data:
62 |         for index in range(len(item)):
63 |             item[index] = '"' + str(item[index]) + '"'
64 |         sql = '''insert into 异步加载网站爬虫
65 |         values(%s)''' % ', '.join(item)
66 |         cursor.execute(sql)
67 |     sqlite.commit()
68 |     sqlite.close()
69 | 
70 | 
71 | def main():
72 |     """
73 |     真实网址：https://spa1.scrape.center/
74 |     电影数据网站，无反爬，数据通过 Ajax 加载，页面动态渲染，适合 Ajax 分析和动态页面渲染爬取。
75 |     分析 Ajax 请求，通过 Ajax 请求直接获取 json 格式的网页数据
76 |     下面的 url 是分析 Ajax 请求后得到的网络地址
77 |     offset 参数代表页数，规律是：offset = (页数 - 1) * 10
78 |     代码测试时间：2021/2/24
79 |     """
80 |     url = 'https://spa1.scrape.center/api/movie/?limit=10&offset='
81 |     start = time.time()
82 |     html = get_html(url)
83 |     data = get_data(html)
84 |     save_data(data)
85 |     print('运行时间：{:.6f}'.format(time.time() - start))
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     main()
90 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/08_动态页面渲染网站爬虫.js:
--------------------------------------------------------------------------------
  1 | const express = require('express');
  2 | const app = express();
  3 | app.get('/', function (req, res) {
  4 |     function encodeUTF8(s) {
  5 |         var i, r = [], c, x;
  6 |         for (i = 0; i < s.length; i++)
  7 |             if ((c = s.charCodeAt(i)) < 0x80) r.push(c);
  8 |             else if (c < 0x800) r.push(0xC0 + (c >> 6 & 0x1F), 0x80 + (c & 0x3F));
  9 |             else {
 10 |                 if ((x = c ^ 0xD800) >> 10 == 0)
 11 |                     c = (x << 10) + (s.charCodeAt(++i) ^ 0xDC00) + 0x10000,
 12 |                         r.push(0xF0 + (c >> 18 & 0x7), 0x80 + (c >> 12 & 0x3F));
 13 |                 else r.push(0xE0 + (c >> 12 & 0xF));
 14 |                 r.push(0x80 + (c >> 6 & 0x3F), 0x80 + (c & 0x3F));
 15 |             }
 16 |         ;
 17 |         return r;
 18 |     };
 19 | 
 20 |     function sha1(s) {
 21 |         var data = new Uint8Array(encodeUTF8(s))
 22 |         var i, j, t;
 23 |         var l = ((data.length + 8) >>> 6 << 4) + 16, s = new Uint8Array(l << 2);
 24 |         s.set(new Uint8Array(data.buffer)), s = new Uint32Array(s.buffer);
 25 |         for (t = new DataView(s.buffer), i = 0; i < l; i++) s[i] = t.getUint32(i << 2);
 26 |         s[data.length >> 2] |= 0x80 << (24 - (data.length & 3) * 8);
 27 |         s[l - 1] = data.length << 3;
 28 |         var w = [], f = [
 29 |                 function () {
 30 |                     return m[1] & m[2] | ~m[1] & m[3];
 31 |                 },
 32 |                 function () {
 33 |                     return m[1] ^ m[2] ^ m[3];
 34 |                 },
 35 |                 function () {
 36 |                     return m[1] & m[2] | m[1] & m[3] | m[2] & m[3];
 37 |                 },
 38 |                 function () {
 39 |                     return m[1] ^ m[2] ^ m[3];
 40 |                 }
 41 |             ], rol = function (n, c) {
 42 |                 return n << c | n >>> (32 - c);
 43 |             },
 44 |             k = [1518500249, 1859775393, -1894007588, -899497514],
 45 |             m = [1732584193, -271733879, null, null, -1009589776];
 46 |         m[2] = ~m[0], m[3] = ~m[1];
 47 |         for (i = 0; i < s.length; i += 16) {
 48 |             var o = m.slice(0);
 49 |             for (j = 0; j < 80; j++)
 50 |                 w[j] = j < 16 ? s[i + j] : rol(w[j - 3] ^ w[j - 8] ^ w[j - 14] ^ w[j - 16], 1),
 51 |                     t = rol(m[0], 5) + f[j / 20 | 0]() + m[4] + w[j] + k[j / 20 | 0] | 0,
 52 |                     m[1] = rol(m[1], 30), m.pop(), m.unshift(t);
 53 |             for (j = 0; j < 5; j++) m[j] = m[j] + o[j] | 0;
 54 |         }
 55 |         ;
 56 |         t = new DataView(new Uint32Array(m).buffer);
 57 |         for (var i = 0; i < 5; i++) m[i] = t.getUint32(i << 2);
 58 |         var hex = Array.prototype.map.call(new Uint8Array(new Uint32Array(m).buffer), function (e) {
 59 |             return (e < 16 ? "0" : "") + e.toString(16);
 60 |         }).join("");
 61 |         return hex;
 62 |     };
 63 |     var Base64 =
 64 |         {
 65 |             _keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
 66 |             encode: function (e) {
 67 |                 var t = "";
 68 |                 var n, r, i, s, o, u, a;
 69 |                 var f = 0;
 70 |                 e = Base64._utf8_encode(e);
 71 |                 while (f < e.length) {
 72 |                     n = e.charCodeAt(f++);
 73 |                     r = e.charCodeAt(f++);
 74 |                     i = e.charCodeAt(f++);
 75 |                     s = n >> 2;
 76 |                     o = (n & 3) << 4 | r >> 4;
 77 |                     u = (r & 15) << 2 | i >> 6;
 78 |                     a = i & 63;
 79 |                     if (isNaN(r)) {
 80 |                         u = a = 64
 81 |                     } else if (isNaN(i)) {
 82 |                         a = 64
 83 |                     }
 84 |                     t = t + this._keyStr.charAt(s) + this._keyStr.charAt(o) + this._keyStr.charAt(u) + this._keyStr.charAt(a)
 85 |                 }
 86 |                 return t
 87 |             },
 88 |             decode: function (e) {
 89 |                 var t = "";
 90 |                 var n, r, i;
 91 |                 var s, o, u, a;
 92 |                 var f = 0;
 93 |                 e = e.replace(/[^A-Za-z0-9+/=]/g, "");
 94 |                 while (f < e.length) {
 95 |                     s = this._keyStr.indexOf(e.charAt(f++));
 96 |                     o = this._keyStr.indexOf(e.charAt(f++));
 97 |                     u = this._keyStr.indexOf(e.charAt(f++));
 98 |                     a = this._keyStr.indexOf(e.charAt(f++));
 99 |                     n = s << 2 | o >> 4;
100 |                     r = (o & 15) << 4 | u >> 2;
101 |                     i = (u & 3) << 6 | a;
102 |                     t = t + String.fromCharCode(n);
103 |                     if (u != 64) {
104 |                         t = t + String.fromCharCode(r)
105 |                     }
106 |                     if (a != 64) {
107 |                         t = t + String.fromCharCode(i)
108 |                     }
109 |                 }
110 |                 t = Base64._utf8_decode(t);
111 |                 return t
112 |             },
113 |             _utf8_encode: function (e) {
114 |                 e = e.replace(/rn/g, "n");
115 |                 var t = "";
116 |                 for (var n = 0; n < e.length; n++) {
117 |                     var r = e.charCodeAt(n);
118 |                     if (r < 128) {
119 |                         t += String.fromCharCode(r)
120 |                     } else if (r > 127 && r < 2048) {
121 |                         t += String.fromCharCode(r >> 6 | 192);
122 |                         t += String.fromCharCode(r & 63 | 128)
123 |                     } else {
124 |                         t += String.fromCharCode(r >> 12 | 224);
125 |                         t += String.fromCharCode(r >> 6 & 63 | 128);
126 |                         t += String.fromCharCode(r & 63 | 128)
127 |                     }
128 |                 }
129 |                 return t
130 |             },
131 |             _utf8_decode: function (e) {
132 |                 var t = "";
133 |                 var n = 0;
134 |                 var r = c1 = c2 = 0;
135 |                 while (n < e.length) {
136 |                     r = e.charCodeAt(n);
137 |                     if (r < 128) {
138 |                         t += String.fromCharCode(r);
139 |                         n++
140 |                     } else if (r > 191 && r < 224) {
141 |                         c2 = e.charCodeAt(n + 1);
142 |                         t += String.fromCharCode((r & 31) << 6 | c2 & 63);
143 |                         n += 2
144 |                     } else {
145 |                         c2 = e.charCodeAt(n + 1);
146 |                         c3 = e.charCodeAt(n + 2);
147 |                         t += String.fromCharCode((r & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
148 |                         n += 3
149 |                     }
150 |                 }
151 |                 return t
152 |             }
153 |         };
154 |     var t = Math.round((new Date).getTime() / 1e3).toString();
155 |     var r = ["/api/movie", t];
156 |     var o = sha1(r.join(',').toString());
157 |     var c = Base64.encode([o, t].join(','));
158 |     res.send(c)
159 | });
160 | app.listen(3000)


--------------------------------------------------------------------------------
/从零开始的网络爬虫/08_动态页面渲染网站爬虫_1.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.webdriver.support.ui import WebDriverWait
  3 | from selenium.webdriver import ChromeOptions
  4 | import sqlite3
  5 | import time
  6 | from bs4 import BeautifulSoup
  7 | import re
  8 | 
  9 | 
 10 | def open_url(browser, url):
 11 |     browser.get(url)
 12 |     WebDriverWait(
 13 |         browser, timeout=5).until(
 14 |         lambda x: x.find_element_by_class_name('el-card__body'))
 15 |     return browser.page_source
 16 | 
 17 | 
 18 | def get_data(url):
 19 |     findname = re.compile(
 20 |         r'<h2 class="m-b-sm" data-v-724ecf3b="">(.*?) - (.*?)</h2>')
 21 |     findtype = re.compile(
 22 |         r'<!-- --><!-- --><span>(.*?)\n')
 23 |     findinfo = re.compile(
 24 |         r'<span data-v-724ecf3b="">(.*?)</span><span data-v-724ecf3b=""> / </span><span data-v-724ecf3b="">(.*?)</span>'
 25 |         r'</div><div class="m-v-sm info" data-v-724ecf3b=""><span data-v-724ecf3b="">(.*?) 上映</span>')
 26 |     findscore = re.compile(
 27 |         r'<p class="score m-t-md m-b-n-sm" data-v-724ecf3b="">(.*?)</p>')
 28 |     template = url + 'page/'
 29 |     data = []
 30 |     option = ChromeOptions()
 31 |     option.add_argument('--headless')
 32 |     browser = webdriver.Chrome(options=option)
 33 |     for page in range(1, 11):
 34 |         base = template + str(page)
 35 |         html = open_url(browser, base)
 36 |         html = BeautifulSoup(html, 'html.parser')
 37 |         for item in html.findAll('div', class_='el-card__body'):
 38 |             item = str(item)
 39 |             movie = []
 40 |             chinese_name = re.findall(findname, item)[0][0]
 41 |             english_name = re.findall(findname, item)[0][1]
 42 |             type = ''.join(i + ' ' for i in re.findall(findtype, item))
 43 |             country = re.findall(findinfo, item)[0][0]
 44 |             time = re.findall(findinfo, item)[0][1]
 45 |             published = re.findall(findinfo, item)[0][2]
 46 |             score = re.findall(findscore, item)[0]
 47 |             movie.append(chinese_name)
 48 |             movie.append(english_name)
 49 |             movie.append(type)
 50 |             movie.append(country)
 51 |             movie.append(time)
 52 |             movie.append(published)
 53 |             movie.append(score)
 54 |             data.append(movie)
 55 |     browser.close()
 56 |     return data
 57 | 
 58 | 
 59 | def save_data(data):
 60 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 61 |     cursor = sqlite.cursor()
 62 |     sql = '''create table 动态页面渲染网站爬虫
 63 |     ('中文名称' text primary key not null,
 64 |     '其他名称' text not null,
 65 |     '类型' text not null,
 66 |     '国家' text not null,
 67 |     '时长' text not null,
 68 |     '上映' date,
 69 |     '评分' text)'''
 70 |     try:
 71 |         cursor.execute(sql)
 72 |         sqlite.commit()
 73 |     except sqlite3.OperationalError:
 74 |         print('数据表已存在')
 75 |     for item in data:
 76 |         for index in range(len(item)):
 77 |             item[index] = '"' + str(item[index]) + '"'
 78 |         sql = '''insert into 动态页面渲染网站爬虫
 79 |         values(%s)''' % ', '.join(item)
 80 |         cursor.execute(sql)
 81 |     sqlite.commit()
 82 |     sqlite.close()
 83 | 
 84 | 
 85 | def main():
 86 |     url = 'https://spa2.scrape.center/'
 87 |     """
 88 |     电影数据网站，无反爬，数据通过 Ajax 加载，数据接口参数加密且有时间限制
 89 |     适合动态页面渲染爬取或 JavaScript 逆向分析
 90 |     此程序为动态页面渲染爬取
 91 |     代码测试时间：2021/3/20
 92 |     """
 93 |     start = time.time()
 94 |     data = get_data(url)
 95 |     save_data(data)
 96 |     print('运行时间：{:.6f}'.format(time.time() - start))
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/08_动态页面渲染网站爬虫_2.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | def get_html(template):
 8 |     html = []
 9 |     for page in range(10):
10 |         token = open_url('http://127.0.0.1:3000/')
11 |         url = template.format(str(page * 10), token)
12 |         response = open_url(url)
13 |         html.append(response)
14 |     return html
15 | 
16 | 
17 | def open_url(url):
18 |     response = requests.get(url)
19 |     return response.content.decode('utf-8')
20 | 
21 | 
22 | def get_data(html):
23 |     data = []
24 |     for pages in html:
25 |         for item in range(10):
26 |             movie = []
27 |             page = json.loads(pages)
28 |             name = page['results'][item]['name']
29 |             alias = page['results'][item]['alias']
30 |             categories = page['results'][item]['categories']
31 |             regions = page['results'][item]['regions']
32 |             minute = page['results'][item]['minute']
33 |             published_at = page['results'][item]['published_at']
34 |             score = page['results'][item]['score']
35 |             movie.append(name)
36 |             movie.append(alias)
37 |             movie.append('%s' % ' '.join(categories))
38 |             movie.append('%s' % ' '.join(regions))
39 |             movie.append(str(minute) + ' 分钟')
40 |             movie.append(published_at)
41 |             movie.append(score)
42 |             data.append(movie)
43 |     return data
44 | 
45 | 
46 | def save_data(data):
47 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
48 |     cursor = sqlite.cursor()
49 |     sql = '''create table 动态页面渲染网站爬虫
50 |     ('中文名称' text primary key not null,
51 |     '其他名称' text not null,
52 |     '类型' text not null,
53 |     '国家' text not null,
54 |     '时长' text not null,
55 |     '上映' date,
56 |     '评分' text)'''
57 |     try:
58 |         cursor.execute(sql)
59 |         sqlite.commit()
60 |     except sqlite3.OperationalError:
61 |         print('数据表已存在')
62 |     for item in data:
63 |         for index in range(len(item)):
64 |             item[index] = '"' + str(item[index]) + '"'
65 |         sql = '''insert into 动态页面渲染网站爬虫
66 |         values(%s)''' % ', '.join(item)
67 |         cursor.execute(sql)
68 |     sqlite.commit()
69 |     sqlite.close()
70 | 
71 | 
72 | def main():
73 |     """
74 |     真实网址：https://spa2.scrape.center/
75 |     电影数据网站，无反爬，数据通过 Ajax 加载，数据接口参数加密且有时间限制
76 |     适合动态页面渲染爬取或 JavaScript 逆向分析
77 |     此程序为 JavaScript 逆向分析
78 |     分析 Ajax 请求，通过 Ajax 请求直接获取 json 格式的网页数据
79 |     下面的 url 是分析 Ajax 请求后得到的网络地址
80 |     offset 参数代表页数，规律是：offset = (页数 - 1) * 10
81 |     token 参数为加密参数，需要 JavaScript 逆向分析
82 |     代码测试时间：2021/3/20
83 |     """
84 |     url = 'https://spa2.scrape.center/api/movie/?limit=10&offset={}&token={}'
85 |     start = time.time()
86 |     html = get_html(url)
87 |     data = get_data(html)
88 |     save_data(data)
89 |     print('运行时间：{:.6f}'.format(time.time() - start))
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/09_无页码翻页网站爬虫_1.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | def get_html(template):
 8 |     html = []
 9 |     for page in range(10):
10 |         url = template + str(page * 10)
11 |         response = open_url(url)
12 |         html.append(response)
13 |     return html
14 | 
15 | 
16 | def open_url(url):
17 |     response = requests.get(url)
18 |     return response.content.decode('utf-8')
19 | 
20 | 
21 | def get_data(html):
22 |     data = []
23 |     for pages in html:
24 |         for item in range(10):
25 |             movie = []
26 |             page = json.loads(pages)
27 |             name = page['results'][item]['name']
28 |             alias = page['results'][item]['alias']
29 |             categories = page['results'][item]['categories']
30 |             regions = page['results'][item]['regions']
31 |             minute = page['results'][item]['minute']
32 |             published_at = page['results'][item]['published_at']
33 |             score = page['results'][item]['score']
34 |             movie.append(name)
35 |             movie.append(alias)
36 |             movie.append('%s' % ' '.join(categories))
37 |             movie.append('%s' % ' '.join(regions))
38 |             movie.append(str(minute) + ' 分钟')
39 |             movie.append(published_at)
40 |             movie.append(score)
41 |             data.append(movie)
42 |     return data
43 | 
44 | 
45 | def save_data(data):
46 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
47 |     cursor = sqlite.cursor()
48 |     sql = '''create table 无页码翻页网站爬虫
49 |     ('中文名称' text primary key not null,
50 |     '其他名称' text not null,
51 |     '类型' text not null,
52 |     '国家' text not null,
53 |     '时长' text not null,
54 |     '上映' date,
55 |     '评分' text)'''
56 |     try:
57 |         cursor.execute(sql)
58 |         sqlite.commit()
59 |     except sqlite3.OperationalError:
60 |         print('数据表已存在')
61 |     for item in data:
62 |         for index in range(len(item)):
63 |             item[index] = '"' + str(item[index]) + '"'
64 |         sql = '''insert into 无页码翻页网站爬虫
65 |         values(%s)''' % ', '.join(item)
66 |         cursor.execute(sql)
67 |     sqlite.commit()
68 |     sqlite.close()
69 | 
70 | 
71 | def main():
72 |     """
73 |     真实网址：https://spa3.scrape.center/
74 |     电影数据网站，无反爬，数据通过 Ajax 加载，无页码翻页，下拉至底部刷新，适合 Ajax 分析和动态页面渲染爬取。
75 |     与爬取异步加载网站方法相同
76 |     分析 Ajax 请求，通过 Ajax 请求直接获取 json 格式的网页数据
77 |     下面的 url 是分析 Ajax 请求后得到的网络地址
78 |     offset 参数代表页数，规律是：offset = (页数 - 1) * 10
79 |     代码测试时间：2021/3/20
80 |     """
81 |     url = 'https://spa3.scrape.center/api/movie/?limit=10&offset='
82 |     start = time.time()
83 |     html = get_html(url)
84 |     data = get_data(html)
85 |     save_data(data)
86 |     print('运行时间：{:.6f}'.format(time.time() - start))
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/09_无页码翻页网站爬虫_2.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | from selenium import webdriver
  4 | from bs4 import BeautifulSoup
  5 | from selenium.webdriver import ChromeOptions
  6 | from selenium.webdriver.common.keys import Keys
  7 | 
  8 | 
  9 | def get_html(url):
 10 |     try:
 11 |         option = ChromeOptions()
 12 |         option.add_argument('--headless')
 13 |         browser = webdriver.Chrome(options=option)
 14 |         browser.get(url)
 15 |         time.sleep(10)
 16 |         webdriver.ActionChains(browser).key_down(Keys.END)
 17 |         time.sleep(2)
 18 |         webdriver.ActionChains(browser).key_up(Keys.END)
 19 |         time.sleep(2)
 20 |         return browser.page_source
 21 |     except BaseException:
 22 |         raise ValueError('发生错误')
 23 |     finally:
 24 |         browser.quit()
 25 | 
 26 | 
 27 | def get_data(html):
 28 |     html = BeautifulSoup(html, 'html.parser')
 29 |     data = []
 30 |     for item in html.findAll('div', class_='el-card__body'):
 31 |         try:
 32 |             movie = []
 33 |             name = item.select(
 34 |                 'a.router-link-exact-active.router-link-active > h2')[0].text
 35 |             chinese_name = name.split(' - ')[0]
 36 |             english_name = name.split(' - ')[1]
 37 |             type = [i.text.strip()
 38 |                     for i in item.select('div.categories > button > span')]
 39 |             country = item.select(
 40 |                 'div.el-col-md-16 > div.m-v-sm.info:nth-child(3) > span')[0].text
 41 |             time = item.select(
 42 |                 'div.el-col-md-16 > div.m-v-sm.info:nth-child(3) > span')[2].text
 43 |             published = item.select(
 44 |                 'div.el-col-md-16 > div.m-v-sm.info:nth-child(4) > span')[0]
 45 |             published = published.text if len(published) == 1 else None
 46 |             score = item.select('p.score.m-t-md.m-b-n-sm')[0].text
 47 |             movie.append(chinese_name)
 48 |             movie.append(english_name)
 49 |             movie.append('%s' % ','.join(type))
 50 |             movie.append(country.strip())
 51 |             movie.append(time.strip())
 52 |             movie.append(published.strip())
 53 |             movie.append(score.strip())
 54 |             data.append(movie)
 55 |         except IndexError:
 56 |             continue
 57 |     return data
 58 | 
 59 | 
 60 | def save_data(data):
 61 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 62 |     cursor = sqlite.cursor()
 63 |     sql = '''create table 无页码翻页网站爬虫
 64 |     ('中文名称' text primary key not null,
 65 |     '其他名称' text not null,
 66 |     '类型' text not null,
 67 |     '国家' text not null,
 68 |     '时长' text not null,
 69 |     '上映' date,
 70 |     '评分' text)'''
 71 |     try:
 72 |         cursor.execute(sql)
 73 |         sqlite.commit()
 74 |     except sqlite3.OperationalError:
 75 |         print('数据表已存在')
 76 |     for item in data:
 77 |         for index in range(len(item)):
 78 |             item[index] = '"' + str(item[index]) + '"'
 79 |         sql = '''insert into 无页码翻页网站爬虫
 80 |         values(%s)''' % ', '.join(item)
 81 |         cursor.execute(sql)
 82 |     sqlite.commit()
 83 |     sqlite.close()
 84 | 
 85 | 
 86 | def main():
 87 |     url = 'https://spa3.scrape.center/'
 88 |     """
 89 |     电影数据网站，无反爬，数据通过 Ajax 加载，无页码翻页，下拉至底部刷新，适合 Ajax 分析和动态页面渲染爬取。
 90 |     本程序使用动态网站渲染方法爬取
 91 |     代码测试时间：2021/3/20
 92 |     """
 93 |     start = time.time()
 94 |     html = get_html(url)
 95 |     data = get_data(html)
 96 |     save_data(data)
 97 |     print('运行时间：{:.6f}'.format(time.time() - start))
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/10_异步智能页面网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | def get_html(template):
 8 |     html = []
 9 |     for page in range(10):
10 |         url = template + str(page * 10)
11 |         response = open_url(url)
12 |         html.append(response)
13 |     return html
14 | 
15 | 
16 | def open_url(url):
17 |     response = requests.get(url)
18 |     return response.content.decode('utf-8')
19 | 
20 | 
21 | def get_data(html):
22 |     data = []
23 |     for pages in html:
24 |         for item in range(10):
25 |             movie = []
26 |             page = json.loads(pages)
27 |             title = page['results'][item]['title']
28 |             url = page['results'][item]['url']
29 |             movie.append(title)
30 |             movie.append(url)
31 |             data.append(movie)
32 |     return data
33 | 
34 | 
35 | def save_data(data):
36 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
37 |     cursor = sqlite.cursor()
38 |     sql = '''create table 异步智能网页网站爬虫
39 |     ('标题' text primary key not null,
40 |     '链接' text not null)'''
41 |     try:
42 |         cursor.execute(sql)
43 |         sqlite.commit()
44 |     except sqlite3.OperationalError:
45 |         print('数据表已存在')
46 |     for item in data:
47 |         for index in range(len(item)):
48 |             item[index] = '"' + str(item[index]) + '"'
49 |         sql = '''insert into 异步智能网页网站爬虫
50 |         values(%s)''' % ', '.join(item)
51 |         try:
52 |             cursor.execute(sql)
53 |         except BaseException:
54 |             continue
55 |     sqlite.commit()
56 |     sqlite.close()
57 | 
58 | 
59 | def main():
60 |     """
61 |     真实网址：https://spa4.scrape.center/
62 |     电影数据网站，无反爬，数据通过 Ajax 加载，无页码翻页，下拉至底部刷新，适合 Ajax 分析和动态页面渲染爬取。
63 |     与爬取异步加载网站方法相同
64 |     分析 Ajax 请求，通过 Ajax 请求直接获取 json 格式的网页数据
65 |     下面的 url 是分析 Ajax 请求后得到的网络地址
66 |     offset 参数代表页数，规律是：offset = (页数 - 1) * 10
67 |     代码测试时间：2021/3/20
68 |     """
69 |     url = 'https://spa4.scrape.center/api/news/?limit=10&offset='
70 |     start = time.time()
71 |     html = get_html(url)
72 |     data = get_data(html)
73 |     save_data(data)
74 |     print('运行时间：{:.6f}'.format(time.time() - start))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/11_大批量动态页面网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | 
 5 | 
 6 | def get_html(template):
 7 |     html = []
 8 |     """
 9 |     示例代码仅爬取前三页数据
10 |     """
11 |     for page in range(3):
12 |         url = template.format(page * 18)
13 |         response = open_url(url)
14 |         html.append(response)
15 |     return html
16 | 
17 | 
18 | def open_url(url):
19 |     response = requests.get(url)
20 |     return response.json()
21 | 
22 | 
23 | def get_data(html):
24 |     data = []
25 |     for page in html:
26 |         for item in page['results']:
27 |             movie = []
28 |             id = item['id']
29 |             name = item['name']
30 |             authors = item['authors']
31 |             score = item['score']
32 |             movie.append(id)
33 |             movie.append(name)
34 |             if authors:
35 |                 cache = ''.join(i.replace('\n', '').strip() for i in authors)
36 |                 movie.append(cache)
37 |             else:
38 |                 movie.append(authors)
39 |             movie.append(score)
40 |             data.append(movie)
41 |     return data
42 | 
43 | 
44 | def save_data(data):
45 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
46 |     cursor = sqlite.cursor()
47 |     sql = '''create table 大批量动态页面网站爬虫
48 |     ('ID' text primary key not null,
49 |     '名称' text not null,
50 |     '作者' text,
51 |     '评分' text not null)'''
52 |     try:
53 |         cursor.execute(sql)
54 |         sqlite.commit()
55 |     except sqlite3.OperationalError:
56 |         print('数据表已存在')
57 |     for item in data:
58 |         for index in range(len(item)):
59 |             item[index] = '"' + str(item[index]) + '"'
60 |         sql = '''insert into 大批量动态页面网站爬虫
61 |         values(%s)''' % ', '.join(item)
62 |         cursor.execute(sql)
63 |     sqlite.commit()
64 |     sqlite.close()
65 | 
66 | 
67 | def main():
68 |     """
69 |     真实网址：https://spa5.scrape.center/
70 |     图书网站，无反爬，数据通过 Ajax 加载，有翻页，适合大批量动态页面渲染抓取。
71 |     代码测试时间：2021/3/20
72 |     """
73 |     url = 'https://spa5.scrape.center/api/book/?limit=18&offset={}'
74 |     start = time.time()
75 |     html = get_html(url)
76 |     data = get_data(html)
77 |     save_data(data)
78 |     print('运行时间：{:.6f}'.format(time.time() - start))
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/12_数据接口参数加密网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | def get_html(template):
 8 |     html = []
 9 |     for page in range(10):
10 |         token = open_url('http://127.0.0.1:3000/')
11 |         url = template.format(str(page * 10), token)
12 |         response = open_url(url)
13 |         html.append(response)
14 |     return html
15 | 
16 | 
17 | def open_url(url):
18 |     response = requests.get(url)
19 |     return response.content.decode('utf-8')
20 | 
21 | 
22 | def get_data(html):
23 |     data = []
24 |     for pages in html:
25 |         for item in range(10):
26 |             movie = []
27 |             page = json.loads(pages)
28 |             name = page['results'][item]['name']
29 |             alias = page['results'][item]['alias']
30 |             categories = page['results'][item]['categories']
31 |             regions = page['results'][item]['regions']
32 |             minute = page['results'][item]['minute']
33 |             published_at = page['results'][item]['published_at']
34 |             score = page['results'][item]['score']
35 |             movie.append(name)
36 |             movie.append(alias)
37 |             movie.append('%s' % ' '.join(categories))
38 |             movie.append('%s' % ' '.join(regions))
39 |             movie.append(str(minute) + ' 分钟')
40 |             movie.append(published_at)
41 |             movie.append(score)
42 |             data.append(movie)
43 |     return data
44 | 
45 | 
46 | def save_data(data):
47 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
48 |     cursor = sqlite.cursor()
49 |     sql = '''create table 数据接口参数加密网站爬虫
50 |     ('中文名称' text primary key not null,
51 |     '其他名称' text not null,
52 |     '类型' text not null,
53 |     '国家' text not null,
54 |     '时长' text not null,
55 |     '上映' date,
56 |     '评分' text)'''
57 |     try:
58 |         cursor.execute(sql)
59 |         sqlite.commit()
60 |     except sqlite3.OperationalError:
61 |         print('数据表已存在')
62 |     for item in data:
63 |         for index in range(len(item)):
64 |             item[index] = '"' + str(item[index]) + '"'
65 |         sql = '''insert into 数据接口参数加密网站爬虫
66 |         values(%s)''' % ', '.join(item)
67 |         cursor.execute(sql)
68 |     sqlite.commit()
69 |     sqlite.close()
70 | 
71 | 
72 | def main():
73 |     """
74 |     真实网址：https://spa6.scrape.center/
75 |     电影数据网站，数据通过 Ajax 加载，数据接口参数加密且有时间限制，源码经过混淆
76 |     适合 JavaScript 逆向分析
77 |     使用 08_动态页面渲染网站爬虫.js 获取加密参数
78 |     代码测试时间：2021/3/20
79 |     """
80 |     url = 'https://spa6.scrape.center/api/movie/?limit=10&offset={}&token={}'
81 |     start = time.time()
82 |     html = get_html(url)
83 |     data = get_data(html)
84 |     save_data(data)
85 |     print('运行时间：{:.6f}'.format(time.time() - start))
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     main()
90 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/13_模拟登陆网站爬虫_1.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sqlite3
  3 | import time
  4 | 
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | 
  9 | def get_html(url):
 10 |     session = requests.Session()
 11 |     data = {'username': 'admin', 'password': 'admin'}
 12 |     session.post(url, data=data)
 13 |     html = ''
 14 |     for page in range(1, 11):
 15 |         url = 'https://login2.scrape.center/page/' + str(page)
 16 |         response = open_url(session, url)
 17 |         html += response
 18 |     return html
 19 | 
 20 | 
 21 | def open_url(session, url):
 22 |     response = session.get(url)
 23 |     return response.text
 24 | 
 25 | 
 26 | def get_data(html):
 27 |     findname = re.compile(
 28 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
 29 |     findtype = re.compile(
 30 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
 31 |     findinfo = re.compile(
 32 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
 33 |     findpublished = re.compile(
 34 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
 35 |     findscore = re.compile(
 36 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
 37 |     html = BeautifulSoup(html, 'lxml')
 38 |     data = []
 39 |     for item in html.findAll('div', class_='el-card__body'):
 40 |         item = str(item)
 41 |         movie = []
 42 |         chinese_name = re.findall(findname, item)[0][0]
 43 |         english_name = re.findall(findname, item)[0][1]
 44 |         type = ''.join(i + ' ' for i in re.findall(findtype, item))
 45 |         country = re.findall(findinfo, item)[0][0]
 46 |         time = re.findall(findinfo, item)[0][1]
 47 |         published = re.findall(findpublished, item)
 48 |         published = published[0] if len(published) == 1 else None
 49 |         score = re.findall(findscore, item)[0].strip()
 50 |         movie.append(chinese_name)
 51 |         movie.append(english_name)
 52 |         movie.append(type.strip())
 53 |         movie.append(country)
 54 |         movie.append(time)
 55 |         movie.append(published)
 56 |         movie.append(score)
 57 |         data.append(movie)
 58 |     return data
 59 | 
 60 | 
 61 | def save_data(data):
 62 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 63 |     cursor = sqlite.cursor()
 64 |     sql = '''create table 模拟登陆网站爬虫
 65 |     ('中文名称' text primary key not null,
 66 |     '其他名称' text not null,
 67 |     '类型' text not null,
 68 |     '国家' text not null,
 69 |     '时长' text not null,
 70 |     '上映' date,
 71 |     '评分' text)'''
 72 |     try:
 73 |         cursor.execute(sql)
 74 |         sqlite.commit()
 75 |     except sqlite3.OperationalError:
 76 |         print('数据表已存在')
 77 |     for item in data:
 78 |         for index in range(len(item)):
 79 |             item[index] = '"' + str(item[index]) + '"'
 80 |         sql = '''insert into 模拟登陆网站爬虫
 81 |         values(%s)''' % ', '.join(item)
 82 |         cursor.execute(sql)
 83 |     sqlite.commit()
 84 |     sqlite.close()
 85 | 
 86 | 
 87 | def main():
 88 |     url = 'https://login2.scrape.center/login?next=/'
 89 |     """
 90 |     对接 Session + Cookies 模拟登录，适合用作 Session + Cookies 模拟登录练习。
 91 |     代码测试时间：2021/5/20
 92 |     """
 93 |     start = time.time()
 94 |     html = get_html(url)
 95 |     data = get_data(html)
 96 |     save_data(data)
 97 |     print('运行时间：{:.6f}'.format(time.time() - start))
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/13_模拟登陆网站爬虫_2.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sqlite3
  3 | import time
  4 | 
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | 
  9 | def get_html(url):
 10 |     data = {'username': 'admin', 'password': 'admin'}
 11 |     response = requests.post(url, data=data, allow_redirects=False)
 12 |     cookie = response.cookies
 13 |     html = ''
 14 |     for page in range(1, 11):
 15 |         url = 'https://login2.scrape.center/page/' + str(page)
 16 |         response = open_url(cookie, url)
 17 |         html += response
 18 |     return html
 19 | 
 20 | 
 21 | def open_url(cookie, url):
 22 |     response = requests.get(url, cookies=cookie)
 23 |     return response.text
 24 | 
 25 | 
 26 | def get_data(html):
 27 |     findname = re.compile(
 28 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
 29 |     findtype = re.compile(
 30 |         r'<button class="el-button category el-button--primary el-button--mini" data-v-7f856186="" ty'
 31 |         r'pe="button">\n<span>(.*?)</span>')
 32 |     findinfo = re.compile(
 33 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
 34 |     findpublished = re.compile(
 35 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
 36 |     findscore = re.compile(
 37 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
 38 |     html = BeautifulSoup(html, 'lxml')
 39 |     data = []
 40 |     for item in html.findAll('div', class_='el-card__body'):
 41 |         item = str(item)
 42 |         movie = []
 43 |         chinese_name = re.findall(findname, item)[0][0]
 44 |         english_name = re.findall(findname, item)[0][1]
 45 |         type_ = ''.join(i + ' ' for i in re.findall(findtype, item))
 46 |         country = re.findall(findinfo, item)[0][0]
 47 |         time_ = re.findall(findinfo, item)[0][1]
 48 |         published = re.findall(findpublished, item)
 49 |         published = published[0] if len(published) == 1 else None
 50 |         score = re.findall(findscore, item)[0].strip()
 51 |         movie.append(chinese_name)
 52 |         movie.append(english_name)
 53 |         movie.append(type_.strip())
 54 |         movie.append(country)
 55 |         movie.append(time_)
 56 |         movie.append(published)
 57 |         movie.append(score)
 58 |         data.append(movie)
 59 |     return data
 60 | 
 61 | 
 62 | def save_data(data):
 63 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 64 |     cursor = sqlite.cursor()
 65 |     sql = '''create table 模拟登陆网站爬虫
 66 |     ('中文名称' text primary key not null,
 67 |     '其他名称' text not null,
 68 |     '类型' text not null,
 69 |     '国家' text not null,
 70 |     '时长' text not null,
 71 |     '上映' date,
 72 |     '评分' text)'''
 73 |     try:
 74 |         cursor.execute(sql)
 75 |         sqlite.commit()
 76 |     except sqlite3.OperationalError:
 77 |         print('数据表已存在')
 78 |     for item in data:
 79 |         for index in range(len(item)):
 80 |             item[index] = '"' + str(item[index]) + '"'
 81 |         sql = '''insert into 模拟登陆网站爬虫
 82 |         values(%s)''' % ', '.join(item)
 83 |         cursor.execute(sql)
 84 |     sqlite.commit()
 85 |     sqlite.close()
 86 | 
 87 | 
 88 | def main():
 89 |     url = 'https://login2.scrape.center/login?next=/'
 90 |     """
 91 |     对接 Session + Cookies 模拟登录，适合用作 Session + Cookies 模拟登录练习。
 92 |     代码测试时间：2021/5/21
 93 |     """
 94 |     start = time.time()
 95 |     html = get_html(url)
 96 |     data = get_data(html)
 97 |     save_data(data)
 98 |     print('运行时间：{:.6f}'.format(time.time() - start))
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/15_CSS样式偏移网站爬虫.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | 
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | 
 9 | def get_html(url):
10 |     cookie = {}
11 |     if not cookie:
12 |         return 0
13 |     headers = {
14 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
15 |                       '89.90 Safari/537.36'}
16 |     response = requests.get(url, headers=headers, cookies=cookie)
17 |     soup = BeautifulSoup(response.text, 'lxml')
18 |     page_num = [
19 |         deal_css(i.findAll('div'), response.text)
20 |         for i in soup.findAll('div', class_="col-md-1")
21 |     ]
22 |     return sum(page_num)
23 | 
24 | 
25 | def deal_css(div, text):
26 |     real_numbers = ['' for _ in div]
27 |     for i, j in enumerate(div):
28 |         class_ = j.get('class')[0]
29 |         value = j.text
30 |         if re.findall(f'..{class_} {{ opacity:0 }}', text):
31 |             continue
32 |         position = re.findall(f'..{class_} {{ position:relative }}', text)
33 |         left = re.findall(f'..{class_} {{ left:(.*?)em }}', text)
34 |         content = re.findall(f'..{class_}:before {{ content:"(\\d+)" }}', text)
35 |         if content:
36 |             return int(content[0])
37 |         elif position and left:
38 |             real_numbers[i + int(left[0])] = value
39 |         else:
40 |             real_numbers[i] = value
41 |     return int(''.join(real_numbers))
42 | 
43 | 
44 | def main():
45 |     base = 'http://glidedsky.com/level/web/crawler-css-puzzle-1?page={}'
46 |     url = [base.format(i) for i in range(1, 1001)]
47 |     start = time.time()
48 |     pool = ThreadPoolExecutor(max_workers=10)
49 |     nums = sum(pool.map(get_html, url))
50 |     print(nums)
51 |     print('爬取用时：{:.6f}'.format(time.time() - start))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/从零开始的网络爬虫/21_模拟登陆网站爬虫.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | import time
  6 | import sqlite3
  7 | 
  8 | 
  9 | def get_html(url):
 10 |     data = {'username': 'admin', 'password': 'admin'}
 11 |     response = requests.post(url, data=data, allow_redirects=False)
 12 |     jwt = json.loads(response.content)['token']
 13 |     header = {'authorization': f'jwt {jwt}'}
 14 |     html = ''
 15 |     for page in range(0, 5):
 16 |         url = 'https://login3.scrape.center/api/book/?limit=18&offset=' + \
 17 |               str(page * 10)
 18 |         response = open_url(url, header)
 19 |         html += response
 20 |     return html
 21 | 
 22 | 
 23 | def open_url(url, header):
 24 |     response = requests.get(url, headers=header)
 25 |     return response.text
 26 | 
 27 | 
 28 | def get_data(html):
 29 |     findname = re.compile(
 30 |         r'<h2 class=".*?" data-v-7f856186=".*?">(.*?) - (.*?)</h2>')
 31 |     findtype = re.compile(
 32 |         r'<button class=".*?" data-v-7f856186=".*?" type="button">\n<span>(.*?)</span>')
 33 |     findinfo = re.compile(
 34 |         r'<span data-v-7f856186="">(.*?)</span>\n<.*?> / </span>\n<span data-v-7f856186="">(.*?)</span>')
 35 |     findpublished = re.compile(
 36 |         r'<div class="m-v-sm info" data-v-7f856186="">\n<span data-v-7f856186="">(.*?) 上映</span>')
 37 |     findscore = re.compile(
 38 |         r'<p class="score m-t-md m-b-n-sm" data-v-7f856186="">\n(.*?)</p>')
 39 |     html = BeautifulSoup(html, 'lxml')
 40 |     data = []
 41 |     for item in html.findAll('div', class_='el-card__body'):
 42 |         item = str(item)
 43 |         movie = []
 44 |         chinese_name = re.findall(findname, item)[0][0]
 45 |         english_name = re.findall(findname, item)[0][1]
 46 |         type = ''
 47 |         for i in re.findall(findtype, item):
 48 |             type += i + ' '
 49 |         country = re.findall(findinfo, item)[0][0]
 50 |         time = re.findall(findinfo, item)[0][1]
 51 |         published = re.findall(findpublished, item)
 52 |         if len(published) == 1:
 53 |             published = published[0]
 54 |         else:
 55 |             published = None
 56 |         score = re.findall(findscore, item)[0].strip()
 57 |         movie.append(chinese_name)
 58 |         movie.append(english_name)
 59 |         movie.append(type.strip())
 60 |         movie.append(country)
 61 |         movie.append(time)
 62 |         movie.append(published)
 63 |         movie.append(score)
 64 |         data.append(movie)
 65 |     return data
 66 | 
 67 | 
 68 | def save_data(data):
 69 |     sqlite = sqlite3.connect('网络爬虫数据库.db')
 70 |     cursor = sqlite.cursor()
 71 |     sql = '''create table 模拟登陆网站爬虫
 72 |     ('中文名称' text primary key not null,
 73 |     '其他名称' text not null,
 74 |     '类型' text not null,
 75 |     '国家' text not null,
 76 |     '时长' text not null,
 77 |     '上映' date,
 78 |     '评分' text)'''
 79 |     try:
 80 |         cursor.execute(sql)
 81 |         sqlite.commit()
 82 |     except sqlite3.OperationalError:
 83 |         print('数据表已存在')
 84 |     for item in data:
 85 |         for index in range(len(item)):
 86 |             item[index] = '"' + str(item[index]) + '"'
 87 |         sql = '''insert into 模拟登陆网站爬虫
 88 |         values(%s)''' % ', '.join(item)
 89 |         cursor.execute(sql)
 90 |     sqlite.commit()
 91 |     sqlite.close()
 92 | 
 93 | 
 94 | def main():
 95 |     url = 'https://login3.scrape.center/api/login'
 96 |     """
 97 |     对接 JWT 模拟登录方式，适合用作 JWT 模拟登录练习。
 98 |     代码测试时间：
 99 |     """
100 |     start = time.time()
101 |     html = get_html(url)
102 |     # data = get_data(html)
103 |     # save_data(data)
104 |     print('运行时间：{:.6f}'.format(time.time() - start))
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     """网站异常，无法测试"""
109 |     pass
110 |     # main()
111 | 


--------------------------------------------------------------------------------
/代码示例/JA3指纹.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | from fake_useragent import FakeUserAgent
 5 | 
 6 | ua = [FakeUserAgent().chrome for _ in range(10)]
 7 | for i in ua:
 8 |     response = requests.get(
 9 |         'https://ja3er.com/json',
10 |         headers={
11 |             'User-Agent': i})
12 |     print(response.json())
13 |     time.sleep(1)
14 | 


--------------------------------------------------------------------------------
/代码示例/JA3指纹破解.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import requests
 4 | from requests.adapters import HTTPAdapter
 5 | from requests.packages.urllib3.util.ssl_ import create_urllib3_context
 6 | 
 7 | ORIGIN_CIPHERS = (
 8 |     'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
 9 |     'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
10 | 
11 | 
12 | class DESAdapter(HTTPAdapter):
13 |     def __init__(self, *args, **kwargs):
14 |         """
15 |         A TransportAdapter that re-enables 3DES support in Requests.
16 |         """
17 |         CIPHERS = ORIGIN_CIPHERS.split(':')
18 |         random.shuffle(CIPHERS)
19 |         CIPHERS = ':'.join(CIPHERS)
20 |         self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
21 |         super().__init__(*args, **kwargs)
22 | 
23 |     def init_poolmanager(self, *args, **kwargs):
24 |         context = create_urllib3_context(ciphers=self.CIPHERS)
25 |         kwargs['ssl_context'] = context
26 |         return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
27 | 
28 |     def proxy_manager_for(self, *args, **kwargs):
29 |         context = create_urllib3_context(ciphers=self.CIPHERS)
30 |         kwargs['ssl_context'] = context
31 |         return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
32 | 
33 | 
34 | headers = {
35 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
36 | s = requests.Session()
37 | s.headers.update(headers)
38 | 
39 | for _ in range(10):
40 |     s.mount('https://ja3er.com', DESAdapter())
41 |     resp = s.get('https://ja3er.com/json').json()
42 |     print(resp)
43 | 


--------------------------------------------------------------------------------
/代码示例/URL参数修改.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
 2 | 
 3 | 
 4 | def replace_field(url, name, value):
 5 |     parse = urlparse(url)  # 把网址转成  ParseResult  对象
 6 |     query = parse.query  # ParseResult  对象的.query  属性，是一个字符串，也就是网址中，问号后面的内容
 7 |     query_pair = parse_qs(query)  # 把 .query 输出的字符串转成字典
 8 |     query_pair[name] = value  # 修改值
 9 |     new_query = urlencode(query_pair, doseq=True)  # 把字段转成. query 形式的字符串
10 |     new_parse = parse._replace(query=new_query)
11 |     return urlunparse(new_parse)  # 把ParseResult对象转回网址字符串
12 | 
13 | 
14 | url_list = [
15 |     'https://xxx.com/articlelist?category=technology',
16 |     'https://xxx.com/articlelist?category=technology&after=',
17 |     'https://xxx.com/articlelist?category=technology&after=asdrtJKSAZFD',
18 |     'https://xxx.com/articlelist?category=technology&after=asdrtJKSAZFD&other=abc'
19 | ]
20 | 
21 | for url in url_list:
22 |     next_page = replace_field(url, 'after', '0000000')
23 |     print(next_page)
24 | 


--------------------------------------------------------------------------------
/代码示例/分布式爬虫示例/加入任务队列.py:
--------------------------------------------------------------------------------
 1 | import parsel
 2 | import requests
 3 | from fake_useragent import FakeUserAgent
 4 | from redis import Redis
 5 | 
 6 | 
 7 | def push_redis_list(text):
 8 |     redis = Redis(host='127.0.0.1', port=6379, password='')
 9 |     for item in text:
10 |         redis.lpush('标题', item)
11 | 
12 | 
13 | def get_url():
14 |     header = {'user-agent': FakeUserAgent().chrome}
15 |     response = requests.get('https://www.baidu.com/', headers=header)
16 |     code = response.encoding
17 |     html = parsel.Selector(text=response.content.decode(code))
18 |     return [
19 |         item
20 |         for item in html.xpath(
21 |             '//ul[@class="s-hotsearch-content"]/li/a/span[2]'
22 |         )
23 |             .css('::text')
24 |             .getall()
25 |     ]
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     push_redis_list(get_url())
30 | 


--------------------------------------------------------------------------------
/代码示例/分布式爬虫示例/读取任务队列.py:
--------------------------------------------------------------------------------
 1 | from redis import Redis
 2 | 
 3 | 
 4 | def get_redis_list():
 5 |     redis = Redis(host='127.0.0.1', port=6379, password='')
 6 |     while True:
 7 |         text = redis.lpop('标题')
 8 |         if text is None:
 9 |             break
10 |         else:
11 |             print(text.decode('utf-8'))
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     get_redis_list()
16 | 


--------------------------------------------------------------------------------
/代码示例/加密解密.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import hashlib
  3 | import hmac
  4 | 
  5 | import rsa
  6 | from Crypto.Cipher import AES
  7 | from Crypto.Cipher import DES3
  8 | from pyDes import CBC
  9 | from pyDes import PAD_PKCS5
 10 | from pyDes import des
 11 | 
 12 | 
 13 | class USE_AES:
 14 |     """
 15 |     AES
 16 |     除了MODE_SIV模式key长度为：32, 48, or 64,
 17 |     其余key长度为16, 24 or 32
 18 |     详细见AES内部文档
 19 |     CBC模式传入iv参数
 20 |     本例使用常用的ECB模式
 21 |     """
 22 | 
 23 |     def __init__(self, key):
 24 |         if len(key) > 32:
 25 |             key = key[:32]
 26 |         self.key = self.to_16(key)
 27 | 
 28 |     def to_16(self, key):
 29 |         """
 30 |         转为16倍数的bytes数据
 31 |         :param key:
 32 |         :return:
 33 |         """
 34 |         key = bytes(key, encoding="utf8")
 35 |         while len(key) % 16 != 0:
 36 |             key += b'\0'
 37 |         return key  # 返回bytes
 38 | 
 39 |     def aes(self):
 40 |         return AES.new(self.key, AES.MODE_ECB)  # 初始化加密器
 41 | 
 42 |     def encrypt(self, text):
 43 |         aes = self.aes()
 44 |         return str(base64.encodebytes(aes.encrypt(self.to_16(text))),
 45 |                    encoding='utf8').replace('\n', '')  # 加密
 46 | 
 47 |     def decode_bytes(self, text):
 48 |         aes = self.aes()
 49 |         return str(aes.decrypt(base64.decodebytes(bytes(
 50 |             text, encoding='utf8'))).rstrip(b'\0').decode("utf8"))  # 解密
 51 | 
 52 | 
 53 | class USE_RSA:
 54 |     """
 55 |     生成密钥可保存.pem格式文件
 56 |     1024位的证书，加密时最大支持117个字节，解密时为128；
 57 |     2048位的证书，加密时最大支持245个字节，解密时为256。
 58 |     加密大文件时需要先用AES或者DES加密，再用RSA加密密钥，详细见文档
 59 |     文档:https://stuvel.eu/files/python-rsa-doc/usage.html#generating-keys
 60 |     """
 61 | 
 62 |     def __init__(self, number=1024):
 63 |         """
 64 |         :param number: 公钥、私钥
 65 |         """
 66 |         self.pubkey, self.privkey = rsa.newkeys(number)
 67 | 
 68 |     def rsaEncrypt(self, text):
 69 |         """
 70 |         :param test: str
 71 |         :return: bytes
 72 |         """
 73 |         content = text.encode('utf-8')
 74 |         return rsa.encrypt(content, self.pubkey)
 75 | 
 76 |     def rsaDecrypt(self, text):
 77 |         """
 78 |         :param text:bytes
 79 |         :return: str
 80 |         """
 81 |         content = rsa.decrypt(text, self.privkey)
 82 |         return content.decode('utf-8')
 83 | 
 84 |     def savePem(self, path_name, text):
 85 |         """
 86 |         :param path_name: 保存路径
 87 |         :param text: str
 88 |         :return:bytes
 89 |         """
 90 |         if "PEM" in path_name.upper():
 91 |             path_name = path_name[:-4]
 92 |         with open('{}.pem'.format(path_name), 'bw') as f:
 93 |             f.write(text.save_pkcs1())
 94 | 
 95 |     def readPem(self, path_name, key_type):
 96 |         """
 97 |         :param path_name: 密钥文件
 98 |         :param key_type:类型
 99 |         :return:
100 |         """
101 |         if 'pubkey' in key_type:
102 |             self.pubkey = rsa.PublicKey.load_pkcs1(path_name)
103 |         else:
104 |             self.privkey = rsa.PublicKey.load_pkcs1(path_name)
105 |         return True
106 | 
107 |     def sign(self, message, priv_key=None, hash_method='SHA-1'):
108 |         """
109 |         生成明文的哈希签名以便还原后对照
110 |         :param message: str
111 |         :param priv_key:
112 |         :param hash_method: 哈希的模式
113 |         :return:
114 |         """
115 |         if priv_key is None:
116 |             priv_key = self.privkey
117 |         return rsa.sign(message.encode(), priv_key, hash_method)
118 | 
119 |     def checkSign(self, mess, result, pubkey=None):
120 |         """
121 |         验证签名：传入解密后明文、签名、公钥，验证成功返回哈希方法，失败则报错
122 |         :param mess: str
123 |         :param result: bytes
124 |         :param pubkey:
125 |         :return: str
126 |         """
127 |         if pubkey is None:
128 |             pubkey = self.privkey
129 |         try:
130 |             result = rsa.verify(mess, result, pubkey)
131 |             return result
132 |         except BaseException:
133 |             return False
134 | 
135 | 
136 | class USE_DES:
137 |     """
138 |     des(key,[mode], [IV], [pad], [pad mode])
139 |     key:必须正好8字节
140 |     mode（模式）：ECB、CBC
141 |     iv:CBC模式中必须提供长8字节
142 |     pad:填充字符
143 |     padmode:加密填充模式PAD_NORMAL or PAD_PKCS5
144 |     """
145 | 
146 |     def __init__(self, key, iv):
147 |         if not isinstance(key, bytes):
148 |             key = bytes(key, encoding="utf8")
149 |         if not isinstance(iv, bytes):
150 |             iv = bytes(iv, encoding="utf8")
151 |         self.key = key
152 |         self.iv = iv
153 | 
154 |     def encrypt(self, text):
155 |         """
156 |         DES 加密
157 |         :param text: 原始字符串
158 |         :return: 加密后字符串，bytes
159 |         """
160 |         if not isinstance(text, bytes):
161 |             text = bytes(text, "utf-8")
162 |         secret_key = self.key
163 |         iv = self.iv
164 |         k = des(secret_key, CBC, iv, pad=None, padmode=PAD_PKCS5)
165 |         return k.encrypt(text, padmode=PAD_PKCS5)
166 | 
167 |     def descrypt(self, text):
168 |         """
169 |         DES 解密
170 |         :param text: 加密后的字符串，bytes
171 |         :return:  解密后的字符串
172 |         """
173 |         secret_key = self.key
174 |         iv = self.iv
175 |         k = des(secret_key, CBC, iv, pad=None, padmode=PAD_PKCS5)
176 |         de = k.decrypt(text, padmode=PAD_PKCS5)
177 |         return de.decode()
178 | 
179 | 
180 | class USE_DES3:
181 |     """
182 |     new(key, mode, *args, **kwargs)
183 |     key:必须8bytes倍数介于16-24
184 |     mode：
185 |     iv:初始化向量适用于MODE_CBC、MODE_CFB、MODE_OFB、MODE_OPENPGP，4种模式
186 |         ``MODE_CBC``, ``MODE_CFB``, and ``MODE_OFB``长度为8bytes
187 |         ```MODE_OPENPGP```加密时8bytes解密时10bytes
188 |         未提供默认随机生成
189 |     nonce：仅在 ``MODE_EAX`` and ``MODE_CTR``模式中使用
190 |             ``MODE_EAX``建议16bytes
191 |             ``MODE_CTR``建议[0, 7]长度
192 |             未提供则随机生成
193 |     segment_size：分段大小，仅在 ``MODE_CFB``模式中使用，长度为8倍数，未指定则默认为8
194 |     mac_len： 适用``MODE_EAX``模式，身份验证标记的长度（字节），它不能超过8（默认值）
195 |     initial_value：适用```MODE_CTR```，计数器的初始值计数器块。默认为**0**。
196 |     """
197 | 
198 |     def __init__(self, key):
199 |         self.key = key
200 |         self.mode = DES3.MODE_ECB
201 | 
202 |     def encrypt(self, text):
203 |         """
204 |         传入明文
205 |         :param text:bytes类型，长度是KEY的倍数
206 |         :return:
207 |         """
208 |         if not isinstance(text, bytes):
209 |             text = bytes(text, 'utf-8')
210 |         x = len(text) % 8
211 |         text = text + b'\0' * x
212 |         cryptor = DES3.new(self.key, self.mode)
213 |         return cryptor.encrypt(text)
214 | 
215 |     def decrypt(self, text):
216 |         cryptor = DES3.new(self.key, self.mode)
217 |         plain_text = cryptor.decrypt(text)
218 |         return str(plain_text.decode("utf-8")).rstrip('\0')
219 | 
220 | 
221 | def USE_MD5(test):
222 |     if not isinstance(test, bytes):
223 |         test = bytes(test, 'utf-8')
224 |     m = hashlib.md5()
225 |     m.update(test)
226 |     return m.hexdigest()
227 | 
228 | 
229 | def USE_HMAC(key, text):
230 |     if not isinstance(key, bytes):
231 |         key = bytes(key, 'utf-8')
232 |     if not isinstance(text, bytes):
233 |         text = bytes(text, 'utf-8')
234 |     h = hmac.new(key, text, digestmod='MD5')
235 |     return h.hexdigest()
236 | 
237 | 
238 | def USE_SHA(text):
239 |     if not isinstance(text, bytes):
240 |         text = bytes(text, 'utf-8')
241 |     sha = hashlib.sha1(text)
242 |     return sha.hexdigest()
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     aes_test = USE_AES("secretKey")
247 |     ciphertext = aes_test.encrypt("测试")
248 |     plaintext = aes_test.decode_bytes(ciphertext)
249 |     print(ciphertext, plaintext)
250 |     rsa_test = USE_RSA()
251 |     ciphertext = rsa_test.rsaEncrypt("测试加密")
252 |     plaintext = rsa_test.rsaDecrypt(ciphertext)
253 |     print(ciphertext, plaintext)
254 |     des_test = USE_DES(b"12345678", b"12345678")
255 |     ciphertext = des_test.encrypt("测试加密")
256 |     plaintext = des_test.descrypt(ciphertext)
257 |     print(ciphertext, plaintext)
258 |     des3_test = USE_DES3(b"123456789qazxswe")
259 |     ciphertext = des3_test.encrypt("测试加密")
260 |     plaintext = des3_test.decrypt(ciphertext)
261 |     print(ciphertext, plaintext)
262 |     md5_test = USE_MD5("测试签名")
263 |     hmac_test = USE_HMAC("123456", "测试")
264 |     sha_test = USE_SHA("测试加密")
265 |     print(ciphertext, plaintext)
266 | 


--------------------------------------------------------------------------------
/代码示例/重试装饰器示例.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def retry(func):
 5 |     max_retry = 10
 6 | 
 7 |     def run():
 8 |         for i in range(max_retry):
 9 |             status_code = func()
10 |             if status_code == 200:
11 |                 return f'第{i + 1}次访问 ' + str(status_code) + ' 访问成功！'
12 |             else:
13 |                 print(f'第{i + 1}次访问', status_code, '重试中！')
14 |         return '访问失败！'
15 | 
16 |     return run
17 | 
18 | 
19 | @retry
20 | def requests():
21 |     return random.choice([200, 404, 404, 404, 404])
22 | 
23 | 
24 | def common():
25 |     pass
26 | 
27 | 
28 | def main():
29 |     response = requests()
30 |     print(response)
31 |     print(requests)
32 |     print(retry)
33 |     print(common)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/其他网络爬虫/GXNAS壁纸爬虫_1.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import random
  4 | import re
  5 | import time
  6 | 
  7 | import requests
  8 | import xlwt
  9 | from bs4 import BeautifulSoup
 10 | from pyppeteer import launch
 11 | 
 12 | 
 13 | async def get_html(url, num):
 14 |     img = re.compile(r'<img alt="(.*)" data-original="(.*)" data-realurl="')
 15 |     data = []
 16 |     # 启动浏览器
 17 |     browser = await launch({'headless': True, 'args': ['--disable-infobars']})
 18 |     page = await browser.newPage()  # 新建页面
 19 |     await page.setViewport({'width': 1920, 'height': 1080})  # 设置窗口大小
 20 |     await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
 21 |                             '4.0.4147.105 Safari/537.36 Edg/84.0.522.50')
 22 |     await page.evaluateOnNewDocument(
 23 |         '''() => { Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
 24 |     await page.goto(url)  # 访问网站
 25 |     await page.waitFor(2000)
 26 |     count = num // 10
 27 |     while count != 0:
 28 |         await page.keyboard.press('PageDown')
 29 |         await page.waitFor(3500)
 30 |         count -= 1
 31 |     html = await page.content()
 32 |     soup = BeautifulSoup(html, 'html.parser')
 33 |     for item in soup.find_all('div', class_='Hhalf oneImg'):
 34 |         item = str(item)
 35 |         # print(item)
 36 |         cache = []
 37 |         image = re.findall(img, item)
 38 |         cache.append(image[0][0])
 39 |         cache.append(image[0][1])
 40 |         data.append(cache)
 41 |     await browser.close()  # 关闭浏览器
 42 |     return data
 43 | 
 44 | 
 45 | def sava_xlsx(data):
 46 |     book = xlwt.Workbook(encoding='utf-8')
 47 |     sheet = book.add_sheet('爬取结果', cell_overwrite_ok=True)
 48 |     tap = ('关键字', '图片链接')
 49 |     for i in range(2):
 50 |         sheet.write(0, i, tap[i])  # 添加列标签
 51 |     for i in range(len(data)):
 52 |         tap = data[i]
 53 |         if len(data) >= 100:
 54 |             print('\r', end='')
 55 |             print('正在保存数据到表格: {:.2f}%'.format(
 56 |                 ((i + 1) / len(data)) * 100), '▉' * ((i + 1) // (len(data) // 50)), end='')
 57 |         elif len(data) > 0:
 58 |             print('\r', end='')
 59 |             print('正在保存数据到表格: {:.2f}%'.format(
 60 |                 ((i + 1) / len(data)) * 100), '▉' * ((i + 1) * 50 // (len(data))), end='')
 61 |         else:
 62 |             print('出现错误')
 63 |         for j in range(2):
 64 |             data_1 = tap[j]
 65 |             sheet.write(i + 1, j, data_1)
 66 |     book.save('图片爬虫.xlsx')
 67 |     print('\n')
 68 | 
 69 | 
 70 | def sava_path(data):
 71 |     root = os.getcwd() + '\\爬取结果\\'
 72 |     header = {
 73 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
 74 |                       '4.0.4147.105 Safari/537.36 Edg/84.0.522.50'}
 75 |     repeat = 0
 76 |     for index, item in enumerate(data):
 77 |         url = str(item[1])
 78 |         path = root + url.split('/')[-1]
 79 |         if len(data) >= 100:
 80 |             print('\r', end='')
 81 |             print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
 82 |                   '▉' * ((index + 1) // (len(data) // 50)),
 83 |                   end='')
 84 |         else:
 85 |             print('\r', end='')
 86 |             print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
 87 |                   '▉' * ((index + 1) * 50 // (len(data))),
 88 |                   end='')
 89 |         try:
 90 |             if not os.path.exists(root):  # 判断根目录是否存在
 91 |                 os.mkdir(root)  # 创建根目录
 92 |             if not os.path.exists(path):  # 判断文件是否存在
 93 |                 file = requests.get(url=url, headers=header)  # 请求文件
 94 |                 time.sleep(random.randrange(2, 5, 1))
 95 |                 with open(path, 'wb') as save:
 96 |                     save.write(file.content)
 97 |                     save.close()
 98 |             else:
 99 |                 repeat += 1
100 |         except BaseException:
101 |             print('')
102 |             print('保存失败')
103 |     print('')
104 |     print('重复图片：' + str(repeat) + '张')
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     url = 'https://tu.gxnas.com/'
109 |     print('爬取数量非准确数量')
110 |     num = int(input('爬取数量：'))
111 |     data = asyncio.get_event_loop().run_until_complete(get_html(url, num))
112 |     sava_xlsx(data)
113 |     sava_path(data)
114 |     print('程序结束')
115 | 


--------------------------------------------------------------------------------
/其他网络爬虫/GXNAS壁纸爬虫_2.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | 
 5 | headers = {
 6 |     'accept': 'application/json, text/javascript, */*; q=0.01',
 7 |     'accept-encoding': 'gzip, deflate, br',
 8 |     'accept-language': 'zh-CN,zh;q=0.9',
 9 |     'dnt': '1',
10 |     'referer': 'https://tu.gxnas.com/',
11 |     'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
12 |     'sec-ch-ua-mobile': '?0',
13 |     'sec-fetch-dest': 'empty',
14 |     'sec-fetch-mode': 'cors',
15 |     'sec-fetch-site': 'same-origin',
16 |     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
17 |                   '89.0.4389.90 Safari/537.36',
18 |     'x-requested-with': 'XMLHttpRequest'}
19 | 
20 | 
21 | def get_data(url, num):
22 |     data = {
23 |         'cid': '360new',
24 |         'start': num,
25 |         'count': '10'
26 |     }
27 |     response = requests.get(url, headers=headers, params=data)
28 |     print(response.json())
29 | 
30 | 
31 | def main(url, num):
32 |     for i in range(num):
33 |         time.sleep(2)
34 |         get_data(url, i)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     page = int(input('爬取页数：'))
39 |     main('https://tu.gxnas.com/api.php', page)
40 |     print('程序结束')
41 | 


--------------------------------------------------------------------------------
/其他网络爬虫/myCancerGenome.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import random
  4 | import time
  5 | 
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | from pymongo import MongoClient
  9 | 
 10 | 
 11 | def get_data(hits):
 12 |     path = os.getcwd() + '\\data'
 13 |     all = hits // 10 + 2 if hits % 10 else hits // 10 + 1
 14 |     if os.path.exists(path):
 15 |         with open('data', 'rb') as f:
 16 |             data = pickle.load(f)
 17 |             long = len(data)
 18 |             print('当前列表数据数量：', long)
 19 |             if long == hits:
 20 |                 print('列表数据获取完毕')
 21 |                 return
 22 |             long = long // 10 + 1
 23 |             all_data = data
 24 |     else:
 25 |         all_data = []
 26 |         long = 1
 27 |     session = requests.Session()
 28 |     headers = {
 29 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
 30 |                       '7.0.4280.67 Safari/537.36 Edg/87.0.664.47',
 31 |         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applica'
 32 |                   'tion/signed-exchange;v=b3;q=0.9',
 33 |         'accept-encoding': 'gzip, deflate, br',
 34 |         'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
 35 |         'cache-control': 'no-cache',
 36 |         'dnt': '1',
 37 |         'pragma': 'no-cache',
 38 |         'sec-fetch-dest': 'document',
 39 |         'sec-fetch-mode': 'navigate',
 40 |         'sec-fetch-site': 'none',
 41 |         'sec-fetch-user': '?1',
 42 |         'upgrade-insecure-requests': '1'}
 43 |     try:
 44 |         session.get(
 45 |             'https://www.mycancergenome.org/content/biomarkers/',
 46 |             headers=headers,
 47 |             timeout=10)
 48 |     except BaseException:
 49 |         raise TimeoutError('获取会话超时')
 50 |     url = 'https://www.mycancergenome.org/mcg/omni_mcg/biomarkers/?fields=alteration_groups&fields=name&fields=biomarke' \
 51 |           'r_type&fields=genes&fields=in_diseases&fields=pathways&fields=summary&fields=trial_count&fields=therapy_coun' \
 52 |           't&fields=drug_count&page={}&search='
 53 |     headers = {
 54 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
 55 |                       '7.0.4280.67 Safari/537.36 Edg/87.0.664.47',
 56 |         'authorization': 'Token 989c38a36eec154f01167274dbce2334ccf8ef11',
 57 |         'accept': 'application/json, text/plain, */*',
 58 |         'accept-encoding': 'gzip, deflate, br',
 59 |         'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
 60 |         'cache-control': 'no-cache',
 61 |         'dnt': '1',
 62 |         'pragma': 'no-cache',
 63 |         'sec-fetch-dest': 'empty',
 64 |         'sec-fetch-mode': 'cors',
 65 |         'sec-fetch-site': 'same-origin',
 66 |         'referer': 'https://www.mycancergenome.org/content/biomarkers/'}
 67 |     z = 0
 68 |     for page in range(long, all):
 69 |         try:
 70 |             response = session.get(
 71 |                 url.format(page), headers=headers, timeout=10)
 72 |             z += 1
 73 |         except BaseException:
 74 |             print('获取列表超时')
 75 |             break
 76 |         time.sleep(random.randrange(1, 4, 1) + random.random())
 77 |         if response.status_code == 200:
 78 |             cache = response.json()
 79 |             for i in range(len(cache['results'])):
 80 |                 data = []
 81 |                 data.append(cache['results'][i]['name'])
 82 |                 try:
 83 |                     data.append(cache['results'][i]['drug_count'])
 84 |                 except BaseException:
 85 |                     pass
 86 |                 all_data.append(data)
 87 |         else:
 88 |             print('发生异常，响应码：', response.status_code)
 89 |             break
 90 |         """
 91 |         启用此代码则每次运行只获取10条数据
 92 |         注释此代码则运行获取全部数据
 93 |         """
 94 |         # break  # 调试使用
 95 |         """
 96 |         启用此代码则每次运行获取1000条数据
 97 |         注释此代码则运行获取全部数据
 98 |         """
 99 |         if z >= 100:
100 |             print(len(all_data))
101 |             break
102 |     with open('data', 'wb') as f:
103 |         pickle.dump(all_data, f)
104 | 
105 | 
106 | def get_info():
107 |     headers = {
108 |         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q'
109 |                   '=0.8,application/signed-exchange;v=b3;q=0.9',
110 |         'accept-encoding': 'gzip, deflate, br',
111 |         'accept-language': 'zh-CN,zh;q=0.9',
112 |         'cache-control': 'no-cache',
113 |         'dnt': '1',
114 |         'pragma': 'no-cache',
115 |         'referer': 'https://www.mycancergenome.org/content/biomarkers/',
116 |         'sec-fetch-dest': 'document',
117 |         'sec-fetch-mode': 'navigate',
118 |         'sec-fetch-site': 'same-origin',
119 |         'sec-fetch-user': '?1',
120 |         'upgrade-insecure-requests': '1',
121 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
122 |                       '7.0.4280.66 Safari/537.36'}
123 |     path = os.getcwd() + '\\progress'
124 |     if os.path.exists(path):
125 |         with open('progress', 'rb') as f:
126 |             progress = pickle.load(f)
127 |             print('当前数据数量：', progress)
128 |     else:
129 |         progress = 0
130 |     with open('data', 'rb') as f:
131 |         data = pickle.load(f)
132 |     client = MongoClient()
133 |     db = client['癌症基因组数据库']
134 |     collection = db['详细数据']
135 |     z = 0
136 |     for i in data[progress:]:
137 |         """
138 |         部分基因无法访问详情页
139 |         """
140 |         if i[0] in ['MALT1']:
141 |             continue
142 |         if len(i) == 1:
143 |             info = {'Drugs': None, 'Info': None, 'Name': i[0]}
144 |             save_data(info, collection)
145 |             progress += 1
146 |         elif len(i) == 2 and i[1] == 0:
147 |             info = {'Info': None, 'Name': i[0], 'Drugs': i[1]}
148 |             save_data(info, collection)
149 |             progress += 1
150 |         elif len(i) == 2:
151 |             try:
152 |                 z += 1
153 |                 cache = open_url_T_1(headers, i[0])
154 |                 if cache:
155 |                     info = {'Name': i[0], 'Drugs': i[1], 'Info': cache}
156 |                     save_data(info, collection)
157 |                     time.sleep(random.randrange(1, 4, 1) + random.random())
158 |                 else:
159 |                     print('访问超时或响应码错误，请重试')
160 |                     break
161 |             except BaseException:
162 |                 time.sleep(random.randrange(1, 4, 1) + random.random())
163 |                 z += 1
164 |                 try:
165 |                     cache = open_url_T_2(headers, i[0])
166 |                 except ValueError:
167 |                     break
168 |                 if cache == 'None':
169 |                     info = {'Name': '', 'Drugs': '', 'Info': None}
170 |                     info['Name'] = i[0]
171 |                     info['Drugs'] = i[1]
172 |                     save_data(info, collection)
173 |                 else:
174 |                     if cache:
175 |                         info = {'Name': '', 'Drugs': '', 'Info': ''}
176 |                         info['Name'] = i[0]
177 |                         info['Drugs'] = i[1]
178 |                         info['Info'] = cache
179 |                         save_data(info, collection)
180 |                     else:
181 |                         print('访问超时或响应码错误，请重试')
182 |                         break
183 |             progress += 1
184 |         else:
185 |             raise ValueError(i)
186 |         # break  # 调试使用
187 |         """
188 |         启用此代码则每次运行发送50次请求后关闭程序
189 |         注释此代码则运行获取全部数据
190 |         """
191 |         if z >= 500:
192 |             break
193 |     with open('progress', 'wb') as f:
194 |         pickle.dump(progress, f)
195 |         print('已获取数据数量：', progress)  # 调试使用
196 | 
197 | 
198 | def auto_name(name):
199 |     name = name.replace(')',
200 |                         '').replace('*',
201 |                                     '').replace('_',
202 |                                                 '-').replace('(',
203 |                                                              '-').replace(';',
204 |                                                                           '-').replace(' ',
205 |                                                                                        '-').replace('--',
206 |                                                                                                     '-').lower()
207 |     return name
208 | 
209 | 
210 | def open_url_F(name):
211 |     """
212 |     暂时不需要调用
213 |     """
214 |     headers = {
215 |         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q='
216 |                   '0.8,application/signed-exchange;v=b3;q=0.9',
217 |         'accept-encoding': 'gzip, deflate, br',
218 |         'accept-language': 'zh-CN,zh;q=0.9',
219 |         'cache-control': 'no-cache',
220 |         'dnt': '1',
221 |         'pragma': 'no-cache',
222 |         'referer': 'https://www.mycancergenome.org/content/biomarkers/',
223 |         'sec-fetch-dest': 'document',
224 |         'sec-fetch-mode': 'navigate',
225 |         'sec-fetch-site': 'same-origin',
226 |         'sec-fetch-user': '?1',
227 |         'upgrade-insecure-requests': '1',
228 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
229 |                       '7.0.4280.66 Safari/537.36'}
230 |     url = 'https://www.mycancergenome.org/content/alteration/{}/'
231 |     response = requests.get(url.format(name), headers=headers)
232 |     html = response.content.decode(response.encoding)
233 |     soup = BeautifulSoup(html, 'lxml')
234 |     num = soup.select(
235 |         'body > div.main-content > div.alteration-detail > div.row.header > div.small-12.medium-9.column'
236 |         's > div:nth-child(1) > div > p > a')[0].text
237 |     print(soup.select('body > div.main-content > div.alteration-detail > div.row.header > div.small-12.medium-9.column'
238 |                       's > div:nth-child(2) > div > p:nth-child({})'.format(int(num) + 1))[0].text.strip())
239 | 
240 | 
241 | def open_url_T_1(
242 |         headers,
243 |         name,
244 |         url='https://www.mycancergenome.org/content/alteration/{}/'):
245 |     try:
246 |         response = requests.get(
247 |             url.format(
248 |                 auto_name(name)),
249 |             headers=headers,
250 |             timeout=10)
251 |         # print(response.url)  # 调试代码
252 |         # print(response.status_code)
253 |     except BaseException:
254 |         return None
255 |     if response.status_code == 200:
256 |         data = [name, response.url]
257 |         html = response.content.decode(response.encoding)
258 |         soup = BeautifulSoup(html, 'lxml')
259 |         info = soup.select('div#therapies-toggle > p')
260 |         content = ''.join(i.text for i in info)
261 |         data.append(content.replace('\n', '').replace('  ', '').strip())
262 |         num = soup.select('div#therapies-toggle > p:last-of-type > a')[0].text
263 |         reference = soup.select(
264 |             'div.small-12.columns > p.reference')[int(num) - 1].text
265 |         data.append(reference.replace('\n', '').replace('  ', '').strip())
266 |         BDT = soup.select(
267 |             'div#therapies-toggle > div.about-alteration-therapy-row')
268 |         data.append({})
269 |         for a in BDT:
270 |             title = a.select(
271 |                 'p.about-alteration-therapy-header')[0].text.replace('+', '').strip()
272 |             if title not in ['Bosutinib', 'Imatinib']:
273 |                 continue
274 |             data[4][title] = []
275 |             Bosutinib = a.select(
276 |                 'div.about-alteration-therapy-content > div.about-alteration-therapy-disease-row')
277 |             for b, c in enumerate(Bosutinib):
278 |                 data[4][title].append([])
279 |                 t1 = c.select(
280 |                     'p.about-alteration-therapy-disease-header')[0].text.replace('-', '')
281 |                 data[4][title][b].append(
282 |                     t1.replace(
283 |                         '\n', '').replace(
284 |                         '  ', '').strip())
285 |                 t2 = c.select(
286 |                     'div.row.table-row.targeted-therapy-table-small-screen-container')
287 |                 for f in t2:
288 |                     data[4][title][b].append([])
289 |                     t3 = f.select('div.small-12.columns.biomarker-criteria')
290 |                     data[4][title][b][-1].append(t3[0].text.replace(
291 |                         '\n', '').replace('  ', '').strip())
292 |                     row = f.select(
293 |                         'div.small-12.columns.response-setting-note > div.row')
294 |                     data[4][title][b][-1].append([])
295 |                     for d, e in enumerate(row):
296 |                         data[4][title][b][-1][-1].append(
297 |                             e.text.replace('\n', '').replace('  ', '').strip())
298 |         return data
299 |     elif response.status_code == 404:
300 |         raise ValueError
301 |     else:
302 |         print(response.url)
303 |         return None
304 | 
305 | 
306 | def open_url_T_2(
307 |         headers,
308 |         name,
309 |         url='https://www.mycancergenome.org/content/gene/{}/'):
310 |     try:
311 |         response = requests.get(
312 |             url.format(
313 |                 auto_name(name)),
314 |             headers=headers,
315 |             timeout=10)
316 |         # print(response.url)  # 调试代码
317 |         # print(response.status_code)
318 |     except BaseException:
319 |         return None
320 |     if response.status_code == 200:
321 |         data = [name, response.url]
322 |         html = response.content.decode(response.encoding)
323 |         soup = BeautifulSoup(html, 'lxml')
324 |         info = soup.select('div#therapies-toggle > p')
325 |         content = ''.join(i.text for i in info)
326 |         data.append(content.replace('\n', '').replace('  ', '').strip())
327 |         try:
328 |             num = soup.select(
329 |                 'div#therapies-toggle > p:last-of-type > a')[0].text
330 |         except IndexError:
331 |             return 'None'
332 |         reference = soup.select(
333 |             'div.small-12.columns > p.reference')[int(num) - 1].text
334 |         data.append(reference.replace('\n', '').replace('  ', '').strip())
335 |         BDT = soup.select('div#therapies-toggle > div.about-gene-therapy-row')
336 |         data.append({})
337 |         for a in BDT:
338 |             title = a.select(
339 |                 'p.about-gene-therapy-header')[0].text.replace('+', '').strip()
340 |             if title not in ['Bosutinib', 'Imatinib']:
341 |                 continue
342 |             data[4][title] = []
343 |             Bosutinib = a.select(
344 |                 'div.about-gene-therapy-content > div.about-gene-therapy-disease-row')
345 |             for b, c in enumerate(Bosutinib):
346 |                 data[4][title].append([])
347 |                 t1 = c.select(
348 |                     'p.about-gene-therapy-disease-header')[0].text.replace('-', '')
349 |                 data[4][title][b].append(
350 |                     t1.replace(
351 |                         '\n', '').replace(
352 |                         '  ', '').strip())
353 |                 t2 = c.select(
354 |                     'div.row.table-row.targeted-therapy-table-small-screen-container')
355 |                 for f in t2:
356 |                     data[4][title][b].append([])
357 |                     t3 = f.select('div.small-12.columns.biomarker-criteria')
358 |                     data[4][title][b][-1].append(t3[0].text.replace(
359 |                         '\n', '').replace('  ', '').strip())
360 |                     row = f.select(
361 |                         'div.small-12.columns.response-setting-note > div.row')
362 |                     data[4][title][b][-1].append([])
363 |                     for d, e in enumerate(row):
364 |                         data[4][title][b][-1][-1].append(
365 |                             e.text.replace('\n', '').replace('  ', '').strip())
366 |         return data
367 |     elif response.status_code == 302:
368 |         print(name, response.url)
369 |         raise ValueError
370 |     else:
371 |         print(response.url)
372 |         return None
373 | 
374 | 
375 | def save_data(data, collection):
376 |     collection.insert_one(data)
377 | 
378 | 
379 | def main():
380 |     hits = 16380  # 数据总数，手动修改
381 |     get_data(hits)
382 |     get_info()
383 |     print('程序已结束')
384 | 
385 | 
386 | if __name__ == '__main__':
387 |     main()
388 | 


--------------------------------------------------------------------------------
/其他网络爬虫/下厨房网爬虫.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import sqlite3
  3 | import time
  4 | 
  5 | from requests_html import HTMLSession
  6 | 
  7 | 
  8 | def get_page_url():
  9 |     """返回每页的网址"""
 10 |     list_ = []
 11 |     base = 'http://www.xiachufang.com/explore/?page='
 12 |     for page in range(1, 2):  # 测试时减少爬取量
 13 |         """生成 1 ~ 20 页的网页地址"""
 14 |         url = base + str(page)
 15 |         list_.append(url)
 16 |     return list_  # 返回 1 ~ 20 页的地址
 17 | 
 18 | 
 19 | def get_url(list_):
 20 |     """获取每个菜品的详细链接"""
 21 |     item_url = []
 22 |     for page in list_:  # 遍历每一页的网址
 23 |         html = open_url(page)  # 请求网页
 24 |         time.sleep(random.randrange(4, 7, 1))  # 减慢爬取速度
 25 |         href = html.html.find(
 26 |             '.recipe.recipe-215-horizontal.pure-g.image-link.display-block > a')  # 查找每一个菜品的详细链接
 27 |         for url in href:
 28 |             item_url.append(
 29 |                 'http://www.xiachufang.com' +
 30 |                 url.attrs['href'])  # 拼接每一个菜品的详细链接
 31 |     return item_url  # 返回 1 ~ 20 页的全部菜品详细链接
 32 | 
 33 | 
 34 | def open_url(url):
 35 |     session = HTMLSession()
 36 |     response = session.get(url)  # 向网页发送请求
 37 |     return response  # 返回网页响应
 38 | 
 39 | 
 40 | def get_data(url_list):
 41 |     data = []  # 储存总数据
 42 |     for item in url_list[:3]:  # 遍历每一个菜品的详细链接，切片减少爬取数据量
 43 |         cache = []  # 储存每个菜品的数据
 44 |         html = open_url(item)  # 发送请求
 45 |         if str(html) == '<Response [200]>':  # 判断是否请求成功
 46 |             time.sleep(random.randrange(4, 7, 1))  # 减慢爬取速度
 47 |             title = html.html.find('.page-title')  # 匹配菜名
 48 |             if bool(title):  # 判断匹配结果
 49 |                 cache.append(title[0].text)  # 添加有效数据
 50 |             else:
 51 |                 print(item)  # 输出检查以便判断问题
 52 |                 cache.append('')  # 添加空数据，避免直接报错
 53 |             recipeIngredient = html.html.find('.ings')  # 匹配用料
 54 |             if bool(recipeIngredient):  # 判断匹配结果
 55 |                 cache.append(recipeIngredient[0].text)  # 添加有效数据
 56 |             else:
 57 |                 print(item)  # 输出检查以便判断问题
 58 |                 cache.append('')  # 添加空数据，避免直接报错
 59 |             recipeInstructions = html.html.find('.steps p.text')  # 匹配做法步骤
 60 |             if bool(recipeInstructions):  # 判断匹配结果
 61 |                 """这里匹配的结果是包含多项的列表，要先处理成单个字符串再添加有效数据"""
 62 |                 steps = ''
 63 |                 for i in range(len(recipeInstructions)):
 64 |                     """遍历匹配结果，这个数据是做法步骤，不同菜品的步骤数不相等，通过遍历组成单个字符串"""
 65 |                     steps += recipeInstructions[i].text
 66 |                 cache.append(steps)  # 添加有效数据
 67 |             else:
 68 |                 print(item)  # 输出检查以便判断问题
 69 |                 cache.append('')  # 添加空数据，避免直接报错
 70 |             image = html.html.find(
 71 |                 'div.cover.image.expandable.block-negative-margin > img')  # 匹配效果图链接
 72 |             if bool(image):  # 判断匹配结果
 73 |                 cache.append(image[0].attrs['src'])  # 添加有效数据
 74 |             else:
 75 |                 print(item)  # 输出检查以便判断问题
 76 |                 cache.append('')  # 添加空数据，避免直接报错
 77 |             url = html.html.find('link[rel=canonical]')  # 匹配详细链接
 78 |             if bool(url):  # 判断匹配结果
 79 |                 cache.append(url[0].attrs['href'])  # 添加有效数据
 80 |             else:
 81 |                 print(item)  # 输出检查以便判断问题
 82 |                 cache.append('')  # 添加空数据，避免直接报错
 83 |             data.append(cache)  # 添加一个菜品的数据到总数据
 84 |         else:
 85 |             break  # 请求失败说明被封IP，跳出循环，爬取结束
 86 |     return data  # 返回总数据的列表
 87 | 
 88 | 
 89 | def save_data(data):
 90 |     sqlite = sqlite3.connect('本周最受欢迎.db')  # 连接数据库
 91 |     cursor = sqlite.cursor()  # 获取数据库游标
 92 |     sql = '''create table 本周最受欢迎
 93 |     ('菜名' text,
 94 |     '用料' text,
 95 |     '做法' text,
 96 |     '效果图' text,
 97 |     '链接' text)'''  # 创建表
 98 |     cursor.execute(sql)  # 执行SQL语句
 99 |     sqlite.commit()  # 提交更改
100 |     for item in data:  # 遍历保存数据
101 |         for index in range(len(item)):  # 数据预处理，SQL语句格式要求
102 |             item[index] = '"' + str(item[index]) + '"'
103 |         sql = '''insert into 本周最受欢迎
104 |         values(%s)''' % ', '.join(item)  # 插入数据
105 |         cursor.execute(sql)  # 执行SQL语句
106 |         sqlite.commit()  # 提交更改
107 |     sqlite.close()  # 关闭数据库
108 | 
109 | 
110 | def main():
111 |     list_ = get_page_url()  # 生成每一页的网址
112 |     url = get_url(list_)  # 获取 1 ~ 20 页的全部菜品详细链接
113 |     data = get_data(url)  # 提取数据
114 |     save_data(data)  # 保存数据
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/其他网络爬虫/北京市政务数据资源网爬虫.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import re
  5 | import time
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | 
 11 | def request_data(start):
 12 |     url = 'https://data.beijing.gov.cn/search/1_file/elevate'
 13 |     header = {
 14 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
 15 |                       '83.121 Safari/537.36'}
 16 |     data = {
 17 |         'q': '_text_:*',
 18 |         'wt': 'json',
 19 |         'rows': '10',
 20 |         'start': start,
 21 |         'enableElevation': 'true',
 22 |         'forceElevation': 'true',
 23 |         'sort': 'publishDate desc',
 24 |         'fl': '_uuid_,title,content,indexUrl,pubDateStr,pubDate,publishDate,publishDateStr,size,score,unitName,download'
 25 |               'Count,callCount,imgsrc,[elevated],imgsrc',
 26 |         'fq': '',
 27 |     }
 28 |     response = requests.post(url=url, headers=header, data=data)
 29 |     if response.status_code != 200:
 30 |         raise ValueError('请求项目列表失败')
 31 |     time.sleep(random.randrange(2, 5, 1))
 32 |     return response
 33 | 
 34 | 
 35 | def get_data(key):
 36 |     start = 0
 37 |     data = request_data(start)
 38 |     data = data.content.decode('utf-8')
 39 |     data = json.loads(data)
 40 |     content_id = data['response']['docs']
 41 |     content_id_list = []
 42 |     for item in content_id:
 43 |         indexUrl = item['indexUrl']
 44 |         indexUrl = re.findall(re.compile(r'/([0-9]*?).htm'), indexUrl)
 45 |         if bool(indexUrl):
 46 |             content_id_list.append(indexUrl[0])
 47 |         else:
 48 |             raise ValueError('提取数据失败')
 49 |         break
 50 |     get_api_id(content_id_list, key)
 51 |     pages = data['response']['numFound']
 52 |     for page in range(pages // 10):
 53 |         start = (page + 1) * 10
 54 |         data = request_data(start)
 55 |         data = data.content.decode('utf-8')
 56 |         data = json.loads(data)
 57 |         content_id = data['response']['docs']
 58 |         content_id_list = []
 59 |         for item in content_id:
 60 |             indexUrl = item['indexUrl']
 61 |             indexUrl = re.findall(re.compile(r'/([0-9]*?).htm'), indexUrl)
 62 |             if bool(indexUrl):
 63 |                 content_id_list.append(indexUrl[0])
 64 |             else:
 65 |                 raise ValueError('提取数据失败')
 66 |             break
 67 |         get_api_id(content_id_list, key)
 68 |         break
 69 | 
 70 | 
 71 | def get_download(id_list, key):
 72 |     api = 'http://data.beijing.gov.cn:80/cms/web/APIInterface/userApply.jsp?id={}&key={}'
 73 |     header = {
 74 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
 75 |                       '83.121 Safari/537.36'}
 76 |     for item in id_list:
 77 |         response = requests.get(api.format(item, key), headers=header)
 78 |         if response.status_code == 200:
 79 |             try:
 80 |                 data = json.loads(response.text)
 81 |                 url = data['result']['address']
 82 |                 name = data['result']['name']
 83 |                 save_data(name, url)
 84 |             except BaseException:
 85 |                 raise ValueError(response.text)
 86 |         else:
 87 |             raise ValueError('请求api失败')
 88 | 
 89 | 
 90 | def save_data(name, url):
 91 |     root = os.getcwd() + '\\数据结果\\'
 92 |     path = root + name + '.' + url.split('.')[-1]
 93 |     if not os.path.exists(root):
 94 |         os.mkdir(root)
 95 |     if not os.path.exists(path):
 96 |         header = {
 97 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
 98 |                           '83.121 Safari/537.36'}
 99 |         response = requests.get(url, headers=header)
100 |         time.sleep(random.randrange(2, 5, 1))
101 |         with open(path, 'wb') as data:
102 |             data.write(response.content)
103 |             data.close()
104 | 
105 | 
106 | def get_api_id(content_id, key):
107 |     model = 'https://data.beijing.gov.cn/cms/web/APIInterface/dataDoc.jsp?contentID='
108 |     header = {
109 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.41'
110 |                       '83.121 Safari/537.36'}
111 |     for item in content_id:
112 |         response = requests.get(url=model + item, headers=header)
113 |         time.sleep(random.randrange(2, 5, 1))
114 |         if response.status_code != 200:
115 |             raise ValueError('请求详情页失败')
116 |         soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
117 |         id_list = [
118 |             index.select('td:last-of-type')[0].text
119 |             for index in soup.select(
120 |                 'div.content-box.fn-clear > table.content-tab:first-of-type > tbody > tr'
121 |             )
122 |         ]
123 | 
124 |         if bool(id_list):
125 |             get_download(id_list, key)
126 | 
127 | 
128 | def main():
129 |     key = input('输入API唯一标识码（key）：')
130 |     get_data(key)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/其他网络爬虫/小米步数.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | import requests
  4 | import time
  5 | 
  6 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  7 | headers = {
  8 |     'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; MI 6 MIUI/20.6.18)'
  9 | }
 10 | 
 11 | 
 12 | # 获取登录code
 13 | def get_code(location):
 14 |     code_pattern = re.compile("(?<=access=).*?(?=&)")
 15 |     return code_pattern.findall(location)[0]
 16 | 
 17 | 
 18 | # 登录
 19 | def login(user, password):
 20 |     url = f"https://api-user.huami.com/registrations/+86{user}/tokens"
 21 |     headers = {
 22 |         "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
 23 |         "User-Agent": "MiFit/4.6.0 (iPhone; iOS 14.0.1; Scale/2.00)"
 24 |     }
 25 |     data = {
 26 |         "client_id": "HuaMi",
 27 |         "password": f"{password}",
 28 |         "redirect_uri": "https://s3-us-west-2.amazonaws.com/hm-registration/successsignin.html",
 29 |         "token": "access"
 30 |     }
 31 |     response = requests.post(url, data=data, headers=headers, allow_redirects=False)
 32 |     location = response.headers["Location"]
 33 |     try:
 34 |         code = get_code(location)
 35 |     except:
 36 |         return 0, 0
 37 |     # print("access_code获取成功！")ste
 38 |     # print(code)
 39 | 
 40 |     url = "https://account.huami.com/v2/client/login"
 41 |     data = {
 42 |         "app_name": "com.xiaomi.hm.health",
 43 |         "app_version": "4.6.0",
 44 |         "code": f"{code}",
 45 |         "country_code": "CN",
 46 |         "device_id": "2C8B4939-0CCD-4E94-8CBA-CB8EA6E613A1",
 47 |         "device_model": "phone",
 48 |         "grant_type": "access_token",
 49 |         "third_name": "huami_phone",
 50 |     }
 51 |     response = requests.post(url, data=data, headers=headers).json()
 52 |     login_token = response["token_info"]["login_token"]
 53 |     # print("login_token获取成功！")
 54 |     # print(login_token)
 55 |     userid = response["token_info"]["user_id"]
 56 |     # print("userid获取成功！")
 57 |     # print(userid)
 58 | 
 59 |     return login_token, userid
 60 | 
 61 | 
 62 | # 主函数
 63 | def main(user, password, step):
 64 |     login_token, userid = login(user, password)
 65 |     if login_token == 0:
 66 |         return "login fail!"
 67 | 
 68 |     timestamp = str(time.time())[:-3].replace('.', '')
 69 |     sync_time = str(time.time())[:-6].replace('.', '')
 70 | 
 71 |     app_token = get_app_token(login_token)
 72 | 
 73 |     today = time.strftime("%F")
 74 |     data = '%5B%7B%22data_hr%22%3A%22%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9L%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
 75 |            '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FVv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0v%5C' \
 76 |            '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9e%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0n%5C%2Fa' \
 77 |            '%5C%2F%5C%2F%5C%2FS%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0b%5C%2F%5C%2F' \
 78 |            '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F1FK%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
 79 |            '%5C%2F%5C%2F%5C%2FR%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
 80 |            '%5C%2F%5C%2F%5C%2F9PTFFpaf9L%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FR%5C' \
 81 |            '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0j%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
 82 |            '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9K%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
 83 |            '%2FOv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fzf%5C%2F%5C%2F%5C%2F86%5C%2Fzr%5C' \
 84 |            '%2FOv88%5C%2Fzf%5C%2FPf%5C%2F%5C%2F%5C%2F0v%5C%2FS%5C%2F8%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F' \
 85 |            '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FSf%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fz3%5C' \
 86 |            '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0r%5C%2FOv%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FS%5C%2F9L%5C%2Fzb%5C' \
 87 |            '%2FSf9K%5C%2F0v%5C%2FRf9H%5C%2Fzj%5C%2FSf9K%5C%2F0%5C%2F%5C%2FN%5C%2F%5C%2F%5C%2F%5C%2F0D%5C%2FSf83%5C' \
 88 |            '%2Fzr%5C%2FPf9M%5C%2F0v%5C%2FOv9e%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
 89 |            '%2FS%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fzv%5C%2F%5C%2Fz7%5C%2FO%5C' \
 90 |            '%2F83%5C%2Fzv%5C%2FN%5C%2F83%5C%2Fzr%5C%2FN%5C%2F86%5C%2Fz%5C%2F%5C%2FNv83%5C%2Fzn%5C%2FXv84%5C%2Fzr%5C' \
 91 |            '%2FPP84%5C%2Fzj%5C%2FN%5C%2F9e%5C%2Fzr%5C%2FN%5C%2F89%5C%2F03%5C%2FP%5C%2F89%5C%2Fz3%5C%2FQ%5C%2F9N%5C' \
 92 |            '%2F0v%5C%2FTv9C%5C%2F0H%5C%2FOf9D%5C%2Fzz%5C%2FOf88%5C%2Fz%5C%2F%5C%2FPP9A%5C%2Fzr%5C%2FN%5C%2F86%5C%2Fzz' \
 93 |            '%5C%2FNv87%5C%2F0D%5C%2FOv84%5C%2F0v%5C%2FO%5C%2F84%5C%2Fzf%5C%2FMP83%5C%2FzH%5C%2FNv83%5C%2Fzf%5C%2FN%5C' \
 94 |            '%2F84%5C%2Fzf%5C%2FOf82%5C%2Fzf%5C%2FOP83%5C%2Fzb%5C%2FMv81%5C%2FzX%5C%2FR%5C%2F9L%5C%2F0v%5C%2FO%5C%2F9I' \
 95 |            '%5C%2F0T%5C%2FS%5C%2F9A%5C%2Fzn%5C%2FPf89%5C%2Fzn%5C%2FNf9K%5C%2F07%5C%2FN%5C%2F83%5C%2Fzn%5C%2FNv83%5C' \
 96 |            '%2Fzv%5C%2FO%5C%2F9A%5C%2F0H%5C%2FOf8%5C%2F%5C%2Fzj%5C%2FPP83%5C%2Fzj%5C%2FS%5C%2F87%5C%2Fzj%5C%2FNv84%5C' \
 97 |            '%2Fzf%5C%2FOf83%5C%2Fzf%5C%2FOf83%5C%2Fzb%5C%2FNv9L%5C%2Fzj%5C%2FNv82%5C%2Fzb%5C%2FN%5C%2F85%5C%2Fzf%5C' \
 98 |            '%2FN%5C%2F9J%5C%2Fzf%5C%2FNv83%5C%2Fzj%5C%2FNv84%5C%2F0r%5C%2FSv83%5C%2Fzf%5C%2FMP%5C%2F%5C%2F%5C%2Fzb%5C' \
 99 |            '%2FMv82%5C%2Fzb%5C%2FOf85%5C%2Fz7%5C%2FNv8%5C%2F%5C%2F0r%5C%2FS%5C%2F85%5C%2F0H%5C%2FQP9B%5C%2F0D%5C' \
100 |            '%2FNf89%5C%2Fzj%5C%2FOv83%5C%2Fzv%5C%2FNv8%5C%2F%5C%2F0f%5C%2FSv9O%5C%2F0ZeXv%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
101 |            '%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F1X%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C' \
102 |            '%2F9B%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2FTP%5C%2F%5C%2F%5C%2F1b%5C%2F' \
103 |            '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F0%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F9N' \
104 |            '%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2F%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B' \
105 |            '%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
106 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
107 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
108 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
109 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
110 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
111 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
112 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
113 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
114 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
115 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
116 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
117 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
118 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
119 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
120 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
121 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
122 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
123 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
124 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
125 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
126 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
127 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
128 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
129 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C' \
130 |            '%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7' \
131 |            '%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%5C%2Fv7%2B%22' \
132 |            '%2C%22date%22%3A%222021-08-07%22%2C%22data%22%3A%5B%7B%22start%22%3A0%2C%22stop%22%3A1439%2C%22value%22' \
133 |            '%3A%22UA8AUBQAUAwAUBoAUAEAYCcAUBkAUB4AUBgAUCAAUAEAUBkAUAwAYAsAYB8AYB0AYBgAYCoAYBgAYB4AUCcAUBsAUB8AUBwAUBIAYBkAYB8AUBoAUBMAUCEAUCIAYBYAUBwAUCAAUBgAUCAAUBcAYBsAYCUAATIPYD0KECQAYDMAYB0AYAsAYCAAYDwAYCIAYB0AYBcAYCQAYB0AYBAAYCMAYAoAYCIAYCEAYCYAYBsAYBUAYAYAYCIAYCMAUB0AUCAAUBYAUCoAUBEAUC8AUB0AUBYAUDMAUDoAUBkAUC0AUBQAUBwAUA0AUBsAUAoAUCEAUBYAUAwAUB4AUAwAUCcAUCYAUCwKYDUAAUUlEC8IYEMAYEgAYDoAYBAAUAMAUBkAWgAAWgAAWgAAWgAAWgAAUAgAWgAAUBAAUAQAUA4AUA8AUAkAUAIAUAYAUAcAUAIAWgAAUAQAUAkAUAEAUBkAUCUAWgAAUAYAUBEAWgAAUBYAWgAAUAYAWgAAWgAAWgAAWgAAUBcAUAcAWgAAUBUAUAoAUAIAWgAAUAQAUAYAUCgAWgAAUAgAWgAAWgAAUAwAWwAAXCMAUBQAWwAAUAIAWgAAWgAAWgAAWgAAWgAAWgAAWgAAWgAAWREAWQIAUAMAWSEAUDoAUDIAUB8AUCEAUC4AXB4AUA4AWgAAUBIAUA8AUBAAUCUAUCIAUAMAUAEAUAsAUAMAUCwAUBYAWgAAWgAAWgAAWgAAWgAAWgAAUAYAWgAAWgAAWgAAUAYAWwAAWgAAUAYAXAQAUAMAUBsAUBcAUCAAWwAAWgAAWgAAWgAAWgAAUBgAUB4AWgAAUAcAUAwAWQIAWQkAUAEAUAIAWgAAUAoAWgAAUAYAUB0AWgAAWgAAUAkAWgAAWSwAUBIAWgAAUC4AWSYAWgAAUAYAUAoAUAkAUAIAUAcAWgAAUAEAUBEAUBgAUBcAWRYAUA0AWSgAUB4AUDQAUBoAXA4AUA8AUBwAUA8AUA4AUA4AWgAAUAIAUCMAWgAAUCwAUBgAUAYAUAAAUAAAUAAAUAAAUAAAUAAAUAAAUAAAUAAAWwAAUAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAeSEAeQ8AcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBcAcAAAcAAAcCYOcBUAUAAAUAAAUAAAUAAAUAUAUAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCgAeQAAcAAAcAAAcAAAcAAAcAAAcAYAcAAAcBgAeQAAcAAAcAAAegAAegAAcAAAcAcAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCkAeQAAcAcAcAAAcAAAcAwAcAAAcAAAcAIAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcCIAeQAAcAAAcAAAcAAAcAAAcAAAeRwAeQAAWgAAUAAAUAAAUAAAUAAAUAAAcAAAcAAAcBoAeScAeQAAegAAcBkAeQAAUAAAUAAAUAAAUAAAUAAAUAAAcAAAcAAAcAAAcAAAcAAAcAAAegAAegAAcAAAcAAAcBgAeQAAcAAAcAAAcAAAcAAAcAAAcAkAegAAegAAcAcAcAAAcAcAcAAAcAAAcAAAcAAAcA8AeQAAcAAAcAAAeRQAcAwAUAAAUAAAUAAAUAAAUAAAUAAAcAAAcBEAcA0AcAAAWQsAUAAAUAAAUAAAUAAAUAAAcAAAcAoAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAYAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBYAegAAcAAAcAAAegAAcAcAcAAAcAAAcAAAcAAAcAAAeRkAegAAegAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAEAcAAAcAAAcAAAcAUAcAQAcAAAcBIAeQAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBsAcAAAcAAAcBcAeQAAUAAAUAAAUAAAUAAAUAAAUBQAcBYAUAAAUAAAUAoAWRYAWTQAWQAAUAAAUAAAUAAAcAAAcAAAcAAAcAAAcAAAcAMAcAAAcAQAcAAAcAAAcAAAcDMAeSIAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcAAAcBQAeQwAcAAAcAAAcAAAcAMAcAAAeSoAcA8AcDMAcAYAeQoAcAwAcFQAcEMAeVIAaTYAbBcNYAsAYBIAYAIAYAIAYBUAYCwAYBMAYDYAYCkAYDcAUCoAUCcAUAUAUBAAWgAAYBoAYBcAYCgAUAMAUAYAUBYAUA4AUBgAUAgAUAgAUAsAUAsAUA4AUAMAUAYAUAQAUBIAASsSUDAAUDAAUBAAYAYAUBAAUAUAUCAAUBoAUCAAUBAAUAoAYAIAUAQAUAgAUCcAUAsAUCIAUCUAUAoAUA4AUB8AUBkAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAAfgAA%22%2C%22tz%22%3A32%2C%22did%22%3A%22DA932FFFFE8816E7%22%2C%22src%22%3A24%7D%5D%2C%22summary%22%3A%22%7B%5C%22v%5C%22%3A6%2C%5C%22slp%5C%22%3A%7B%5C%22st%5C%22%3A1628296479%2C%5C%22ed%5C%22%3A1628296479%2C%5C%22dp%5C%22%3A0%2C%5C%22lt%5C%22%3A0%2C%5C%22wk%5C%22%3A0%2C%5C%22usrSt%5C%22%3A-1440%2C%5C%22usrEd%5C%22%3A-1440%2C%5C%22wc%5C%22%3A0%2C%5C%22is%5C%22%3A0%2C%5C%22lb%5C%22%3A0%2C%5C%22to%5C%22%3A0%2C%5C%22dt%5C%22%3A0%2C%5C%22rhr%5C%22%3A0%2C%5C%22ss%5C%22%3A0%7D%2C%5C%22stp%5C%22%3A%7B%5C%22ttl%5C%22%3A18272%2C%5C%22dis%5C%22%3A10627%2C%5C%22cal%5C%22%3A510%2C%5C%22wk%5C%22%3A41%2C%5C%22rn%5C%22%3A50%2C%5C%22runDist%5C%22%3A7654%2C%5C%22runCal%5C%22%3A397%2C%5C%22stage%5C%22%3A%5B%7B%5C%22start%5C%22%3A327%2C%5C%22stop%5C%22%3A341%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A481%2C%5C%22cal%5C%22%3A13%2C%5C%22step%5C%22%3A680%7D%2C%7B%5C%22start%5C%22%3A342%2C%5C%22stop%5C%22%3A367%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A2295%2C%5C%22cal%5C%22%3A95%2C%5C%22step%5C%22%3A2874%7D%2C%7B%5C%22start%5C%22%3A368%2C%5C%22stop%5C%22%3A377%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1592%2C%5C%22cal%5C%22%3A88%2C%5C%22step%5C%22%3A1664%7D%2C%7B%5C%22start%5C%22%3A378%2C%5C%22stop%5C%22%3A386%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1072%2C%5C%22cal%5C%22%3A51%2C%5C%22step%5C%22%3A1245%7D%2C%7B%5C%22start%5C%22%3A387%2C%5C%22stop%5C%22%3A393%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1036%2C%5C%22cal%5C%22%3A57%2C%5C%22step%5C%22%3A1124%7D%2C%7B%5C%22start%5C%22%3A394%2C%5C%22stop%5C%22%3A398%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A488%2C%5C%22cal%5C%22%3A19%2C%5C%22step%5C%22%3A607%7D%2C%7B%5C%22start%5C%22%3A399%2C%5C%22stop%5C%22%3A414%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A2220%2C%5C%22cal%5C%22%3A120%2C%5C%22step%5C%22%3A2371%7D%2C%7B%5C%22start%5C%22%3A415%2C%5C%22stop%5C%22%3A427%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1268%2C%5C%22cal%5C%22%3A59%2C%5C%22step%5C%22%3A1489%7D%2C%7B%5C%22start%5C%22%3A428%2C%5C%22stop%5C%22%3A433%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A152%2C%5C%22cal%5C%22%3A4%2C%5C%22step%5C%22%3A238%7D%2C%7B%5C%22start%5C%22%3A434%2C%5C%22stop%5C%22%3A444%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A2295%2C%5C%22cal%5C%22%3A95%2C%5C%22step%5C%22%3A2874%7D%2C%7B%5C%22start%5C%22%3A445%2C%5C%22stop%5C%22%3A455%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1592%2C%5C%22cal%5C%22%3A88%2C%5C%22step%5C%22%3A1664%7D%2C%7B%5C%22start%5C%22%3A456%2C%5C%22stop%5C%22%3A466%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1072%2C%5C%22cal%5C%22%3A51%2C%5C%22step%5C%22%3A1245%7D%2C%7B%5C%22start%5C%22%3A467%2C%5C%22stop%5C%22%3A477%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A1036%2C%5C%22cal%5C%22%3A57%2C%5C%22step%5C%22%3A1124%7D%2C%7B%5C%22start%5C%22%3A478%2C%5C%22stop%5C%22%3A488%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A488%2C%5C%22cal%5C%22%3A19%2C%5C%22step%5C%22%3A607%7D%2C%7B%5C%22start%5C%22%3A489%2C%5C%22stop%5C%22%3A499%2C%5C%22mode%5C%22%3A4%2C%5C%22dis%5C%22%3A2220%2C%5C%22cal%5C%22%3A120%2C%5C%22step%5C%22%3A2371%7D%2C%7B%5C%22start%5C%22%3A500%2C%5C%22stop%5C%22%3A511%2C%5C%22mode%5C%22%3A3%2C%5C%22dis%5C%22%3A1268%2C%5C%22cal%5C%22%3A59%2C%5C%22step%5C%22%3A1489%7D%2C%7B%5C%22start%5C%22%3A512%2C%5C%22stop%5C%22%3A522%2C%5C%22mode%5C%22%3A1%2C%5C%22dis%5C%22%3A152%2C%5C%22cal%5C%22%3A4%2C%5C%22step%5C%22%3A238%7D%5D%7D%2C%5C%22goal%5C%22%3A8000%2C%5C%22tz%5C%22%3A%5C%2228800%5C%22%7D%22%2C%22source%22%3A24%2C%22type%22%3A0%7D%5D '
134 |     # find_date = re.compile(r'.*?date%22%3A%22(.*?)%22%2C%22data.*?')
135 |     # find_step = re.compile(r'.*?ttl%5C%22%3A(.*?)%2C%5C%22dis.*?')
136 |     # data = re.sub(find_date.findall(data)[0], today, data)
137 |     # data = re.sub(find_step.findall(data)[0], step, data)
138 |     data.replace('1628296479', sync_time)
139 |     data.replace('2021-08-07', today)
140 |     data.replace('18272', step)
141 | 
142 |     url = f'https://api-mifit-cn.huami.com/v1/data/band_data.json?&t={timestamp}'
143 |     headers = {
144 |         "apptoken": app_token,
145 |         "Content-Type": "application/x-www-form-urlencoded"
146 |     }
147 | 
148 |     data = f'userid={userid}&last_sync_data_time={sync_time}&device_type=0&last_deviceid=DA932FFFFE8816E7&data_json={data}'
149 | 
150 |     response = requests.post(url, data=data, headers=headers).json()
151 |     # print(response)
152 |     result = f"{user[:4]}****{user[-4:]}: [{now}] 修改步数（{step}）" + response['message']
153 |     print(result)
154 |     return result
155 | 
156 | 
157 | # 获取时间戳
158 | def get_time():
159 |     url = 'http://api.m.taobao.com/rest/api3.do?api=mtop.common.getTimestamp'
160 |     response = requests.get(url, headers=headers).json()
161 |     t = response['data']['t']
162 |     return str(time.time())[:-3].replace('.', '')
163 | 
164 | 
165 | # 获取app_token
166 | def get_app_token(login_token):
167 |     url = f"https://account-cn.huami.com/v1/client/app_tokens?app_name=com.xiaomi.hm.health&dn=api-user.huami.com" \
168 |           f"%2Capi-mifit.huami.com%2Capp-analytics.huami.com&login_token={login_token} "
169 |     response = requests.get(url, headers=headers).json()
170 |     # print("app_token获取成功！")
171 |     # print(app_token)
172 |     return response['token_info']['app_token']
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     """半成品：不可用"""
177 |     user = "00000"
178 |     password = "00000"
179 |     step_low = "10000"
180 |     step_high = "20000"
181 |     step = str(random.randint(int(step_low), int(step_high)))
182 |     main(user, password, step)
183 | 


--------------------------------------------------------------------------------
/其他网络爬虫/彼岸图网爬虫.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | import time
  5 | 
  6 | import requests
  7 | import xlwt
  8 | from bs4 import BeautifulSoup
  9 | from fake_useragent import FakeUserAgent
 10 | 
 11 | 
 12 | def get_url(num):
 13 |     url_list = []
 14 |     for i in range(num):
 15 |         if i == 0:
 16 |             url = 'http://pic.netbian.com/'
 17 |         else:
 18 |             url = 'http://pic.netbian.com/index_' + str(i + 1) + '.html'
 19 |         url_list.append(url)
 20 |     return url_list
 21 | 
 22 | 
 23 | def get_html(url):
 24 |     header = {'user-agent': FakeUserAgent().chrome}
 25 |     if isinstance(url, list):
 26 |         all_html = ''
 27 |         for i in url:
 28 |             html = requests.get(url=i, headers=header)
 29 |             html = BeautifulSoup(html.content, 'html.parser')
 30 |             html = html.findAll('ul', class_="clearfix")
 31 |             all_html += str(html)
 32 |         return all_html
 33 |     elif isinstance(url, str):
 34 |         html = requests.get(url=url, headers=header)
 35 |         html = BeautifulSoup(html.content, 'html.parser')
 36 |         html = str(html)
 37 |         return html
 38 |     else:
 39 |         raise KeyError('我也不知道哪里错了')
 40 | 
 41 | 
 42 | def deal_data(html):
 43 |     data_list = []
 44 |     findurl = re.compile(r'href="(.*?)"')
 45 |     finddata = re.compile(
 46 |         r'<div class=".*?"><a href="" id=".*?"><img alt=".*?" data-pic=".*?" src="(.*?)" title="(.*?)"/></a></div>')
 47 |     url = re.findall(findurl, html)
 48 |     for i in range(len(url)):
 49 |         cache = []
 50 |         url[i] = 'http://pic.netbian.com/' + url[i]
 51 |         html = get_html(url[i])
 52 |         data = re.findall(finddata, html)
 53 |         cache.append('http://pic.netbian.com' + data[0][0])
 54 |         cache.append(data[0][1])
 55 |         data_list.append(cache)
 56 |     return data_list
 57 | 
 58 | 
 59 | def sava_xlsx(data):
 60 |     book = xlwt.Workbook(encoding='utf-8')
 61 |     sheet = book.add_sheet('爬取结果')
 62 |     tap = ('图片链接', '关键字')
 63 |     for i in range(2):
 64 |         sheet.write(0, i, tap[i])  # 添加列标签
 65 |     for i in range(len(data)):
 66 |         tap = data[i]
 67 |         if len(data) >= 100:
 68 |             print('\r', end='')
 69 |             print('正在保存数据到表格: {:.2f}%'.format(
 70 |                 ((i + 1) / len(data)) * 100), '▉' * ((i + 1) // (len(data) // 50)), end='')
 71 |         elif len(data) > 0:
 72 |             print('\r', end='')
 73 |             print('正在保存数据到表格: {:.2f}%'.format(
 74 |                 ((i + 1) / len(data)) * 100), '▉' * ((i + 1) * 50 // (len(data))), end='')
 75 |         else:
 76 |             print('出现错误')
 77 |         for j in range(2):
 78 |             data_1 = tap[j]
 79 |             sheet.write(i + 1, j, data_1)
 80 |     book.save('图片爬虫2.xlsx')
 81 |     print('\n')
 82 | 
 83 | 
 84 | def sava_path(data):
 85 |     root = os.getcwd() + '\\图片爬虫2\\'
 86 |     header = {'user-agent': FakeUserAgent().chrome}
 87 |     repeat = 0
 88 |     for index, item in enumerate(data):
 89 |         url = str(item[0])
 90 |         path = root + url.split('/')[-1]
 91 |         if len(data) >= 100:
 92 |             print('\r', end='')
 93 |             print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
 94 |                   '▉' * ((index + 1) // (len(data) // 50)),
 95 |                   end='')
 96 |         else:
 97 |             print('\r', end='')
 98 |             print('正在下载图片: {:.2f}%'.format(((index + 1) / len(data)) * 100),
 99 |                   '▉' * ((index + 1) * 50 // (len(data))),
100 |                   end='')
101 |         try:
102 |             if not os.path.exists(root):  # 判断根目录是否存在
103 |                 os.mkdir(root)  # 创建根目录
104 |             if not os.path.exists(path):  # 判断文件是否存在
105 |                 file = requests.get(url=url, headers=header)  # 请求文件
106 |                 time.sleep(random.randrange(2, 5, 1))
107 |                 with open(path, 'wb') as save:
108 |                     save.write(file.content)
109 |                     save.close()
110 |             else:
111 |                 repeat += 1
112 |         except BaseException:
113 |             print('\n保存失败')
114 |     print('\n重复图片：' + str(repeat) + '张')
115 | 
116 | 
117 | def main():
118 |     num = int(input('爬取页数(1 ~ 1250)：'))
119 |     if num < 1 or num > 1250:
120 |         raise ValueError('页数输入错误')
121 |     url = get_url(num)
122 |     html = get_html(url)
123 |     data = deal_data(html)
124 |     sava_xlsx(data)
125 |     sava_path(data)
126 |     print('程序结束')
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 


--------------------------------------------------------------------------------
/其他网络爬虫/抖音无水印视频爬虫.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import time
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | class DouYin:
 9 |     def __init__(self):
10 |         self.headers = {
11 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
12 |                           'Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'}
13 |         self.android_headers = {'user-agent': 'Android'}
14 | 
15 |     def get_share_url(self, url):
16 |         response = requests.get(
17 |             url,
18 |             headers=self.headers,
19 |             allow_redirects=False)
20 |         if 'location' in response.headers.keys():
21 |             return response.headers['location']
22 |         else:
23 |             raise Exception("解析失败")
24 | 
25 |     def get_data(self, url, share):
26 |         response = requests.get(url, headers=self.headers).text
27 |         json_str = json.loads(response)
28 |         download_url = json_str['item_list'][0]['video']['play_addr']['url_list'][0].replace(
29 |             "playwm", "play")
30 |         name = json.loads(response)[
31 |             'item_list'][0]['share_info']['share_title']
32 |         with open(name + '.mp4', 'wb') as f:
33 |             f.write(
34 |                 requests.get(
35 |                     url=download_url,
36 |                     headers=self.android_headers).content)
37 |         print('视频下载完成！')
38 |         print('软件即将退出')
39 |         for i in range(1, 6):
40 |             time.sleep(1)
41 |             print('\r', end='')
42 |             print(6 - i, end='')
43 | 
44 |     def run(self):
45 |         share = input("请输入抖音短视频分享链接：")
46 |         url = re.findall(r'https://v.douyin.com/.*/', share)[0]
47 |         location = self.get_share_url(url)
48 |         vid = re.findall(r'/share/video/(\d*)', location)[0]
49 |         url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'.format(
50 |             vid)
51 |         self.get_data(url, share)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     dy = DouYin()
56 |     dy.run()
57 | 


--------------------------------------------------------------------------------
/其他网络爬虫/有道翻译.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import random
 3 | import time
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | class YouDao:
 9 |     def __init__(self):
10 |         self.cookie = self.get_cookie()
11 | 
12 |     def get_cookie(self):
13 |         headers = {
14 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicat'
15 |                       'ion/signed-exchange;v=b3;q=0.9',
16 |             'Accept-Encoding': 'gzip, deflate',
17 |             'Accept-Language': 'zh-CN,zh;q=0.9',
18 |             'Connection': 'keep-alive',
19 |             'DNT': '1',
20 |             'Host': 'fanyi.youdao.com',
21 |             'Upgrade-Insecure-Requests': '1',
22 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
23 |                           '7.0.4280.141 Safari/537.36 Edg/87.0.664.75'}
24 |         response = requests.get('http://fanyi.youdao.com/', headers=headers)
25 |         return [i.name + '=' + i.value for i in response.cookies]
26 | 
27 |     def get_data(self, key):
28 |         url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
29 |         headers = {
30 |             "Accept": "application/json, text/javascript, */*; q=0.01",
31 |             "Accept-Encoding": "gzip, deflate",
32 |             "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
33 |             "Cache-Control": "no-cache",
34 |             "Connection": "keep-alive",
35 |             "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
36 |             'Cookie': '%s' % ','.join(
37 |                 self.cookie),
38 |             "DNT": "1",
39 |             "Host": "fanyi.youdao.com",
40 |             "Origin": "http://fanyi.youdao.com",
41 |             "Pragma": "no-cache",
42 |             "Referer": "http://fanyi.youdao.com/",
43 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro"
44 |                           "me/87.0.4280.141 Safari/537.36 Edg/87.0.664.75",
45 |             "X-Requested-With": "XMLHttpRequest"}
46 |         lts = str(int(time.time() * 1000))
47 |         salt = lts + str(random.randint(0, 10))
48 |         sign = self.get_sign(key, salt)
49 |         form = {'i': key,
50 |                 'from': 'AUTO',
51 |                 'to': 'AUTO',
52 |                 'smartresult': 'dict',
53 |                 'client': 'fanyideskweb',
54 |                 'salt': salt,
55 |                 'sign': sign,
56 |                 'lts': lts,
57 |                 'bv': '02c2dd94fb562b4304f9b0c657990444',
58 |                 'doctype': 'json',
59 |                 'version': '2.1',
60 |                 'keyfrom': 'fanyi.web',
61 |                 'action': 'FY_BY_REALTlME'}
62 |         response = requests.post(url=url, headers=headers, params=form)
63 |         data = response.json()
64 |         print('翻译结果：', data['translateResult'][0][0]['tgt'])
65 |         print('来自有道词典结果：')
66 |         for i in data['smartResult']['entries']:
67 |             if i:
68 |                 print(i.replace('\r\n', ''))
69 |             else:
70 |                 continue
71 | 
72 |     def get_sign(self, key, salt):
73 |         sign = "fanyideskweb" + key + salt + "Tbh5E8=q6U3EXe+&L[4c@"
74 |         data = hashlib.md5()
75 |         data.update(sign.encode('utf-8'))
76 |         return data.hexdigest()
77 | 
78 |     def translate(self, key):
79 |         self.get_data(key) if key else print('翻译内容为空')
80 | 
81 | 
82 | def main():
83 |     key = input('翻译内容：')
84 |     YouDao().translate(key)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 


--------------------------------------------------------------------------------
/其他网络爬虫/百度百科爬虫项目.py:
--------------------------------------------------------------------------------
  1 | from urllib import parse
  2 | 
  3 | import chardet
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from fake_useragent import FakeUserAgent
  7 | 
  8 | 
  9 | class UrlManager():
 10 |     def __init__(self):
 11 |         self.new_urls = set()
 12 |         self.old_urls = set()
 13 | 
 14 |     def new_url_size(self):
 15 |         """返回未爬取网址数量"""
 16 |         return len(self.new_urls)
 17 | 
 18 |     def old_url_size(self):
 19 |         """返回已爬取网址数量"""
 20 |         return len(self.old_urls)
 21 | 
 22 |     def has_new_url(self):
 23 |         """判断有无未爬取网址"""
 24 |         return self.new_url_size() != 0
 25 | 
 26 |     def get_new_url(self):
 27 |         """获取一个待爬取网址"""
 28 |         new_url = self.new_urls.pop()
 29 |         self.old_urls.add(new_url)
 30 |         return new_url
 31 | 
 32 |     def add_new_url(self, url):
 33 |         """添加待爬取单个网址"""
 34 |         if url is None:
 35 |             return None
 36 |         if url not in self.new_urls and url not in self.old_urls:
 37 |             self.new_urls.add(url)
 38 | 
 39 |     def add_new_urls(self, urls):
 40 |         """添加待爬取多个网址"""
 41 |         if urls is None or len(urls) == 0:
 42 |             return None
 43 |         else:
 44 |             for url in urls:
 45 |                 self.add_new_url(url)
 46 | 
 47 | 
 48 | class HtmlDownloader():
 49 |     def __init__(self):
 50 |         self.header = {'user-agent': FakeUserAgent().chrome}
 51 | 
 52 |     def download(self, url):
 53 |         if url is None:
 54 |             return None
 55 |         response = requests.get(url, headers=self.header)
 56 |         if response.status_code == 200:
 57 |             return response.content.decode(
 58 |                 chardet.detect(response.content)['encoding'])
 59 |         else:
 60 |             return None
 61 | 
 62 | 
 63 | class HtmlParser():
 64 |     def parser(self, url, html):
 65 |         if url is None or html is None:
 66 |             return
 67 |         soup = BeautifulSoup(html, 'lxml')
 68 |         new_urls = self._get_new_url(url, soup)
 69 |         new_data = self._get_new_data(url, soup)
 70 |         return new_urls, new_data
 71 | 
 72 |     def _get_new_url(self, url, soup):
 73 |         new_urls = set()
 74 |         links = soup.select('div.basic-info.cmn-clearfix a')
 75 |         if bool(links):
 76 |             for item in links:
 77 |                 new_url = parse.urljoin(url, item['href'])
 78 |                 new_urls.add(new_url)
 79 |         return new_urls
 80 | 
 81 |     def _get_new_data(self, url, soup):
 82 |         new_data = [url]
 83 |         info = soup.select('div.lemma-summary > div.para')
 84 |         title = soup.select('.lemmaWgt-lemmaTitle-title > h1')
 85 |         if bool(info) and bool(title):
 86 |             new_data.append(title[0].text)
 87 |             new_data.append(info[0].text)
 88 |             return new_data
 89 |         return None
 90 | 
 91 | 
 92 | class DataOutput():
 93 |     def __init__(self):
 94 |         self.datas = []
 95 | 
 96 |     def store_data(self, data):
 97 |         if data is None:
 98 |             return
 99 |         self.datas.append(data)
100 | 
101 |     def output_file(self):
102 |         with open('百度百科爬虫结果.html', 'w+', encoding='utf-8') as html:
103 |             html.write('<!DOCTYPE html>')
104 |             html.write('<html lang="zh-CN">')
105 |             html.write('<head>')
106 |             html.write('    <meta charset="UTF-8">')
107 |             html.write('    <title>爬取结果</title>')
108 |             html.write('</head>')
109 |             html.write('<body>')
110 |             html.write('<table>')
111 |             html.write('<tr>\n<th>链接</th>\n<th>关键词</th>\n<th>描述</th></tr>')
112 |             for item in self.datas:
113 |                 html.write('<tr>')
114 |                 html.write('<td>%s</td>' % item[0])
115 |                 html.write('<td>%s</td>' % item[1])
116 |                 html.write('<td>%s</td>' % item[2])
117 |                 html.write('</tr>')
118 |             html.write('</table>')
119 |             html.write('</body>')
120 |             html.write('</html>')
121 | 
122 | 
123 | class SpiderMan():
124 |     def __init__(self):
125 |         self.manager = UrlManager()
126 |         self.downloader = HtmlDownloader()
127 |         self.parser = HtmlParser()
128 |         self.output = DataOutput()
129 | 
130 |     def crawl(self, root_url):
131 |         self.manager.add_new_url(root_url)
132 |         while (self.manager.new_url_size()
133 |                and self.manager.old_url_size() <= 100):
134 |             new_url = self.manager.get_new_url()
135 |             html = self.downloader.download(new_url)
136 |             new_urls, new_data = self.parser.parser(new_url, html)
137 |             self.manager.add_new_urls(new_urls)
138 |             self.output.store_data(new_data)
139 |         self.output.output_file()
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     spider = SpiderMan()
144 |     spider.crawl(
145 |         'https://baike.baidu.com/item/%E8%8F%B2%E5%BE%8B%E5%AE%BE%E5%B8%98%E8%9B%A4')
146 | 


--------------------------------------------------------------------------------
/其他网络爬虫/糗事百科爬虫.py:
--------------------------------------------------------------------------------
 1 | ﻿import re
 2 | 
 3 | from requests_html import HTMLSession
 4 | 
 5 | 
 6 | def get_html(list):
 7 |     session = HTMLSession()
 8 |     data = []
 9 |     have_url = re.compile('查看全文')
10 |     for url in list:
11 |         html = session.get(url)
12 |         all_article = html.html.find('.contentHerf')
13 |         all_maintext = html.html.find('.main-text')
14 |         for item in range(len(all_article)):
15 |             cache = []
16 |             article = all_article[item].text
17 |             cache.append(article)
18 |             if bool(re.findall(have_url, article)):
19 |                 cache.append(
20 |                     'https://www.qiushibaike.com' +
21 |                     all_article[item].attrs['href'])
22 |             else:
23 |                 cache.append(None)
24 |             try:
25 |                 maintext = all_maintext[item].text
26 |                 cache.append(maintext)
27 |             except BaseException:
28 |                 cache.append('无')
29 |             data.append(cache)
30 |     return data
31 | 
32 | 
33 | def deal_text(data):
34 |     modify2 = re.compile(r'\n[0-9]*')
35 |     for i in data:
36 |         for j in range(3):
37 |             if i[j] is not None:
38 |                 i[j] = modify2.sub('', i[j])
39 |             if j == 0:
40 |                 print('段子：', i[j])
41 |             elif j == 1:
42 |                 if i[j] is not None:
43 |                     print('查看全文：', i[j])
44 |             else:
45 |                 print('神评：', i[j])
46 |                 print('\n')
47 | 
48 | 
49 | def url_list(pages):
50 |     url_list = []
51 |     for i in range(pages):
52 |         if i == 0:
53 |             url = 'https://www.qiushibaike.com/text/'
54 |         else:
55 |             url = 'https://www.qiushibaike.com/text/page/' + str(i + 1) + '/'
56 |         url_list.append(url)
57 |     return (url_list)
58 | 
59 | 
60 | def input_data():
61 |     try:
62 |         print('爬取范围（ 1 ~ 13 ）')
63 |         pages = int(input('爬取页数：'))
64 |     except BaseException:
65 |         raise ValueError('页数输入错误')
66 |     if pages >= 1 and pages <= 13:
67 |         return pages
68 |     else:
69 |         raise ValueError('页数输入错误')
70 | 
71 | 
72 | def main():
73 |     pages = input_data()
74 |     url = url_list(pages)
75 |     data = get_html(url)
76 |     deal_text(data)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     main()
81 | 


--------------------------------------------------------------------------------
/其他网络爬虫/豆瓣Top250爬虫.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sqlite3
  3 | import time
  4 | import urllib.request
  5 | 
  6 | import pymysql
  7 | import xlwt
  8 | from bs4 import BeautifulSoup
  9 | from termcolor import colored
 10 | 
 11 | 
 12 | def main(means):  # 以不同的格式进行保存数据
 13 |     if means == '表格':  # 以xlsx格式保存
 14 |         douban = 'https://movie.douban.com/top250?start='
 15 |         datalist = get_data(douban)
 16 |         try:
 17 |             savexlsx(datalist)
 18 |             print('爬取成功，文件已保存')
 19 |         except PermissionError:
 20 |             print('文件已打开，无法写入数据')
 21 |             print('请重新运行')
 22 |     elif means == 'SQLite':  # 以sqlite格式保存数据
 23 |         douban = 'https://movie.douban.com/top250?start='
 24 |         datalist = get_data(douban)
 25 |         dbpath = '数据库.db'
 26 |         savedb(datalist, dbpath)
 27 |     elif means == 'MySQL':
 28 |         douban = 'https://movie.douban.com/top250?start='
 29 |         datalist = get_data(douban)
 30 |         mysql_save(datalist)
 31 |     else:
 32 |         print('保存格式错误')
 33 | 
 34 | 
 35 | # 正则表达式规则
 36 | findlink = re.compile(r'<a href="(.*?)"')
 37 | findtitle = re.compile(r'<span class="title">(.*?)</span>')
 38 | findscore = re.compile(r'<span class=".*?" property=".*?">(.*?)</span>')
 39 | findpeople = re.compile(r'<span>(.*)人评价</span>')
 40 | findinfo = re.compile(r'<span class="inq">(.*?)</span>')
 41 | findtype = re.compile(r'''导演.*?<br/>
 42 |                             .* / .* / (.*)''')
 43 | 
 44 | 
 45 | # 解析网址
 46 | def askURL(douban):
 47 |     # 设置浏览器UA
 48 |     User = {
 49 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.41"
 50 |                       "47.89 Safari/537.36 Edg/84.0.522.40Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3"
 51 |                       "6 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.40"}
 52 |     # 携带参数访问网址
 53 |     request = urllib.request.Request(url=douban, headers=User)  # 指定参数
 54 |     response = urllib.request.urlopen(request)  # 解析网址
 55 |     html = response.read().decode('utf-8')  # 记录源码
 56 |     return html  # 返回数据
 57 | 
 58 | 
 59 | # 翻页爬取
 60 | def get_data(douban):
 61 |     data_list = []  # 总数据，包含全部电影信息
 62 |     for i in range(10):  # 每一页获取网页源码
 63 |         print('正在解析第' + str(i) + '页')
 64 |         url = douban + str(i * 25)
 65 |         html = askURL(url)  # 返回一页网页源码
 66 |         time.sleep(2)
 67 |         print('获取成功')
 68 |         soup = BeautifulSoup(html, 'html.parser')  # 解析网页源码
 69 |         for item in soup.find_all('div', class_="item"):  # 解析每一部电影信息
 70 |             data = []  # 临时储存数据，包含一部电影信息
 71 |             item = str(item)  # 转化为字符串
 72 |             link = re.findall(findlink, item)[0]
 73 |             data.append(link)  # 添加临时储存数据
 74 |             title = re.findall(findtitle, item)
 75 |             if len(title) == 2:
 76 |                 c = title[0].replace('/', '')
 77 |                 data.append(c)
 78 |                 e = title[1].replace('/', '')
 79 |                 e = e.strip()
 80 |                 data.append(e)
 81 |             else:
 82 |                 data.append(title[0])
 83 |                 data.append('无')
 84 |             score = re.findall(findscore, item)[0]
 85 |             score = score.strip()
 86 |             data.append(score)
 87 |             people = re.findall(findpeople, item)[0]
 88 |             people = people.strip()
 89 |             data.append(people)
 90 |             info = re.findall(findinfo, item)
 91 |             if len(info) == 0:
 92 |                 data.append('无')
 93 |             else:
 94 |                 data.append(info[0])
 95 |             type = re.findall(findtype, item)[0]
 96 |             type = type.strip()
 97 |             data.append(type)
 98 |             data_list.append(data)  # 把一部电影的信息以列表形式添加到总数据
 99 |     return data_list  # 返回全部电影信息
100 | 
101 | 
102 | def savexlsx(datalist):
103 |     book = xlwt.Workbook(encoding='utf-8')
104 |     sheet = book.add_sheet('爬取结果', cell_overwrite_ok=True)  # 覆盖写入
105 |     tap = ('链接', '电影名称', '英文名称', '评分', '评价人数', '介绍', '类型')
106 |     for i in range(7):
107 |         sheet.write(0, i, tap[i])  # 添加列标签
108 |     for i in range(250):
109 |         tap = datalist[i]
110 |         print('正在保存第' + str(i + 1) + '条')
111 |         for j in range(7):
112 |             data_1 = tap[j]
113 |             sheet.write(i + 1, j, data_1)
114 |     book.save('豆瓣TOP250.xlsx')
115 | 
116 | 
117 | def savedb(datalist, dbpath):
118 |     list = sqlite3.connect(dbpath)
119 |     cursor = list.cursor()
120 |     basesql = '''create table 豆瓣top250
121 |     ('TOP' integer primary key not null ,
122 |     '链接' text not null,
123 |     '电影名称' text  not null,
124 |     '英文名称' text,
125 |     '评分' number not null,
126 |     '评价人数' number not null,
127 |     '介绍' text,
128 |     '类型' text not null)'''
129 |     try:
130 |         cursor.execute(basesql)
131 |         list.commit()
132 |     except sqlite3.OperationalError:
133 |         print('数据表已存在')
134 |     for i, data in enumerate(datalist):  # 把索引赋给i，把元素赋给data
135 |         for index in range(7):
136 |             data[index] = '"' + str(data[index]) + '"'
137 |         sql = '''insert into 豆瓣top250('链接', '电影名称',
138 |             '英文名称', '评分', '评价人数', '介绍', '类型')
139 |             values(%s)''' % ','.join(data)
140 |         cursor.execute(sql)
141 |         print('正在保存第' + str(i + 1) + '条数据')
142 |         list.commit()
143 |     print('数据表已保存完毕')
144 |     list.close()
145 |     print('数据库已关闭')
146 | 
147 | 
148 | def mysql_save(datalist):
149 |     try:
150 |         database = pymysql.connect('localhost', 'root', '数据库密码', '数据库名称')
151 |         try:
152 |             sql = '''create table 豆瓣top250
153 |                 (TOP int(3) primary key auto_increment,
154 |                 链接 text,
155 |                 电影名称 text,
156 |                 英文名称 text,
157 |                 评分 float,
158 |                 评价人数 mediumint,
159 |                 介绍 text,
160 |                 类型 text)'''
161 |             cursor = database.cursor()
162 |             cursor.execute(sql)
163 |             database.commit()
164 |             print('新建表成功')
165 |         except BaseException:
166 |             print('数据表已存在')
167 |         for i, data in enumerate(datalist):  # 把索引赋给i，把元素赋给data
168 |             for index in range(7):
169 |                 data[index] = '"' + str(data[index]) + '"'
170 |             sql = '''insert into 豆瓣top250(链接, 电影名称,
171 |                 英文名称, 评分, 评价人数, 介绍, 类型)
172 |                 values(%s)''' % ','.join(data)
173 |             cursor.execute(sql)
174 |             print('正在保存第' + str(i + 1) + '条数据')
175 |             database.commit()
176 |         print('数据表已保存完毕')
177 |         database.close()
178 |         print('数据库已关闭')
179 |     except BaseException:
180 |         raise ValueError('数据库名称或密码错误')
181 | 
182 | 
183 | if __name__ == '__main__':
184 |     print(colored('使用MySQL数据库保存爬取数据需要先安装MySQl\n且需要在代码中修改MySQL数据库密码和连接数据库名称', 'red'))
185 |     print('输入“表格”或“SQLite”或“MySQL”')
186 |     means = input('选择保存形式：xlsx表格、SQLite数据库、MySQL数据库')
187 |     start = time.time()
188 |     main(means)
189 |     print('运行时间：%.5f' % float(time.time() - start))
190 |     print('程序已关闭')
191 | 


--------------------------------------------------------------------------------
/其他网络爬虫/铅笔小说网爬虫.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | import time
  5 | 
  6 | import unicodedata
  7 | from requests_html import HTMLSession
  8 | from termcolor import colored
  9 | 
 10 | 
 11 | def input_data():
 12 |     url = input('输入网址：')
 13 |     check_url = re.compile(r'^https://www.x23qb.com/book/.*?/')
 14 |     url = re.findall(check_url, url)
 15 |     if bool(url):
 16 |         return url
 17 |     else:
 18 |         raise ValueError('网址输入错误')
 19 | 
 20 | 
 21 | def open_url(url):
 22 |     session = HTMLSession()
 23 |     if len(url) == 1:
 24 |         response = session.get(url[0])
 25 |         return response
 26 |     elif len(url) > 1:
 27 |         html = []
 28 |         for page in url:
 29 |             response = session.get(page)
 30 |             print('获取小说内容中...')
 31 |             time.sleep(random.randrange(2, 5, 1))
 32 |             html.append(response)
 33 |         return html
 34 |     else:
 35 |         raise TypeError('传入网址发生异常')
 36 | 
 37 | 
 38 | def get_url(html):
 39 |     title = []
 40 |     url = []
 41 |     for page in html.html.find('#chapterList li > a'):
 42 |         url.append('https://www.x23qb.com' + page.attrs['href'])
 43 |         title.append(unicodedata.normalize('NFKC', page.text))
 44 |     return title, url
 45 | 
 46 | 
 47 | def get_text(text):
 48 |     findtext = re.compile(r'chapter\(\);(.*?)铅笔小说', flags=re.DOTALL)
 49 |     data = []
 50 |     for page in text:
 51 |         content = page.html.find('#TextContent')
 52 |         if len(content) != 1:
 53 |             raise ValueError(content)
 54 |         read = re.findall(findtext, content[0].text)
 55 |         if len(read) == 1:
 56 |             data.append(read[0])
 57 |         else:
 58 |             raise ValueError(read)
 59 |     return data
 60 | 
 61 | 
 62 | def save_txt(book, title, text):
 63 |     if len(title) == len(text):
 64 |         for index in range(len(title)):
 65 |             root = os.getcwd() + '\\' + book + '\\'
 66 |             file = root + title[index] + '.txt'
 67 |             if not os.path.exists(root):
 68 |                 os.mkdir(root)
 69 |             if not os.path.exists(file):
 70 |                 with open(file, 'w+', encoding='utf-8') as txt:
 71 |                     txt.write(unicodedata.normalize('NFKC', text[index]))
 72 |                     txt.close()
 73 |                 print(colored(title[index] + '保存成功', 'yellow'))
 74 |             else:
 75 |                 save = input(
 76 |                     colored(
 77 |                         title[index] +
 78 |                         '已存在，是否覆盖保存？\n覆盖保存直接回车，不保存请输入任意字符后回车\n需要关闭文件后再覆盖保存\n',
 79 |                         'red'))
 80 |                 if bool(save):
 81 |                     print(colored(title[index] + '已存在，未保存', 'red'))
 82 |                 else:
 83 |                     with open(file, 'w+', encoding='utf-8') as txt:
 84 |                         txt.write(unicodedata.normalize('NFKC', text[index]))
 85 |                         txt.close()
 86 |                     print(colored(title[index] + '覆盖保存成功', 'yellow'))
 87 |     else:
 88 |         print('获取小说数据异常')
 89 | 
 90 | 
 91 | def book_root(name):
 92 |     name = name.replace('\\', '')
 93 |     name = name.replace('/', '')
 94 |     name = name.replace('?', '')
 95 |     name = name.replace(':', '')
 96 |     name = name.replace('*', '')
 97 |     name = name.replace('|', '')
 98 |     name = name.replace('<', '')
 99 |     name = name.replace('>', '')
100 |     name = name.replace('"', '')
101 |     return name
102 | 
103 | 
104 | def main():
105 |     url = input_data()
106 |     start = time.time()
107 |     html = open_url(url)
108 |     book = html.html.find('.d_title h1')[0].text
109 |     book = book_root(book)
110 |     title, url = get_url(html)
111 |     html = open_url(url)
112 |     text = get_text(html)
113 |     save_txt(book, title, text)
114 |     print('爬取结束，运行时间：{:.6f}'.format(time.time() - start))
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/前程无忧/51job爬虫_1.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from urllib import parse
  3 | import json
  4 | import pymysql
  5 | import time
  6 | import random
  7 | 
  8 | 
  9 | def get_html(key, page=1):
 10 |     header = {
 11 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
 12 |                       '3.121 Safari/537.36',
 13 |         'Accept': 'application/json, text/javascript, */*; q=0.01',
 14 |         'Host': 'search.51job.com',
 15 |         'Accept-Encoding': 'gzip, deflate, br',
 16 |         'Sec-Fetch-Dest': 'empty',
 17 |         'Sec-Fetch-Mode': 'cors',
 18 |         'Sec-Fetch-Site': 'same-origin',
 19 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
 20 |         'Cache-Control': 'no-cache',
 21 |         'Connection': 'keep-alive',
 22 |         'DNT': '1',
 23 |         'Pragma': 'no-cache',
 24 |         'X-Requested-With': 'XMLHttpRequest',
 25 |         'Referer': 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000'
 26 |                    '&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&w'
 27 |                    'elfare='.format(
 28 |                        parse.quote(key),
 29 |                        page)}
 30 |     url = 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&' \
 31 |           'cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
 32 |     session = requests.Session()
 33 |     response = session.get(url.format(parse.quote(key), page), headers=header)
 34 |     if response.status_code == 200:
 35 |         return response.content.decode(response.encoding)
 36 |     else:
 37 |         raise Warning('请求网页失败')
 38 | 
 39 | 
 40 | def get_data(key, html, page):
 41 |     data, pages = deal_data(html)
 42 |     save_data(data)
 43 |     if page != 0:
 44 |         for i in range(2, page + 2):
 45 |             time.sleep(random.random() + random.randrange(2, 4, 1))
 46 |             html = get_html(key, page=i)
 47 |             data, pages = deal_data(html)
 48 |             save_data(data)
 49 |     print('获取数据结束')
 50 | 
 51 | 
 52 | def save_data(data):
 53 |     """首次使用需要在下方代码填写数据库信息（自行创建数据库）"""
 54 |     try:
 55 |         database = pymysql.connect(
 56 |             host='localhost',
 57 |             user='root',
 58 |             password='数据库密码',
 59 |             db='数据库名称')
 60 |     except pymysql.err.OperationalError:
 61 |         raise ValueError("连接数据库失败，请检查数据库密码与数据库名称（运行前手动创建数据库）")
 62 |     cursor = database.cursor()
 63 |     sql = '''create table if not exists 51job
 64 |         (链接 text,
 65 |         职位 text,
 66 |         发布日期 DATETIME,
 67 |         月薪 text,
 68 |         信息 text,
 69 |         福利 text,
 70 |         公司名称 text,
 71 |         公司性质 text,
 72 |         公司规模 text,
 73 |         行业分类 text)'''
 74 |     cursor.execute(sql)
 75 |     database.commit()
 76 |     sql = '''insert into 51job VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
 77 |     cursor.executemany(sql, data)
 78 |     database.commit()
 79 |     database.close()
 80 | 
 81 | 
 82 | def deal_data(data):
 83 |     data = json.loads(data)
 84 |     items = []
 85 |     engine_search_result = data['engine_search_result']
 86 |     for i in engine_search_result:
 87 |         cache = []
 88 |         job_href = i['job_href']
 89 |         job_name = i['job_name']
 90 |         issuedate = i['issuedate']
 91 |         if bool(i['providesalary_text']):
 92 |             providesalary_text = i['providesalary_text']
 93 |         else:
 94 |             providesalary_text = None
 95 |         if bool(i['attribute_text']):
 96 |             attribute_text = '%s' % ','.join(i['attribute_text'])
 97 |         else:
 98 |             attribute_text = None
 99 |         jobwelf_list = '%s' % ','.join(i['jobwelf_list'])
100 |         if not bool(jobwelf_list):
101 |             jobwelf_list = None
102 |         company_name = i['company_name']
103 |         companytype_text = i['companytype_text']
104 |         if bool(i['companysize_text']):
105 |             companysize_text = i['companysize_text']
106 |         else:
107 |             companysize_text = None
108 |         companyind_text = i['companyind_text']
109 |         cache.append(job_href)
110 |         cache.append(job_name)
111 |         cache.append(issuedate)
112 |         cache.append(providesalary_text)
113 |         cache.append(attribute_text)
114 |         cache.append(jobwelf_list)
115 |         cache.append(company_name)
116 |         cache.append(companytype_text)
117 |         cache.append(companysize_text)
118 |         cache.append(companyind_text)
119 |         items.append(cache)
120 |     pages = data['jobid_count']
121 |     return items, int(pages)
122 | 
123 | 
124 | def get_page(html, page):
125 |     item, items = deal_data(html)
126 |     pages = items // 50 + 1 if items % 50 != 0 else items // 50
127 |     if pages >= page >= 1:
128 |         return page - 1
129 |     print('获取到总页数为：' + str(pages) + '\n输入页数超出总页数或页数输入错误\n本次运行程序只获取第一页数据')
130 |     return 0
131 | 
132 | 
133 | def main():
134 |     print('首次运行请在代码中修改MySQL数据库密码和数据库名称')
135 |     key = input('请输入关键字：')
136 |     page = int(input('请输入获取页数：'))
137 |     html = get_html(key)
138 |     page = get_page(html, page)
139 |     get_data(key, html, page)
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     main()
144 | 


--------------------------------------------------------------------------------
/前程无忧/51job爬虫_2.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from urllib import parse
  3 | import json
  4 | import sqlite3
  5 | import time
  6 | import random
  7 | 
  8 | 
  9 | def get_html(key, page=1):
 10 |     header = {
 11 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
 12 |                       '3.121 Safari/537.36',
 13 |         'Accept': 'application/json, text/javascript, */*; q=0.01',
 14 |         'Host': 'search.51job.com',
 15 |         'Accept-Encoding': 'gzip, deflate, br',
 16 |         'Sec-Fetch-Dest': 'empty',
 17 |         'Sec-Fetch-Mode': 'cors',
 18 |         'Sec-Fetch-Site': 'same-origin',
 19 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
 20 |         'Cache-Control': 'no-cache',
 21 |         'Connection': 'keep-alive',
 22 |         'DNT': '1',
 23 |         'Pragma': 'no-cache',
 24 |         'X-Requested-With': 'XMLHttpRequest',
 25 |         'Referer': 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000'
 26 |                    '&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&w'
 27 |                    'elfare='.format(
 28 |             parse.quote(key),
 29 |             page)}
 30 |     url = 'https://search.51job.com/list/030000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&' \
 31 |           'cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
 32 |     session = requests.Session()
 33 |     response = session.get(url.format(parse.quote(key), page), headers=header)
 34 |     if response.status_code == 200:
 35 |         return response.content.decode(response.encoding)
 36 |     else:
 37 |         raise Warning('请求网页失败')
 38 | 
 39 | 
 40 | def get_data(key, html, page):
 41 |     data, pages = deal_data(html)
 42 |     save_data(data)
 43 |     if page != 0:
 44 |         for i in range(2, page + 2):
 45 |             time.sleep(random.random() + random.randrange(2, 4, 1))
 46 |             html = get_html(key, page=i)
 47 |             data, pages = deal_data(html)
 48 |             save_data(data)
 49 |     print('获取数据结束')
 50 | 
 51 | 
 52 | def save_data(data):
 53 |     database = sqlite3.connect('51job.db')
 54 |     cursor = database.cursor()
 55 |     sql = '''create table if not exists 前程无忧
 56 |         (链接 text,
 57 |         职位 text,
 58 |         发布日期 DATETIME,
 59 |         月薪 text,
 60 |         信息 text,
 61 |         福利 text,
 62 |         公司名称 text,
 63 |         公司性质 text,
 64 |         公司规模 text,
 65 |         行业分类 text)'''
 66 |     cursor.execute(sql)
 67 |     database.commit()
 68 |     sql = '''insert into 前程无忧 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'''
 69 |     cursor.executemany(sql, data)
 70 |     database.commit()
 71 |     database.close()
 72 | 
 73 | 
 74 | def deal_data(data):
 75 |     data = json.loads(data)
 76 |     items = []
 77 |     engine_search_result = data['engine_search_result']
 78 |     for i in engine_search_result:
 79 |         cache = []
 80 |         job_href = i['job_href']
 81 |         job_name = i['job_name']
 82 |         issuedate = i['issuedate']
 83 |         if bool(i['providesalary_text']):
 84 |             providesalary_text = i['providesalary_text']
 85 |         else:
 86 |             providesalary_text = None
 87 |         if bool(i['attribute_text']):
 88 |             attribute_text = '%s' % ','.join(i['attribute_text'])
 89 |         else:
 90 |             attribute_text = None
 91 |         jobwelf_list = '%s' % ','.join(i['jobwelf_list'])
 92 |         if not bool(jobwelf_list):
 93 |             jobwelf_list = None
 94 |         company_name = i['company_name']
 95 |         companytype_text = i['companytype_text']
 96 |         if bool(i['companysize_text']):
 97 |             companysize_text = i['companysize_text']
 98 |         else:
 99 |             companysize_text = None
100 |         companyind_text = i['companyind_text']
101 |         cache.append(job_href)
102 |         cache.append(job_name)
103 |         cache.append(issuedate)
104 |         cache.append(providesalary_text)
105 |         cache.append(attribute_text)
106 |         cache.append(jobwelf_list)
107 |         cache.append(company_name)
108 |         cache.append(companytype_text)
109 |         cache.append(companysize_text)
110 |         cache.append(companyind_text)
111 |         items.append(cache)
112 |     pages = data['jobid_count']
113 |     return items, int(pages)
114 | 
115 | 
116 | def get_page(html, page):
117 |     item, items = deal_data(html)
118 |     pages = items // 50 + 1 if items % 50 != 0 else items // 50
119 |     if pages >= page >= 1:
120 |         return page - 1
121 |     print('获取到总页数为：' + str(pages) + '\n输入页数超出总页数或页数输入错误\n本次运行程序只获取第一页数据')
122 |     return 0
123 | 
124 | 
125 | def main():
126 |     """数据保存到SQLite"""
127 |     key = input('请输入关键字：')
128 |     page = int(input('请输入获取页数：'))
129 |     html = get_html(key)
130 |     page = get_page(html, page)
131 |     get_data(key, html, page)
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 


--------------------------------------------------------------------------------
/哔哩哔哩/B站弹幕爬虫.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | from fake_useragent import FakeUserAgent
 5 | import json
 6 | import jieba
 7 | import wordcloud
 8 | 
 9 | 
10 | def input_url():
11 |     url = input('输入视频网址：')
12 |     if url[-1] != '/':
13 |         url += '/'
14 |     check_url = re.compile(r'^https://www.bilibili.com/video/(.*?)/$')
15 |     bv = re.findall(check_url, url)
16 |     if bool(bv):
17 |         if len(bv[0].split('/')) == 1:
18 |             return bv[0]
19 |         else:
20 |             raise ValueError('视频网址输入错误')
21 | 
22 | 
23 | def get_url(bv):
24 |     content, text = open_url(
25 |         'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp'.format(bv))
26 |     text = json.loads(text)
27 |     av = text['data'][0]['cid']
28 |     return 'http://comment.bilibili.com/' + str(av) + '.xml'
29 | 
30 | 
31 | def open_url(url):
32 |     header = {'user-agent': FakeUserAgent().chrome}
33 |     html = requests.get(url=url, headers=header)
34 |     return html.content, html.text
35 | 
36 | 
37 | def deal_data(html):
38 |     find = re.compile(r'<d p=".*?">(.*?)</d>')
39 |     data = ''
40 |     html = BeautifulSoup(html, 'html.parser')
41 |     for item in html.findAll('d'):
42 |         item = str(item)
43 |         item = re.findall(find, item)
44 |         data += item[0]
45 |     return data
46 | 
47 | 
48 | def cloud(data):
49 |     data = jieba.cut(data)
50 |     word = wordcloud.WordCloud(
51 |         font_path='msyh.ttc',
52 |         background_color='white',
53 |         width=1920,
54 |         height=1080)
55 |     word.generate('%s' % ' '.join(data))
56 |     word.to_file('弹幕词云图.png')
57 | 
58 | 
59 | def main():
60 |     url = input_url()
61 |     url = get_url(url)
62 |     content, text = open_url(url)
63 |     data = deal_data(content)
64 |     cloud(data)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/哔哩哔哩/B站评论爬虫.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | from fake_useragent import FakeUserAgent
  5 | import json
  6 | import jieba
  7 | import wordcloud
  8 | import xlwt
  9 | 
 10 | 
 11 | def input_data():
 12 |     url = input('输入视频网址：')
 13 |     if url[-1] != '/':
 14 |         url += '/'
 15 |     check_url = re.compile(r'^https://www.bilibili.com/video/(.*?)/$')
 16 |     bv = re.findall(check_url, url)
 17 |     if not bool(bv):
 18 |         raise ValueError('视频网址格式错误')
 19 |     if len(bv[0].split('/')) != 1:
 20 |         raise ValueError('视频网址格式错误')
 21 |     page = int(input('爬取评论页数(仅爬取指定页数评论)：'))
 22 |     if page < 1:
 23 |         raise ValueError('爬取评论总页数输入错误')
 24 |     return url, page
 25 | 
 26 | 
 27 | def get_url(url, page):
 28 |     findav = re.compile(
 29 |         r'<meta content="https://www.bilibili.com/video/av(.*?)/"')
 30 |     content, text = open_url(url)
 31 |     html = BeautifulSoup(content, 'html.parser')
 32 |     html = str(html)
 33 |     av = re.findall(findav, html)
 34 |     if len(av) == 2 and av[0] == av[1]:
 35 |         av = av[0]
 36 |         url = 'https://api.bilibili.com/x/v2/reply?pn=' + \
 37 |               str(page) + '&type=1&oid=' + av + '&sort=2'
 38 |         return url
 39 |     else:
 40 |         raise ValueError('获取视频AV号发生异常')
 41 | 
 42 | 
 43 | def open_url(url):
 44 |     header = {'user-agent': FakeUserAgent().chrome}
 45 |     html = requests.get(url=url, headers=header)
 46 |     return html.content, html.text
 47 | 
 48 | 
 49 | def deal_data(jsdata):
 50 |     data = []
 51 |     jsdata = json.loads(jsdata)
 52 |     for i in range(len(jsdata['data']['replies'])):
 53 |         mdata = []
 54 |         aname = jsdata['data']['replies'][i]['member']['uname']
 55 |         mdata.append(aname)
 56 |         asex = jsdata['data']['replies'][i]['member']['sex']
 57 |         mdata.append(asex)
 58 |         asign = jsdata['data']['replies'][i]['member']['sign']
 59 |         mdata.append(asign)
 60 |         amessage = jsdata['data']['replies'][i]['content']['message']
 61 |         mdata.append(amessage)
 62 |         idata = []
 63 |         if bool(jsdata['data']['replies'][i]['replies']):
 64 |             for j in range(len(jsdata['data']['replies'][i]['replies'])):
 65 |                 idata_cache = []
 66 |                 iname = jsdata['data']['replies'][i]['replies'][j]['member']['uname']
 67 |                 idata_cache.append(iname)
 68 |                 isex = jsdata['data']['replies'][i]['replies'][j]['member']['sex']
 69 |                 idata_cache.append(isex)
 70 |                 isign = jsdata['data']['replies'][i]['replies'][j]['member']['sign']
 71 |                 idata_cache.append(isign)
 72 |                 imessage = jsdata['data']['replies'][i]['replies'][j]['content']['message']
 73 |                 idata_cache.append(imessage)
 74 |                 idata.append(idata_cache)
 75 |         mdata.append(idata)
 76 |         data.append(mdata)
 77 |     return data
 78 | 
 79 | 
 80 | def cloud(data):
 81 |     cache = ''.join(data[i][3] for i in range(len(data)))
 82 |     data = jieba.cut(cache)
 83 |     word = wordcloud.WordCloud(
 84 |         font_path='msyh.ttc',
 85 |         background_color='white',
 86 |         width=1920,
 87 |         height=1080)
 88 |     word.generate('%s' % ' '.join(data))
 89 |     word.to_file('评论词云图.png')
 90 | 
 91 | 
 92 | def save_data(data):
 93 |     book = xlwt.Workbook(encoding='utf-8')
 94 |     sheet = book.add_sheet('爬取结果', cell_overwrite_ok=False)
 95 |     top = ('类型', '用户名', '性别', '签名', '评论')
 96 |     for i in range(len(top)):
 97 |         sheet.write(0, i, top[i])
 98 |     for i in range(len(data)):
 99 |         top = data[i]
100 |         for j in range(len(top) + 1):
101 |             if j == 4 and bool(top[j]):
102 |                 for x in range(len(top[j])):
103 |                     for y in range(len(top[j][x]) + 1):
104 |                         if y == 4:
105 |                             sheet.write(x + 2 + 4 * i, 0, '评论回复')
106 |                         else:
107 |                             text = top[j][x][y]
108 |                             sheet.write(x + 2 + 4 * i, y + 1, text.strip())
109 |             elif j != 4 or not bool(1 - bool(top[j])):
110 |                 if j == 5:
111 |                     sheet.write(1 + 4 * i, 0, '主评论')
112 |                 else:
113 |                     text = top[j]
114 |                     sheet.write(1 + 4 * i, j + 1, text.strip())
115 |     try:
116 |         book.save('B站评论数据.xls')
117 |     except PermissionError:
118 |         print('保存数据失败，请关闭xlsx文件后重新运行程序')
119 | 
120 | 
121 | def main():
122 |     url, page = input_data()
123 |     url = get_url(url, page)
124 |     content, text = open_url(url)
125 |     data = deal_data(text)
126 |     save_data(data)
127 |     cloud(data)
128 |     print('程序结束')
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 


--------------------------------------------------------------------------------
/哔哩哔哩/msyh.ttc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeanAmier/Spiders/0e43c87259bf2083fad926a945285d0d49e9f966/哔哩哔哩/msyh.ttc


--------------------------------------------------------------------------------
/好豆网/好豆网爬虫.py:
--------------------------------------------------------------------------------
  1 | # Author_QQ：2437596031
  2 | try:
  3 |     from requests_html import HTMLSession
  4 |     import time
  5 |     import pymysql
  6 |     import json
  7 |     import re
  8 |     import random
  9 |     import os
 10 |     import hashlib
 11 |     import copy
 12 |     from urllib.parse import quote
 13 | except ModuleNotFoundError as e:
 14 |     print('导入模块失败，请安装对应模块后再运行\n', e)
 15 |     exit()
 16 | 
 17 | 
 18 | class Config:
 19 |     config_file = 'config_file.json'
 20 |     default = {
 21 |         "MySQL": {"host": "",
 22 |                   "user": "",
 23 |                   "password": ""},
 24 |         "Progress": {"type": 0,
 25 |                      "page": 0},
 26 |         "Tasks": [
 27 |             ['type_url', 'type_name', 'start_page', 'end_page']
 28 |         ]
 29 |     }
 30 | 
 31 |     def __init__(self):
 32 |         if not os.path.exists(self.config_file):
 33 |             with open(self.config_file, 'w', encoding='utf-8') as f:
 34 |                 f.write(json.dumps(self.default))
 35 |             print('已生成配置文件：config_file.json\nTasks 格式：[爬取链接, 类型, 起始页数, 结束页数]')
 36 |             self.mysql = None
 37 |             self.progress = None
 38 |             self.tasks = None
 39 |         else:
 40 |             with open(self.config_file, 'r', encoding='utf-8') as f:
 41 |                 try:
 42 |                     self.config = json.load(f)
 43 |                     self.config_new = copy.deepcopy(self.config)
 44 |                 except json.decoder.JSONDecodeError:
 45 |                     print(self.config_file, '格式错误！')
 46 |                     exit()
 47 |             self.mysql = self.config['MySQL']
 48 |             self.progress = self.config['Progress']
 49 |             self.tasks = self.config['Tasks']
 50 | 
 51 |     def new_config(self):
 52 |         return self.config_new
 53 | 
 54 |     def get_config(self):
 55 |         return self.mysql, self.progress, self.tasks, self.tasks_length
 56 | 
 57 |     def check_all(self):
 58 |         if None in [self.mysql, self.progress, self.tasks]:
 59 |             return False
 60 |         return False not in [
 61 |             self.check_mysql(),
 62 |             self.check_progress(),
 63 |             self.check_tasks(),
 64 |         ]
 65 | 
 66 |     def check_mysql(self):
 67 |         try:
 68 |             db = pymysql.connect(
 69 |                 host=self.mysql['host'],
 70 |                 user=self.mysql['user'],
 71 |                 password=self.mysql['password'])
 72 |             db.close()
 73 |             return True
 74 |         except pymysql.err.OperationalError:
 75 |             print('连接数据库失败！')
 76 |             return False
 77 | 
 78 |     def check_progress(self):
 79 |         return isinstance(self.progress['type'], int) and isinstance(
 80 |             self.progress['page'], int
 81 |         )
 82 | 
 83 |     def check_tasks(self):
 84 |         for i in self.tasks:
 85 |             if not re.findall(
 86 |                     r'^https://www.haodou.com/recipe/all/\d+$',
 87 |                     i[0]):
 88 |                 print('链接错误：', i)
 89 |                 return False
 90 |             if not re.findall(r'^[\u4e00-\u9fa5]+$', i[1]):
 91 |                 print('备注错误：', i)
 92 |                 return False
 93 |             if not 1 <= i[2] <= 250:
 94 |                 print('起始页数错误：', i)
 95 |                 return False
 96 |             if not 1 <= i[3] <= 250:
 97 |                 print('结束页数错误：', i)
 98 |                 return False
 99 |             if i[2] > i[3]:
100 |                 print('起始页数大于结束页数：', i)
101 |                 return False
102 |         return True
103 | 
104 |     @property
105 |     def tasks_length(self):
106 |         return len(self.tasks)
107 | 
108 |     def save_config(self):
109 |         with open(self.config_file, 'w') as f:
110 |             f.write(json.dumps(self.config_new))
111 |         print('配置文件已更新！\n未完成全部爬取任务前不要修改', self.config_file, '文件！')
112 | 
113 | 
114 | class Database:
115 |     db_name = 'haodouDB'
116 |     sql = f"""create database IF NOT EXISTS {db_name} CHARACTER SET utf8mb4"""
117 | 
118 |     def __init__(self, mysql):
119 |         self.mysql_db = pymysql.connect(
120 |             host=mysql['host'],
121 |             user=mysql['user'],
122 |             password=mysql['password'])
123 |         self.cursor = self.mysql_db.cursor()
124 |         self.initial_db(mysql)
125 | 
126 |     def initial_db(self, mysql):
127 |         self.cursor.execute(self.sql)
128 |         self.mysql_db.commit()
129 |         self.mysql_db.close()
130 |         self.mysql_db = pymysql.connect(
131 |             host=mysql['host'],
132 |             user=mysql['user'],
133 |             password=mysql['password'],
134 |             db=self.db_name)
135 |         self.cursor = self.mysql_db.cursor()
136 | 
137 |     def create_table(self):
138 |         sql = """CREATE TABLE IF NOT EXISTS 好豆网数据(
139 |         ID INTEGER primary key,
140 |         链接 text not null,
141 |         菜名 text not null,
142 |         主料_1 text not null,
143 |         主料_2 text,
144 |         主料_3 text,
145 |         主料_4 text,
146 |         主料_5 text,
147 |         主料_6 text,
148 |         主料_7 text,
149 |         主料_8 text,
150 |         主料_9 text,
151 |         主料_10 text,
152 |         主料_11 text,
153 |         主料_12 text,
154 |         主料_13 text,
155 |         主料_14 text,
156 |         主料_15 text,
157 |         主料_16 text,
158 |         主料_17 text,
159 |         主料_18 text,
160 |         主料_19 text,
161 |         主料_20 text,
162 |         辅料 text not null,
163 |         步骤 text not null,
164 |         收藏 INTEGER not null,
165 |         类型 text not null
166 |         )"""
167 |         self.cursor.execute(sql)
168 |         self.mysql_db.commit()
169 | 
170 |     def insert_data(self, data):
171 |         if not data:
172 |             return
173 |         sql = """insert ignore into 好豆网数据 values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
174 |         %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
175 |         self.cursor.execute(sql, data)
176 |         self.mysql_db.commit()
177 | 
178 |     def __del__(self):
179 |         self.mysql_db.close()
180 |         print('数据库已关闭')
181 | 
182 | 
183 | class Spider:
184 |     base_url = 'https://www.haodou.com'
185 |     ajax = 'https://vhop.haodou.com/hop/router/rest.json'
186 |     headers_ajax = {
187 |         'Accept': 'application/json, text/plain, */*',
188 |         'Accept-Encoding': 'gzip, deflate, br',
189 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
190 |         'Cache-Control': 'no-cache',
191 |         'Connection': 'keep-alive',
192 |         'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
193 |         'DNT': '1',
194 |         'Host': 'vhop.haodou.com',
195 |         'Origin': 'https://www.haodou.com',
196 |         'Pragma': 'no-cache',
197 |         'Referer': 'https://www.haodou.com/',
198 |         'Sec-Fetch-Dest': 'empty',
199 |         'Sec-Fetch-Mode': 'cors',
200 |         'Sec-Fetch-Site': 'same-site',
201 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
202 |                       '89.114 Safari/537.36 Edg/89.0.774.68'}
203 |     headers_item = {
204 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicatio'
205 |                   'n/signed-exchange;v=b3;q=0.9',
206 |         'Accept-Encoding': 'gzip, deflate, br',
207 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
208 |         'Cache-Control': 'no-cache',
209 |         'Connection': 'keep-alive',
210 |         'DNT': '1',
211 |         'Host': 'www.haodou.com',
212 |         'Pragma': 'no-cache',
213 |         'Referer': 'https://www.haodou.com/recipe/all/',
214 |         'Sec-Fetch-Dest': 'document',
215 |         'Sec-Fetch-Mode': 'navigate',
216 |         'Sec-Fetch-Site': 'none',
217 |         'Sec-Fetch-User': '?1',
218 |         'Upgrade-Insecure-Requests': '1',
219 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.43'
220 |                       '89.114 Safari/537.36 Edg/89.0.774.68'}
221 |     form_first = {
222 |         '_HOP_': None,
223 |         'from': 'mvue',
224 |         'adcode': '100000',
225 |         'appid': '100',
226 |         'Siger': None,
227 |         'uuid': '0',
228 |         'uid': '0',
229 |         'hduid': '0',
230 |         'vc': '177',
231 |         'vn': '1.0.0'}
232 |     form_second = {'numbers': '[]',
233 |                    'moduleId': '5d35709cfd96c61a103a13c2',
234 |                    'id': None,
235 |                    '_HOP_': None,
236 |                    'from': 'mvue',
237 |                    'adcode': '100000',
238 |                    'appid': '100',
239 |                    'Siger': None,
240 |                    'uuid': '0',
241 |                    'uid': '0',
242 |                    'hduid': '0',
243 |                    'vc': '177',
244 |                    'vn': '1.0.0',
245 |                    'last': None}
246 | 
247 |     class Parameter:
248 | 
249 |         @staticmethod
250 |         def get_hop(time_, sign):
251 |             return f'{{"version":"1.0.0","action":"api.www.recipe.category","secret_id":"5722f877e4b0d4512e3fd872","c' \
252 |                    f'urrent_time":{time_},"sign":"{sign}"}}'
253 | 
254 |         @staticmethod
255 |         def url_encode(text):
256 |             return quote(text)
257 | 
258 |         @staticmethod
259 |         def get_current_time():
260 |             return str(time.time())[:10]
261 | 
262 |         def get_last(self, page, total):
263 |             base = f'{{current:{page},total:{total},offset:{page * 40},limit:40}}'
264 |             return self.url_encode(base)
265 | 
266 |         @staticmethod
267 |         def get_sign(time_, siger_, last=None, id_=None):
268 |             if last and id_:
269 |                 return (
270 |                     f"Siger{siger_}_HOP_.actionapi.www.recipe.category_HOP_.current_time{time_}_HOP_.secret_id5722f"
271 |                     f"877e4b0d4512e3fd872_HOP_.version1.0.0adcode100000appid100frommvuehduid0id{id_}last{last}mod"
272 |                     f"uleId5d35709cfd96c61a103a13c2numbers%5B%5Duid0uuid0vc177vn1.0.01bc0d50feafb484b863d4100a561"
273 |                     f"a9cf")
274 | 
275 |             else:
276 |                 return (
277 |                     f"Siger{siger_}_HOP_.actionapi.www.search.default_HOP_.current_time{time_}_HOP_.secret_id5722f8"
278 |                     f"77e4b0d4512e3fd872_HOP_.version1.0.0adcode100000appid100frommvuehduid0uid0uuid0vc177vn1.0.0"
279 |                     f"1bc0d50feafb484b863d4100a561a9cf")
280 | 
281 |         @staticmethod
282 |         def get_siger():
283 |             return time.strftime("%Y%m%d")
284 | 
285 |     def __init__(self, progress, tasks, length):
286 |         self.session = HTMLSession()
287 |         self.type_ = progress['type']
288 |         self.page = progress['page']
289 |         self.tasks = tasks
290 |         self.length = length
291 |         self.parameter = self.Parameter()
292 |         self.state = False
293 |         self.last = None
294 | 
295 |     def open_url(self, url, id_, page):
296 |         current = 1 if page < 2 else page - 1
297 |         if not self.last or page == 1:
298 |             html = self.get_last(url)
299 |         if self.state:
300 |             return ['0']
301 |         if page == 1:
302 |             print('获取菜品列表成功：', url, page)
303 |             return [i.attrs['href'] for i in html.find('div > a.lists')]
304 |         current_time = self.parameter.get_current_time()
305 |         siger = self.md5(self.md5(self.parameter.get_siger()))
306 |         last = '{{"current":{},"total":{},"offset":{},"limit":40}}'.format(
307 |             current, self.last, (page - 1) * 40)
308 |         sign = self.md5(
309 |             self.parameter.get_sign(
310 |                 current_time,
311 |                 siger,
312 |                 self.parameter.url_encode(last),
313 |                 id_))
314 |         hop = self.parameter.get_hop(current_time, sign)
315 |         form = copy.deepcopy(self.form_second)
316 |         form['id'] = id_
317 |         form['Siger'] = siger
318 |         form['_HOP_'] = hop
319 |         form['last'] = last
320 |         response = self.session.post(
321 |             self.ajax, headers=self.headers_ajax, data=form)
322 |         if response.status_code != 200:
323 |             print('获取菜品列表失败：', url, page)
324 |             self.state = True
325 |             return ['0']
326 |         self.wait()
327 |         print('获取菜品列表成功：', url, page)
328 |         return self.get_url(response.html.html)
329 | 
330 |     def get_html(self, url, id_):
331 |         headers = copy.deepcopy(self.headers_item)
332 |         headers['Referer'] += id_
333 |         response = self.session.get(url, headers=headers)
334 |         if response.status_code != 200:
335 |             print('获取菜品详情失败：', url)
336 |             self.state = True
337 |             return None
338 |         self.wait()
339 |         print('获取菜品详情成功：', url)
340 |         return response.html
341 | 
342 |     def get_url(self, content):
343 |         try:
344 |             initial = json.loads(content)
345 |             return ["/recipe/" + str(item["id"])
346 |                     for item in initial['data']['dataset']]
347 |         except TypeError as e:
348 |             for item in initial['data']['dataset']:
349 |                 print(type(item["id"]), item["id"])
350 |             print(e)
351 |         except KeyError:
352 |             pass
353 |         self.state = True
354 |         return ['0']
355 | 
356 |     def get_data(self, html):
357 |         if not html:
358 |             return None
359 |         try:
360 |             name = html.find('div.content-right > h1.title-p')[0].text
361 |             ingredients = [i.text for i in html.find('div.ingredients')]
362 |             condiment = [i.text for i in html.find('div.condiment')]
363 |             practices = [i.text for i in html.find('div.practices > div.pai')]
364 |             favorite = html.find(
365 |                 'div.read > div.cntFavorite:nth-child(2)')[0].text
366 |             return [
367 |                        self._filter(name),
368 |                        ','.join(self._filter(condiment)),
369 |                        ','.join(self._filter(practices)),
370 |                        self._filter(favorite)
371 |                    ], self._filter(ingredients)
372 |         except IndexError:
373 |             return None
374 | 
375 |     def run(self, config, database):
376 |         for i in self.tasks[self.type_:self.length]:
377 |             id_ = self.get_id(i[0])
378 |             start = max(self.page, i[2])
379 |             config["Progress"]["page"] = start
380 |             for j in range(start, i[3] + 1):
381 |                 if self.state:
382 |                     return
383 |                 url = self.open_url(i[0], id_, j)
384 |                 for x in url:
385 |                     if self.state:
386 |                         return
387 |                     html = self.get_html(self.base_url + x, id_)
388 |                     data, ingredients = self.get_data(html)
389 |                     if data:
390 |                         data = self.merge(x, data, i[1], ingredients)
391 |                         database.insert_data(data)
392 |                     else:
393 |                         self.state = True
394 |                     # break
395 |                 else:
396 |                     config["Progress"]["page"] += 1
397 |                 # break
398 |             else:
399 |                 config["Progress"]["type"] += 1
400 |                 config["Progress"]["page"] = 0
401 |             self.last = None
402 |             return  # 默认每次运行仅获取一种类型的数据，注释此代码可实现一次性爬取全部数据
403 |         else:
404 |             print('已完成全部爬取任务！')
405 | 
406 |     def get_last(self, url):
407 |         response = self.session.get(url, headers=self.headers_item)
408 |         if response.status_code != 200:
409 |             self.state = True
410 |             print('获取 last 异常：', url)
411 |             return
412 |         last = re.findall(
413 |             re.compile(r'total:(\d+),'),
414 |             response.html.html)
415 |         if len(last) != 2:
416 |             self.state = True
417 |             print('获取 last 异常：', last)
418 |             return
419 |         self.wait()
420 |         self.last = last[1]
421 |         return response.html
422 | 
423 |     def get_id(self, url):
424 |         id_ = re.findall(r'^https://www.haodou.com/recipe/all/(\d+$)', url)
425 |         if not id_:
426 |             self.state = True
427 |             return
428 |         return id_[0]
429 | 
430 |     @staticmethod
431 |     def wait():
432 |         time.sleep(random.random() + random.randint(2, 5))
433 |         return
434 | 
435 |     @staticmethod
436 |     def md5(text):
437 |         hash_ = hashlib.md5()
438 |         hash_.update(bytes(text, encoding='utf-8'))
439 |         return hash_.hexdigest()
440 | 
441 |     @staticmethod
442 |     def _filter(content):
443 |         if isinstance(content, str):
444 |             return content.replace('\n', '').strip()
445 |         elif isinstance(content, list):
446 |             return [i.replace('\n', '').strip() for i in content]
447 |         else:
448 |             raise ValueError
449 | 
450 |     def merge(self, link, data, type_, ingredients):
451 |         try:
452 |             id_ = re.findall(r'^/recipe/(\d+)$', link)[0]
453 |         except IndexError:
454 |             self.state = True
455 |             return None
456 |         item = [id_, self.base_url + link, type_]
457 |         if len(ingredients) > 20:
458 |             self.state = True
459 |             print('主料信息过多：', self.base_url + link, '请联系开发者或自行修改代码！')
460 |             return None
461 |         while len(ingredients) < 20:
462 |             ingredients.append(None)
463 |         data[1:1] = ingredients
464 |         item[2:2] = data
465 |         return item
466 | 
467 | 
468 | class Core:
469 |     def __init__(self, config, database, spider):
470 |         self.config = config()
471 |         if not self.config.check_all():
472 |             print(f'请修改 {self.config.config_file} 配置文件')
473 |             self.run = False
474 |         else:
475 |             self.mysql_config, self.progress_config, self.tasks_config, self.tasks_length = self.config.get_config()
476 |             self.database = database(self.mysql_config)
477 |             self.spider = spider(
478 |                 self.progress_config,
479 |                 self.tasks_config,
480 |                 self.tasks_length)
481 |             self.run = True
482 | 
483 |     def start(self):
484 |         if not self.run:
485 |             print('未开始爬取数据！')
486 |             return
487 |         print('若程序报错请保留错误信息，并联系开发者！')
488 |         self.database.create_table()
489 |         config_new = self.config.new_config()
490 |         self.spider.run(config_new, self.database)
491 |         self.config.save_config()
492 | 
493 |     def __del__(self):
494 |         print('程序已退出！')
495 | 
496 | 
497 | def main():
498 |     corn = Core(Config, Database, Spider)
499 |     corn.start()
500 | 
501 | 
502 | if __name__ == '__main__':
503 |     main()
504 | 


--------------------------------------------------------------------------------
/淘宝/淘宝数据爬虫.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import re
 4 | import time
 5 | 
 6 | import requests
 7 | import xlwt
 8 | 
 9 | 
10 | def get_html(url, cookie):
11 |     header = {
12 |         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrom'
13 |                       'e/85.0.4183.83 Safari/537.36',
14 |         'cookie': cookie}
15 |     html = requests.get(url=url, headers=header)
16 |     html = html.content.decode('utf-8')
17 |     return html
18 | 
19 | 
20 | def deal_data(html):
21 |     find = re.compile(r'g_page_config = \{(.*)\};')
22 |     data = re.findall(find, html)
23 |     file = json.loads('{' + data[0] + '}')
24 |     data = []
25 |     for i in range(44):
26 |         cache = [
27 |             file['mods']['itemlist']['data']['auctions'][i]['raw_title'],
28 |             file['mods']['itemlist']['data']['auctions'][i]['view_price'],
29 |             file['mods']['itemlist']['data']['auctions'][i]['item_loc']]
30 |         try:
31 |             cache.append(file['mods']['itemlist']['data']
32 |                          ['auctions'][i]['view_sales'])
33 |         except BaseException:
34 |             cache.append(None)
35 |         cache.append(file['mods']['itemlist']['data']['auctions'][i]['nick'])
36 |         data.append(cache)
37 |     return data
38 | 
39 | 
40 | def savexlsx(datalist):
41 |     book = xlwt.Workbook(encoding='utf-8')
42 |     sheet = book.add_sheet('爬取结果', cell_overwrite_ok=True)
43 |     tap = ('描述', '价格', '发货地', '销量', '店铺')
44 |     for i in range(5):
45 |         sheet.write(0, i, tap[i])
46 |     for i in range(len(datalist)):
47 |         tap = datalist[i]
48 |         for j in range(5):
49 |             data_1 = tap[j]
50 |             sheet.write(i + 1, j, data_1)
51 |     book.save('淘宝爬虫结果.xlsx')
52 |     print('文件已保存')
53 | 
54 | 
55 | def main():
56 |     url = 'https://s.taobao.com/search?q={}&s={}'
57 |     key = str(input('爬取关键字：'))
58 |     page = int(input('爬取页数：'))
59 |     cookie = str(input('粘贴cookie到此处：'))
60 |     datalist = []
61 |     try:
62 |         for _ in range(page):
63 |             data = get_html(url.format(key, page * 44), cookie)
64 |             data = deal_data(data)
65 |             for j in range(len(data)):
66 |                 datalist.append(data[j])
67 |             time.sleep(random.randrange(3, 7, 1))
68 |         savexlsx(datalist)
69 |     except BaseException:
70 |         print('请检查 cookie 是否有误')
71 |     print('程序结束')
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/美食天下/美食天下.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import time
  3 | import random
  4 | from bs4 import BeautifulSoup
  5 | import pymysql
  6 | import json
  7 | import os
  8 | import re
  9 | 
 10 | HEADERS_1 = {
 11 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applicatio'
 12 |               'n/signed-exchange;v=b3;q=0.9',
 13 |     'Accept-Encoding': 'gzip, deflate, br',
 14 |     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
 15 |     'Cache-Control': 'no-cache',
 16 |     'Connection': 'keep-alive',
 17 |     'DNT': '1',
 18 |     'Host': 'home.meishichina.com',
 19 |     'Pragma': 'no-cache',
 20 |     'Referer': 'https://www.meishichina.com/',
 21 |     'Sec-Fetch-Dest': 'document',
 22 |     'Sec-Fetch-Mode': 'navigate',
 23 |     'Sec-Fetch-Site': 'same-site',
 24 |     'Sec-Fetch-User': '?1',
 25 |     'Upgrade-Insecure-Requests': '1',
 26 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8'
 27 |                   '8.0.4324.150 Safari/537.36 Edg/88.0.705.63'}
 28 | 
 29 | HEADERS_2 = HEADERS_1.copy()
 30 | del HEADERS_2['Referer']
 31 | DATABASE = 'deliciousFood'
 32 | 
 33 | ROOT = os.getcwd() + '\\cache\\'
 34 | 
 35 | 
 36 | # if not os.path.exists(ROOT):
 37 | #     os.mkdir(ROOT)
 38 | 
 39 | 
 40 | def get_data():
 41 |     if os.path.exists('data.json'):
 42 |         with open('data.json', 'r', encoding='utf-8') as f:
 43 |             data = json.load(f)
 44 |             data = data['data']
 45 |         return data
 46 |     else:
 47 |         with open('data.json', 'w', encoding='utf-8') as f:
 48 |             data = {
 49 |                 "data": [[0, "url_1", "demo_1", 100],
 50 |                          [1, "url_2", "demo_2", 100]]
 51 |             }
 52 |             f.write(json.dumps(data))
 53 |         print('已在当前目录生成 data.json 文件')
 54 |         print('请在 data.json 文件输入程序必要信息后再运行本程序！')
 55 |         print(
 56 |             '格式: [索引（整数）, 链接（字符串）, 类型（字符串）, 爬取页数（整数）]\n按格式在 data.json 输入相关信息，注意最外侧还有一对中括号')
 57 |         print('爬取任务未完成时不要修改 data.json')
 58 |         return None
 59 | 
 60 | 
 61 | def check_data(data):
 62 |     if not data:
 63 |         return None
 64 |     _ = -1
 65 |     for i, j in enumerate(data):
 66 |         if j[0] - _ != 1:
 67 |             print(j, '索引错误')
 68 |             return None
 69 |         _ = i
 70 |         if not re.findall(
 71 |                 r'^https://home.meishichina.com/recipe/[a-z0-9]*?/$',
 72 |                 j[1]):
 73 |             print(j, '链接错误')
 74 |             return None
 75 |         if not j[2]:
 76 |             print(j, '类型错误')
 77 |             return None
 78 |         if j[3] < 1 or j[3] > 100:
 79 |             print(j, '爬取页数错误')
 80 |             return None
 81 |         data[i][1] = j[1] + 'page/{}/'
 82 |     return data
 83 | 
 84 | 
 85 | HEADERS_3 = {
 86 |     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q'
 87 |               '=0.8,application/signed-exchange;v=b3;q=0.9',
 88 |     'accept-encoding': 'gzip, deflate, br',
 89 |     'accept-language': 'zh-CN,zh;q=0.9',
 90 |     'cache-control': 'no-cache',
 91 |     'dnt': '1',
 92 |     'pragma': 'no-cache',
 93 |     'referer': 'https://home.meishichina.com/',
 94 |     'sec-fetch-dest': 'document',
 95 |     'sec-fetch-mode': 'navigate',
 96 |     'sec-fetch-site': 'none',
 97 |     'sec-fetch-user': '?1',
 98 |     'upgrade-insecure-requests': '1',
 99 |     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro'
100 |                   'me/87.0.4280.141 Safari/537.36'}
101 | 
102 | 
103 | def get_json():
104 |     if os.path.exists('MySQL.json'):
105 |         with open('MySQL.json', 'r') as f:
106 |             template = json.load(f)
107 |             host = template['host']
108 |             user = template['user']
109 |             password = template['password']
110 |         return host, user, password
111 |     else:
112 |         with open('MySQL.json', 'w') as f:
113 |             template = {'host': '', 'user': '', 'password': ''}
114 |             f.write(json.dumps(template))
115 |         print('已在当前目录生成 MySQL.json 文件')
116 |         print('请在 MySQL.json 文件输入数据库信息后再运行本程序！')
117 |         return None, None, None
118 | 
119 | 
120 | def create_db(host, user, password):
121 |     try:
122 |         db = pymysql.connect(host=host, user=user, password=password)
123 |     except pymysql.err.OperationalError:
124 |         print('连接数据库失败，请检查 MySQL.json 文件')
125 |         exit()
126 |     sql = f"create database {DATABASE} CHARACTER SET utf8mb4"
127 |     cursor = db.cursor()
128 |     try:
129 |         cursor.execute(sql)
130 |     except pymysql.err.ProgrammingError:
131 |         pass
132 |     db.commit()
133 |     db.close()
134 | 
135 | 
136 | def create_table(db, cursor):
137 |     # sql = """create table {}(
138 |     # ID MEDIUMINT primary key,
139 |     # 链接 text not null,
140 |     # 菜名 text not null,
141 |     # 食材 text not null,
142 |     # 步骤 text not null,
143 |     # 效果图 text not null)"""
144 |     # # 图片数据 MEDIUMBLOB
145 |     # for i in list_:
146 |     #     try:
147 |     #         cursor.execute(sql.format(i[2]))
148 |     #         db.commit()
149 |     #     except pymysql.err.OperationalError:
150 |     #         continue
151 |     sql = """create table if not exists 美食天下(
152 |         ID MEDIUMINT primary key,
153 |         链接 text not null,
154 |         菜名 text not null,
155 |         食材 text not null,
156 |         步骤 text not null,
157 |         效果图 text not null,
158 |         类型 text not null)"""
159 |     # 图片数据 MEDIUMBLOB
160 |     cursor.execute(sql)
161 |     db.commit()
162 | 
163 | 
164 | def wait_time():
165 |     time.sleep(random.random() + random.randint(1, 5))
166 | 
167 | 
168 | def get_urls(session, url, page):
169 |     global HEADERS_1
170 |     url = url.format(page)
171 |     print('当前网址：', url)
172 |     response = session.get(url, headers=HEADERS_1)
173 |     if response.status_code == 200:
174 |         soup = BeautifulSoup(response.content, 'lxml')
175 |         urls = [i['href']
176 |                 for i in soup.select('ul > li > div.detail > h2 > a')]
177 |         wait_time()
178 |         return session, urls
179 |     else:
180 |         print(url, response.status_code)
181 |         return None, None
182 | 
183 | 
184 | def open_url(session, url):
185 |     global HEADERS_2
186 |     response = session.get(url, headers=HEADERS_2)
187 |     if response.status_code == 200:
188 |         wait_time()
189 |         return session, response.content
190 |     else:
191 |         print(url, response.status_code)
192 |         return None, None
193 | 
194 | 
195 | def get_img(url):
196 |     global HEADERS_3
197 |     response = requests.get(url, headers=HEADERS_3)
198 |     if response.status_code == 200:
199 |         wait_time()
200 |         return response.content
201 |     else:
202 |         print(response.status_code, url)
203 |         return None
204 | 
205 | 
206 | def save_img(id_, img):
207 |     if img:
208 |         global ROOT
209 |         root = os.path.join(ROOT, id_ + '.jpg')
210 |         with open(root, 'wb') as f:
211 |             f.write(img)
212 |         print(f'已保存图片：{id_}')
213 |     else:
214 |         print(f'保存图片失败：{id_}')
215 | 
216 | 
217 | def image_data(id_):
218 |     global ROOT
219 |     root = os.path.join(ROOT, id_ + '.jpg')
220 |     with open(root, 'rb') as f:
221 |         img = f.read()
222 |     return img
223 | 
224 | 
225 | def deal_data(html):
226 |     soup = BeautifulSoup(html, 'lxml')
227 |     try:
228 |         link = soup.select('h1.recipe_De_title > a')[0]['href']
229 |         title = soup.select('h1.recipe_De_title > a')[0]['title']
230 |         img = soup.select('div.recipe_De_imgBox > a')[0].img['src']
231 |         ingredients = [i.text for i in soup.select('fieldset.particulars b')]
232 |         step = [i.text for i in soup.select(
233 |             'div.recipeStep > ul > li > div.recipeStep_word')]
234 |     except IndexError:
235 |         return 'Error'
236 |     id_ = get_id(link)
237 |     """需要下载图片到本地请取消注释"""
238 |     # save_img(id_, get_img(img))
239 |     data = [
240 |         id_,
241 |         link,
242 |         title,
243 |         '%s' %
244 |         ','.join(ingredients),
245 |         '%s' %
246 |         ''.join(step).replace('"', "'"),
247 |         img]
248 |     if None not in data:
249 |         return data
250 |     print(data)
251 |     return None
252 | 
253 | 
254 | def get_id(url):
255 |     return re.findall(
256 |         r'^https://home.meishichina.com/recipe-([0-9]*?).html',
257 |         url)[0]
258 | 
259 | 
260 | def save_data(db, cursor, data, type_):
261 |     save = True
262 |     data.append(type_)
263 |     for i in range(7):
264 |         data[i] = '"' + data[i] + '"'
265 |     sql = """insert into 美食天下 (ID, 链接, 菜名, 食材, 步骤, 效果图, 类型)
266 |     values(%s)""" % ','.join(data)
267 |     try:
268 |         cursor.execute(sql)
269 |         db.commit()
270 |         print('已保存数据', data[0], data[2])
271 |     except pymysql.err.IntegrityError:
272 |         save = False
273 |     # try:
274 |     #     if data[-1] and save:
275 |     #         sql = "insert into %s (图片数据) values (%s)"
276 |     #         args = (type_, pymysql.Binary(data[-1]))
277 |     #         cursor.execute(sql, args)
278 |     #         db.commit()
279 |     #         print('已保存图片', data[0], data[2])
280 |     # except pymysql.err.IntegrityError:
281 |     #     save = False
282 |     return save
283 | 
284 | 
285 | def save_process(progress):
286 |     with open('progress.json', 'w') as f:
287 |         f.write(json.dumps(progress))
288 | 
289 | 
290 | def main():
291 |     print('除非发生未知异常，否则不要直接关闭程序')
292 |     print('爬取任务未完成时不要修改 data.json')
293 |     _ = get_data()
294 |     crawler_data = check_data(_)
295 |     host, user, password = get_json()
296 |     if None in [crawler_data, host, user, password]:
297 |         print('data.json 或 MySQL.json 文件内容错误')
298 |         exit()
299 |     create_db(host, user, password)
300 |     db = pymysql.connect(
301 |         host=host,
302 |         user=user,
303 |         password=password,
304 |         db=DATABASE)
305 |     cursor = db.cursor()
306 |     create_table(db, cursor)
307 |     over = False
308 |     if os.path.exists('progress.json'):
309 |         with open('progress.json', 'r') as f:
310 |             progress = json.load(f)
311 |         if progress['type'] == len(
312 |                 crawler_data) - 1 and progress['page'] > crawler_data[progress['type']][3]:
313 |             print('已获取全部数据，现在可以修改 data.json 文件')
314 |             over = True
315 |             progress['type'] += 1
316 |             progress['page'] = 1
317 |         elif progress['type'] == len(crawler_data):
318 |             print('已获取全部数据，现在可以修改 data.json 文件')
319 |             over = True
320 |     else:
321 |         progress = {'type': 0, 'page': 1}
322 |     _ = progress.copy()
323 |     start_type = _['type']
324 |     start_page = _['page']
325 |     session = requests.Session()
326 |     for item in crawler_data[start_type:]:
327 |         if over:
328 |             break
329 |         progress['type'] = item[0]
330 |         if progress['page'] > item[3]:
331 |             progress['page'] = 1
332 |             break  # 单次运行只爬取一种类型
333 |         for page in range(start_page, item[3] + 1):
334 |             # if page - start_page >= 10:
335 |             #     """单次运行爬取10页，注释代码块可取消限制，修改代码可修改单次爬取页数"""
336 |             #     over = True
337 |             time.sleep(random.random() + random.randint(5, 15))
338 |             if over:
339 |                 break
340 |             print('正在爬取 {} 的第 {} 页数据'.format(item[2], page))
341 |             session, urls = get_urls(session, item[1], page)
342 |             if session and urls:
343 |                 for info in urls:
344 |                     # break  # 测试使用
345 |                     if over:
346 |                         break
347 |                     session, html = open_url(session, info)
348 |                     if session and html:
349 |                         data = deal_data(html)
350 |                         if data == 'Error':
351 |                             print('疑似无效链接：', info)
352 |                             continue
353 |                         elif data:
354 |                             result = save_data(db, cursor, data, item[2])
355 |                             if not result:
356 |                                 over = True
357 |                                 break
358 |                         else:
359 |                             over = True
360 |                             break
361 |                     else:
362 |                         over = True
363 |                         break
364 |                     break  # 测试使用
365 |                 progress['page'] = page if over else page + 1
366 |             else:
367 |                 over = True
368 |                 break
369 |     save_process(progress)
370 |     db.close()
371 |     print('程序已退出')
372 | 
373 | 
374 | if __name__ == '__main__':
375 |     start_time = time.time()
376 |     main()
377 |     print('本次运行时间：{:.6f}'.format(time.time() - start_time))
378 | 


--------------------------------------------------------------------------------
/超能网/超能网文章内容爬虫.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | from parsel import Selector
 5 | 
 6 | 
 7 | def get_code(url):
 8 |     headers = {
 9 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
10 |                   "application/signed-exchange;v=b3;q=0.9",
11 |         "Accept-Encoding": "gzip, deflate, br",
12 |         "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
13 |         "Cache-Control": "no-cache",
14 |         "Connection": "keep-alive",
15 |         "DNT": "1",
16 |         "Host": "www.expreview.com",
17 |         "Pragma": "no-cache",
18 |         "sec-ch-ua": r'\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Microsoft Edge\";v=\"96\"',
19 |         "sec-ch-ua-mobile": "?0",
20 |         "sec-ch-ua-platform": r'\"Windows\"',
21 |         "Sec-Fetch-Dest": "document",
22 |         "Sec-Fetch-Mode": "navigate",
23 |         "Sec-Fetch-Site": "same-origin",
24 |         "Sec-Fetch-User": "?1",
25 |         "Upgrade-Insecure-Requests": "1",
26 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
27 |                       "Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62 "}
28 |     resource = requests.get(url, headers=headers)
29 |     if resource.status_code == requests.codes.ok:
30 |         return resource.text
31 |     return ""
32 | 
33 | 
34 | def selector(code):
35 |     return Selector(text=code)
36 | 
37 | 
38 | def get_text(select):
39 |     text = select.xpath("//div[@id='post_body']/p/text()").getall()
40 |     [print(i) for i in text]
41 | 
42 | 
43 | def main():
44 |     # url = input('输入文章链接：')
45 |     url = 'https://www.expreview.com/80774.html'
46 |     if re.match(r'https://www.expreview.com/\d+?.html', url):
47 |         code = get_code(url)
48 |         select = selector(code)
49 |         get_text(select)
50 |     else:
51 |         print('文章链接格式错误！')
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/链家/链家二手房爬虫_1.py:
--------------------------------------------------------------------------------
 1 | from pyppeteer import launch
 2 | import asyncio
 3 | import time
 4 | import random
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | import xlwt
 8 | 
 9 | 
10 | async def get_html(_url, _pg):
11 |     try:
12 |         browser = await launch()
13 |         page = await browser.newPage()
14 |         await page.evaluate('''() =>{
15 |             Object.defineProperties(navigator,{ webdriver:{ get: () => false } });
16 |             window.navigator.chrome = { runtime: {},  };
17 |             Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
18 |             Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], });
19 |             }''')
20 |         await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})')
21 |         for pg in range(1, _pg + 1):
22 |             print('正在爬取第 %d 页数据' % pg)
23 |             await page.goto(_url + 'pg%d' % pg)  # 访问网站
24 |             await page.waitFor(random.randrange(2000, 4000, 200))
25 |             html = await page.content()
26 |             data = deal_data(html)
27 |             save_data(data, pg)
28 |     finally:
29 |         await browser.close()  # 关闭浏览器
30 | 
31 | 
32 | def deal_data(html):
33 |     soup = BeautifulSoup(html, 'lxml')
34 |     item = soup.select('ul.sellListContent > li > div.info.clear')
35 |     data = []
36 |     for i in item:
37 |         url = i.find('div', class_='title').a['href']
38 |         title = i.select('div.title > a')[0].text.strip()
39 |         flood = [_.text.strip() for _ in i.select('div.flood > div > a')]
40 |         address = i.select('div.address > div')[0].text.strip()
41 |         follow_info = i.select('div.followInfo')[0].text.strip()
42 |         tag = [_.text.strip() for _ in i.select('div.tag > span')]
43 |         info = [_.text.strip() for _ in i.select('div.priceInfo > div')]
44 |         cache = [
45 |             url,
46 |             title,
47 |             '%s' %
48 |             '-'.join(flood),
49 |             address,
50 |             follow_info,
51 |             '%s' %
52 |             ','.join(tag),
53 |             info[0],
54 |             info[1]]
55 |         data.append(cache)
56 |     return data
57 | 
58 | 
59 | def save_data(data, pg):
60 |     top = ['链接', '标题', '位置', '详情', '数据', '标签', '总价', '单价']
61 |     excel = xlwt.Workbook(encoding='utf-8')
62 |     sheet = excel.add_sheet('第%s页爬取结果' % pg, cell_overwrite_ok=True)
63 |     for i in range(len(top)):
64 |         sheet.write(0, i, top[i])
65 |     for i in range(len(data)):
66 |         for x, y in enumerate(data[i]):
67 |             sheet.write(i + 1, x, y)
68 |     excel.save('链家二手房爬取结果_%s.xls' % time.time())
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     """使用无头浏览器爬取"""
73 |     url = input(
74 |         '正确网址示例：https://bj.lianjia.com/ershoufang/dongcheng/\n错误网址示例：https://bj.lianjia.com/ershoufang/\n输入网址：')
75 |     pg = int(input('输入爬取页数（1~100）：'))
76 |     if pg < 1 or pg > 100:
77 |         print('爬取页数输入错误，本次运行只爬取第一页数据')
78 |         pg = 1
79 |     if re.match(r'https://[a-z]*?.lianjia.com/ershoufang/[a-z]*?/$', url):
80 |         asyncio.get_event_loop().run_until_complete(get_html(url, pg))
81 |     else:
82 |         print('网址输入错误')
83 | 


--------------------------------------------------------------------------------