├── .gitattributes
├── .gitignore
├── README.md
├── 工具
└── 刷Github访问量.py
├── 第1章-基本库requests和re的使用
└── 爬取电影列表.py
├── 第2章-网页数据的提取
├── BeautifulSoup实战案例一.py
├── BeautifulSoup实战案例二.py
├── BeautifulSoup库的使用.py
├── PyQuery实战案例.py
├── PyQuery库的使用.py
├── Xpath实战案例.py
├── Xpath解析HTML页面.py
└── test.html
├── 第3章-进程线程和协程的使用
├── 协程的使用.py
├── 多线程和协程的练习.py
├── 多线程的使用.py
└── 多进程的使用.py
├── 第4章-python操作主流数据库
├── MongoDB数据库.py
├── MySQL数据库.py
├── Redis数据库.py
└── 三大数据库的案例.py
├── 第5章-selenium和Scrapy
├── __init__.py
├── base_scrapy
│ ├── base_scrapy
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-311.pyc
│ │ │ ├── items.cpython-311.pyc
│ │ │ ├── pipelines.cpython-311.pyc
│ │ │ └── settings.cpython-311.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-311.pyc
│ │ │ └── joke.cpython-311.pyc
│ │ │ └── joke.py
│ └── scrapy.cfg
├── base_scrapy_plus
│ ├── __init__.py
│ ├── base_scrapy_plus
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-311.pyc
│ │ │ ├── items.cpython-311.pyc
│ │ │ ├── middlewares.cpython-311.pyc
│ │ │ ├── pipelines.cpython-311.pyc
│ │ │ └── settings.cpython-311.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-311.pyc
│ │ │ │ └── blogspider.cpython-311.pyc
│ │ │ └── blogspider.py
│ │ └── start.py
│ └── scrapy.cfg
├── selenium入门.py
├── selenium练习—模拟qq邮箱登录.py
├── 案例-爬取某上市公司网站.py
└── 案例-爬取网易新闻.py
├── 第6章-JS基础
├── Flask服务.py
├── jsonp页面.html
└── templates
│ ├── JQuery发送AJAX请求.html
│ └── success.html
├── 第7章-JS逆向
├── B站逆向
│ ├── Tkinter.py
│ ├── cookie加密.py
│ ├── 代理.py
│ ├── 刷播放量(单线程).py
│ ├── 多线程.py
│ └── 请求头加密.py
├── py里面执行js代码.py
├── rsa.privatekey.pem
├── rsa.publickey.pem
├── 抠出来的代码文件.js
├── 招标网站
│ ├── package-lock.json
│ ├── package.json
│ ├── 爬取数据.py
│ ├── 请求参数加密.js
│ └── 返回数据解密.js
├── 注意点.md
├── 看准网
│ └── 爬取数据.py
├── 网易云音乐
│ ├── package-lock.json
│ ├── package.json
│ ├── 网易云-扣代码.js
│ └── 网易云下载音乐.py
├── 网易有道翻译
│ └── 发送请求.py
└── 逆向常见加密算法.py
├── 第8章-JS补环境
└── 抖音
│ ├── 分析移民问题
│ ├── comments.csv
│ ├── 分析分数线.py
│ ├── 分析评论.py
│ ├── 存储到mysql数据库.py
│ ├── 测试.py
│ ├── 爬取分数线.py
│ └── 爬取评论.py
│ └── 接单
│ └── 扣代码.js
└── 第9章-APP逆向
└── 代练通
├── Heroes.py
├── 分析数据.py
├── 数据库操作.py
└── 爬取数据.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | node_modules
3 | crypto-js
4 | see you again.m4a
5 | *.cpython-311.pyc
6 | *.xlsx
7 | *.m4a
8 |
9 | /哈理工教务在线/
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python爬虫🕷️
2 |
3 | ## 介绍📘
4 |
5 | - **🧭 参考资料:** 崔大出版的书[Python3爬虫](https://github.com/Python3WebSpider/Python3WebSpider)和路飞的课程。
6 |
7 | - 📅 **时间:** 所有案例和代码均为`2023年10月`—`2024年1月`的最新案例
8 | - 📝 **注释:** 代码附有详细注释
9 |
10 | ## 即将推出🔜
11 |
12 | 1. 所有爬虫案例整理成一个**HTML页面**,方便学习。
13 | 2. 有实际应用价值的案例统一整理成**exe程序**或**部署在网站**上
14 |
15 | ## 警告⚠️
16 |
17 | 请不要将本仓库所提供的代码或技术用于**任何可能影响目标网站正常运行**的用途
18 |
19 | 进行爬虫学习过程中,务必遵守相关法律法规。爬虫普法参考视频 [如何避免面向监狱编程](https://www.bilibili.com/video/BV1d64y1N7LW/)
20 |
21 |
--------------------------------------------------------------------------------
/工具/刷Github访问量.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome
2 | from selenium.webdriver.chrome.options import Options
3 |
4 | """
5 | 使用selenium模拟访问Github主页、玩玩就行了,刷徽标的访问量(仓库)、刷fork次数也是一样的道理 fork完再删再fork。
6 | """
7 |
8 | # 无头浏览器
9 | opt = Options()
10 | opt.add_argument("--headless")
11 | opt.add_argument('--disable-gpu')
12 |
13 | # todo:自己的主页或者仓库地址
14 | url = "https://github.com/AZCodingAccount"
15 | noHeaderDriver = Chrome(options=opt)
16 | try:
17 | for i in range(0, 51):
18 | noHeaderDriver.get(url)
19 |
20 | # 隐式等待页面加载
21 | noHeaderDriver.implicitly_wait(5)
22 |
23 | print(f"访问第{i + 1}次")
24 | except Exception as e:
25 | print("出现异常,异常原因", e)
26 |
--------------------------------------------------------------------------------
/第1章-基本库requests和re的使用/爬取电影列表.py:
--------------------------------------------------------------------------------
1 | # 导入模块
2 | import requests
3 | import re
4 | # 这里简单的解析打印了一下相关内容,后续没有封装,主要正则太麻烦了,后面使用一些解析库就行了
5 |
6 | # 定义函数封装爬取的方法
7 | def get_one_page(url):
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
10 | }
11 | # 这里使用try-except捕捉一下异常
12 | response = requests.get(url, headers=headers)
13 | if response.status_code == 200:
14 | return response.text
15 | return None
16 |
17 |
18 | # 定义接收爬取回来的数据的方法
19 | def get_data():
20 | # 抓取前10页数据
21 | for page in range(1, 10):
22 | url = f'https://ssr1.scrape.center/page/{page}'
23 | one_page = get_one_page(url)
24 | parse_one_page(one_page)
25 |
26 |
27 | total_title_list = []
28 |
29 | # 利用正则解析爬取回来的字符串
30 | def parse_one_page(html):
31 | title_pattern = re.compile("
(.*?)")
32 | # 这里之所以会匹配到\n和空字符串,原因在于.*?可能会匹配空和\n,使用strip函数去掉即可
33 | score_pattern = re.compile('(.*?)', re.S)
34 | # search为第一次查找,findall查找全部
35 | scores = re.findall(score_pattern, html)
36 | for score in scores:
37 | score = score.strip()
38 | print(f"score为:{score}")
39 | time_pattern = re.compile(r'\d{4}-\d{2}-\d{2}\s?上映')
40 | times = re.findall(time_pattern, html)
41 | for time in times:
42 | print(f"time为:{time}")
43 | title = re.findall(title_pattern, html)
44 | total_title_list.append(title)
45 |
46 |
47 | get_data()
48 | # 这是对标题的处理
49 | i = 0
50 | for idx1, title_list in enumerate(total_title_list):
51 | for idx2, title in enumerate(title_list):
52 | prev = str(i) + ':'
53 | total_title_list[idx1][idx2] = prev + title
54 | i = i + 1
55 |
56 | result1 = ''.join([str(item) for item in total_title_list])
57 | result = result1.replace(', ', '\n').replace('\'', '').replace('[', '').replace(']', '')
58 | print(result)
59 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/BeautifulSoup实战案例一.py:
--------------------------------------------------------------------------------
1 | # 导入请求和解析数据模块
2 | import time
3 |
4 | import requests
5 | from datetime import datetime, timedelta
6 | from bs4 import BeautifulSoup
7 |
8 | # 采用新发地这个网站做演示,由于这个网站的数据是使用JS动态加载的,并且我还可以直接向他们服务器发送数据请求接口,简单一点直接请求json数据解析就可以了
9 | # 简单一点,就直接算一个蔬菜类一个小类的平均价格了, 因为如果不传这个品类id的话,默认他这个接口返回的是蔬菜类的。
10 | # 当然,还可以拿到所有品类的详细数据,导出做各种数据分析,这里就不这样做了,毕竟是练习bs4库(虽然网站改版了没有练习到)
11 | # 发送请求
12 | url = 'http://www.xinfadi.com.cn/getPriceData.html'
13 | headers = {
14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
15 | }
16 | # 定义页数,总价格,查找时间
17 | page_count = 1
18 | sum_avg_Price = 0
19 | pubDateEndTime = datetime.today().strftime("%Y/%m/%d")
20 | pubDateStartTime = (datetime.today() - timedelta(days=30)).strftime("%Y/%m/%d")
21 | # 选择蔬菜->水菜这个品类来进行统计,统计最近30天的平均价格(没办法不传品类id,不传品类id不能通过时间段请求)
22 | data = {
23 | 'limit': 20,
24 | 'current': page_count,
25 | "pubDateStartTime": pubDateStartTime,
26 | "pubDateEndTime": pubDateEndTime,
27 | "prodPcatid": 1186,
28 | "prodCatid": 1199
29 | }
30 | response = requests.post(url, headers=headers, data=data)
31 | total = response.json()['count']
32 | # 定义一个while循环爬取数据,一次爬取20条
33 | while response.json()['list']:
34 | # 睡眠0.5秒以免给别人的服务器造成负担
35 | time.sleep(0.5)
36 | # 指针+1
37 | page_count = page_count + 1
38 | # 拿出来每个品类的avgPrice,然后求一个平均
39 | goodsList = response.json()['list']
40 | for goods in goodsList:
41 | sum_avg_Price += float(goods['avgPrice'])
42 | # 更新请求体重发请求
43 | data = {
44 | 'limit': 20,
45 | 'current': page_count,
46 | "pubDateStartTime": pubDateStartTime,
47 | "pubDateEndTime": pubDateEndTime,
48 | "prodPcatid": 1186,
49 | "prodCatid": 1199
50 | }
51 | response = requests.post(url, headers=headers, data=data)
52 |
53 | print(
54 | f"根据爬虫数据显示:最近一个月新发地蔬菜大类下的水菜数据一共有{page_count}页{total}条,平均销售价格为:{round(sum_avg_Price / total, 2)}")
55 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/BeautifulSoup实战案例二.py:
--------------------------------------------------------------------------------
1 | # 爬取优美图库的图片,因为之前那个菜市场直接请求的接口,没有练习到BS4。BS4这个库常用的也就是find和find_all,select也可以用吧,节点啥的感觉用不到
2 | # 需求:爬取可爱分类下的10张图片,存储到当前目录下面的images文件夹下面,文件命名要与网站一致
3 | # 导入模块
4 | import time
5 |
6 | import requests
7 | from bs4 import BeautifulSoup
8 |
9 | # 请求数据
10 | response = requests.get("https://www.umei.cc/weimeitupian/keaitupian/")
11 | response.encoding = "utf-8"
12 | # 如果只是爬取图片的话直接找img里面的data-original属性就可以了,但是我还想要他的图片名,还需要找title那个div标签
13 | # 提取所有a里面的href属性,初始化
14 | html = BeautifulSoup(response.text, 'html.parser')
15 | divs = html.find_all('div', attrs={'class': 'item masonry_brick'})
16 | # 定义一个计数器,读取到10个就停
17 | count = 0
18 | for div in divs:
19 | time.sleep(0.2)
20 | if count > 10:
21 | break
22 | image_src = div.find('img').get('data-original')
23 | image_name = div.find('img').get('alt').split(' ')[0]
24 | # 发送一次请求获取图片的字节数据
25 | image = requests.get(image_src).content
26 | # 把图片保存到文件夹里面
27 | with open(f"./images/{image_name}.jpg", 'wb') as f:
28 | f.write(image)
29 | print(f"{image_name}.jpg图片写入成功")
30 | count += 1
31 | print("写入完成")
--------------------------------------------------------------------------------
/第2章-网页数据的提取/BeautifulSoup库的使用.py:
--------------------------------------------------------------------------------
1 | import re
2 | from bs4 import BeautifulSoup
3 |
4 | """
5 | BeautifulSoup这个选择器可以选择很多解析器,html.parser,lxml等等。
6 | 就是根据标签名和属性值获取,获取多了就是一个列表再遍历,还想再往下获取就再find。
7 | 可以使用css选择器 (select方法)。个人比较习惯用pyquery,这个感觉没啥特点
8 | """
9 | html = """
10 | The Dormouse's story
11 |
12 | The Dormouse's story
13 | Once upon a time there were three little sisters; and their names were
14 | ,
15 | Lacie and
16 | Tillie;
17 | and they lived at the bottom of a well.
18 | ...
19 | """
20 | # 解析html页面
21 | soup = BeautifulSoup(html, 'lxml')
22 | # 把html标准化输出
23 | # print(soup.prettify())
24 | print("--------------------------------")
25 |
26 | # 1.节点选择器,相较于xpath更加简洁,直接选取节点
27 | print(type(soup.title)) # 打印title标签的类型
28 | print(soup.title) # 打印title标签
29 | print(soup.head) # 会嵌套获取标签
30 | print(soup.p) # 只会选中相同标签的第一个
31 | print(soup.title.string) # 取出html中的title标签里面的内容,如果是父标签且没内容,就是None
32 | print("--------------------------------")
33 |
34 | # 2.获取名称或者属性
35 | print(soup.p.attrs) # 获取第一个p节点的所有属性
36 | print(soup.title.name) # 获取当前节点的名称
37 | print(soup.p['name']) # 获取当前节点的name属性的值,可能返回的是一个列表
38 | print("--------------------------------")
39 |
40 | # 3.嵌套进行获取。直接往下.就好了
41 | print(soup.html.p) # 获取html标签下的第一个p标签
42 | print("--------------------------------")
43 |
44 | # 4.进行节点之间的关联选择
45 | # 子节点
46 | print(soup.p.contents) # 获取第一个p节点的直接子节点,会选择里面的内容
47 | print(soup.p.children) # 获取子节点,返回一个可迭代的对象
48 | for i, child in enumerate(soup.p.children):
49 | print(i, child) # 获取的跟直接子节点一样,但是这个是迭代器对象
50 |
51 | print(soup.p.descendants)
52 | for i, child in enumerate(soup.p.descendants):
53 | print(i, child) # 这个会把孙子(重孙子...)节点也遍历出来,甚至还有最后节点的内容
54 | # 父节点
55 | print(soup.a.parent) # 获取第一个a标签的父节点,应该是第一个p
56 | print(soup.a.parents) # 往上递归寻找父节点。
57 | # print(list(enumerate(soup.a.parents)))
58 | # 兄弟节点。BS4会把空格,换行符和普通文本都当成节点,这个需要注意,因此第一个就是/n
59 | print('下一个兄弟:', soup.p.next_sibling)
60 | print('上一个兄弟:', soup.p.next_sibling.previous_sibling)
61 | print('下一个兄弟们:', soup.p.next_siblings)
62 | print('下一个兄弟们:', soup.p.next_sibling.next_sibling.previous_siblings)
63 | print("--------------------------------")
64 |
65 | # 调用方法进行查询
66 | html2 = '''
67 |
68 |
69 |
Hello
70 |
71 |
72 |
73 | - Foo
74 | - Bar
75 | - Jay
76 |
77 |
81 |
82 |
83 | '''
84 | soup2 = BeautifulSoup(html2, 'lxml')
85 | # 根据节点名查询
86 | print(soup2.find_all(name='ul')) # 查找所有标签为ul的节点,返回一个列表
87 | print(type(soup2.find_all(name='ul')[0])) # 查找第一个ul标签的节点
88 | # 根据属性查询。如果class有多个值也可以筛选出来(选择其中任意一个)
89 | # 查询id为list-1,且class为list的元素
90 | print(soup2.find_all(attrs={
91 | 'id': 'list-1',
92 | 'class': 'list'
93 | }))
94 | # 根据文本查询
95 | print(soup2.find_all(string=re.compile('ar'))) # 查找所有包含ar字符串的文本,返回一个列表
96 | print("--------------------------------")
97 |
98 | # find方法查找第一个匹配的元素。
99 | print(soup2.find(class_='list'))
100 | print("--------------------------------")
101 |
102 | # 使用CSS选择器
103 | tag = soup2.select('#list-2 .element:first-child') # 选择id为list-2下的类为element下的第一个节点,返回的是一个列表
104 | print(tag)
105 | # 选取到节点之后就可以获取一系列属性,属性名,string文本,标签名
106 | print(tag[0].attrs) # 获取当前标签所有属性,返回的是一个字典
107 | print(tag[0].string) # 获取标签里面的文本
108 | print(tag[0].name) # 获取当前标签名
109 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/PyQuery实战案例.py:
--------------------------------------------------------------------------------
1 | """
2 | 背景1: 当我们浏览到了一个非常有吸引力的网站——诗词名句网,作为一个诗迷,我们想把所有的诗据为己有。而网站没有提供下载功能,这可咋办呢?
3 | 需求:诗人选择:1:输出网站默认推荐的前100位诗人,2:自定义输入作者,如果不存在这个作者,重新输入
4 | 3:选择下载诗的数量:输入参数,下载即可(会输出总数量,不能大于总数量)
5 | 4:存储:默认存储到当前目录下面的poems文件夹,一首诗一个txt文件
6 | 背景2: 这个时候我们突然看到了古籍,它对我们的吸引力远大于古诗,我们想把所有古籍据为己有,这个时候咋办呢?
7 | 实现2: 1:请求所有的古籍名,存到一个字典中,进行对比,并输出排名前10位的古籍。
8 | 2:输入古籍名称和下载模式,开始下载,可以选择下载到一个txt文件或者分开章节下载
9 | 3:存储到./books这个文件夹下
10 | """
11 | import os.path
12 | import time
13 | import requests
14 | from pyquery import PyQuery as pq
15 |
16 |
17 | # 定义交互的方法
18 | def say_aloha(books_name):
19 | print(
20 | "-----------------------------------------------你好^_^,欢迎使用----------------------------------------------")
21 | print(f"您查询到的前10个古籍为:{' '.join(list(books_name.keys())[0:10])}")
22 | book_name = input("请输入您想下载的古籍:(输入古籍名即可,不需书名号)")
23 | mode = int(input("请输入您选择的下载的存储模式:(1:所有章节存储到一个txt文件中。2:章节分开存储。)"))
24 | user_choice = {'book_name': book_name, 'mode': mode}
25 | return user_choice
26 |
27 |
28 | # 定义获取html字符串的方法
29 | def get_html(url):
30 | headers = {
31 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
32 | }
33 | response = requests.get(url, headers=headers)
34 | return response.content.decode()
35 |
36 |
37 | # 定义获取所有古籍的方法
38 | def get_books_name():
39 | cate_html = get_html("https://www.shicimingju.com/book/")
40 | # 开始提取所有书名,并把书名和url映射存储到一个字典中返回回去
41 | cate_doc = pq(cate_html)
42 | a_s = cate_doc('#main_left ul li a').items()
43 | books_dict = {}
44 | for a in a_s:
45 | book_name = a.text()
46 | href = domain_name + a.attr('href')
47 | books_dict[book_name] = href
48 | return books_dict
49 |
50 |
51 | # 开始下载书籍每一章的数据
52 | def get_section(url):
53 | # 获取章节页面
54 | cate_html = get_html(url)
55 | # 解析章节名称和url
56 | sections = pq(cate_html)('#main_left ul li a').items()
57 | sections_dict = {}
58 | for section in sections:
59 | section_name = section.text()
60 | section_href = domain_name + section.attr('href')
61 | sections_dict[section_name] = section_href
62 | return sections_dict
63 |
64 |
65 | def save_section_content(book_name, section_name, url, mode):
66 | # 睡眠0.3秒再访问
67 | time.sleep(0.3)
68 | if mode == 2:
69 | content_html = get_html(url)
70 | section_content = pq(content_html)(".chapter_content").text()
71 | section_content.replace('\n', '\n\n ')
72 | if not (content_html or section_content):
73 | return False
74 | # 写入文件
75 | if not os.path.exists("./books"):
76 | os.mkdir('./books')
77 | with open(f'./books/{book_name}-{section_name}.txt', 'w', encoding='utf-8') as f:
78 | f.write(section_content)
79 | return True
80 | else:
81 | # 直接创建一个文件,追加写入,章节名写一个换个行
82 | content_html = get_html(url)
83 | section_content = pq(content_html)(".chapter_content").text()
84 | section_content.replace('\n', '\n\n')
85 | if not (content_html or section_content):
86 | return False
87 | # 写入文件
88 | if not os.path.exists("./books"):
89 | os.mkdir('./books')
90 | with open(f'./books/{book_name}.txt', 'a', encoding='utf-8') as f:
91 | f.write(f'\n\n {section_name}\n\n\n')
92 | f.write(section_content)
93 | return True
94 |
95 |
96 | # 控制下载各个章节的方法
97 | def save_sections_text(sections_dict, mode, book_name):
98 | # 这里就直接遍历了,判断用户选择的模式,看看是写到一个还是多个文件中
99 | flag = True
100 | for (section_name, url) in sections_dict.items():
101 | flag = save_section_content(book_name, section_name, url, mode)
102 | print("章节" + section_name + "下载完成~")
103 | if not flag:
104 | flag = False
105 |
106 | return flag
107 |
108 |
109 | # 下载古籍的主方法
110 | def save_book(books_name, user_choice):
111 | # 匹配书籍url
112 | url = books_name.get('《' + user_choice.get('book_name') + '》')
113 | if not url:
114 | return False
115 | # 获取章节url和章节名
116 | sections_dict = get_section(url)
117 | # 存储章节文本到文件中
118 | flag = save_sections_text(sections_dict, user_choice.get('mode'), user_choice.get('book_name'))
119 | return flag
120 |
121 |
122 | # 主函数
123 | if __name__ == '__main__':
124 | try:
125 | domain_name = 'https://www.shicimingju.com'
126 | # 获取网站所能提供的所有方法,并存储到一个字典里面
127 | books_name = get_books_name()
128 | # 获取到下载的名称和模式
129 | user_choice = say_aloha(books_name)
130 | # 开始下载书籍,下载成功返回一个标志
131 | flag = save_book(books_name, user_choice)
132 | print(f"开始下载书籍{user_choice.get('book_name')}~")
133 | if flag:
134 | print(f"书籍{user_choice.get('book_name')}下载成功~")
135 | else:
136 | print(f"书籍{user_choice.get('book_name')}下载失败~")
137 | except:
138 | print("未知异常~")
139 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/PyQuery库的使用.py:
--------------------------------------------------------------------------------
1 | # pyquery官网:http://pyquery.readthedocs.io/
2 | # 导包
3 | from pyquery import PyQuery as pq
4 |
5 | """
6 | pyquery是完美使用css选择器的,类似于jquery,常用有:
7 | PyQuery(html)初始化,doc()选择元素,items()获取所有节点迭代器对象,text()获取文本,attr("属性名")获取属性
8 | html()获取html文本
9 | """
10 |
11 | # 定义要演示的html字符串,接下来的操作就对它做信息提取
12 | html = '''
13 |
22 | '''
23 | # 初始化
24 | # 1:使用字符串
25 | doc1 = pq(html)
26 | # print(doc1('li'))
27 | # 2:也可以传入一个url
28 | doc2 = pq(url='http://8.130.167.39/')
29 | # print(doc2('title'))
30 | # 3:读取本地文件
31 | doc3 = pq(filename='test.html')
32 | # print(doc3('li'))
33 |
34 | # 使用CSS选择器选择节点,会选择全部
35 | # print(doc1('#container .item-1 a'))
36 | # print(type(doc1('#container .item-1 a')))
37 |
38 | # 遍历选择出来的节点调用text方法就可以获得里面的内容
39 | # for item in doc1('#container .item-1 a').items():
40 | # print(item.text())
41 |
42 | # TODO:查找节点
43 | # 子节点
44 | # 1:find方法,这个相当于选择器分开写了,但是这个不是遍历每一个,是直接一次拿到符合条件的【字符串】
45 | items = doc1('#container')
46 | lis1 = items.find('.active')
47 | # print(lis1)
48 | # 2:children方法,children会忽略孙子等节点只关心子节点,传入选择器筛选子节点
49 | lis2 = items.children('.active')
50 | # print(lis2)
51 |
52 |
53 | # 父节点
54 | # 1:parent方法,这个方法只会获取直接父节点
55 | parent = doc1('#container li').parent()
56 | # print(parent) # 获取到ul节点
57 | # 2:parents方法,获取到祖先节点
58 | parents = doc1('#container li').parents()
59 | # print(parents) # 获取到div和ul
60 | # 3:进一步筛选祖先节点
61 | parents_checked = doc1('#container li').parents('div')
62 | # print(parents_checked) # 获取到div
63 |
64 | # 兄弟节点
65 | # 1:siblings方法
66 | single_li = doc1('#container .item-0')
67 | siblings_li = single_li.siblings()
68 | # print(siblings_li) # 获取到所有li
69 | # 使用选择器过滤兄弟节点
70 | second_li = single_li.siblings('.item-1')
71 | # print(second_li) # 获取到第二个li
72 |
73 | # 遍历节点
74 | items_li = doc1('#container li')
75 | # for li in items_li.items():
76 | # print(li.text()) # 打印所有li标签里面的文本
77 |
78 | # TODO:提取信息
79 | # 获取属性
80 | # 需求:1:获取第三个li的所有class属性,2:获取第三个li里面的a的href
81 | # 选中这个节点->调用attr方法得到属性值
82 | classes = doc1('#container .item-2').attr('class')
83 | a_href_value = doc1('#container .item-2 a').attr('href')
84 | # print('第三个li的所有属性值为:' + classes + "第三个li的a里面的href属性值为" + str(a_href_value))
85 | # 需求3:获取所有class="link"的a节点的href属性(!!!如果使用attr是不能实现的,只会返回节点中第一个的href,这个时候需要借助items这个方法遍历)
86 | a_s = doc1('.link')
87 | # print(a_s)
88 | # for a in a_s.items():
89 | # print(a.attr('href'))
90 |
91 | # 获取文本
92 | # 需求1:获取第一个li里面的文本。需求2:获取所有li里面的文本(!!! 获取到所有节点,然后获取这些节点的文本,pyquery库会封装拼接成一个新的字符串)
93 | li_text = doc1('#container .item-0').text()
94 | lis_text = doc1('#container li').text()
95 | # print("第一个li的文本内容是:" + li_text + "所有li里面的内容是:" + lis_text)
96 |
97 | # TODO:节点操作
98 | # 可以对节点的属性和节点本身进行操作,修改原始的pyquery对象
99 | # 需求1:给第一个节点添加一个highlight类。把所有li节点的active类给移除了(如果有)
100 | new_doc1 = doc1('#container .item-0').add_class('highlight')
101 | doc1('#container li').remove_class('active')
102 | # print(doc1)
103 |
104 | # 需求2:给第三个li加上id为third-li,修改第三个li的内容为new third li
105 | doc1('#container .item-3').attr('id', 'third-li').text('new third li')
106 | # print(doc1)
107 |
108 | # 需求3:移除第三个li(对于想要提纯数据的时候挺有效的)
109 | doc1('#container').find('.item-2').remove()
110 | # print(doc1)
111 |
112 | # 需求三,选中最后一个li和第四个li
113 | first_li = doc1('li:first-child')
114 | fourth_li = doc1('li:nth-child(4)')
115 | print(f"第一个li:{first_li},第四个li{fourth_li}")
116 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/Xpath实战案例.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 |
4 | """
5 | 现在考虑这样一个需求:小张想在猪八戒平台上面找一个logo设计的商家,但是它需要多种维度进行分析,
6 | 销量,价格,好评率,主营业务和商家名称这些信息都是很重要的参考指标,现在抓取前6条数据存储到猪八戒logo设计商家.csv这个文件中以便进一步数据分析
7 | 因为只能抓取6条,好像是ajax动态加载,需要用后面的库才能实现模拟请求,这里就不深究了
8 | """
9 |
10 | url = "https://www.zbj.com/logosjzbj/f.html?r=2"
11 | headers = {
12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
13 | }
14 | response = requests.get(url, headers=headers)
15 | response.encoding = 'utf-8'
16 | # 初始化
17 | html_zbj = etree.HTML(response.text)
18 | # 获取到所有div对象(一个div对应一个商家信息)
19 | divs = html_zbj.xpath("//div[@class='search-result-list-service']/div")
20 | i = 1
21 | total_html = ''
22 | for div in divs:
23 | if i > 6:
24 | break
25 | price = div.xpath("./div/div/div[1]/span[1]/text()")[0]
26 | sell_count = "".join(div.xpath("./div/div[@class='bot-content']/div[@class='descprit-box']/div[2]//text()"))
27 | top_rate_count = "".join(div.xpath("./div/div[@class='bot-content']/div[@class='descprit-box']/div[3]//text()"))
28 | main_business = div.xpath("./div/div[@class='bot-content']/div[@class='name-pic-box']//text()")[0]
29 | shop_name = div.xpath("./div//div[@class='shop-detail']//text()")[0]
30 | # 把这些数据写入到csv文件中
31 | with open('猪八戒logo设计商家信息统计.csv', 'a', encoding="utf-8") as f:
32 | if i == 1:
33 | with open('猪八戒logo设计商家信息统计.csv', 'w'):
34 | pass
35 | f.write("商家名,主营业务,价格,销售量,好评数\n")
36 | f.write(f"{shop_name},{main_business},{price},{sell_count},{top_rate_count}\n")
37 | i += 1
38 | print("写入完成")
39 | # 数据已经封装好了,美中不足的就是数据有点少,后面学了一些高级的请求数据的框架就可以解决这个问题了
40 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/Xpath解析HTML页面.py:
--------------------------------------------------------------------------------
1 | from lxml import etree
2 |
3 | # Xpath解析HTML页面只需要三个步骤。!!!可以解析html,也可以解析xml。
4 | # 1. 构造xpath对象(把html字符串转换成xpath对象)
5 | # 2. 编写选择器(最重要) 这个主要可以通过节点的属性,节点相互之间的层级关系,进行选择,一般最后都是//text()|@属性名
6 | # 3. 取出需要的text数据
7 | text = '''
8 |
9 |
10 |
11 |
12 |
13 | Document
14 |
15 |
16 |
27 |
28 |
29 | '''
30 | # 使用HTML可以自动修正html结果,html为bytes类型
31 | html = etree.HTML(text)
32 | result = etree.tostring(html)
33 | # print(result.decode('utf-8'))
34 | # 从外部导入HTML文件并解析
35 | new_html = etree.parse('./test.html', etree.HTMLParser())
36 | # 获取文件调用etree里面的xpath方法就可以解析出节点了,学习xpath的过程就是写选择器的过程
37 | # 1.选取所有节点
38 | result0 = new_html.xpath('//*')
39 |
40 | # 2.选取子节点,使用/获取子节点,使用//获取子孙节点。注意层级的关系
41 | result1 = html.xpath('//ul/li') # 选取ul里面的所有li,只会直接选中,不会跨层级选中
42 | result2 = html.xpath('//ul//a') # 选取ul里面的所有a,会跨层级选中
43 |
44 | # 3.根据子节点的属性选择到父节点,类似于相对路径
45 | result3 = html.xpath('//a[@href="link4.html"]/../@class') # 根据属性选择器选择到a标签。利用../回到父级获取父级的class属性
46 |
47 | # 4.根据节点里面的class类进行选择
48 | result4 = new_html.xpath('//li[@class="item-0"]')
49 |
50 | # 5.获取节点中的文本,使用text()函数。
51 | # 如果直接匹配子节点,可能会匹配到其他子节点的数据。/n之类的。这里是因为自动添加了标签,自动换行。
52 | result5 = html.xpath('//li[@class="item-0"]//text()')
53 | result6 = html.xpath('//li[@class="item-0"]/a/text()')
54 |
55 | # 6.获取节点的属性,使用@href
56 | result7 = html.xpath('//li/a/@href') # 获取所有li下面的所有a的href属性
57 |
58 | # 7.根据多个属性值进行匹配,采用contains方法.只有包含这个属性就被筛选出来
59 | result8 = html.xpath('//li[contains(@class,"li")]/a/text()') # 查找li里面class属性下面包含li的a标签的值
60 |
61 | # 8.根据多个属性的值进行匹配,利用and运算符
62 | result9 = html.xpath('//li[contains(@class,"li") and @name="li-2"]/a/text()') # 筛选出类名包含li的,name值为li-2的li,
63 | # 并且求出它下面的text值
64 |
65 | # 9.按序选择,匹配到多个节点后,选择多个节点当中的第几个
66 | result10 = html.xpath('//li[1]/a/text()') # 选择第一个li里面的a的text值
67 | result11 = html.xpath('//li[last()]/a/text()') # 选择最后一个li里面的a的值
68 | result12 = html.xpath('//li[position()>3]/a/text()') # 选择3以后的li标签下面的a的值
69 |
70 | # 10.选取跟本节点相关的节点
71 |
72 | result13 = html.xpath('//li[1]/ancestor::*') # 获取第一个li的所有父节点,递归获取,会往上继续找
73 | result14 = html.xpath('//li[1]/ancestor::div') # 获取所有父节点,但是只要div标签的
74 | result15 = html.xpath('//li[1]/attribute::*') # 获取当前li节点的所有属性值
75 | result16 = html.xpath('//li[1]/child::a[@href="link1.html"]') # 获取当前li节点的子节点,并且需要满足href=link1.html条件
76 | result17 = html.xpath('//li[1]/descendant::span') # 获取当前li节点的所有子孙节点,但是只要span标签的
77 | result18 = html.xpath('//li[1]/following::*[2]') # 获取当前节点之后的所有节点(不是同级的),但是只要第2个。就是a
78 | result19 = html.xpath('//li[1]/following-sibling::*') # 获取当前节点之后的所有节点(是同级的)。
79 |
80 | # for循环专门打印运行结果
81 | for i in range(0, 20):
82 | print(f'result{i}: {locals()["result" + str(i)]}')
83 | print('--------------------------')
84 |
--------------------------------------------------------------------------------
/第2章-网页数据的提取/test.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/第3章-进程线程和协程的使用/协程的使用.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import aiohttp
3 | import aiofiles
4 | import time
5 |
6 | from aiohttp import TCPConnector
7 |
8 | """
9 | 协程是爬虫的重中之重,前面提到虽然多进程利于处理计算密集型任务,多线程和协程利于处理IO密集型任务, 但是计算密集型的应用对于爬虫毕竟还是
10 | 几乎没有应用的,因此我们选择的范围是线程和协程,但是线程的切换还是有成本的,协程是在一个线程里面成本会小很多。这里的应用类似于JS的异步。
11 | 主要介绍下面几个方面:
12 | 1:协程的基本使用。创建协程,封装协程为task对象,创建事件循环,添加task到事件循环中
13 | 2:aiohttp和aiofiles的基本使用。(协程当中发请求和存数据的库)
14 | 3:控制协程的并发量
15 | """
16 |
17 |
18 | async def run(i):
19 | await asyncio.sleep(2)
20 | print(f"第{i}号协程在执行任务")
21 | return i + 1
22 |
23 |
24 | # TODO:协程的基本使用
25 | # 简单解释一下下面这行代码,run(i)创建了一个coroutine对象(因为有async修饰),传递给create_task封装程task对象。
26 | # 后面的for循环是创建了4个任务封装成task。通过await关键字等待任务执行的结果添加到res_list里面并打印
27 | # 最后通过async.run(main())函数创建了事件循环并管理这些任务
28 | # async def main():
29 | # begin_time = time.time()
30 | # task_list = [asyncio.create_task(run(i)) for i in range(1, 5)]
31 | # res_list = []
32 | # for task in task_list:
33 | # res = await task
34 | # res_list.append(res)
35 | #
36 | # print(res_list, time.time() - begin_time) # 2s多,处理IO也是这样,等待IO时候调用其他协程完成其他任务
37 | #
38 | #
39 | # # 运行主协程
40 | # asyncio.run(main())
41 |
42 | # TODO:aiohttp和aiofiles基本使用
43 | # 解释一下下面代码,首先打开aiohttp的客户端服务,然后开启服务端发送请求请求数据。开启客户端时候可以配置ssl连接,指定自己cookies。
44 | # 发送请求时候指定请求头,请求体,请求查询参数。获取数据后使用asyncfiles这个模块进行文件的写入。设置编码格式,异步等待文件写入完成
45 | # async def aio_main():
46 | # cookies = {}
47 | # async with aiohttp.ClientSession(connector=TCPConnector(ssl=False), cookies=cookies) as session:
48 | # params = [('s', '后端')]
49 | # data = {}
50 | # headers = {}
51 | # # proxy = "http://127.0.0.1:10080" # 有需要自己配置代理
52 | # html = ""
53 | # async with session.get('https://bugdesigner.cn', params=params, data=data, headers=headers) as res:
54 | # print(res.status)
55 | # # 使用这个所有进行异步加载的都必须加上await等待结果,比如说请求数据,io操作
56 | # html = await res.text(encoding='utf-8')
57 | # print(html)
58 | # # print(await res.read()) 请求回来的数据不是文本格式,比如说图片
59 | # async with aiofiles.open('index.html', mode='w',encoding='utf-8') as f:
60 | # await f.write(html)
61 | # asyncio.run(aio_main())
62 |
63 | # TODO:控制协程并发量
64 | # 简单解释一下下面代码。首先创建一个信号量对象,在创建任务的时候创建了20个协程,但是传入了信号量进行约束,因此可以实现5个协程一起执行,控制并发
65 | # async def sem_run(semaphore, i):
66 | # async with semaphore:
67 | # await asyncio.sleep(5)
68 | # print(f"协程正在执行工作——{i}")
69 | #
70 | #
71 | # async def sem_main():
72 | # semaphore = asyncio.Semaphore(5)
73 | # task_list = [asyncio.create_task(sem_run(semaphore, i)) for i in range(1, 21)]
74 | # for task in task_list:
75 | # await task
76 | #
77 | #
78 | # asyncio.run(sem_main())
79 |
--------------------------------------------------------------------------------
/第3章-进程线程和协程的使用/多线程和协程的练习.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os.path
3 | import re
4 | from urllib.parse import urljoin
5 | import aiohttp
6 | import aiofiles
7 | import time
8 | import requests
9 | from aiohttp import TCPConnector
10 | from pyquery import PyQuery as pq
11 | from lxml import etree
12 |
13 | """
14 | 下面这个案例实现这个功能,爬取美剧天堂的视频,去网站查看有没有这个视频的源,点到集数这个页面复制上面的url,输入到程序中,
15 | 程序会打印出所有集数。指定集数,会爬取指定集数的视频。并把视频存储为一个mp4文件,命名为视频名称-集数
16 |
17 | 思路:
18 | 1:首先找到美剧天堂的域名,输入url后,程序向这个网页发送请求获取到这个网页的数据。找到集数并打印在控制台。
19 | 2:输入集数,程序再发一次请求,返回的是播放页的源码。然而我需要的是m3u8文件,播放页源码里面并不存在
20 | 分析网络请求和源码可以发现,这个网站是向这个https://php.playerla.com/mjplay这个uri发送查询请求,携带一个查询参数id
21 | (这个id怎么获取呢,在每个播放页的源码里面会有所有相关集的id,利用正则匹配即可。)
22 | 3:请求这个拼接一下这个url,发送第三次请求,获取与播放器相关的html,使用正则匹配到url地址。
23 | 4:根据上述正则匹配的结果发送第四次请求,拿到这个文件了以后,拼接上前缀和里面的资源。就可以实现一个一个下载了。
24 | (这个前缀查看网络请求跟请求m3u8的url差不多,字符串切割一下存成baseurl)
25 | 5:程序使用协程下载再发请求,这次下载的就是.ts文件。最后下载完毕使用merge函数合并,执行ffmpeg的一个命令即可
26 | """
27 | # 定义全局变量,title为集数名,href为集数url,m3u8_url为请求m3u8文件的url,baseurl为请求,video_name为影片名,episode_n为第几集
28 | title = href = m3u8_url = baseurl = video_name = episode_n = None
29 |
30 |
31 | # 通过用户输入的url来查找当前美剧的名称和对应的episode
32 | def get_section(url):
33 | global title, href
34 | page_code = requests.get(url, headers=headers)
35 | lis = pq(page_code.text)(".z-pannel_bd ul li a").items()
36 | name = pq(page_code.text)('h1').text()
37 | res = []
38 | # 得到里面的title和href属性值
39 | for li in lis:
40 | title = li.attr("title")
41 | href = "https://www.mjtt5.tv" + li.attr('href')
42 | if title and title.startswith('第') and title.endswith('集'):
43 | res.append({'title': title, 'href': href})
44 | res.append({'name': name})
45 | return res
46 |
47 |
48 | # 从选集页面源码里面找到播放器相关的地址,供得到m3u8url使用
49 | def find_m3u8_url(href):
50 | res = requests.get(href, headers=headers)
51 | ids = etree.HTML(res.text).xpath('/html/body/div[2]/div/div[1]/div[1]/script[1]/text()')
52 | id = str(ids[0])
53 | re_str = "\\\\/".join(href.rsplit("/", 3)[-3:])
54 | pattern = r'"(\w+?=?)","\\/{}"'.format(re_str)
55 | query_id = re.search(pattern, id).group(1)
56 | return f"https://php.playerla.com/mjplay/?id={query_id}"
57 |
58 |
59 | # 通过上一个函数得到的播放器地址请求得到m3u8的url
60 | def get_m3u8_url(m3u8_url):
61 | res = requests.get(m3u8_url, headers=headers)
62 | # 编写正则找到m3u8文件的url
63 | pattern = r'var playconfig = {\s*"url":\s*"([^"]+)"'
64 | m3u8_url = re.search(pattern, res.text).group(1)
65 | return m3u8_url
66 |
67 |
68 | # 交互页面
69 | def say_aloha():
70 | global m3u8_url, baseurl, video_name, episode_n
71 | print(
72 | "-----------------------------------------------你好^_^,欢迎使用----------------------------------------------")
73 | url = input("请输入需要下载的电影的url").strip()
74 | # 列表里面包字典
75 | section_list = get_section(url)
76 | section = int(input(f'美剧{section_list[-1].get("name")}共有{len(section_list)}集,请输入你需要下载第几集'))
77 | video_name = section_list[-1].get("name")
78 | episode_n = section
79 | # 得到的这个url原来是在线播放器的地址
80 | m3u8_url_page = find_m3u8_url(section_list[len(section_list) - section].get('href'))
81 | m3u8_url = get_m3u8_url(m3u8_url_page)
82 | # 得到这个url就可以传递给真正干活的协程那些方法干活去了
83 | baseurl = m3u8_url.rsplit("/", 1)[0] + "/"
84 |
85 |
86 | # 创建m3u8文件,通过上面得到的m3u8的url请求,把m3u8文件存储到本地
87 | async def find_m3u8(url):
88 | async with aiohttp.ClientSession(connector=TCPConnector(ssl=False)) as session:
89 | async with session.get(url, headers=headers) as res:
90 | data = await res.text(encoding="UTF-8")
91 | if not os.path.exists("./ts"):
92 | os.mkdir("ts")
93 | async with aiofiles.open('./ts/index.m3u8', mode='w', encoding='UTF-8') as f:
94 | await f.write(data)
95 |
96 |
97 | # 下载一个ts文件的方法,加上try-except和while循环确保全部下完,使用协程和信号量
98 | async def download_one(url, sem, i):
99 | headers = {
100 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
101 | "authority": "https://cdn8.tvtvgood.com"
102 | }
103 | async with sem:
104 | while True:
105 | try:
106 | async with aiohttp.ClientSession(connector=TCPConnector(ssl=False)) as session:
107 | async with session.get(url, headers=headers, timeout=60) as res:
108 | print("第" + str(i) + "个" + url + "爬取中~状态码为:", res.status)
109 | data = await res.read()
110 | async with aiofiles.open(f'./ts/{i}.ts', mode='wb') as f:
111 | await f.write(data)
112 | break
113 | except Exception as e:
114 | print(f"请求出错,原因是:{e},正在重新爬取中......")
115 |
116 |
117 | # 构建一个拼接电影文件的m3u8文件,供ffmpeg使用
118 | def create_join_ts_file():
119 | i = 0
120 | with open('./ts/index.m3u8', mode='r') as f:
121 | lines = f.readlines()
122 | with open('./ts/index.m3u8', mode='w') as f:
123 | for line in lines:
124 | if line.startswith('#'):
125 | f.write(line)
126 | else:
127 | f.write(f'{i}.ts\n')
128 | i += 1
129 |
130 |
131 | # 下载所有的ts文件,创建一个任务列表,调用download_one方法干活
132 | async def download_all():
133 | task_list = []
134 | with open('./ts/index.m3u8', mode='r', encoding='UTF-8') as f:
135 | line = "line"
136 | i = 0
137 | # 开始下载视频,首先创建一个信号量,控制并发协程数
138 | sem = asyncio.Semaphore(10)
139 | while line:
140 | line = f.readline()
141 | if line.startswith('#'):
142 | continue
143 | url = urljoin(baseurl, line.strip())
144 | if url == baseurl:
145 | continue
146 | task = asyncio.create_task(download_one(url, sem, i))
147 | task_list.append(task)
148 | i += 1
149 | await asyncio.wait(task_list)
150 | # 下完以后再合并
151 | create_join_ts_file()
152 | merge()
153 |
154 |
155 | # 合并所有ts文件,并且把剩下的ts文件给删了
156 | def merge():
157 | os.chdir('./ts')
158 | cmd = f'ffmpeg -i index.m3u8 -c copy {video_name}-{episode_n}.mp4'
159 | os.system(cmd)
160 |
161 |
162 | def delete():
163 | os.chdir('./ts')
164 | # 删除m3u8和ts文件
165 | for file in os.listdir('./'):
166 | if file.endswith('.ts') and file.endswith('.m3u8'):
167 | os.remove(file)
168 |
169 |
170 |
171 | # 程序主入口
172 | async def main():
173 | # 进行爬虫数据收集和预先数据处理
174 | say_aloha()
175 | begin_time = time.time()
176 | print("开始下载视频......")
177 | # 得到m3u8文件
178 | task1 = asyncio.create_task(find_m3u8(m3u8_url))
179 | await task1
180 | # 开始下载
181 | task2 = asyncio.create_task(download_all())
182 | await task2
183 | end_time = time.time()
184 | print(f"下载视频完成,共用时{end_time - begin_time}秒")
185 |
186 |
187 | # 文件的入口,启动程序主入口那个异步方法
188 | if __name__ == '__main__':
189 | headers = {
190 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
191 | }
192 | # asyncio.run(main())
193 | delete()
194 |
--------------------------------------------------------------------------------
/第3章-进程线程和协程的使用/多线程的使用.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import time
3 | from concurrent.futures import ThreadPoolExecutor, wait
4 |
5 | """
6 | 前面提到的进程在windows系统下消耗太大,因此爬虫一般不使用多进程抓取。然而多线程也有弊端,由于Python语言中的GIL锁的存在,导致一些场景使用
7 | 多线程并不能显著的提高性能。在IO密集型任务中可以使用这个但是后面又有协程,因此可以说多线程可以说是高不成低不就了,在python的位置很尴尬。
8 | 计算密集型使用多进程好一点(有独立的Python解释器)
9 | 下面实现这些方面:
10 | 1:如何创建并使用多线程,多线程之间如何传参,如何获取线程名称,查看当前主线程,
11 | 2:线程之间数据共享,发生数据不一致问题如何解决(这是指的是多线程同时操作一片内存空间可能影响数据,高版本python已解决这个问题)
12 | 3:线程池如何使用,如何获取线程任务的返回值。获取返回值可通过数据共享实现(kv键值对)
13 | """
14 |
15 |
16 | def run(i):
17 | print(f"{i}号线程{threading.current_thread().name}在执行任务,当前主线程为{threading.main_thread().name}")
18 |
19 |
20 | def get_name():
21 | print(f"当前姓名集合里面有{name_list}")
22 |
23 |
24 | def set_name():
25 | name_list.append('赵六')
26 | print("往姓名集合里面添加一个姓名赵六")
27 |
28 |
29 | def pool_run(i):
30 | time.sleep(1) # 睡一秒,不然线程池之间线程直接被重用了,看不到5个线程都被利用起来的效果
31 | print(f"{i}号线程{threading.current_thread().name}在执行任务,当前主线程为{threading.main_thread().name}")
32 | return i
33 |
34 |
35 | if __name__ == '__main__':
36 | # TODO:创建并使用线程,线程传参,获取当前线程名和主线程名
37 | # 下面代码创建了4个线程,并把循环变量i传入run方法中,由run方法并发执行多个线程,由循环等待运行完毕,并打印运行run方法的线程的信息
38 | # # 创建存储线程对象的列表
39 | # thread_list = []
40 | # for i in range(1, 5):
41 | # thread = threading.Thread(target=run, args=(i,), name=f'thread{i}')
42 | # thread_list.append(thread)
43 | # thread.start()
44 | # for thread in thread_list:
45 | # thread.join()
46 | # print("任务执行完毕~")
47 |
48 | # TODO:线程之间如何进行数据共享(显然直接定义全局变量即可)
49 | # 下面代码定义了一个姓名列表,创建了三个不同的线程,这三个线程对这个列表进行读写操作,观察数据是否可以共享
50 | name_list = ['张三', '李四', '王五']
51 | # get_name_thread = threading.Thread(target=get_name)
52 | # set_name_thread = threading.Thread(target=set_name)
53 | # get_name2_thread = threading.Thread(target=get_name)
54 | # get_name_thread.start()
55 | # set_name_thread.start()
56 | # get_name2_thread.start()
57 | # get_name2_thread.join()
58 | # set_name_thread.join()
59 | # get_name_thread.join()
60 | # print("姓名操作完毕~")
61 |
62 | # TODO:线程池的使用
63 | # 下面代码创建了一个5个容量的线程池,并把任务分配给了五个线程,最后获取线程的返回值
64 | # pool = ThreadPoolExecutor(5)
65 | # # 采用简化的写法
66 | # futures = [pool.submit(pool_run, i) for i in range(1, 6)]
67 | # # 接收返回值,还可以使用as_completed,那个不返回not_done对象
68 | # done, not_done = wait(futures)
69 | # for future in done:
70 | # print(future.result())
71 |
--------------------------------------------------------------------------------
/第3章-进程线程和协程的使用/多进程的使用.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from multiprocessing import Process, Manager, Pool
4 |
5 | """
6 | 主要介绍下面几个方面:
7 | 1:进程的创建,进程如何传参,进程如何提升效率?
8 | 2:进程之间如何通信
9 | 3:进程池的使用
10 | """
11 |
12 |
13 | # 老生常谈的run方法
14 | def run(i):
15 | # 祖传先睡一秒
16 | time.sleep(1)
17 | print("数据正在爬取中~", i)
18 |
19 |
20 | # 生产者消费者模拟进程通信,应用场景是生产者抓取数据,消费者进程下载数据
21 | def run_provider(msg_queue, time_dict):
22 | msg_queue.put("product1")
23 | msg_queue.put("product2")
24 | msg_queue.put("product3")
25 | time_dict['producer'] = time.time_ns()
26 | print(f"生产者——当前父进程是{os.getppid()},当前子进程是{os.getpid()}")
27 |
28 |
29 | def run_consumer(msg_queue, time_dict):
30 | product1 = msg_queue.get()
31 | time_dict['consumer'] = time.time_ns()
32 | product2 = msg_queue.get()
33 | product3 = msg_queue.get() # 设置timeout时间,几秒内没有生产者放东西就直接结束
34 | print(f"消费者——当前父进程是{os.getppid()},当前子进程是{os.getpid()},{product1},{product2},{product3}")
35 |
36 |
37 | def run_process_pool(i):
38 | time.sleep(3)
39 | print(f"进程开始工作了~{i}")
40 |
41 |
42 | if __name__ == '__main__':
43 | # TODO:进程创建,传参,提高效率
44 | # 看一下相对于单进程节约多长时间
45 | # start_time = time.time()
46 | # # 定义一个列表存储进程对象
47 | # p_list = []
48 | # for i in range(1, 7):
49 | # # 创建进程,target传入要让这个子进程干的事(即方法),args传入要传递给这个子进程的参数(加,号防止误认为是字符串或是int等)
50 | # p = Process(target=run, args=(i,))
51 | # p_list.append(p)
52 | # p.start()
53 | # # p.terminate() # 这个方法用于终止进程
54 | # for p in p_list:
55 | # # 为了让主进程等待这些子进程执行完
56 | # p.join()
57 | # end_time = time.time()
58 | # print(f"一共运行了{end_time - start_time}") # 只运行了1.6s,而单进程至少是6s多
59 |
60 | # TODO:进程之间如何通讯
61 | # msg_queue = Manager().Queue()
62 | # time_dict = Manager().dict()
63 | # # 创建一个生产者和一个消费者
64 | # p_provider = Process(target=run_provider, args=(msg_queue, time_dict))
65 | # p_consumer = Process(target=run_consumer, args=(msg_queue, time_dict))
66 | # p_provider.start()
67 | # p_consumer.start()
68 | # p_provider.join()
69 | # p_consumer.join()
70 | # # 这个时间不太固定,探究也没啥意义,生产者生产,消费者消费
71 | # print(f"父进程结束了~{time_dict.get('producer')},{time_dict.get('consumer')}")
72 |
73 | # 进程池的使用,跟Java线程池,数据库连接池都差不多,为了减少频繁创建和销毁
74 | p = Pool() # 不传参默认cpu核心数,传参代表进程池有几个进程
75 | print(f"cpu核心数为{os.cpu_count()}")
76 | # 创建32个进程
77 | for i in range(1, 33):
78 | p.apply_async(run_process_pool, args=(i,))
79 | p.close()
80 | p.join()
81 | # 打印时候可能同一时刻两个进程打印到控制台就会出现没有正常换行的现象
82 | print("父进程结束了~")
83 |
--------------------------------------------------------------------------------
/第4章-python操作主流数据库/MongoDB数据库.py:
--------------------------------------------------------------------------------
1 | # 导包
2 | from bson import ObjectId
3 | from pymongo import MongoClient
4 |
5 | """
6 | MongoDB数据库好像是Node生态里面的一个十分重要的数据库,它的特点是是非关系型数据库,非常自由,键值对形式。
7 | 你可以不用关心数据的约束,因为没有约束(当然可以自定义约束)。
8 | 这里演示数据库的连接和关闭和简单的CRUD
9 | """
10 |
11 | # # 连接数据库
12 | # conn = MongoClient(host="localhost", port=27017)
13 | # collection = conn.test.test
14 | #
15 | # # 获取当前数据库名称
16 | # database_name = conn.test.name
17 | # print(database_name)
18 |
19 | # TODO:添加数据
20 | # 插入数据
21 | # data = collection.insert_one({"name": "zhangsan", "age": 18, "gender": "women"})
22 | # # print(data.inserted_id)
23 | # # 插入多条数据
24 | # many_data = collection.insert_many([{"name": "lisi", "age": 18, "gender": "women"},
25 | # {"name": "wangwu", "age": 20, "gender": "man"},
26 | # {"name": "zhaoliu", "age": 22, "gender": "women", "hobby": "唱跳rap"}])
27 | # 打印插入的id列表
28 | # print(list(map(str, many_data.inserted_ids)))
29 |
30 | # TODO:查询数据
31 | # 查询单个数据
32 | # result_one = collection.find_one({"name": "zhangsan"})
33 | # print(result_one)
34 | # 查询所有数据
35 | # result_all = collection.find({"name": "zhangsan"})
36 | # 把列表转换成字符串形式并且换行
37 | # print("\n".join(map(str, result_all)))
38 | # 根据id查询
39 | # result_id = collection.find({"_id": ObjectId("652fe0ef60e487842ba421f5")})
40 | # print(list(result_id)[0])
41 | # 排序查询,查询大于等于20的并降序排列
42 | # result_sort = collection.find({"age": {"$gte": 20}}).sort("age", -1)
43 | # 使用列表推导式打印所有符合条件的数据
44 | # [print(item) for item in result_sort]
45 | # 限制打印5条
46 | # result_sort_limit = collection.find({"age": {"$gte": 20}}).sort("age", -1).limit(5)
47 | # [print(item) for item in result_sort_limit]
48 |
49 | # # TODO:更新数据
50 | # # 修改第一个符合条件的数据,把name为zhangsan的age设置为99
51 | # update_one_result = collection.update_one({"name": "zhangsan"}, {"$set": {"age": 99}})
52 | # print(f"受影响的条数为{update_one_result.matched_count}条")
53 | # # 修改所有符合条件的数据,把name为lisi的name都设置为李四,age设置为10086
54 | # update_many_result = collection.update_many({"name": "lisi"}, {"$set": {"name": "李四", "age": 10086}})
55 | # print(f"受影响的条数为{update_many_result.matched_count}条")
56 |
57 | # # TODO:删除数据
58 | # # 删除第一条符合条件的数据,删除第一个age为99的数据
59 | # delete_result = collection.delete_one({"age": 99})
60 | # print(f"删除的条数为{delete_result.deleted_count}条")
61 | # # 删除多条数据,删除所有名字为zhangsan的用户和age为10086的用户(就是李四)
62 | # delete_many_result = collection.delete_many({"$or": [{"name": "zhangsan"}, {"age": 10086}]})
63 | # print(f"删除的条数为{delete_many_result.deleted_count}条")
64 |
65 | # # 关闭数据库连接
66 | # conn.close()
67 |
--------------------------------------------------------------------------------
/第4章-python操作主流数据库/MySQL数据库.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 |
3 | """
4 | 这里主要演示MySQL数据库的连接和如何执行sql语句。一些其他的分页,排序分组多表就是编写sql语句的事情了
5 | """
6 |
7 | # 连接MySQL数据库
8 | db = pymysql.connect(host='localhost', user='root', password='123456', database='spidertestdb')
9 | # 创建游标对象
10 | cursor = db.cursor()
11 | # 执行sql语句验证是否连接无误
12 | cursor.execute('show databases')
13 | databases = cursor.fetchall()
14 | # print(databases)
15 | # print(type(databases))
16 |
17 | # TODO:创建表并插入数据
18 | create_table_sql = """
19 | CREATE TABLE IF NOT EXISTS novel_info (
20 | id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
21 | novel_name VARCHAR(50),
22 | novel_author VARCHAR(50),
23 | novel_desc TEXT
24 | )
25 | """
26 | cursor.execute(create_table_sql)
27 |
28 | insert_sql = """
29 | insert into novel_info values(null,"三国演义","罗贯中","这是一个三国的小说"),
30 | (null,"西游记","吴承恩","这是一个西游的小说"),
31 | (null,"水浒传","施耐庵","这是一部水浒的小说")
32 | """
33 | # 必须try-expect,因为开启了事务,返回的是影响的行数
34 | try:
35 | insert_count = cursor.execute(insert_sql)
36 | print(f"此次插入影响了{insert_count}行")
37 | db.commit()
38 | except:
39 | db.rollback()
40 |
41 | # TODO:查询数据,查询表中的所有数据,返回的是元组
42 | query_sql = """select * from novel_info"""
43 | cursor.execute(query_sql)
44 | datas = cursor.fetchall()
45 | print(f"目前数据库的所有数据为{datas}")
46 |
47 | # TODO:修改数据,把西游记的作者修改成张狗蛋
48 | update_sql = """update novel_info set novel_author='张狗蛋' where novel_name='西游记' """
49 | update_count = cursor.execute(update_sql)
50 | print(f"此次更改影响了{update_count}行")
51 |
52 | # TODO:删除数据,删除三国演义这部小说
53 | delete_sql = """delete from novel_info where novel_name='三国演义'"""
54 | delete_count = cursor.execute(delete_sql)
55 | print(f"此次删除影响了{delete_count}行")
56 |
--------------------------------------------------------------------------------
/第4章-python操作主流数据库/Redis数据库.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import redis
4 |
5 | """
6 | redis是一个非常好用的nosql数据库,一般跟后端生态连接起来了,redis一共有10大数据类型,这里就简单介绍一下
7 | 1:redis的连接和关闭
8 | 2:redis5大数据类型,str,list,hash,set,zset的存取操作
9 | 3:redis的常见命令,删除键,获取键的数据类型,获取所有键,查看库,切换库,刷新库
10 | """
11 |
12 | # TODO:连接数据库
13 | # 这里连接是我本地虚拟机的redis,要确定redis.conf配置文件可以允许外部连接,并且要打开虚拟机的6380端口,Centos7
14 | '''
15 | sudo firewall-cmd --list-all
16 | sudo firewall-cmd --zone=public --add-port=6380/tcp --permanent
17 | sudo firewall-cmd --reload
18 | '''
19 | r = redis.StrictRedis(host="192.168.182.100", port=6380, password='123456', db=0, decode_responses=True)
20 |
21 | # TODO:五大数据类型的操作
22 | # 对字符串的操作
23 | r.set('name', "张狗蛋")
24 | print(r.get("name"))
25 | # 设置过期时间
26 | r.set("expire_key", "value", 20)
27 | time.sleep(1)
28 | ttl = r.ttl("expire_key")
29 | print(f"还有{ttl}秒过期")
30 |
31 | # 使用列表
32 | # 左插法,因此先进去的应该是最后打印的
33 | r.lpush('hobby', '唱', '跳', 'rap')
34 | r.lpush('hobby', '篮球')
35 | print(r.lrange('hobby', 0, -1))
36 |
37 | # 使用哈希表
38 | r.hset("person", "name", "zhangsan")
39 | r.hset("person", "age", "18")
40 | # 一次添加多个键值对
41 | r.hset('person', mapping={"name": "李四", "age": 23, "gender": "man"})
42 | # 获取单个键的值
43 | print(r.hget("person", "age"))
44 | # 获取person里面的所有键值对
45 | print(r.hgetall("person"))
46 |
47 | # 使用集合
48 | r.sadd('features', "thin", "handsome", "wealthy")
49 | print(r.smembers("features"))
50 | print(r.sismember("features", "fat"))
51 |
52 | # 使用zset
53 | r.zadd("ips", {"ip1": 100, "ip2": 98, "ip3": 60, "ip4": 58})
54 | # 从低到高,并且返回分数
55 | print(r.zrange("ips", 0, -1, withscores=True))
56 | # 从高到低返回
57 | print(r.zrevrange("ips", 0, -1, withscores=True))
58 | # 修改分数的值,给ip2这个元素的分数-1
59 | r.zincrby("ips", -1, "ip2")
60 | print(r.zrange("ips", 0, -1, withscores=True))
61 |
62 | # TODO:redis的常用操作
63 | # 删除单个键
64 | r.delete("name")
65 | # 查询所有键
66 | keys = r.keys("*")
67 | print(f"所有的键为:{keys}")
68 | # 查看当前键的数据类型
69 | r_type = r.type("ips")
70 | print(f"ips键的数据类型为:{r_type}")
71 | # 查询当前数据库
72 | current_db = r.connection_pool.connection_kwargs['db']
73 | print(f"当前数据库为:{current_db}库")
74 | # 切换数据库
75 | db = r.select(1)
76 | print(f"切换结果:{db}")
77 | # 清空所有数据库缓存
78 | r.flushall()
79 | print("数据库清除成功~")
80 |
--------------------------------------------------------------------------------
/第4章-python操作主流数据库/三大数据库的案例.py:
--------------------------------------------------------------------------------
1 | # 数据处理相关的包
2 | import re
3 | # 请求相关的包
4 | import random
5 | import time
6 | import requests
7 | from lxml import etree
8 | # 数据库的包
9 | import pymysql
10 | from pymongo import MongoClient
11 | import redis
12 |
13 | """
14 | 案例爬取:段子星。需求:爬取段子星首页的前20页段子,段子名—段子日期-段子作者-段子阅读量-段子评论-段子详细内容描述存储到数据库。
15 | 由于主要是数据库案例,因此主要是存储到数据库的操作
16 | """
17 |
18 |
19 | # 创建一个递归函数,用来转换文章详情的多重列表(如果有的话)
20 | def recursive_join(lst, separator='\n'):
21 | result = []
22 | for item in lst:
23 | if isinstance(item, list):
24 | result.append(recursive_join(item, separator))
25 | else:
26 | result.append(item)
27 | return separator.join(result)
28 |
29 |
30 | # 把之前封装好的集合给组装成一条一条的数据返回回去,清洗一下数据
31 | def trans_dict(crawl_result):
32 | result = []
33 | for i in range(0, len(crawl_result['title'])):
34 | one_res = {}
35 | for key, value in crawl_result.items():
36 | # print(key, value) # 此时value是一个列表,接下来通过索引组装每一个段子
37 | # print(value[i])
38 | one_res[key] = value[i]
39 | one_res['detail'] = recursive_join(one_res['detail']) if isinstance(one_res['detail'], list) else one_res[
40 | 'detail']
41 | result.append(one_res)
42 | return result
43 |
44 |
45 | # 对请求回来的数据进行解析封装
46 | def extract_html():
47 | # 定义一下抓取的列表
48 | title = []
49 | date = []
50 | author = []
51 | views_counts = []
52 | comments_counts = []
53 | detail_list = []
54 | for page_count in range(1, 21):
55 | time.sleep(random.randint(1, 3))
56 | url = "https://duanzixing.com/page/%d/" % page_count
57 | response = requests.get(url, headers=headers)
58 | # 使用xpath解析
59 | html = etree.HTML(response.text)
60 | # 获取段子标题
61 | one_page_title = html.xpath("/html/body/section/div/div/article/header/h2/a/text()")
62 | # 获取段子发布日期
63 | one_page_date = html.xpath("/html/body/section/div/div/article/p[1]/time/text()")
64 | # 获取段子作者
65 | one_page_author = html.xpath("/html/body/section/div/div/article/p[1]/span[1]/text()")
66 | # 获取段子浏览量
67 | views = html.xpath("/html/body/section/div/div/article/p[1]/span[2]/text()")
68 | one_page_views_counts = []
69 | # 处理views
70 | for view in views:
71 | views_count = re.findall(r"\d{1,}", view)[0]
72 | one_page_views_counts.append(views_count)
73 | # 获取评论数
74 | comments = html.xpath("/html/body/section/div/div/article/p[1]/a/text()")
75 | # 处理comments
76 | # comments_count = []
77 | # [comments_count.append(re.findall(r"\d+", comment)[0]) for comment in comments]
78 | one_page_comments_counts = [re.findall(r"\d+", comment)[0] for comment in comments]
79 | # 获取段子详情,这个需要进到段子里面,先获取段子详情页的url
80 | detail_urls = html.xpath("/html/body/section/div/div/article/header/h2/a/@href")
81 | # 对每个url进行requests请求,获取详情页的源码
82 | detail_code_list = [requests.get(detail_url, headers=headers).text for detail_url in detail_urls]
83 | # 再对每个详情页进行提取数据,存到detail_list这个列表中,这里需要注意,源码是根据
分割,因此会转换成列表。
84 | one_page_detail_list = [etree.HTML(detail_code).xpath("/html/body/section/div/div/article/p/text()") for
85 | detail_code in detail_code_list]
86 | # 处理特殊字符
87 | for i in range(len(one_page_detail_list)):
88 | for j in range(len(one_page_detail_list[i])):
89 | one_page_detail_list[i][j] = one_page_detail_list[i][j].replace("\u200b", "")
90 | # 装到之前的列表里面
91 | title.extend(one_page_title)
92 | date.extend(one_page_date)
93 | author.extend(one_page_author)
94 | views_counts.extend(one_page_views_counts)
95 | comments_counts.extend(one_page_comments_counts)
96 | detail_list.extend(one_page_detail_list)
97 | print(f"第{page_count}页抓取封装完成~")
98 | # 把之前的数据封装成一个字典
99 | crawl_result = {'title': title, 'date': date, 'author': author, 'views_counts': views_counts,
100 | 'comments_counts': comments_counts, 'detail': detail_list}
101 | print("正在存储到数据库中~")
102 | return crawl_result
103 |
104 |
105 | # 存储到MySQL数据库
106 | def save_to_mysql(result):
107 | try:
108 | # TODO:存储到MySQL数据库
109 | # 连接数据库
110 | db = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="123456", database="spidertestdb")
111 | cursor = db.cursor()
112 | # 创建表(title varchar给大点,500的话utf8就是500/3个汉字,uft8mb4就是125个汉字)
113 | create_sql = """
114 | create table if not exists jokes
115 | (
116 | id int primary key auto_increment comment '主键',
117 | title varchar(500) comment '段子标题',
118 | date date comment '段子发布日期',
119 | author varchar(100) comment '段子作者',
120 | views_count int comment '浏览量',
121 | comments_count int comment '评论数',
122 | detail text comment '段子内容'
123 | )
124 | """
125 | # 建表语句
126 | cursor.execute(create_sql)
127 | delete_sql = "delete from jokes"
128 | # 删除之前的数据
129 | cursor.execute(delete_sql)
130 | # 往数据库插入数据
131 | # 遍历之前的字典
132 | for one_res in result:
133 | insert_sql = f"""
134 | insert into jokes (title, date, author, views_count, comments_count, detail)
135 | values (%s, %s, %s, %s, %s, %s)
136 | """
137 | try:
138 | inserted_count = cursor.execute(insert_sql, (
139 | one_res['title'], one_res['date'], one_res['author'], one_res['views_counts'],
140 | one_res['comments_counts'],
141 | one_res['detail']))
142 | # print(f"插入了{inserted_count}条数据")
143 | db.commit()
144 | except Exception as e:
145 | db.rollback()
146 | print(f"数据异常,回滚了~. 错误信息: {e},{one_res}")
147 | print("向MySQL数据库存储数据存储成功")
148 | except Exception as e:
149 | print(f"Mysql-未知异常~,异常信息为{e}")
150 |
151 |
152 | # 存储到MongoDB数据库
153 | def save_to_mongodb(result):
154 | try:
155 | # TODO:存储到MongoDB数据库
156 | # 连接数据库
157 | conn = MongoClient(host="localhost", port=27017)
158 | # 创建集合,插入数据时会自动创建
159 | spider_collection = conn.test.spider
160 | # 删除集合里面的数据
161 | spider_collection.delete_many({})
162 | # 遍历列表,把每一条数据都插入进去
163 | for one_result in result:
164 | try:
165 | one = spider_collection.insert_one(one_result)
166 | # print(f"数据插入成功,插入数据的id为{one.inserted_id}")
167 | except Exception as e:
168 | print(f"数据插入失败,异常信息为{e}")
169 | print("向MongoDB数据库存储数据存储成功")
170 | except Exception as e:
171 | print(f"未知异常~异常信息为{e}")
172 |
173 |
174 | # 存储到redis数据库
175 | def save_to_redis(result):
176 | try:
177 | # TODO:存储到redis数据库
178 | # 连接数据库
179 | r = redis.StrictRedis(host="192.168.182.100", port=6380, password='123456', db=0, decode_responses=True)
180 | # 清空之前库中的数据
181 | r.flushall()
182 | # 因此采用redis的hash表来存储每一个段子,然后把hash表的对应段子的所有key存储到一个列表的一个key中作为value。
183 | # 这里我采用redis的incr函数,那么就相当于1-这个joke_id_counter的value都是hash表里面的键了
184 | # 但是这个是不考虑删除的情况,要是删除了这个值不会变,这个时候再定义一个列表或者Bitmap就行了
185 | # 遍历列表,把每一条数据都插入进去
186 | for one_result in result:
187 | # 首先生成一个id
188 | id = r.incr("joke_id_counter")
189 | r.hset(id, mapping=one_result)
190 | # 获取一下这个joke_id_counter里面的值,看看是不是正常的增加了
191 | count = r.get("joke_id_counter")
192 | # print(f"redis数据库现在段子数据共有{count}条")
193 | print("向Redis数据库存储数据存储成功")
194 | except Exception as e:
195 | print(f"Redis-未知异常~,异常信息为{e}")
196 |
197 |
198 | if __name__ == '__main__':
199 | headers = {
200 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57'
201 | }
202 | url = "https://duanzixing.com/"
203 | # 封装数据
204 | crawl_result = extract_html()
205 | # 把数据转换成易于存储于数据库的格式
206 | result = trans_dict(crawl_result)
207 | # 所有数据都准备完毕了,接下来就是存储到数据库
208 | # 存储到mysql数据库
209 | save_to_mysql(result)
210 | # 存储到mongoDB数据库
211 | save_to_mongodb(result)
212 | # 这时需要注意,result被MongoDB数据库修改了加了一个_id,因此解决方法可以是在redis中把主键给删了或者重新转换一下
213 | result = trans_dict(crawl_result)
214 | # 存储到redis数据库
215 | save_to_redis(result)
216 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/__init__.py
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__init__.py
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/items.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/items.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/pipelines.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/pipelines.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/settings.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/__pycache__/settings.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class ScrapyItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | title = scrapy.Field()
13 | date = scrapy.Field()
14 | author = scrapy.Field()
15 | views_counts = scrapy.Field()
16 | comments_counts = scrapy.Field()
17 | detail = scrapy.Field()
18 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 |
6 | from scrapy import signals
7 |
8 | # useful for handling different item types with a single interface
9 | from itemadapter import is_item, ItemAdapter
10 |
11 |
12 | class ScrapySpiderMiddleware:
13 | # Not all methods need to be defined. If a method is not defined,
14 | # scrapy acts as if the spider middleware does not modify the
15 | # passed objects.
16 |
17 | @classmethod
18 | def from_crawler(cls, crawler):
19 | # This method is used by Scrapy to create your spiders.
20 | s = cls()
21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 | return s
23 |
24 | def process_spider_input(self, response, spider):
25 | # Called for each response that goes through the spider
26 | # middleware and into the spider.
27 |
28 | # Should return None or raise an exception.
29 | return None
30 |
31 | def process_spider_output(self, response, result, spider):
32 | # Called with the results returned from the Spider, after
33 | # it has processed the response.
34 |
35 | # Must return an iterable of Request, or item objects.
36 | for i in result:
37 | yield i
38 |
39 | def process_spider_exception(self, response, exception, spider):
40 | # Called when a spider or process_spider_input() method
41 | # (from other spider middleware) raises an exception.
42 |
43 | # Should return either None or an iterable of Request or item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info("Spider opened: %s" % spider.name)
57 |
58 |
59 | class ScrapyDownloaderMiddleware:
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info("Spider opened: %s" % spider.name)
104 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 | import time
6 |
7 | import pymysql
8 | # useful for handling different item types with a single interface
9 | from itemadapter import ItemAdapter
10 |
11 |
12 | class ScrapyPipeline:
13 | # 定义一些日志的变量
14 | page_count = 0
15 | count = 0
16 | start_time = None
17 | end_time = None
18 |
19 | # 创建一个递归函数,用来转换文章详情的多重列表(如果有的话)
20 | def recursive_join(self, lst, separator='\n'):
21 | result = []
22 | for item in lst:
23 | if isinstance(item, list):
24 | result.append(self.recursive_join(item, separator))
25 | else:
26 | result.append(item)
27 | return separator.join(result)
28 |
29 | # 把之前封装好的集合给组装成一条一条的数据返回回去,清洗一下数据
30 | def trans_dict(self, crawl_result):
31 | result = []
32 | for i in range(0, len(crawl_result['title'])):
33 | one_res = {}
34 | for key, value in crawl_result.items():
35 | # print(key, value) # 此时value是一个列表,接下来通过索引组装每一个段子
36 | # print(value[i])
37 | one_res[key] = value[i]
38 | one_res['detail'] = self.recursive_join(one_res['detail']) if isinstance(one_res['detail'], list) else \
39 | one_res[
40 | 'detail']
41 | result.append(one_res)
42 | return result
43 |
44 | def open_spider(self, spider):
45 | self.start_time = time.time()
46 | print('初始化数据库连接中......')
47 |
48 | # TODO:存储到MySQL数据库
49 | # 连接数据库
50 | self.db = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="123456", database="spidertestdb")
51 | self.cursor = self.db.cursor()
52 | # 创建表(title varchar给大点,500的话utf8就是500/3个汉字,uft8mb4就是125个汉字)
53 | create_sql = """
54 | create table if not exists jokes
55 | (
56 | id int primary key auto_increment comment '主键',
57 | title varchar(500) comment '段子标题',
58 | date date comment '段子发布日期',
59 | author varchar(100) comment '段子作者',
60 | views_count int comment '浏览量',
61 | comments_count int comment '评论数',
62 | detail text comment '段子内容'
63 | )
64 | """
65 | # 建表语句
66 | self.cursor.execute(create_sql)
67 | delete_sql = "delete from jokes"
68 | # 删除之前的数据
69 | self.cursor.execute(delete_sql)
70 | print("数据库连接初始化成功")
71 |
72 | # 爬虫文件中提取数据的方法每yield一次item,就会运行一次
73 | # 在这里转换数据类型和存储到mysql数据库
74 | def process_item(self, item, spider):
75 | # spider可以获取转发过来的item的spider的属性,方法和名称。如
76 | # print(spider.name) joke
77 | self.page_count += 1
78 | print(f"请稍等,正在为您努力爬取数据......已爬取{self.page_count}页")
79 | # 这里需要转换一下,把返回的数据转换成可以插入到数据库的格式
80 | result = self.trans_dict(item)
81 | # 往数据库插入数据
82 | # 遍历之前的字典
83 | for one_res in result:
84 | insert_sql = f"""
85 | insert into jokes (title, date, author, views_count, comments_count, detail)
86 | values (%s, %s, %s, %s, %s, %s)
87 | """
88 | try:
89 | inserted_count = self.cursor.execute(insert_sql, (
90 | one_res['title'], one_res['date'], one_res['author'], one_res['views_counts'],
91 | one_res['comments_counts'],
92 | one_res['detail']))
93 | self.count += inserted_count
94 | # print(f"插入了{inserted_count}条数据")
95 | self.db.commit()
96 | except Exception as e:
97 | self.db.rollback()
98 | print(f"数据异常,回滚了~. 错误信息: {e},{one_res}")
99 | return item
100 |
101 | def close_spider(self, spider):
102 | self.end_time = time.time()
103 | print(
104 | f'共爬取了{self.count}条数据,已存储到MySQL数据库中,花费{self.end_time - self.start_time}秒\n关闭数据库连接中......')
105 | self.db.close()
106 | print("数据库连接关闭成功")
107 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for base_scrapy project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = "base_scrapy"
11 |
12 | SPIDER_MODULES = ["base_scrapy.spiders"]
13 | NEWSPIDER_MODULE = "base_scrapy.spiders"
14 | LOG_LEVEL = 'ERROR'
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | # USER_AGENT = "base_scrapy (+http://www.yourdomain.com)"
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 |
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | # CONCURRENT_REQUESTS = 32
24 |
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | # DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | # CONCURRENT_REQUESTS_PER_IP = 16
32 |
33 | # Disable cookies (enabled by default)
34 | # COOKIES_ENABLED = False
35 |
36 | # Disable Telnet Console (enabled by default)
37 | # TELNETCONSOLE_ENABLED = False
38 |
39 | # Override the default request headers:
40 | DEFAULT_REQUEST_HEADERS = {
41 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42 | "Accept-Language": "en",
43 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
44 | }
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | # SPIDER_MIDDLEWARES = {
49 | # "base_scrapy.middlewares.ScrapySpiderMiddleware": 543,
50 | # }
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | # DOWNLOADER_MIDDLEWARES = {
55 | # "base_scrapy.middlewares.ScrapyDownloaderMiddleware": 543,
56 | # }
57 |
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | # EXTENSIONS = {
61 | # "scrapy.extensions.telnet.TelnetConsole": None,
62 | # }
63 |
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | "base_scrapy.pipelines.ScrapyPipeline": 300,
68 | }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72 | # AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | # AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | # AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | # AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | # HTTPCACHE_ENABLED = True
86 | # HTTPCACHE_EXPIRATION_SECS = 0
87 | # HTTPCACHE_DIR = "httpcache"
88 | # HTTPCACHE_IGNORE_HTTP_CODES = []
89 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
90 |
91 | # Set settings whose default value is deprecated to a future-proof value
92 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
93 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
94 | FEED_EXPORT_ENCODING = "utf-8"
95 |
96 |
97 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/__pycache__/joke.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/__pycache__/joke.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/base_scrapy/spiders/joke.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | import time
5 |
6 | import requests
7 | import scrapy
8 | from lxml import etree
9 |
10 | from ..items import ScrapyItem
11 |
12 | """
13 | scrapy底层是基于协程的,这里还是之前的数据库的案例,看一下同样是存储20页数据,scrapy有多快。基础的scrapy使用就演示下面几点:
14 | 1:scrapy发送请求和转发请求
15 | 2:scrapy把数据转发到管道存储到数据库
16 | 3:items的使用
17 | """
18 |
19 |
20 | class JokeSpider(scrapy.Spider):
21 | name = "joke"
22 | allowed_domains = ["duanzixing.com"]
23 | start_urls = ["https://duanzixing.com"]
24 |
25 | def get_data(self, response):
26 | # TODO:常用的就是一个url和text,响应头也可以
27 | # # 获取响应的url地址
28 | print(f"响应的url:{response.url}")
29 | # # 获取当前响应对应的请求的url地址
30 | # print(f"请求的url:{response.request.url}")
31 | # # 解码响应头
32 | # decoded_response_headers = {
33 | # k.decode('utf-8'): [v.decode('utf-8') for v in v_list]
34 | # for k, v_list in response.headers.items()
35 | # }
36 | # print(f"响应头:{decoded_response_headers}")
37 | #
38 | # # 解码请求头
39 | # decoded_request_headers = {
40 | # k.decode('utf-8'): [v.decode('utf-8') for v in v_list]
41 | # for k, v_list in response.request.headers.items()
42 | # }
43 | # print(f"响应的请求头:{decoded_request_headers}")
44 | # # 获取响应体
45 | # # print(f"响应体为:{response.body}")
46 | # # 返回响应的内容(字符串形式)
47 | # # print(f"响应内容为:{response.text}")
48 | # # 获取响应状态码
49 | # print(f"响应状态码:{response.status}")
50 | # # 获取返回的json数据(解析不了会报错,用于请求后台接口的场景)
51 | # # json_data = json.loads(response.text)
52 | # # print(f"返回的json数据为:{json_data}")
53 |
54 | def parse(self, response, **kwargs):
55 | # self.get_data(response)
56 | # 获取请求的url
57 | url = "https://duanzixing.com/page/%d/"
58 | # 循环获取数据
59 | for page_count in range(1, 21):
60 | time.sleep(random.randint(1, 3))
61 | page_url = url % page_count
62 | # 把请求转发给parse_data这个方法进行处理
63 | yield scrapy.Request(page_url, callback=self.parse_data, meta={'url': page_url})
64 |
65 | def parse_data(self, response, **kwargs):
66 | # 这个url还是有用的,这个案例不按照顺序存,如果需要顺序存的时候把字典加个标志key就可以了,到时候排个序。
67 | url = response.meta['url']
68 | # 获取到这个response,就可以进行解析了
69 | # 获取段子标题
70 | one_page_title = response.xpath("/html/body/section/div/div/article/header/h2/a/text()").extract()
71 | # 获取段子发布日期
72 | one_page_date = response.xpath("/html/body/section/div/div/article/p[1]/time/text()").extract()
73 | # 获取段子作者
74 | one_page_author = response.xpath("/html/body/section/div/div/article/p[1]/span[1]/text()").extract()
75 | # 获取段子浏览量
76 | views = response.xpath("/html/body/section/div/div/article/p[1]/span[2]/text()").extract()
77 | one_page_views_counts = []
78 | # 处理views
79 | for view in views:
80 | views_count = re.findall(r"\d{1,}", view)[0]
81 | one_page_views_counts.append(views_count)
82 | # 获取评论数
83 | comments = response.xpath("/html/body/section/div/div/article/p[1]/a/text()").extract()
84 | # 处理comments
85 | # comments_count = []
86 | # [comments_count.append(re.findall(r"\d+", comment)[0]) for comment in comments]
87 | one_page_comments_counts = [re.findall(r"\d+", comment)[0] for comment in comments]
88 |
89 | # 获取段子详情,这个需要进到段子里面,先获取段子详情页的url
90 | detail_urls = response.xpath("/html/body/section/div/div/article/header/h2/a/@href").extract()
91 | # 对每个url进行requests请求,获取详情页的源码,这里当然也可以把所有参数带着转发给一个函数。scrapy.Request(...),
92 | # 但是为了简便就不这么写了。缺点就是变成同步的了。也可以使用async那一套
93 | detail_code_list = [requests.get(detail_url, headers=self.settings.get('DEFAULT_REQUEST_HEADERS')).text for
94 | detail_url in detail_urls]
95 | # 再对每个详情页进行提取数据,存到detail_list这个列表中,这里需要注意,源码是根据
分割,因此会转换成列表。
96 | one_page_detail_list = [etree.HTML(detail_code).xpath("/html/body/section/div/div/article/p/text()") for
97 | detail_code in detail_code_list]
98 | # 处理特殊字符
99 | for i in range(len(one_page_detail_list)):
100 | for j in range(len(one_page_detail_list[i])):
101 | one_page_detail_list[i][j] = one_page_detail_list[i][j].replace("\u200b", "")
102 | # 把之前的数据封装成一个字典
103 | crawl_result = {'title': one_page_title, 'date': one_page_date, 'author': one_page_author,
104 | 'views_counts': one_page_views_counts,
105 | 'comments_counts': one_page_comments_counts, 'detail': one_page_detail_list}
106 | # 转发给管道存储数据
107 | # yield crawl_result
108 | # 当然也可以使用之前的item
109 | item = ScrapyItem()
110 | item['title'] = one_page_title
111 | item['date'] = one_page_date
112 | item['author'] = one_page_author
113 | item['views_counts'] = one_page_views_counts
114 | item['comments_counts'] = one_page_comments_counts
115 | item['detail'] = one_page_detail_list
116 | yield item
117 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = base_scrapy.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = base_scrapy
12 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/__init__.py
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__init__.py
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/items.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/items.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/middlewares.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/middlewares.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/pipelines.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/pipelines.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/settings.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/__pycache__/settings.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class BaseScrapyPlusItem(scrapy.Item):
10 | # 定义一下需要爬取的数据,文章标题,内容,标签下面的文章标题,分类下面的文章标题
11 | # 这里规范的应该是定义三个不同的item类,这里为了简便就不定义了,不能为了规范而规范
12 | title = scrapy.Field()
13 | content = scrapy.Field()
14 | tag=scrapy.Field()
15 | tag_title=scrapy.Field()
16 | cate=scrapy.Field()
17 | cate_title=scrapy.Field()
18 |
19 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 | import random
6 |
7 | from scrapy import signals
8 | from base_scrapy_plus.settings import USER_AGENTS_LIST # 注意导入路径,请忽视pycharm的错误提示
9 |
10 | # useful for handling different item types with a single interface
11 | from itemadapter import is_item, ItemAdapter
12 | # 配置随机UA中间件
13 | class UserAgentMiddleware(object):
14 | def process_request(self, request, spider):
15 | user_agent = random.choice(USER_AGENTS_LIST)
16 | request.headers['User-Agent'] = user_agent
17 |
18 | class BaseScrapyPlusSpiderMiddleware:
19 | # Not all methods need to be defined. If a method is not defined,
20 | # scrapy acts as if the spider middleware does not modify the
21 | # passed objects.
22 |
23 | @classmethod
24 | def from_crawler(cls, crawler):
25 | # This method is used by Scrapy to create your spiders.
26 | s = cls()
27 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
28 | return s
29 |
30 | def process_spider_input(self, response, spider):
31 | # Called for each response that goes through the spider
32 | # middleware and into the spider.
33 |
34 | # Should return None or raise an exception.
35 | return None
36 |
37 | def process_spider_output(self, response, result, spider):
38 | # Called with the results returned from the Spider, after
39 | # it has processed the response.
40 |
41 | # Must return an iterable of Request, or item objects.
42 | for i in result:
43 | yield i
44 |
45 | def process_spider_exception(self, response, exception, spider):
46 | # Called when a spider or process_spider_input() method
47 | # (from other spider middleware) raises an exception.
48 |
49 | # Should return either None or an iterable of Request or item objects.
50 | pass
51 |
52 | def process_start_requests(self, start_requests, spider):
53 | # Called with the start requests of the spider, and works
54 | # similarly to the process_spider_output() method, except
55 | # that it doesn’t have a response associated.
56 |
57 | # Must return only requests (not items).
58 | for r in start_requests:
59 | yield r
60 |
61 | def spider_opened(self, spider):
62 | spider.logger.info("Spider opened: %s" % spider.name)
63 |
64 |
65 | class BaseScrapyPlusDownloaderMiddleware:
66 | # Not all methods need to be defined. If a method is not defined,
67 | # scrapy acts as if the downloader middleware does not modify the
68 | # passed objects.
69 |
70 | @classmethod
71 | def from_crawler(cls, crawler):
72 | # This method is used by Scrapy to create your spiders.
73 | s = cls()
74 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
75 | return s
76 |
77 | def process_request(self, request, spider):
78 | # Called for each request that goes through the downloader
79 | # middleware.
80 |
81 | # Must either:
82 | # - return None: continue processing this request
83 | # - or return a Response object
84 | # - or return a Request object
85 | # - or raise IgnoreRequest: process_exception() methods of
86 | # installed downloader middleware will be called
87 | return None
88 |
89 | def process_response(self, request, response, spider):
90 | # Called with the response returned from the downloader.
91 |
92 | # Must either;
93 | # - return a Response object
94 | # - return a Request object
95 | # - or raise IgnoreRequest
96 | return response
97 |
98 | def process_exception(self, request, exception, spider):
99 | # Called when a download handler or a process_request()
100 | # (from other downloader middleware) raises an exception.
101 |
102 | # Must either:
103 | # - return None: continue processing this exception
104 | # - return a Response object: stops process_exception() chain
105 | # - return a Request object: stops process_exception() chain
106 | pass
107 |
108 | def spider_opened(self, spider):
109 | spider.logger.info("Spider opened: %s" % spider.name)
110 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 | import os.path
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | # 这里为了简便,写到三个不同的md文件里面就可以了
12 | class BaseScrapyPlusPipeline:
13 | content = None
14 | tag = None
15 | cate = None
16 |
17 | def open_spider(self, spider):
18 | print("初始化环境中~")
19 | if not os.path.exists('./爬取的文件'):
20 | os.mkdir('./爬取的文件')
21 | self.content = open('./爬取的文件/文章.md', mode='w', encoding='utf-8')
22 | self.tag = open('./爬取的文件/标签.md', mode='w', encoding='utf-8')
23 | self.cate = open('./爬取的文件/分类.md', mode='w', encoding='utf-8')
24 |
25 | def process_item(self, item, spider):
26 | # 判断传过来的item是哪个方法传过来的
27 | if 'title' in item:
28 | print("文章正在写入中......")
29 | self.content.write("# ")
30 | self.content.write(item['title'])
31 | self.content.write("\n")
32 | self.content.write(item['content'].replace("#", ""))
33 | self.content.write("\n")
34 | elif 'tag' in item:
35 | print("标签正在写入中......")
36 | self.tag.write("## ")
37 | self.tag.write(item['tag'])
38 | self.tag.write("\n")
39 | for title in item['tag_title']:
40 | self.tag.write('- ')
41 | self.tag.write(title)
42 | self.tag.write("\n")
43 | elif 'cate' in item:
44 | print("分类正在写入中......")
45 | self.cate.write("## ")
46 | self.cate.write(item['cate'])
47 | self.cate.write("\n")
48 | for title in item['cate_title']:
49 | self.cate.write('- ')
50 | self.cate.write(title)
51 | self.cate.write("\n")
52 | return item
53 |
54 | def close_spider(self, spider):
55 | print(f"网站经过筛选的总url数量为:{spider.count}")
56 | print("关闭环境中")
57 | self.content.close()
58 | self.tag.close()
59 | self.cate.close()
60 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for base_scrapy_plus project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = "base_scrapy_plus"
11 |
12 | SPIDER_MODULES = ["base_scrapy_plus.spiders"]
13 | NEWSPIDER_MODULE = "base_scrapy_plus.spiders"
14 |
15 | LOG_LEVEL="ERROR"
16 |
17 |
18 | # 随机UA,给下载中间件使用
19 | USER_AGENTS_LIST = [
20 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
21 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
22 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
23 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
24 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
25 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
26 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
27 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
28 | ]
29 |
30 |
31 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
32 | #USER_AGENT = "base_scrapy_plus (+http://www.yourdomain.com)"
33 |
34 | # Obey robots.txt rules
35 | ROBOTSTXT_OBEY = False
36 |
37 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
38 | #CONCURRENT_REQUESTS = 32
39 |
40 | # Configure a delay for requests for the same website (default: 0)
41 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
42 | # See also autothrottle settings and docs
43 | #DOWNLOAD_DELAY = 3
44 | # The download delay setting will honor only one of:
45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
46 | #CONCURRENT_REQUESTS_PER_IP = 16
47 |
48 | # Disable cookies (enabled by default)
49 | #COOKIES_ENABLED = False
50 |
51 | # Disable Telnet Console (enabled by default)
52 | #TELNETCONSOLE_ENABLED = False
53 |
54 | # Override the default request headers:
55 | DEFAULT_REQUEST_HEADERS = {
56 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
57 | "Accept-Language": "en",
58 | }
59 |
60 | # Enable or disable spider middlewares
61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
62 | #SPIDER_MIDDLEWARES = {
63 | # "base_scrapy_plus.middlewares.BaseScrapyPlusSpiderMiddleware": 543,
64 | #}
65 |
66 | # Enable or disable downloader middlewares
67 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
68 | # 下载中间件添加随机UA
69 | DOWNLOADER_MIDDLEWARES = {
70 | "base_scrapy_plus.middlewares.BaseScrapyPlusDownloaderMiddleware": 543,
71 | }
72 |
73 | # Enable or disable extensions
74 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
75 | #EXTENSIONS = {
76 | # "scrapy.extensions.telnet.TelnetConsole": None,
77 | #}
78 |
79 | # Configure item pipelines
80 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
81 | ITEM_PIPELINES = {
82 | "base_scrapy_plus.pipelines.BaseScrapyPlusPipeline": 300,
83 | 'scrapy_redis.pipelines.RedisPipeline': 800,
84 |
85 | }
86 |
87 | # Enable and configure the AutoThrottle extension (disabled by default)
88 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
89 | #AUTOTHROTTLE_ENABLED = True
90 | # The initial download delay
91 | #AUTOTHROTTLE_START_DELAY = 5
92 | # The maximum download delay to be set in case of high latencies
93 | #AUTOTHROTTLE_MAX_DELAY = 60
94 | # The average number of requests Scrapy should be sending in parallel to
95 | # each remote server
96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
97 | # Enable showing throttling stats for every response received:
98 | #AUTOTHROTTLE_DEBUG = False
99 |
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = "httpcache"
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
107 |
108 | # Set settings whose default value is deprecated to a future-proof value
109 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
110 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
111 | FEED_EXPORT_ENCODING = "utf-8"
112 |
113 | # 配置分布式
114 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
115 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
116 | SCHEDULER_PERSIST = True
117 | #
118 | #
119 | # 配置redis数据库信
120 | REDIS_HOST = "192.168.182.100"
121 | REDIS_PORT = 6380
122 | REDIS_PARAMS = {'password': '123456'}
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/__pycache__/blogspider.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/__pycache__/blogspider.cpython-311.pyc
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/spiders/blogspider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy.linkextractors import LinkExtractor
3 | from scrapy.spiders import CrawlSpider, Rule
4 | from scrapy_redis.spiders import RedisCrawlSpider
5 | from base_scrapy_plus.items import BaseScrapyPlusItem
6 |
7 | """
8 | 这里演示scrapy的全站抓取和分布式爬虫,配置随机UA,scrapy的知识点基本都用到了。下载中间件,items,管道,分布式爬取,全站抓取。
9 | 全站抓取一定要慎重,不要影响别人网站的正常运行。配置好rules和delay。这里以我的博客演示,一共就几十个url,快一点点。
10 | 但是这里需要思考,我们要哪个页面是为了提取什么数据,
11 | 打开具体的文章,就获取文章标题和内容。定义一个提取文章数据的方法parse_article
12 | 比如说我打开博客的标签,就是获取标签下的文章标题,就定义一个提取文章标题的方法parse_tag_headlines。
13 | 打开博客的分类,要获取当前分类下的文章标题,定义一个提取分类下面的标题长度方法parse_cate_headlines
14 | 分别转发给不同的parse方法就可以了。引入分布式爬虫,快一些
15 | """
16 |
17 |
18 | class BlogspiderSpider(RedisCrawlSpider):
19 | count = 0
20 | name = "blogspider"
21 | allowed_domains = ["bugdesigner.cn"]
22 | # start_urls = ["https://bugdesigner.cn"]
23 | redis_key = 'blogQuene' # 使用管道名称
24 |
25 | rules = (
26 | Rule(LinkExtractor(allow=r"https://www.bugdesigner.cn/(?!tag/|cate/|aboutme/)[^/]+/$"),
27 | callback="parse_article",
28 | follow=True),
29 | # 匹配https://www.bugdesigner.cn/tag/开头的,中间匹配不是/的字符,遇到/结束
30 | Rule(LinkExtractor(allow=r"https://www.bugdesigner.cn/tag/[^/]+/$"),
31 | callback="parse_tag_headlines",
32 | follow=True),
33 | Rule(LinkExtractor(allow=r"https://www.bugdesigner.cn/category/[^/]+/$"),
34 | callback="parse_cate_headlines",
35 | follow=True),
36 | Rule(LinkExtractor(allow=r".*"), callback="parse_count", follow=False),)
37 |
38 | # 保底的方法
39 | def parse_count(self, response):
40 | self.count += 1
41 | # print(response.request.url)
42 |
43 | # 批量抓取文章
44 | def parse_article(self, response):
45 | # print(response.request.url)
46 | # 实例化item,一个字典
47 | item = BaseScrapyPlusItem()
48 | # 记录爬取的url个数
49 | self.count += 1
50 | # 提取博客文章标题
51 | item['title'] = response.xpath("/html/body/div[2]/div/div/div[1]/div[1]/div[2]/h1/text()").extract_first()
52 | # print(title)
53 | # print(response.text)
54 | # 提取博客文章内容
55 | item['content'] = response.xpath('//div[@id="lightgallery"]').extract_first()
56 | # print(content)
57 | yield item
58 |
59 | # 获取标签和对应的文章标题
60 | def parse_tag_headlines(self, response):
61 | self.count += 1
62 | # print(response.request.url)
63 | # 提取当前标签下面的所有文章标题,存储到item里面
64 | item = BaseScrapyPlusItem()
65 | # 获取标签名
66 | item['tag'] = response.xpath('//div[@class="k-header"]//div[@class="title"]/text()').extract_first()
67 | # 获取标签名下面的文章,文章不止一个
68 | item['tag_title'] = response.xpath('//div[@class="row"]//h3[@class="title"]/a/text()').extract()
69 | # print(item['tag'], item['tag_title'], self.i)
70 | yield item
71 |
72 | # 获取分类和对应的文章标题
73 | def parse_cate_headlines(self, response):
74 | self.count += 1
75 | # print(response.request.url)
76 | item = BaseScrapyPlusItem()
77 | # 提取文章分类名
78 | item['cate'] = response.xpath('//div[@class="k-header"]//div[@class="title"]/text()').extract_first()
79 | # 提取下面的文章名
80 | item['cate_title'] = response.xpath('//div[@class="row"]//h3[@class="title"]/a/text()').extract()
81 | # print(item['cate'], item['cate_title'])
82 | yield item
83 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/base_scrapy_plus/start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 |
4 |
5 | if __name__ == '__main__':
6 | cmdline.execute("scrapy crawl blogspider".split())
7 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/base_scrapy_plus/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = base_scrapy_plus.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = base_scrapy_plus
12 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/selenium入门.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from selenium.webdriver import Chrome
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.chrome.options import Options
6 |
7 | """
8 | selenium本来是一个自动化测试工具,后来因为它是直接操作浏览器不容易被反爬就被用到了爬虫领域,但是一般用的也不太多。实在请求不到数据了,
9 | 再使用它。
10 | 下面演示这几个方面:
11 | 1:selenium快速启动
12 | 2:selenium定位元素
13 | 3:selenium进阶操作,无头浏览器,处理cookie,解决frame嵌套问题
14 | """
15 | # # TODO:selenium快速启动
16 | # # 创建浏览器对象
17 | # blog = Chrome()
18 | # # 访问我的博客网站
19 | # blog.get("http://www.bugdesigner.cn")
20 | # blog.maximize_window() #最大化浏览器窗口
21 | # # 打印我的博客网站的标题
22 | # print(blog.title)
23 | # # 给主页截个图
24 | # blog.save_screenshot("homepage.png")
25 | #
26 | # # 搜索关于Docker的文章
27 | # # 找到输入框,给输入框设置值为Docker
28 | # blog.find_element("id", "search-widgets").send_keys("Docker")
29 | # # 点击搜索按钮
30 | # blog.find_element("id", "searchsubmit").click()
31 | #
32 | # # 获取页面内容
33 | # source_code = blog.page_source
34 | # # print(source_code)
35 | # # 获取cookie
36 | # cookies = blog.get_cookies()
37 | # # print(cookies)
38 | # # 获取当前url
39 | # url = blog.current_url
40 | # # print(url)
41 |
42 | # # TODO:selenium定位元素
43 | # # selenium提供很多选择器,但是常用的只有三个,css,id,xpath。
44 | # # 使用选择器值定位。获取博客内有关Docker的文章内容
45 | # labels = blog.find_elements(By.CSS_SELECTOR, ".a-post .label")
46 | # titles = blog.find_elements(By.CSS_SELECTOR, ".a-post .title a")
47 | # contents = blog.find_elements(By.CSS_SELECTOR, ".a-post .content p")
48 | # data = {
49 | # "label": labels,
50 | # "title": titles,
51 | # "content": contents
52 | # }
53 | # for key, value in data.items():
54 | # for index, item in enumerate(value):
55 | # print(f"第{index + 1}个文章的{key}为{item.text}")
56 |
57 | # # TODO:selenium进阶操作
58 | # # 1. 获取当前所有的窗口
59 | # current_windows = blog.window_handles
60 | # # 2. 根据窗口索引进行切换
61 | # blog.switch_to.window(current_windows[1])
62 |
63 | # # 无头浏览器
64 | # opt = Options()
65 | # opt.add_argument("--headless")
66 | # opt.add_argument('--disable-gpu')
67 | # opt.add_argument("--window-size=4000,1600") # 设置窗口大小
68 | #
69 | # noHeaderDriver = Chrome(options=opt)
70 | # noHeaderDriver.get("https://www.bugdesigner.cn")
71 | # cookies = noHeaderDriver.get_cookies()
72 |
73 | # # 添加cookie
74 | # for cookie in cookies:
75 | # print(cookie)
76 | # noHeaderDriver.add_cookie(cookie)
77 | # noHeaderDriver.get("https://www.bugdesigner.cn")
78 | # title1 = noHeaderDriver.title
79 | # print(title1)
80 | # # 不让进程停止,这样浏览器就不会自动退出
81 | # input("输入任何东西结束")
82 | # 对页面的操作
83 | # blog.forward() # 前进
84 | # blog.back() # 后退
85 | # blog.refresh() # 刷新
86 | # blog.close() # 退出当前页面
87 | # blog.quit() # 退出浏览器
88 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/selenium练习—模拟qq邮箱登录.py:
--------------------------------------------------------------------------------
1 | # 实现一些访问时候的停止功能模块
2 | import random
3 | import time
4 | # 创建Chrome的web驱动模块和选择器模块和可选项模块
5 | from selenium.webdriver import Chrome
6 | from selenium.webdriver.common.by import By
7 | from selenium.webdriver.chrome.options import Options
8 | # 显式等待模块,没用上
9 | from selenium.webdriver.support.ui import WebDriverWait
10 | # 模拟鼠标行为的模块
11 | from selenium.webdriver.common.action_chains import ActionChains
12 | # 打码平台用到的模块
13 | import base64
14 | import requests
15 |
16 | """
17 | 这个练习实现了QQ邮箱的自动登录,技术点有:
18 | 1:iframe的嵌套
19 | 2:打码平台的使用
20 | 3:一些选择器,基本语法,隐式显式等待,全局option等等
21 | 用户名和密码改一下就能用,打码平台token需要自己整一个。
22 | """
23 |
24 |
25 | # 定义一个切换iframe的函数
26 | def change_iframe():
27 | qq_driver.switch_to.default_content() # 先回到主页面
28 | iframe = qq_driver.find_element(By.XPATH,
29 | "//iframe[contains(@src,'https://graph.qq.com/oauth2.0/authorize?response_type=code&client_id=102013353&scope=get_user_info%2Cget_app_friends&theme=10&auth_item_state=1&redirect_uri=https%3A%2F%2Fwx.mail.qq.com%2Flist%2Freadtemplate%3Fname%3Dlogin_jump.html%26scene%3D1%26login_type%3Dqq')]")
30 | qq_driver.switch_to.frame(iframe)
31 | iframe2 = qq_driver.find_element(By.XPATH,
32 | "//iframe[contains(@src,'https://xui.ptlogin2.qq.com/cgi-bin/xlogin?appid=716027609&daid=383&style=33&login_text=%E7%99%BB%E5%BD%95&hide_title_bar=1&hide_border=1&target=self&s_url=https%3A%2F%2Fgraph.qq.com%2Foauth2.0%2Flogin_jump&pt_3rd_aid=102013353&pt_feedback_link=https%3A%2F%2Fsupport.qq.com%2Fproducts%2F77942%3FcustomInfo%3D.appid102013353&theme=10&verify_theme=')]")
33 | qq_driver.switch_to.frame(iframe2)
34 |
35 |
36 | # 打码平台的函数
37 | def decode_code():
38 | # 开始打码
39 | url = "http://www.jfbym.com/api/YmServer/customApi"
40 | with open(r'verifyCode.png', 'rb') as f:
41 | im = base64.b64encode(f.read()).decode()
42 | data = {
43 | 'token': 'xxxxxxxx', # 输入自己的token
44 | 'type': '30221',
45 | 'image': im,
46 | }
47 | _headers = {
48 | 'Content-Type': 'application/json'
49 | }
50 | response = requests.request("POST", url, headers=_headers, json=data)
51 | # 其实也就是中点位置,这个时候对数据进行处理,边框大概是4px,忽略就好了
52 | # 对x值进行这样的处理,如果说x>290px,那么直接给x赋值为135(最右边)。如果说<10px,直接给x赋值为-135(最左边).其他的就给x值减去150px
53 | # 对y值进行这样的处理,直接给y-115px就可以了
54 | # 点击验证码模拟鼠标操作
55 | coords_str = response.json().get('data').get('data')
56 | coords = [tuple(map(int, coord.split(','))) for coord in coords_str.split('|')]
57 | print(coords)
58 | # 使用ActionChains模拟鼠标操作
59 | actions = ActionChains(qq_driver)
60 | verify_frame = qq_driver.find_element(By.ID, "newVcodeIframe")
61 | for coord in coords:
62 | x, y = coord
63 | if x > 290:
64 | x = 135
65 | elif x < 10:
66 | x = -135
67 | else:
68 | x = x - 150
69 | y = y - 115
70 | # 为了提供更精确的点击,将鼠标移到验证码元素的左上角,然后偏移特定的x, y坐标
71 | actions.move_to_element_with_offset(verify_frame, x, y).click().perform()
72 |
73 |
74 | # 通过验证码的函数
75 | def pass_verifyCode():
76 | global newVcodeIframe
77 | while newVcodeIframe.is_displayed():
78 | # 出现弹框了截个图,截取指定部分的,等下发给打码平台
79 | qq_driver.find_element(By.ID, "newVcodeIframe").screenshot("verifyCode.png")
80 | print("验证码截图成功~")
81 | # 开始打码
82 | decode_code()
83 | change_iframe()
84 | # 这个非常重要,你还得进一层iframe,不然找不到那个按钮
85 | path3 = "//iframe[contains(@src,'https://captcha.gtimg.com/1/template/drag_ele.html')]"
86 | iframe3 = qq_driver.find_element(By.XPATH, path3)
87 | qq_driver.switch_to.frame(iframe3)
88 | # 点击确认按钮
89 | qq_driver.find_element(By.CLASS_NAME, "verify-btn-text").click()
90 | # 强制等待5s,看看页面是不是跳转了,如果验证失败或者二次验证就继续循环验证
91 | time.sleep(5)
92 | change_iframe()
93 | # 更新页面的可见性,这里之所以获取一下是因为如果再循环就报错了,严谨一点
94 | newVcodeIframe = qq_driver.find_element(By.ID, "newVcodeIframe")
95 |
96 |
97 | def input_info():
98 | # 获取用户名和密码这个输入框,填充数据并点击登录
99 | qq_driver.find_element(By.ID, "u").send_keys("xxxxxx")
100 | qq_driver.find_element(By.ID, "p").send_keys("xxxxx")
101 | # 旧版验证码,因此点击太快会提示你没有输入验证码
102 | # verify_area = qq_driver.find_element(By.ID, "verifycode")
103 | # current_value = verify_area.get_attribute("value")
104 | qq_driver.implicitly_wait(10)
105 | # 获取新版验证码对象
106 | newVcodeArea = qq_driver.find_element(By.ID, "newVcodeArea")
107 | # 腾讯本身表单有问题,就一直点击这个按钮,除非出现了验证码。还需要考虑不需要登录的情况,点击一次系统会自动更新这个可见性
108 | # 因此循环内不需更新条件也不会出现死循环,
109 | while not newVcodeArea.is_displayed():
110 | try:
111 | qq_driver.find_element(By.ID, "login_button").click()
112 | except:
113 | time.sleep(random.randint(1, 3))
114 | print("继续点击按钮~")
115 |
116 |
117 | if __name__ == '__main__':
118 | # 禁用弹出框
119 | chrome_options = Options()
120 | chrome_options.add_argument("--disable-popup-blocking")
121 | # 创建驱动对象
122 | qq_driver = Chrome(options=chrome_options)
123 | # 启动浏览器
124 | qq_driver.get("https://mail.qq.com")
125 | qq_driver.maximize_window()
126 | # 改变当前frame
127 | change_iframe()
128 | # 获取密码登录这个超链接并点击
129 | qq_driver.find_element(By.ID, "switcher_plogin").click()
130 | # 输入用户名和密码
131 | input_info()
132 | # 强制等5s,给验证码加载的时间
133 | time.sleep(5)
134 | newVcodeIframe = qq_driver.find_element(By.ID, "newVcodeIframe")
135 | # 通过验证码
136 | pass_verifyCode()
137 | # 通过验证码以后一般就是进去了,但是还有的还需要进行短信验证,这里需要虚拟机或者说第三方云平台啥的。但是原理是一样的
138 | # 来个死循环,让页面不自动关闭
139 | while True:
140 | pass
141 |
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/案例-爬取某上市公司网站.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AZCodingAccount/python-spider/4447dd3277f492e5b63f94391938baf402f701ea/第5章-selenium和Scrapy/案例-爬取某上市公司网站.py
--------------------------------------------------------------------------------
/第5章-selenium和Scrapy/案例-爬取网易新闻.py:
--------------------------------------------------------------------------------
1 | import time
2 | import csv
3 |
4 | import requests
5 | from selenium.webdriver import Chrome
6 | from selenium.webdriver.common.by import By
7 |
8 | def selenium_crawl():
9 | chrome = Chrome()
10 | chrome.get("https://news.163.com/")
11 | chrome.maximize_window()
12 | load_more_tips = False
13 | count = 1
14 | # 模拟页面滑动,页面数据是动态加载的,滑动5次,(这样容易被检测)
15 | while count <= 6:
16 | print(f"第{count}次滑动")
17 | count += 1
18 | chrome.execute_script("window.scrollTo(0, document.body.scrollHeight);")
19 | # 点击加载更多
20 | button = chrome.find_element(By.XPATH,
21 | '//*[@id="index2016_wrap"]/div[3]/div[2]/div[3]/div[2]/div[5]/div/a[3]')
22 | # button.click()
23 | chrome.execute_script('arguments[0].click();', button)
24 | time.sleep(4)
25 | # 错误示范,没有考虑到一个文章有多个img
26 | # html = chrome.page_source
27 | # print(html)
28 | # # 从页面上提取数据
29 | # titles = chrome.find_elements(By.XPATH,
30 | # '//div[contains(@class,"ndi_main")]/div[contains(@class,"data_row")]//div[contains(@class,"news_title")]/h3//a')
31 | # # 这里的图片可能有多个
32 | # image_hrefs = chrome.find_elements(By.XPATH, "//div[contains(@class,'ndi_main')]/div[contains(@class,'data_row')]//img")
33 | # article_hrefs = chrome.find_elements(By.XPATH,
34 | # "//div[contains(@class,'ndi_main')]/div[contains(@class,'data_row')]//div[contains(@class,'news_title')]/h3//a")
35 | # 遍历取出每一项,并把他们添加到列表中,列表中
36 | # articles = [{'title': title, 'url': url, 'img_url': img_url} for title, url, img_url in
37 | # zip(titles, article_hrefs, image_hrefs)]
38 | # for index, title in enumerate(titles):
39 | # print(index, title.text)
40 | #
41 | # for index, title in enumerate(image_hrefs):
42 | # print(index, title.get_attribute("src"))
43 | # for index, title in enumerate(article_hrefs):
44 | # print(index, title.get_attribute("href"))
45 |
46 | article_elements = chrome.find_elements(By.XPATH,
47 | '//div[contains(@class,"ndi_main")]/div[contains(@class,"data_row")]') # 定位所有文章
48 | # 遍历所有文章取出每一个文章,并对这些文章进行提取
49 | articles = []
50 | for article_element in article_elements:
51 | # 对每篇文章,提取标题、URL等信息
52 | title = article_element.find_element(By.XPATH, ".//h3/a").text
53 | url = article_element.find_element(By.XPATH, ".//h3/a").get_attribute('href')
54 |
55 | # 提取当前文章内的所有图片链接
56 | img_urls = [img.get_attribute('src') for img in article_element.find_elements(By.XPATH, ".//img")]
57 |
58 | # 将信息添加到列表中
59 | articles.append({'title': title, 'url': url, 'img_urls': img_urls})
60 |
61 | # 打印共有多少新闻和新闻详情
62 | print(len(articles))
63 | for article in articles:
64 | print(article)
65 |
66 | # 接下来存储到数据库或者csv文件中
67 | with open('网易新闻数据.csv', mode='w', newline='', encoding='utf-8') as file:
68 | writer = csv.DictWriter(file, fieldnames=['title', 'url', 'img_urls'])
69 | # 写入表头
70 | writer.writeheader()
71 | # 写入数据行
72 | for row in articles:
73 | writer.writerow(row)
74 |
75 |
76 | def save_article(url):
77 | # 对于新闻,可以把所有数据都爬下来存储到一个html文件里面
78 | return requests.get(url, headers={
79 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
80 | })
81 |
82 |
83 | if __name__ == '__main__':
84 | # selenium爬取网易新闻信息
85 | selenium_crawl()
86 | # 演示一下怎么据为己有
87 | res = save_article("https://www.163.com/dy/article/IJG52MJK051282JL.html")
88 | with open("article.html", mode="w", encoding="utf-8") as f:
89 | f.write('') # 设置一下编码
90 | f.write(res.text)
91 |
--------------------------------------------------------------------------------
/第6章-JS基础/Flask服务.py:
--------------------------------------------------------------------------------
1 | import json
2 | from flask import Flask, render_template, request # pip install Flask
3 | from flask_cors import CORS
4 | app = Flask(__name__)
5 |
6 |
7 | # 跨域相关配置,最后也没带过来cookie~(失败的尝试,话说为啥该配的都配了本地还是带不过来cookie啊)
8 | # CORS(app, origins="http://localhost:63342")
9 | #
10 | # CORS(app, supports_credentials=True) # 注意添加supports_credentials=True
11 | #
12 | #
13 | # @app.after_request
14 | # def after_request(response):
15 | # response.headers['Access-Control-Allow-Origin'] = 'http://localhost:63342'
16 | # response.headers['Access-Control-Allow-Credentials'] = 'true'
17 | # response.headers['Access-Control-Allow-Headers'] = 'Content-Type,token,token2'
18 | # response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
19 | # return response
20 |
21 |
22 | @app.route("/")
23 | def index():
24 | # 跳转到首页
25 | print("你曾经来过服务器")
26 | name = "alex"
27 | # 数据是在这里渲染后, 返回个客户端的html
28 | return render_template("JQuery发送AJAX请求.html", name=name)
29 |
30 |
31 | # 开发一个接收get请求的接口
32 | @app.route("/ajax_get")
33 | def ajax_get_req():
34 | print(request.headers.get("token2"))
35 | # 接收cookie中的信息
36 | print(request.cookies.get("name"))
37 | n = request.cookies.get('name')
38 | print(n)
39 | if not n:
40 | return "没有cookie就不要来了."
41 | # 接收header中的信息
42 | token = request.headers.get('token')
43 | if not token:
44 | return "没token还想来?"
45 |
46 | # Flask接收get请求的参数
47 | name = request.args.get('name')
48 | _ = request.args.get('_')
49 | if name and _:
50 | # 返回json
51 | return {"name": 'zhangsan', "id": 10086, "isMen": True}
52 | else:
53 | return "回家去吧"
54 |
55 |
56 | # 开发一个接收post请求的接口
57 | @app.route("/ajax_post", methods=['POST'])
58 | def ajax_get_post():
59 | # time.sleep(3)
60 | # 接收JSON数据
61 | print(request.json)
62 |
63 | lst = [
64 | {"id": 1, "name": "张飞", "age": 16},
65 | {"id": 2, "name": "孙斌", "age": 16},
66 | {"id": 3, "name": "樵夫", "age": 16},
67 | {"id": 4, "name": "大佬", "age": 16},
68 | ]
69 |
70 | return json.dumps(lst)
71 |
72 |
73 | # 开发一个处理jsonp的接口
74 | @app.route("/process_jsonp", methods=["GET"])
75 | def process_jsonp():
76 | # 获取回调字符串
77 | cb = request.args.get("cb")
78 | data = {
79 | "name": "zhangsan",
80 | "age": 18
81 | }
82 | # 实际上就是导入script标签(不受同源策略的影响),自己就运行了,AJAX自己封装了一下。现在不用这个了,都是CORS,设置一下就可以访问了
83 | """
84 | 我们这里返回的时候就相当于把这个函数给运行了
85 | cb({"name":"zhangsan","age":18})。客户端那里有个函数等着接数据,数据就是这个cb里面的参数,
86 | 这个相当于各种注入,SQL注入,URL注入,就是拼接。用来过CrossOrigin的 。因此逆向的时候可能会有a{..........}里面一大坨东西就是jsonp
87 | """
88 | return cb + "(" + json.dumps(data) + ")"
89 |
90 | if __name__ == '__main__':
91 | app.run()
92 |
--------------------------------------------------------------------------------
/第6章-JS基础/jsonp页面.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Document
7 |
8 |
9 |
23 |
24 |
25 | 正在发送数据
26 |
27 |
28 |
--------------------------------------------------------------------------------
/第6章-JS基础/templates/JQuery发送AJAX请求.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 | Title
9 |
10 |
124 |
125 |
126 |
152 |
153 |
154 |
155 |
156 | id |
157 | name |
158 | age |
159 |
160 |
161 |
162 |
163 |
--------------------------------------------------------------------------------
/第6章-JS基础/templates/success.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | 成功页面
7 |
8 |
9 | 上传成功!
10 |
11 |
12 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/Tkinter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | import threading
5 | import time
6 | from requests.adapters import HTTPAdapter
7 | import requests
8 | from urllib3 import Retry
9 |
10 | from 请求头加密 import convert_md5, get_secret_key
11 | from cookie加密 import gen_b_lsid, gen_uuid
12 | from 代理 import get_tunnel_proxies, get_proxy_dict
13 |
14 | start_time = time.time()
15 | # 全局计数器和线程锁
16 | count_lock = threading.Lock()
17 | play_count = 0
18 | time_limit = 60 * 30 # 运行时间限制,10分钟
19 |
20 | import tkinter as tk
21 | from tkinter import scrolledtext
22 | import threading
23 | import time
24 |
25 |
26 | # 图形化
27 | class App:
28 | def __init__(self, root):
29 | self.root = root
30 | self.root.title("B站刷视频播放量小工具")
31 | self.root.geometry('800x400')
32 | self.create_widgets()
33 | self.running = False # 控制增加播放量线程
34 |
35 | def create_widgets(self):
36 | # 设置整体布局
37 | input_frame = tk.Frame(self.root)
38 | input_frame.pack(pady=10)
39 |
40 | tk.Label(input_frame, text="输入视频URL:").grid(row=0, column=0, sticky="e")
41 | self.url_entry = tk.Entry(input_frame, width=60)
42 | self.url_entry.grid(row=0, column=1, padx=5)
43 |
44 | tk.Label(input_frame, text="输入并发线程数:").grid(row=1, column=0, sticky="e")
45 | self.thread_count = tk.Entry(input_frame, width=40)
46 | self.thread_count.grid(row=1, column=1)
47 |
48 | self.log_text = scrolledtext.ScrolledText(self.root, wrap=tk.WORD, width=80, height=20, bg='black', fg='green')
49 | self.log_text.pack(padx=10, pady=10)
50 |
51 | self.start_button = tk.Button(self.root, text="开始", command=self.toggle_start)
52 | self.start_button.pack(pady=5)
53 |
54 | def toggle_start(self):
55 | if not self.running:
56 | self.running = True
57 | self.start_button.config(text="停止")
58 | self.start_play_count_threads()
59 | else:
60 | self.running = False
61 | self.start_button.config(text="开始")
62 |
63 | # 获取查询参数
64 | def get_query_params(self, session, click_url):
65 | # 获取w_aid
66 | res = session.get(url=click_url, timeout=5)
67 | # 使用正则提取,直接把这个对象都给提取出来,后面还要拿东西。
68 | data_dict = re.search(r'window\.__INITIAL_STATE__=(.+);\(function\(', res.text).group(1)
69 | # 获取aid
70 | w_aid = json.loads(data_dict)['aid']
71 | # 获取播放量并打印一下,日志
72 | self.log_message(
73 | f"当前视频:{json.loads(data_dict)['videoData']['title']},实际的播放量为:{json.loads(data_dict)['videoData']['stat']['view']}")
74 | # 模拟w_ftime,w_stime
75 | w_ftime = int(time.time()) + random.randint(1, 3)
76 | w_stime = int(time.time())
77 | # 模拟wts
78 | wts = int(time.time()) + random.randint(2, 6)
79 | # 获取web_location
80 | res1 = session.get("https://s1.hdslb.com/bfs/static/player/main/core.d98a5476.js", timeout=5) # 获取js文件
81 | web_location = re.findall(r"function p\(e,t,r\){if\(e.web_location=(.*?),t", res1.text) # 正则提取
82 | # 获取w_rid
83 | res2 = session.get("https://api.bilibili.com/x/web-interface/nav", timeout=5) # 获取img_url,sub_url
84 | secret_key = get_secret_key(res2) # 将img_url和sub_url使用py代码复现加密逻辑
85 | str = f'w_aid={w_aid}&w_ftime={w_ftime}&w_part=1&w_stime={w_stime}&w_type=3&web_location={web_location}&wts={wts}' + secret_key
86 | # 获取到w_rid
87 | w_rid = convert_md5(str)
88 | params = {
89 | "w_aid": w_aid,
90 | "w_part": 1,
91 | "w_ftime": w_ftime,
92 | "w_stime": w_stime,
93 | "w_type": 3,
94 | "web_location": web_location,
95 | "w_rid": w_rid,
96 | "wts": wts
97 | }
98 | return params, json.loads(data_dict), res.text, res.cookies
99 |
100 | # 获取请求体相关参数
101 | def get_body_data(self, params, data_dict, text):
102 | # 获取cid和spm_id
103 | cid = data_dict['videoData']['cid']
104 | spm_id = data_dict['videoData']['embedPlayer']['stats']['spmId']
105 |
106 | # 获取session
107 | session_data_dict = re.search(r'window\.__playinfo__=(.*?)', text, re.DOTALL).group(1)
108 |
109 | session = json.loads(session_data_dict)['session']
110 | data = {
111 | "aid": params.get('w_aid'),
112 | 'cid': cid,
113 | 'part': 1,
114 | 'lv': 0,
115 | 'ftime': params.get('w_ftime'),
116 | 'stime': params.get('w_stime'),
117 | 'type': params['w_type'],
118 | 'sub_type': 0,
119 | 'refer_url': "",
120 | 'outer': 0,
121 | 'spmid': spm_id,
122 | 'from_spmid': "",
123 | 'session': session,
124 | 'csrf': ''
125 | }
126 | return data
127 |
128 | # 获取cookie
129 | def get_cookie(self, first_cookies, params, data, session):
130 | # buvid3和b_nut
131 | buvid3 = first_cookies.get('buvid3')
132 | b_nut = first_cookies.get('b_nut')
133 |
134 | # b_lsid和_uuid
135 | b_lsid = gen_b_lsid()
136 | _uuid = gen_uuid()
137 | # 获取sid
138 | params = {"aid": params['w_aid'],
139 | 'cid': data['cid'],
140 | 'web_location': params['web_location'],
141 | 'w_rid': params['w_rid'],
142 | 'wts': int(time.time())
143 | }
144 | res = session.get("https://api.bilibili.com/x/player/wbi/v2", params=params, timeout=5) # 向这个请求发,获取cookie里面的sid
145 | sid = res.cookies.get('sid')
146 |
147 | # 获取buvid4和buvid_fp
148 | res = session.get("https://api.bilibili.com/x/frontend/finger/spi", timeout=5)
149 | buvid4 = res.json()['data']['b_4']
150 | # f700b2fa0217e916d769bf691fb41f92,浏览器的型号,所以buvid_fp这个是固定的
151 | cookies = {
152 | 'buvid3': buvid3,
153 | 'b_nut': b_nut,
154 | 'CURRENT_FNVAL': '4048',
155 | 'b_lsid': b_lsid,
156 | '_uuid': _uuid,
157 | 'sid': sid,
158 | 'buvid_fp': 'f700b2fa0217e916d769bf691fb41f92',
159 | 'buvid4': buvid4
160 | }
161 | return cookies
162 |
163 | # 获取当前ip
164 | def get_current_ip(self, session):
165 | try:
166 | response = session.get("https://httpbin.org/ip", timeout=5)
167 | ip = response.json()["origin"]
168 | return ip
169 | except requests.RequestException as e:
170 | self.log_message(f"Error getting IP: {e}")
171 | return None
172 |
173 | # 真正干活的函数
174 | def increase_video_play_count(self, session, click_url):
175 | global play_count # 全局计数器
176 | try:
177 | params, data_dict, text, first_cookies = self.get_query_params(session, click_url)
178 | data = self.get_body_data(params, data_dict, text)
179 | cookies = self.get_cookie(first_cookies, params, data, session)
180 | request_url = "https://api.bilibili.com/x/click-interface/click/web/h5"
181 | response = session.post(url=request_url, params=params, data=data, cookies=cookies, timeout=5)
182 | ip = self.get_current_ip(session)
183 | self.log_message(f"当前请求的ip是:{ip}")
184 | # 更新计数器
185 | with count_lock:
186 | play_count += 1
187 | self.log_message(f"当前播放量理论上刷了: {play_count}个")
188 | except Exception as e:
189 | self.log_message(f"发生错误:{e}")
190 |
191 | # 创建一个带有重试机制的session
192 | def create_session_with_retry(self):
193 | session = requests.Session()
194 |
195 | # 定义重试策略
196 | retries = Retry(
197 | total=5,
198 | backoff_factor=1,
199 | status_forcelist=[429, 500, 502, 503, 504], # 重试的状态码
200 | )
201 |
202 | # 创建带有重试策略的 HTTPAdapter
203 | adapter = HTTPAdapter(max_retries=retries)
204 |
205 | # 将该适配器挂载到 HTTP 和 HTTPS
206 | session.mount('http://', adapter)
207 | session.mount('https://', adapter)
208 |
209 | return session
210 |
211 | # 启动线程
212 | def start_play_count_threads(self):
213 | for i in range(int(self.thread_count.get())): # 启动16个线程
214 | threading.Thread(target=self.thread_worker, args=(self.url_entry.get(),)).start()
215 |
216 | # 主要线程工作函数
217 | def thread_worker(self, click_url):
218 | proxies = get_tunnel_proxies()
219 | while self.running:
220 | session = self.create_session_with_retry()
221 | session.proxies.update(proxies)
222 | session.headers.update({
223 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
224 | 'Referer': self.url_entry.get(),
225 | 'origin': 'https://www.bilibili.com'
226 | })
227 | self.increase_video_play_count(session, click_url)
228 |
229 | # 日志打印
230 | def log_message(self, message):
231 | if self.running:
232 | self.log_text.insert(tk.END, message + "\n")
233 | self.log_text.yview(tk.END)
234 | self.root.update_idletasks()
235 |
236 |
237 | if __name__ == "__main__":
238 | root = tk.Tk()
239 | app = App(root)
240 | root.mainloop()
241 |
242 | # # 主程序
243 | # if __name__ == '__main__':
244 | # click_url = "https://www.bilibili.com/video/BV1T64y1n7wG/"
245 | # headers = {
246 | # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
247 | # 'Referer': 'https://www.bilibili.com/video/BV1Ce411q786/',
248 | # 'origin': 'https://www.bilibili.com'
249 | # }
250 | #
251 | # threads = []
252 | # for i in range(16): # 10个线程
253 | # t = threading.Thread(target=thread_worker, args=(click_url, headers))
254 | # threads.append(t)
255 | # t.start()
256 | #
257 | # for t in threads:
258 | # t.join()
259 | # end_time = int(time.time())
260 | #
261 | # print(f"播放量刷取完毕,一共刷取{play_count}个播放量,耗时{end_time - start_time}秒")
262 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/cookie加密.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 | import time
4 | import uuid
5 |
6 |
7 | # 生成uuid
8 | def gen_uuid():
9 | uuid_sec = str(uuid.uuid4())
10 | time_sec = str(int(time.time() * 1000 % 1e5)).rjust(5, "0")
11 | return f"{uuid_sec}{time_sec}infoc"
12 |
13 |
14 | # 生成b_lsid
15 | def gen_b_lsid():
16 | data = ""
17 | for i in range(8):
18 | v1 = math.ceil(16 * random.uniform(0, 1))
19 | v2 = hex(v1)[2:].upper()
20 | data += v2
21 | result = data.rjust(8, "0")
22 |
23 | e = int(time.time() * 1000)
24 | t = hex(e)[2:].upper()
25 |
26 | b_lsid = f"{result}_{t}"
27 | return b_lsid
28 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/代理.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | # 普通代理
5 | def get_proxy_dict():
6 | key = "6JD3LFEN" # 用户key
7 | passwd = "D3BDB526FEE2" # 用户密码
8 |
9 | res = requests.get(
10 | url="https://share.proxy.qg.net/get?key=6JD3LFEN&num=1&area=&isp=&format=json&seq=&distinct=false&pool=1"
11 | )
12 | host = res.json()['data'][0]['server'] # 121.29.81.215:52001
13 |
14 | # 账密模式
15 | proxy = 'http://{}:{}@{}'.format(key, passwd, host)
16 |
17 | return {"http": proxy, "https": proxy}
18 |
19 |
20 | # 隧道代理
21 | def get_tunnel_proxies():
22 | proxy_host = "tunnel3.qg.net:15156"
23 | proxy_username = "2F9CDB09"
24 | proxy_pwd = "F120C8FC7845"
25 |
26 | return {
27 | "http": f"http://{proxy_username}:{proxy_pwd}@{proxy_host}",
28 | "https": f"http://{proxy_username}:{proxy_pwd}@{proxy_host}"
29 | }
30 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/刷播放量(单线程).py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | import time
5 | from requests.adapters import HTTPAdapter
6 | import requests
7 | from urllib3 import Retry
8 |
9 | from 请求头加密 import convert_md5, get_secret_key
10 | from cookie加密 import gen_b_lsid, gen_uuid
11 | from 代理 import get_tunnel_proxies, get_proxy_dict
12 |
13 |
14 | # 获取查询参数
15 | def get_query_params():
16 | # 获取w_aid
17 | res = session.get(url=click_url)
18 | # 使用正则提取,直接把这个对象都给提取出来,后面还要拿东西。
19 | data_dict = re.search(r'window\.__INITIAL_STATE__=(.+);\(function\(', res.text).group(1)
20 | # 获取aid
21 | w_aid = json.loads(data_dict)['aid']
22 | # 获取播放量并打印一下,日志
23 | print(
24 | f"当前视频{json.loads(data_dict)['videoData']['title']}:实际的播放量为{json.loads(data_dict)['videoData']['stat']['view']}")
25 | # 模拟w_ftime,w_stime
26 | w_ftime = int(time.time()) + random.randint(1, 3)
27 | w_stime = int(time.time())
28 | # 模拟wts
29 | wts = int(time.time()) + random.randint(2, 6)
30 | # 获取web_location
31 | res1 = session.get("https://s1.hdslb.com/bfs/static/player/main/core.d98a5476.js") # 获取js文件
32 | web_location = re.findall(r"function p\(e,t,r\){if\(e.web_location=(.*?),t", res1.text) # 正则提取
33 | # 获取w_rid
34 | res2 = session.get("https://api.bilibili.com/x/web-interface/nav", headers=headers) # 获取img_url,sub_url
35 | secret_key = get_secret_key(res2) # 将img_url和sub_url使用py代码复现加密逻辑
36 | str = f'w_aid={w_aid}&w_ftime={w_ftime}&w_part=1&w_stime={w_stime}&w_type=3&web_location={web_location}&wts={wts}' + secret_key
37 | # 获取到w_rid
38 | w_rid = convert_md5(str)
39 | params = {
40 | "w_aid": w_aid,
41 | "w_part": 1,
42 | "w_ftime": w_ftime,
43 | "w_stime": w_stime,
44 | "w_type": 3,
45 | "web_location": web_location,
46 | "w_rid": w_rid,
47 | "wts": wts
48 | }
49 | return params, json.loads(data_dict), res.text, res.cookies
50 |
51 |
52 | # 获取请求体相关参数
53 | def get_body_data(params, data_dict, text):
54 | # 获取cid和spm_id
55 | cid = data_dict['videoData']['cid']
56 | spm_id = data_dict['videoData']['embedPlayer']['stats']['spmId']
57 |
58 | # 获取session
59 | session_data_dict = re.search(r'window\.__playinfo__=(.*?)', text, re.DOTALL).group(1)
60 |
61 | session = json.loads(session_data_dict)['session']
62 | data = {
63 | "aid": params.get('w_aid'),
64 | 'cid': cid,
65 | 'part': 1,
66 | 'lv': 0,
67 | 'ftime': params.get('w_ftime'),
68 | 'stime': params.get('w_stime'),
69 | 'type': params['w_type'],
70 | 'sub_type': 0,
71 | 'refer_url': "",
72 | 'outer': 0,
73 | 'spmid': spm_id,
74 | 'from_spmid': "",
75 | 'session': session,
76 | 'csrf': ''
77 | }
78 | return data
79 |
80 |
81 | # 获取cookie
82 | def get_cookie(first_cookies, params, data):
83 | # buvid3和b_nut
84 | buvid3 = first_cookies.get('buvid3')
85 | b_nut = first_cookies.get('b_nut')
86 |
87 | # b_lsid和_uuid
88 | b_lsid = gen_b_lsid()
89 | _uuid = gen_uuid()
90 | # 获取sid
91 | params = {"aid": params['w_aid'],
92 | 'cid': data['cid'],
93 | 'web_location': params['web_location'],
94 | 'w_rid': params['w_rid'],
95 | 'wts': int(time.time())
96 | }
97 | res = session.get("https://api.bilibili.com/x/player/wbi/v2", params=params) # 向这个请求发,获取cookie里面的sid
98 | sid = res.cookies.get('sid')
99 |
100 | # 获取buvid4和buvid_fp
101 | res = session.get("https://api.bilibili.com/x/frontend/finger/spi", headers=headers)
102 | buvid4 = res.json()['data']['b_4']
103 | # f700b2fa0217e916d769bf691fb41f92,浏览器的型号,所以buvid_fp这个是固定的
104 | cookies = {
105 | 'buvid3': buvid3,
106 | 'b_nut': b_nut,
107 | 'CURRENT_FNVAL': '4048',
108 | 'b_lsid': b_lsid,
109 | '_uuid': _uuid,
110 | 'sid': sid,
111 | 'buvid_fp': 'f700b2fa0217e916d769bf691fb41f92',
112 | 'buvid4': buvid4
113 | }
114 | return cookies
115 |
116 |
117 | # 真正干活的函数
118 | def increase_video_play_count():
119 | # 刷播放量url
120 | request_url = "https://api.bilibili.com/x/click-interface/click/web/h5"
121 | # 获取查询参数
122 | params, data_dict, text, first_cookies = get_query_params()
123 | # 获取请求体
124 | data = get_body_data(params, data_dict, text)
125 | # 获取cookie
126 | cookies = get_cookie(first_cookies, params, data)
127 |
128 | # 直接请求
129 | res = session.post(url=request_url, params=params, data=data, cookies=cookies)
130 |
131 |
132 | # 创建一个带有重试机制的session
133 | def create_session_with_retry():
134 | session = requests.Session()
135 |
136 | # 定义重试策略
137 | retries = Retry(
138 | total=5,
139 | backoff_factor=1,
140 | status_forcelist=[429, 500, 502, 503, 504], # 重试的状态码
141 | )
142 |
143 | # 创建带有重试策略的 HTTPAdapter
144 | adapter = HTTPAdapter(max_retries=retries)
145 |
146 | # 将该适配器挂载到 HTTP 和 HTTPS
147 | session.mount('http://', adapter)
148 | session.mount('https://', adapter)
149 |
150 | return session
151 |
152 |
153 | # main入口
154 | if __name__ == '__main__':
155 | click_url = "https://www.bilibili.com/video/BV1ju4y1W78T/" # 要
156 | headers = {
157 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
158 | }
159 | start_time = int(time.time())
160 | # 获取代理
161 | # proxies = get_tunnel_proxies()
162 | count = 2000
163 | for i in range(0, count):
164 | # 获取session
165 | session = create_session_with_retry()
166 | proxies = get_proxy_dict() # 按量计费的代理
167 | # 给session设置代理
168 | session.proxies.update(proxies)
169 | # 给session统一设置请求头
170 | session.headers.update(headers)
171 | # 开始刷播放量
172 | increase_video_play_count()
173 | print(f"理论上刷了{i + 1}个播放量")
174 | print("----------------------------------------")
175 | end_time = int(time.time())
176 |
177 | print(f"播放量刷取完毕,一共刷取{count}个播放量,耗时{end_time - start_time}秒")
178 |
179 | # test1()
180 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/多线程.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | import threading
5 | import time
6 | from requests.adapters import HTTPAdapter
7 | import requests
8 | from urllib3 import Retry
9 |
10 | from 请求头加密 import convert_md5, get_secret_key
11 | from cookie加密 import gen_b_lsid, gen_uuid
12 | from 代理 import get_tunnel_proxies, get_proxy_dict
13 |
14 | start_time = time.time()
15 | # 全局计数器和线程锁
16 | count_lock = threading.Lock()
17 | play_count = 0
18 | time_limit = 60 * 10 # 运行时间限制,10分钟
19 |
20 |
21 | # 获取查询参数
22 | def get_query_params(session, click_url):
23 | # 获取w_aid
24 | res = session.get(url=click_url, timeout=5)
25 | # 使用正则提取,直接把这个对象都给提取出来,后面还要拿东西。
26 | data_dict = re.search(r'window\.__INITIAL_STATE__=(.+);\(function\(', res.text).group(1)
27 | # 获取aid
28 | w_aid = json.loads(data_dict)['aid']
29 | # 获取播放量并打印一下,日志
30 | print(
31 | f"当前视频:{json.loads(data_dict)['videoData']['title']},实际的播放量为:{json.loads(data_dict)['videoData']['stat']['view']}")
32 | # 模拟w_ftime,w_stime
33 | w_ftime = int(time.time()) + random.randint(1, 3)
34 | w_stime = int(time.time())
35 | # 模拟wts
36 | wts = int(time.time()) + random.randint(2, 6)
37 | # 获取web_location
38 | res1 = session.get("https://s1.hdslb.com/bfs/static/player/main/core.d98a5476.js", timeout=5) # 获取js文件
39 | web_location = re.findall(r"function p\(e,t,r\){if\(e.web_location=(.*?),t", res1.text) # 正则提取
40 | # 获取w_rid
41 | res2 = session.get("https://api.bilibili.com/x/web-interface/nav", timeout=5) # 获取img_url,sub_url
42 | secret_key = get_secret_key(res2) # 将img_url和sub_url使用py代码复现加密逻辑
43 | str = f'w_aid={w_aid}&w_ftime={w_ftime}&w_part=1&w_stime={w_stime}&w_type=3&web_location={web_location}&wts={wts}' + secret_key
44 | # 获取到w_rid
45 | w_rid = convert_md5(str)
46 | params = {
47 | "w_aid": w_aid,
48 | "w_part": 1,
49 | "w_ftime": w_ftime,
50 | "w_stime": w_stime,
51 | "w_type": 3,
52 | "web_location": web_location,
53 | "w_rid": w_rid,
54 | "wts": wts
55 | }
56 | return params, json.loads(data_dict), res.text, res.cookies
57 |
58 |
59 | # 获取请求体相关参数
60 | def get_body_data(params, data_dict, text):
61 | # 获取cid和spm_id
62 | cid = data_dict['videoData']['cid']
63 | spm_id = data_dict['videoData']['embedPlayer']['stats']['spmId']
64 |
65 | # 获取session
66 | session_data_dict = re.search(r'window\.__playinfo__=(.*?)', text, re.DOTALL).group(1)
67 |
68 | session = json.loads(session_data_dict)['session']
69 | data = {
70 | "aid": params.get('w_aid'),
71 | 'cid': cid,
72 | 'part': 1,
73 | 'lv': 0,
74 | 'ftime': params.get('w_ftime'),
75 | 'stime': params.get('w_stime'),
76 | 'type': params['w_type'],
77 | 'sub_type': 0,
78 | 'refer_url': "",
79 | 'outer': 0,
80 | 'spmid': spm_id,
81 | 'from_spmid': "",
82 | 'session': session,
83 | 'csrf': ''
84 | }
85 | return data
86 |
87 |
88 | # 获取cookie
89 | def get_cookie(first_cookies, params, data, session):
90 | # buvid3和b_nut
91 | buvid3 = first_cookies.get('buvid3')
92 | b_nut = first_cookies.get('b_nut')
93 |
94 | # b_lsid和_uuid
95 | b_lsid = gen_b_lsid()
96 | _uuid = gen_uuid()
97 | # 获取sid
98 | params = {"aid": params['w_aid'],
99 | 'cid': data['cid'],
100 | 'web_location': params['web_location'],
101 | 'w_rid': params['w_rid'],
102 | 'wts': int(time.time())
103 | }
104 | res = session.get("https://api.bilibili.com/x/player/wbi/v2", params=params, timeout=5) # 向这个请求发,获取cookie里面的sid
105 | sid = res.cookies.get('sid')
106 |
107 | # 获取buvid4和buvid_fp
108 | res = session.get("https://api.bilibili.com/x/frontend/finger/spi", timeout=5)
109 | buvid4 = res.json()['data']['b_4']
110 | # f700b2fa0217e916d769bf691fb41f92,浏览器的型号,所以buvid_fp这个是固定的
111 | cookies = {
112 | 'buvid3': buvid3,
113 | 'b_nut': b_nut,
114 | 'CURRENT_FNVAL': '4048',
115 | 'b_lsid': b_lsid,
116 | '_uuid': _uuid,
117 | 'sid': sid,
118 | 'buvid_fp': 'f700b2fa0217e916d769bf691fb41f92',
119 | 'buvid4': buvid4
120 | }
121 | return cookies
122 |
123 |
124 | # 获取当前ip
125 | def get_current_ip(session):
126 | try:
127 | response = session.get("https://httpbin.org/ip", timeout=5)
128 | ip = response.json()["origin"]
129 | return ip
130 | except requests.RequestException as e:
131 | print(f"Error getting IP: {e}")
132 | return None
133 |
134 |
135 | # 真正干活的函数
136 | def increase_video_play_count(session, headers, click_url):
137 | global play_count
138 | try:
139 | params, data_dict, text, first_cookies = get_query_params(session, click_url)
140 | data = get_body_data(params, data_dict, text)
141 | cookies = get_cookie(first_cookies, params, data, session)
142 | request_url = "https://api.bilibili.com/x/click-interface/click/web/h5"
143 | response = session.post(url=request_url, params=params, data=data, cookies=cookies, timeout=5)
144 | ip = get_current_ip(session)
145 | print(f"当前请求的ip是:{ip}")
146 | # 更新计数器
147 | with count_lock:
148 | play_count += 1
149 | print(f"当前播放量理论上刷了: {play_count}个")
150 | except Exception as e:
151 | print(f"发生错误:{e}")
152 |
153 |
154 | # 创建一个带有重试机制的session
155 | def create_session_with_retry():
156 | session = requests.Session()
157 |
158 | # 定义重试策略
159 | retries = Retry(
160 | total=5,
161 | backoff_factor=1,
162 | status_forcelist=[429, 500, 502, 503, 504], # 重试的状态码
163 | )
164 |
165 | # 创建带有重试策略的 HTTPAdapter
166 | adapter = HTTPAdapter(max_retries=retries)
167 |
168 | # 将该适配器挂载到 HTTP 和 HTTPS
169 | session.mount('http://', adapter)
170 | session.mount('https://', adapter)
171 |
172 | return session
173 |
174 |
175 | # 线程工作函数
176 | def thread_worker(click_url, headers):
177 | # proxies = get_tunnel_proxies()
178 | while True:
179 | session = create_session_with_retry()
180 | proxies = get_proxy_dict() # 按量计费的代理
181 | session.proxies.update(proxies) # 把代理整上去
182 | session.headers.update(headers) # 公共请求头
183 | increase_video_play_count(session, headers, click_url)
184 | # 时间达到限制时退出
185 | if time.time() - start_time > time_limit:
186 | break
187 |
188 |
189 | # 主程序
190 | if __name__ == '__main__':
191 | click_url = input("输入要刷的视频url:")
192 | thread_count = input("输入并发线程数:")
193 | # click_url = "https://www.bilibili.com/video/BV1Ce411q786/"
194 | headers = {
195 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
196 | 'Referer': click_url,
197 | 'origin': 'https://www.bilibili.com'
198 | }
199 |
200 | threads = []
201 | for i in range(int(thread_count)): # 10个线程
202 | t = threading.Thread(target=thread_worker, args=(click_url, headers))
203 | threads.append(t)
204 | t.start()
205 |
206 | for t in threads:
207 | t.join()
208 | end_time = int(time.time())
209 |
210 | print(f"播放量刷取完毕,一共刷取{play_count}个播放量,耗时{end_time - start_time}秒")
211 |
--------------------------------------------------------------------------------
/第7章-JS逆向/B站逆向/请求头加密.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import json
3 | import struct
4 |
5 |
6 |
7 | # 获取秘钥
8 | def get_secret_key(res):
9 | # print(json.loads(res.text)['data']['wbi_img']['img_url'], json.loads(res.text)['data']['wbi_img']['sub_url'])
10 | t = json.loads(res.text)['data']['wbi_img']['img_url']
11 | r = json.loads(res.text)['data']['wbi_img']['sub_url']
12 | # 提取 t 和 r 中的特定部分
13 | t_extracted = t[t.rfind('/') + 1:].split('.')[0]
14 | r_extracted = r[r.rfind('/') + 1:].split('.')[0]
15 |
16 | # 拼接 t 和 r 的提取部分
17 | e = t_extracted + r_extracted
18 |
19 | # 定义索引数组
20 | indices = [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14,
21 | 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59,
22 | 6, 63, 57, 62, 11, 36, 20, 34, 44, 52]
23 |
24 | # 根据索引数组提取字符并拼接
25 | result = ''.join(e[i] for i in indices if i < len(e))
26 |
27 | # 截取前 32 个字符
28 | return result[:32]
29 |
30 |
31 | def s(input_string):
32 | """
33 | 计算字符串的 MD5 散列,并返回四个整数值组成的数组。
34 | """
35 | md5_hash = hashlib.md5(input_string.encode())
36 | # 将 16 字节的 MD5 哈希分成四个 32 位的整数
37 | return list(struct.unpack('>4I', md5_hash.digest()))
38 |
39 |
40 | def words_to_bytes(words):
41 | """
42 | 将整数数组转换为字节序列。
43 | """
44 | # 每个整数转换为 4 字节
45 | return [byte for word in words for byte in struct.pack('>I', word)]
46 |
47 |
48 | def convert_md5(input_string, as_bytes=False, as_string=False):
49 | """
50 | 根据选项返回 MD5 哈希的不同表示。
51 | """
52 | # 计算 MD5 整数数组
53 | md5_words = s(input_string)
54 |
55 | if as_bytes:
56 | # 返回字节序列
57 | return words_to_bytes(md5_words)
58 | elif as_string:
59 | # 返回字符串表示(假设是 UTF-8 编码的字符串)
60 | return ''.join(chr(byte) for byte in words_to_bytes(md5_words))
61 | else:
62 | # 返回十六进制表示
63 | return ''.join(f'{word:08x}' for word in md5_words)
64 |
65 |
66 | # 测试字符串
67 | # 前面请求的
68 | test_string = 'aid=325318514&cid=1381936481&web_location=1315873&wts=1703944676ea1db124af3c7062474693fa704f4ff8'
69 | # 模拟的
70 | test_string2 = 'w_aid=325318514&w_ftime=1703944675&w_part=1&w_stime=1703944674&w_type=3&web_location=1315873&wts=17039446769d86b01094b49f0347055bdfa8cb479f'
71 | test_string3 = 'w_aid=325318514&w_ftime=1704012605&w_part=1&w_stime=1704012604&w_type=3&web_location=1315873&wts=17040126619d86b01094b49f0347055bdfa8cb479f'
72 |
73 | # 测试函数
74 | md5_as_bytes = convert_md5(test_string, as_bytes=True)
75 | md5_as_string = convert_md5(test_string, as_string=True)
76 | md5_as_hex = convert_md5(test_string2)
77 |
78 | # 156c3c9ccf38bbe3e32c2a8481540e07
79 | # 156c3c9ccf38bbe3e32c2a8481540e07
80 | # print(md5_as_hex)
81 |
--------------------------------------------------------------------------------
/第7章-JS逆向/py里面执行js代码.py:
--------------------------------------------------------------------------------
1 | # 固定格式,用于解决windows执行js文件输出汉字乱码问题
2 | from functools import partial # 锁定参数
3 | import subprocess
4 |
5 | subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
6 | import execjs # 此时再引入execjs的时候. 里面就可以自动使用你的subprocess.Popen
7 |
8 | # 读取js代码
9 | with open('./抠出来的代码文件.js', mode='r', encoding='utf-8') as f:
10 | js_code = f.read()
11 |
12 | # 加载代码
13 | js = execjs.compile(js_code)
14 |
15 | # 执行js代码中的函数,可以看出,就跟多进程一样,就是直接调用的函数,里面的log是不执行的
16 | # c = js.call("fn", 1, 2)
17 | # print(c)
18 | #
19 | # c2 = js.call("fn2")
20 | # print(c2)
21 |
22 | # # 直接执行js代码
23 | # js_code2="""
24 | # '牛逼666我的宝贝'.substring(0,2)
25 | # """
26 | # c3 = js.eval(js_code2)
27 | # print(c3)
28 |
--------------------------------------------------------------------------------
/第7章-JS逆向/rsa.privatekey.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PRIVATE KEY-----
2 | MIICXQIBAAKBgQC+o4EMhwj500ebItpmsLmTK2Y+MUYdJMZzvJ4oWtyEnKSRygbv
3 | ONdFsxFrmF0UKYMlYzdbvBTA5rHB8D4EIftbqY1o9Y2DNWpoMNtGeXW9/cDpuFzi
4 | IpmocyVHh5SVXFSV0FdyiLMpRyLPd5ZLm62DcMDAoJUr1wckjVEn3/fq6wIDAQAB
5 | AoGAEVXd7M00uD8JsmZDdPYaEfNu2zXw2cKX1ztuKF5G0jpKTsWPxt4k2T401dNn
6 | l2dE+Tfh91INkxMHVArsIMUUTbO4/9oAIZ/8FOrQQCVM/jKd9fFHtCoDJo4b4VbK
7 | Isod41OOa79rIGfFjmZTA8NLHgWGjoKkSNypn0GwxOtskpkCQQDL0xgRYs4pXQNQ
8 | Ypz60oM5bfyHaF5hmmN2h3VB3mPBr7R2nQFcLE9BstS9yxASLKQ26kpcjxeQh3HY
9 | 0yt8ObnzAkEA73BR7BfSiCUnPRQftYe8xTgeeT6uggfbn+A8C7Mpc/Q5Pf06AMfl
10 | Qj6B6laqgJhVkLbSqCoZIbqH16apPSARKQJBAJWMM6uF+EfHFAhl4lHyJQ/9D/z1
11 | 6dHpKjMsXi9PEZhp55njxryvUhOdN5RtqkP7dx1Ht/nDj4GerzxsbAqAfFsCQBRC
12 | M5w/M7/zLMu9LSWl03H4WpO1mvxDf/CZ/1qaERLPghxdlY/FWf9t3H4gURLKUMsK
13 | vO4fibrR1naHkmAqm2kCQQDIW95LR000DxMy6DGLITGlbD2TF2/UK5xyAvqrL8vz
14 | frXBIxIvZlsqff7nDdXRkMmRfTW1OKWN4EuIRm1edPnY
15 | -----END RSA PRIVATE KEY-----
--------------------------------------------------------------------------------
/第7章-JS逆向/rsa.publickey.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN PUBLIC KEY-----
2 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC+o4EMhwj500ebItpmsLmTK2Y+
3 | MUYdJMZzvJ4oWtyEnKSRygbvONdFsxFrmF0UKYMlYzdbvBTA5rHB8D4EIftbqY1o
4 | 9Y2DNWpoMNtGeXW9/cDpuFziIpmocyVHh5SVXFSV0FdyiLMpRyLPd5ZLm62DcMDA
5 | oJUr1wckjVEn3/fq6wIDAQAB
6 | -----END PUBLIC KEY-----
--------------------------------------------------------------------------------
/第7章-JS逆向/抠出来的代码文件.js:
--------------------------------------------------------------------------------
1 | function fn(a, b) {
2 | console.log("我是抠出来的代码文件~")
3 | return a + b;
4 | }
5 | function fn2() {
6 | return "我是抠出来的代码文件2~"
7 | }
8 |
9 |
--------------------------------------------------------------------------------
/第7章-JS逆向/招标网站/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "招标网站",
3 | "lockfileVersion": 3,
4 | "requires": true,
5 | "packages": {
6 | "": {
7 | "dependencies": {
8 | "crypto-js": "^4.2.0"
9 | }
10 | },
11 | "node_modules/crypto-js": {
12 | "version": "4.2.0",
13 | "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.2.0.tgz",
14 | "integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q=="
15 | }
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/第7章-JS逆向/招标网站/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": {
3 | "crypto-js": "^4.2.0"
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/第7章-JS逆向/招标网站/爬取数据.py:
--------------------------------------------------------------------------------
1 | from functools import partial # 锁定参数
2 | import subprocess
3 |
4 | subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
5 |
6 | import execjs
7 | import requests
8 |
9 | # 0XwwzxGlqEopfJXc0zE5LyGT0xKVdsseU2Gk3EmS3Ndm5iTe71ct5Eo7wWJ5EmXLFMA19OpyZAccLG10Wqk_2.4AJgAiccOn6
10 | """
11 | 这个是碰运气,我找了两小时,在拦截器发送的时候还没有带查询参数呢,但是发出去的时候带了个很诡异的查询字符串。于是我就继续单步调试
12 | 后来是在VM的js文件里面找到了加密入口,但是太难调用了(见请求参数加密.js),不知道他们是改了axios的源码还是引入了第三方包。
13 | 这个网站请求参数带了个查询参数,试了一下同一请求参数还不一样。但是它发送请求的时候请求头或者查询参数也没带什么时间戳之类的信息,服务端校验怎么校验呢
14 | 因此我就猜测这个可能是它就根据一定算法生成的一个参数,那么它随机,我就直接定死就可以了,发个请求还真能返回数据(后来发现就算不带这个参数也是能返回数据)
15 | 至于解密就太简单了,标准的DES加密。iv都没用
16 | (ps,这种网站用vue2写的,一看就不太专业,他们的加密也不会很难,猜就行,服务端校验也不会特别严格。
17 | 禁用F12可以使用右边那个开发者工具打开,有的无限debugger等就得用其他方法过了)
18 | """
19 | headers = {
20 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
21 | , "Referer": "https://ctbpsp.com/",
22 |
23 | }
24 | oArpptFq = "0ofDDValqEtvpRign5gCU7Tm8ZF8BW5db.6bPxs1lQ2lSW8c8nRDpEi0nXordd_3e_FWQt..gA3yLTG34PguuJDy5OOkd.fYC"
25 | url = f"https://ctbpsp.com/cutominfoapi/recommand/type/5/pagesize/10/currentpage/1?oArpptFq={oArpptFq}"
26 |
27 | resp = requests.get(url, headers=headers)
28 |
29 | # 打印一下看看是不是返回加密前的疏忽了
30 | print(resp.text)
31 |
32 | # 读取js代码
33 | f = open('返回数据解密.js', mode='r', encoding='utf-8')
34 | js_code = f.read()
35 | f.close()
36 |
37 | # 加载代码
38 | js = execjs.compile(js_code)
39 | # 返回回来的加密数据去解密
40 | result = js.call("decryptByDES", resp.text.strip('"'))
41 | print(result)
42 |
--------------------------------------------------------------------------------
/第7章-JS逆向/招标网站/返回数据解密.js:
--------------------------------------------------------------------------------
1 | const CryptoJS=require("crypto-js")
2 | function decryptByDES(ciphertext) {
3 | var keyHex = CryptoJS.enc.Utf8.parse("1qaz@wsx3e");
4 | var decrypted = CryptoJS.DES.decrypt({
5 | ciphertext: CryptoJS.enc.Base64.parse(ciphertext),
6 | },
7 | keyHex, {
8 | mode: CryptoJS.mode.ECB,
9 | padding: CryptoJS.pad.Pkcs7,
10 | }
11 | );
12 | return decrypted.toString(CryptoJS.enc.Utf8);
13 | }
14 |
15 |
--------------------------------------------------------------------------------
/第7章-JS逆向/注意点.md:
--------------------------------------------------------------------------------
1 | - 可能会存在JSON格式转换的问题,在发送请求或者生成加密字符串的时候这
2 | 个时候自动加上空格,解决方式是:
3 | ```json_string = json.dumps(data, separators=(',', ':'))```
4 |
--------------------------------------------------------------------------------
/第7章-JS逆向/看准网/爬取数据.py:
--------------------------------------------------------------------------------
1 | import base64
2 |
3 | import requests
4 | from Crypto.Cipher import AES
5 | from Crypto.Util.Padding import pad, unpad
6 |
7 | url = "https://www.kanzhun.com/api_to/search/salary.json"
8 | # 需要复现的查询参数
9 | kiv = "CSpFHJE0TN9oL3rF"
10 | # AES复现
11 | aes = AES.new(key=b'G$$QawckGfaLB97r', IV=b'CSpFHJE0TN9oL3rF', mode=AES.MODE_CBC)
12 | ming = '{"query":"软件开发工程师","cityCode":"","industryCodes":"","pageNum":1,"limit":15}'
13 | mi = aes.encrypt(pad(ming.encode('utf-8'), 16))
14 | b = base64.b64encode(mi).decode().replace("/", "_").replace("+", "-").replace("=", "~")
15 |
16 | params = {
17 | 'b': b,
18 | 'kiv': kiv
19 | }
20 | headers = {
21 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
22 | 'Cookie': 'wd_guid=5632262a-3e0b-4746-b729-c972d1d21db9; historyState=state; __c=1700303213; __g=-; __l=l=%2Fwww.kanzhun.com%2Fsearch%3FcityCode%3D31%26industryCodes%3D%26pageNum%3D1%26query%3D%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588%26type%3D4&r=; Hm_lvt_1f6f005d03f3c4d854faec87a0bee48e=1700299731,1700303213; R_SCH_CY_V=25761614; W_CITY_S_V=31; pageType=1; lasturl="https://www.kanzhun.com/search?cityCode=31&industryCodes=&pageNum=1&query=%E8%BD%AF%E4%BB%B6%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&type=4'
23 | }
24 | res = requests.get(url, params=params, headers=headers)
25 | print(res.text)
26 |
27 | # 进行解密的逻辑
28 | aes_decrypt = AES.new(key=b'G$$QawckGfaLB97r', IV=b'CSpFHJE0TN9oL3rF', mode=AES.MODE_CBC)
29 | data=unpad(aes_decrypt.decrypt(base64.b64decode(res.text)),16).decode('utf-8')
30 | print(data)
31 |
--------------------------------------------------------------------------------
/第7章-JS逆向/网易云音乐/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "网易云音乐",
3 | "lockfileVersion": 3,
4 | "requires": true,
5 | "packages": {
6 | "": {
7 | "dependencies": {
8 | "crypto-js": "^4.2.0"
9 | }
10 | },
11 | "node_modules/crypto-js": {
12 | "version": "4.2.0",
13 | "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.2.0.tgz",
14 | "integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q=="
15 | }
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/第7章-JS逆向/网易云音乐/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": {
3 | "crypto-js": "^4.2.0"
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/第7章-JS逆向/网易云音乐/网易云下载音乐.py:
--------------------------------------------------------------------------------
1 | # 固定格式,用于解决windows执行js文件输出汉字乱码问题
2 | from functools import partial # 锁定参数
3 | import subprocess
4 | subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
5 |
6 | import execjs # 此时再引入execjs的时候. 里面就可以自动使用你的subprocess.Popen
7 | import requests
8 |
9 | url = "https://music.163.com/weapi/song/enhance/player/url/v1?csrf_token="
10 | headers = {
11 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
12 | }
13 | data = {
14 | "ids": "[1404596131]",
15 | "level": "standard",
16 | "encodeType": "aac",
17 | "csrf_token": ""
18 | }
19 |
20 | # 读取js代码
21 | with open('./网易云-扣代码.js', mode='r', encoding='utf-8') as f:
22 | js_code = f.read()
23 |
24 | # 加载代码
25 | js = execjs.compile(js_code)
26 |
27 | data = js.call("encrypt_data", data)
28 | print(data)
29 | # 得到加密后的数据,发送请求
30 | res = requests.post(url, headers=headers, data={"params": data.get("encText"), "encSecKey": data.get("encSecKey")})
31 | print(res.json())
32 | # 提取歌曲url
33 | song_url = res.json()['data'][0]['url']
34 | print(song_url)
35 |
36 | # 下载音乐
37 | res = requests.get(song_url, headers=headers)
38 | with open("see you again.m4a", mode='wb') as f:
39 | f.write(res.content)
40 |
--------------------------------------------------------------------------------
/第7章-JS逆向/网易有道翻译/发送请求.py:
--------------------------------------------------------------------------------
1 | import base64
2 |
3 | import requests
4 | import time
5 | from hashlib import md5
6 | from Crypto.Cipher import AES
7 | from Crypto.Util.Padding import pad, unpad
8 |
9 | """
10 | 逆向请求参数的思路是首先抓包,看到请求体里面有sign,时间戳,显然是需要逆向的。这时搜一下webtranslate(或者搜mysticTime等),然后找到加密入口
11 | 发现是md5加密,传过去的时候有一个"fsdsogkndfokasodnaso"字符串,猜测可能是盐,再发一次请求,这个字符串不变,肯定了这个猜想
12 | (注意,那个js文件里面没有这个参数赋值的操作,应该是暴露出去,另外的文件调用传参的)。
13 | 接下来找到加密的函数,标准md5加密,在py里面复现就可以了,只有e(也就是time在变),另外需要注意的是传1,2,这种的时候一般传数字,有时候传字符串也行(看服务端)
14 | """
15 |
16 | url = "https://dict.youdao.com/webtranslate"
17 |
18 | headers = {
19 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
20 | "Referer": "https://fanyi.youdao.com/",
21 | "Cookie": "OUTFOX_SEARCH_USER_ID=1138325494@10.105.137.204; OUTFOX_SEARCH_USER_ID_NCOO=46378145.29139559",
22 | "Origin": "https://fanyi.youdao.com"
23 | }
24 | # 伪造时间戳,py里面是秒,转成毫秒
25 | times = int(time.time() * 1000)
26 |
27 | # 伪造sign,直接复现md5
28 | # 准备数据
29 | d = "fanyideskweb"
30 | e = times
31 | u = "webfanyi"
32 | t = "fsdsogkndfokasodnaso"
33 | str = f"client={d}&mysticTime={e}&product={u}&key={t}"
34 | # 加密得到sign
35 | sign = md5(str.encode("utf-8")).hexdigest()
36 |
37 | data = {
38 | "i": "like",
39 | "from": "auto",
40 | "to": "",
41 | "dictResult": "true",
42 | "keyid": "webfanyi",
43 | "sign": sign,
44 | "client": "fanyideskweb",
45 | "product": "webfanyi",
46 | "appVersion": "1.0.0",
47 | "vendor": "web",
48 | "pointParam": "client,mysticTime,product",
49 | "mysticTime": e,
50 | "keyfrom": "fanyi.web",
51 | "mid": "1",
52 | "screen": "1",
53 | "model": "1",
54 | "network": "wifi",
55 | "abtest": "0",
56 | "yduuid": "abcdefg"
57 | }
58 |
59 | # 发送请求
60 | res = requests.post(url, headers=headers, data=data)
61 | print(res.text)
62 |
63 | key = 'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl'
64 | key=md5(key.encode('utf-8')).digest()
65 | iv = 'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4'
66 | iv=md5(iv.encode('utf-8')).digest()
67 | data = res.text.replace(
68 | "_", "/").replace("-", "+")
69 | """
70 | t为加密的数据,o为key,n为iv,接下来就是阅读代码,alloc函数可能是在分配什么东西,不影响o=a的值,i.a.create...这个函数可能是标准的或者非标准的,因此
71 | 先试一下能不能复现,不能的话再具体去扣逻辑
72 | R = (t,o,n)=>{
73 | if (!t)
74 | return null;
75 | const a = e.alloc(16, y(o))
76 | , c = e.alloc(16, y(n))
77 | , r = i.a.createDecipheriv("aes-128-cbc", a, c);
78 | let s = r.update(t, "base64", "utf-8");
79 | return s += r.final("utf-8"),
80 | s
81 | }
82 | """
83 |
84 | aes = AES.new(key=key, IV=iv, mode=AES.MODE_CBC)
85 |
86 | result= unpad(aes.decrypt(base64.b64decode(data)), 16).decode("utf-8")
87 |
88 | print(result)
89 |
--------------------------------------------------------------------------------
/第7章-JS逆向/逆向常见加密算法.py:
--------------------------------------------------------------------------------
1 | # 导入摘要算法
2 | import binascii
3 | from hashlib import md5, sha1, sha256
4 | # 导入URL
5 | from urllib.parse import urlencode, unquote, quote
6 | # 导入base64
7 | import base64
8 | # 导入对称加密相关模块
9 | from Crypto.Cipher import AES, DES, DES3
10 | from Crypto.Util.Padding import pad, unpad
11 | # 导入非对称加密相关模块
12 | from Crypto.PublicKey import RSA
13 | from Crypto.Cipher import PKCS1_v1_5
14 | from Crypto import Random
15 |
16 | """
17 | 在这个文件中,介绍一下JS逆向中常见的加密算法,分别是摘要算法MD5(salt),sha1,sha256和URLEncode、Base64、对称加密AES和DES、非对称加密RSA。
18 | 在理解加密的时候,加密算法越复杂性能就越差,因此还是有很多仅仅使用MD5算法或MD5+salt算法
19 | 摘要算法不存在解密的逻辑,本身就是散列直接做的摘要。URLEncode和Base64是转换的,可以直接还原原文。
20 | """
21 |
22 | # TODO:MD5
23 | """
24 | MD5是一个不可逆的摘要算法,特点是速度快,并且非常难被破解,原本是128位二进制字符串,后来为了表示方便,一般显示的都是16进制的32位。(存在MD5相同的可能性)
25 | """
26 | # # 创建md5对象
27 | # obj = md5("张狗蛋".encode("utf-8"))
28 | # obj.update("非常帅".encode("utf-8")) # 添加加密的内容
29 | # md5_encrypt_data = obj.hexdigest() # 生成16进制的md5摘要
30 | # print(md5_encrypt_data)
31 |
32 | # 网上有很多穷举出来md5的网站,如https://www.cmd5.com/。我们可以加盐让md5穷举(撞库)不出来。
33 | # # 这个的实现原理就是直接在后面拼接。之前那个字符串还需要付费,现在这个它就撞不出来了。常见搭配时间戳+原始数据+非常大串的自定义字符串
34 | # salt = "我是张狗蛋非常帅的盐"
35 | # obj=md5(salt.encode("utf-8"))
36 | # obj.update("张狗蛋非常帅".encode("utf-8"))
37 | # md5salt_encrypt_data = obj.hexdigest()
38 | # print(md5salt_encrypt_data)
39 |
40 | # TODO:sha1算法和sha256算法。sha1和sha256可以认为是md5的升级版。sha1被证实会发生碰撞。(长度为40位)sha256是sha1的升级版,产生64位16进制数。
41 | """
42 | sha系列(最大加密长度为2^64位)
43 | """
44 | # # 调用起来都是一样的
45 | # sha = sha256("张狗蛋非常帅".encode("utf-8"))
46 | # sha256_encrypt_data=sha.hexdigest()
47 | # print(sha256_encrypt_data)
48 |
49 | # TODO:URLEncode。
50 | """
51 | 这个URL编码是为了防止URL在传输过程中出现歧义或URL注入问题。将汉字转换成字节,一个字节转换成两个16进制并在前面加上%分割。默认采用UTF-8
52 | """
53 | # base_url = "https://www.bugdesigner.cn/?"
54 | # params = {
55 | # "s": "实用资源"
56 | # }
57 | # url = base_url + urlencode(params)
58 | # print(url)
59 | # 也可以对单个字符串进行编码或解码
60 | # print(quote("张狗蛋"))
61 | # print(unquote("%E5%BC%A0%E7%8B%97%E8%9B%8B"))
62 |
63 | # TODO:Base64。Base64是一种将二进制转换为字符串的方式,可以方便在互联网上传播。它的数据量会增加1/3左右
64 | """
65 | Base64的编码方式是首先二进制数据按照字节分组,接下来这些字节按照每组6位分组,接着每组将会映射到Base64表中的一个字符,这个表包含了64个字符分别是
66 | 26个大写字母+26个小写字母+10个数字+2个特殊符号(+和/)。!!!注意。如果最后组不满6位,则会填充=,最后的字符串不是4的倍数会报错,这个时候需要自己手动填充
67 | / 换成_,+换成-(URLBase64变种,害怕对URL有影响)
68 | """
69 |
70 | # bs = "张狗蛋非常帅A".encode("utf-8") # 转换成字节
71 | # print(base64.b64encode(bs).decode()) # 将字节数据转换成base64格式的。注意这里就会填充两个==,因为最后总字节数/6余2
72 | #
73 | # b64_encode_data = "5byg54uX6JuL6Z2e5bi45biFQQ=="
74 | # print(base64.b64decode(b64_encode_data).decode()) # 将b64字符串解码,decode是为了从字节变成默认的中文。注意这里如果缺少=会报错,这里填充上=号就好了
75 |
76 | # TODO:对称加密AES,DES
77 | """
78 | AES称为高级加密标准,通过加密秘钥的长度和加密轮数分为AES-128,AES-192,AES-256。但是加密的步骤是一模一样的。
79 | AES分为key,加密模式,IV(受加密模式影响),pad这四个部分。
80 | key很简单,就是秘钥,是16位的字节。
81 | 加密模式一般有CBC和ECB,CBC需要IV(初始向量),不同的IV加密出来的结果是不一样的,也是16位的字节
82 | pad是因为AES这个算法加密的数据需要是16的倍数,如果不是,那么就需要填充到16的倍数。
83 | 逆向的时候需要找到key,mode和iv,一般都是CBC模式,因为安全性比较高
84 | """
85 | # # 创建AES对象
86 | # aes = AES.new(key=b"1234567890123456", IV=b'1234567890123456', mode=AES.MODE_CBC)
87 | # ming = "张狗蛋很帅"
88 | # # 添加填充,必须已经被编码成utf-8的形式了
89 | # ming = pad(ming.encode("utf-8"), 16)
90 | # aes_encrypt_data = aes.encrypt(ming)
91 | # # print(aes_encrypt_data)
92 | # # 一般加密完成以后会处理成字符串。
93 | # str1 = base64.b64encode(aes_encrypt_data).decode()
94 | # # print(str1)
95 | # str2 = binascii.b2a_hex(aes_encrypt_data).decode()
96 | # # print(str2)
97 | #
98 | # # 解密,假设之前加密的转换成字符串了
99 | # aes_decrypt = AES.new(key=b"1234567890123456", IV=b'1234567890123456', mode=AES.MODE_CBC) # 这里一定得新创建一个AES
100 | # mi = base64.b64decode(str1)
101 | # aes_decrypt_data=aes_decrypt.decrypt(mi)
102 | # aes_decrypt_data = unpad(aes_decrypt_data, 16) #去掉填充
103 | # print(aes_decrypt_data.decode()) #解码成字符串
104 |
105 | """
106 | DES是AES的降级版,所有步骤都跟AES一样,除了key和iv的填充从16个字节降低到了8个字节
107 | """
108 | # # 加密
109 | # des = DES.new(key=b'12345678', mode=DES.MODE_CBC, iv=b'12345678')
110 | # data = "张狗蛋很帅"
111 | # data = pad(data.encode("utf-8"), 16) # 填充
112 | # des_encrypt_data = des.encrypt(data) # 加密
113 | # des_encrypt_data = base64.b64encode(des_encrypt_data).decode() # 处理成base64字符串
114 | # print(des_encrypt_data)
115 | #
116 | # # 解密
117 | # des_decrypt = DES.new(key=b'12345678', mode=DES.MODE_CBC, iv=b'12345678') # 创建新的DES对象
118 | # des_decrypt_data = des_decrypt.decrypt(base64.b64decode(des_encrypt_data)) # 解码成字节串
119 | # des_decrypt_data = unpad(des_decrypt_data, 16) # 去掉填充
120 | # print(des_decrypt_data.decode())
121 |
122 | # TODO:RSA
123 | """
124 | 非对称加密常见的是RSA,非对称加密的原理是生成的时候有一个公钥还有一个私钥,数据被公钥加密只能被私钥解密,反之则不可以。原理是素数相乘。
125 | 这里需要讨论PKCS(公钥密码学标准,具体也是根据加密数据是不是8的倍数进行填充,就不再深入讨论)。还有一个是没有填充的算法,这里py不能复现,必须使用JS。主要的流程为
126 | 服务端生成一对公钥和私钥(必须同时生成),加密的时候客户端使用公钥,并对数据进行填充,然后加密,解密的时候服务端找到私钥进行解密(公钥和私钥长度可以指定,一般是1024/2048)
127 | """
128 | # # 创建私钥和公钥
129 | # gen_random = Random.new # 获取一个伪随机数生成器,用于最后解密时使用
130 | # print(gen_random)
131 | # rsa = RSA.generate(1024)
132 | # with open('rsa.publickey.pem', mode='wb') as f:
133 | # f.write(rsa.public_key().exportKey())
134 | # with open('rsa.privatekey.pem', mode='wb') as f:
135 | # f.write(rsa.exportKey())
136 | #
137 | # # 进行加密
138 | # data = "张狗蛋很帅"
139 | # with open("rsa.publickey.pem", mode='r') as f:
140 | # pk = f.read()
141 | # rsa_pk = RSA.importKey(pk)
142 | # rsa = PKCS1_v1_5.new(rsa_pk) # 生成一个RSA对象
143 | #
144 | # rsa_encrypt_data = rsa.encrypt(data.encode("utf-8")) # 进行加密
145 | # b64_rsa_encrypt_data = base64.b64encode(rsa_encrypt_data).decode("utf-8") # 处理成字符串格式
146 | # print(b64_rsa_encrypt_data)
147 | # # 进行解密
148 | # with open("rsa.privatekey.pem", mode='r') as f:
149 | # privatekey = f.read()
150 | # rsa_pk = RSA.importKey(privatekey)
151 | # rsa = PKCS1_v1_5.new(rsa_pk)
152 | #
153 | # result = rsa.decrypt(base64.b64decode(b64_rsa_encrypt_data), gen_random)
154 | # print(result.decode("utf-8"))
155 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/分析分数线.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from pymongo import MongoClient
3 | import matplotlib.pyplot as plt
4 |
5 | # 指定中文字体,使用 'SimHei'
6 | plt.rcParams['font.sans-serif'] = ['SimHei']
7 | # 为了正常显示负号
8 | plt.rcParams['axes.unicode_minus'] = False
9 |
10 | """
11 | 抖音分析评论涉及到NLP和LDA,这里简单分析一下。从下面这几个维度分析
12 | 1:根据ip归属地聚类,看看是哪些省的更关注这些问题
13 | 2: 词云图,哪些词最关键
14 | # 这个其实也不太准确,GPT4还是不够本土化,有的完全看不出来是冷嘲热讽的,还是用模式匹配吧。
15 | 3:调用AI大模型接口,让他帮我们分析这个评论是什么方向。
16 | 1:支持河南考生移民
17 | 2:认为不能抢占黑龙江考生名额
18 |
19 | 3:认为河南高考分数太高
20 | 4:认为教育不平等,支持全国统一分数线
21 | 5:认为复读生太多
22 | 6: 认为河南大学少
23 | 7:认为两者教育资源有差距
24 |
25 | 8:认为河南生的多
26 | 9:攻击河南人
27 |
28 | 10:认为有人搞阴谋论,抹黑黑龙江和河南
29 | 11:其他原因
30 |
31 | 至于对分数线的分析就比较简单了。分析分数线最高的4个省和分数线最低的4个省,但是这又衍生出来一个问题,每年没准这8个省不固定呢?那每一年都要画吗?
32 | 事实证明,也不用,因为分数线最高基本也是每一年分数线都是最高,也不可能出现飞上枝头当凤凰这样的出现,所以我取的是平均值
33 |
34 | 因此分析结果就可以出来了,就分析本科一批(一批),本科二批(二批)这两类数据,京津沪这些教育改革的快,基本好久都没有本科一二批了,现在好多省份也是新高考,
35 | 主选历史和地理那种,这里也不进行分析了。
36 |
37 | 筛选完成之后得到15个省份,这里其实还需要考虑全国1卷、2卷、3卷。但是一般用2卷和3卷的分数线近几年也相对比较低,这里就直接跟1卷的一起比了。
38 | 分析完成之后再单独分析一下黑龙江和河南。
39 | """
40 |
41 |
42 | # 筛选出那些不是一本二本招生的
43 | def filter_province():
44 | list1 = []
45 | for doc in total_list:
46 | dict1 = {}
47 | # print(doc, type(doc))
48 | # 开始筛选,首先是第一轮筛选,筛选出没有本科一批......这些数据
49 | isContainsDirChar = False
50 | isDirProvince = True
51 | for data in doc['data']:
52 | # data,许多年的数据
53 | for key, values in data.items():
54 | # print(key, values) # 2023:[2023的数据]
55 | for value in values: # 遍历每一年的
56 | # 使用普通字符串筛选就行
57 | if '\n' in value and (
58 | '本科一批' == value.split('\n')[0] or '一批' == value.split('\n')[0] or '本科一段' ==
59 | value.split('\n')[0]
60 | or '本科二批' == value.split('\n')[0] or '本科二段' == value.split('\n')[0] or '二批' ==
61 | value.split('\n')[0]):
62 | isContainsDirChar = True # 标记这次通过了
63 | if isContainsDirChar: # 其实这里还有第一批,但是23年都是一本二本的,前几年肯定也是这个制度,直接break
64 | break
65 | if not isContainsDirChar: # 表明某一年不是一本或二本招生,这个省份直接排除
66 | isDirProvince = False
67 | if not isDirProvince: # 已经不是符合要求的省份了,没必要再遍历其他年
68 | break
69 | if not isDirProvince:
70 | break
71 | if isDirProvince: # 经过了重重考验,符合条件的省份
72 | dict1['city'] = doc['city']
73 | print(dict1['city'], end='、')
74 | dict1['data'] = doc['data']
75 | list1.append(dict1)
76 | return list1
77 |
78 |
79 | # 绘制过滤完成的所有省份的折线图
80 | def plot_line_chart(datas_y, front_regions, end_regions, title):
81 | data_x = ['2018年', '2019年', '2020年', '2021年', '2022年', '2023年']
82 |
83 | plt.figure(figsize=(12, 8))
84 | # 画前4名和后4名
85 | for region, _ in front_regions + end_regions:
86 | scores = datas_y[region][::-1]
87 | plt.plot(data_x, scores, marker='o', label=region)
88 | # 添加数值标记
89 | for x, y in zip(data_x, scores):
90 | plt.text(x, y * 1.002, y, va='bottom', fontsize=10)
91 | plt.title(title, fontsize=14)
92 | plt.xlabel('年份', fontsize=12)
93 | plt.ylabel('分数', rotation=360, labelpad=20, fontsize=12)
94 | # 移动图例到图表外面
95 | plt.legend(loc='upper left', bbox_to_anchor=(-0.15, 1))
96 |
97 | plt.show()
98 |
99 |
100 | # 画前四个和后四个,文科本一、文科本二、理科本一、理科本二。
101 | # @params string batch_type:标识是一批还是二批,一为一批、二为二批
102 | # @params int subject_type:标识是文科还是理科,文科1理科2
103 | def draw_all_scoreline(batch_type, subject_type):
104 | provinces = [] # 存储所有省份
105 | datas_y = {}
106 | for province in list1:
107 | data_y = [] # 存储每个城市的本科数据的列表
108 | provinces.append(province['city'])
109 | for data in province['data']:
110 | for key, values in data.items():
111 | for value in values:
112 | if ('\n' in value) and (
113 | value.split('\n')[0] == f'本科{batch_type}批'
114 | or value.split('\n')[0] == f'{batch_type}批'
115 | or value.split('\n')[0] == f'第{batch_type}批'
116 | or value.split('\n')[0] == f'{batch_type}本'
117 | or value.split('\n')[0] == f'本科{batch_type}段'
118 | or value.split('\n')[0] == f'汉语言本科{batch_type}批'):
119 | data_y.append(int(value.split('\n')[subject_type].replace('分', ""))) # 取的是文科或者理科的分数
120 | datas_y[province['city']] = data_y # 把每个省的信息存一下,等下要排名
121 |
122 | # 计算每个地区的平均分数
123 | average_scores = {region: np.mean(scores) for region, scores in datas_y.items()}
124 |
125 | # 根据平均分数排序
126 | sorted_regions = sorted(average_scores.items(), key=lambda x: x[1])
127 |
128 | # 选择排名前4的和排名后4的
129 | top_5_regions = sorted_regions[-4:]
130 | bottom_5_regions = sorted_regions[:4]
131 |
132 | # 这个处理之后是一个列表包含元组,这点要注意 [('河南', 545.1666666666666), ('江西', 549.0), ('贵州', 552.5), ('云南', 560.0)]
133 |
134 | plot_line_chart(datas_y, top_5_regions, bottom_5_regions, f"{'文科' if subject_type == 1 else '理科'}"
135 | f"{'一本' if batch_type == '一' else '二本'}分数线排名前四和排名后四的省份")
136 |
137 |
138 | # 解析数据的函数,解析成组合条形图可以识别的数据格式
139 | def parse_scores(data):
140 | scores = {'本科一批': {'文科': [], '理科': []}, '本科二批': {'文科': [], '理科': []}}
141 | for entry in data:
142 | # print(entry.values())
143 | year_scores = (list(entry.values()))[0]
144 | # ['批次\n文科\n理科', '本科一批\n547\n514', '本科二批\n465\n409', '高职专科批\n185\n185', '查看详情\n查看详情']
145 | for score in year_scores:
146 | # print(score)
147 | parts = score.split('\n')
148 | if '本科一批' in parts[0] or '一本' in parts[0] or '一批' in parts[0]:
149 | scores['本科一批']['文科'].append(int(parts[1]))
150 | scores['本科一批']['理科'].append(int(parts[2]))
151 | elif '本科二批' in parts[0] or '二本' in parts[0] or '二批' in parts[0]:
152 | scores['本科二批']['文科'].append(int(parts[1]))
153 | scores['本科二批']['理科'].append(int(parts[2]))
154 | return scores
155 |
156 |
157 | # 绘制两个组合条形图的函数
158 | # @params type 标识要绘制的是文科还是理科的组合条形图 1文科,2理科
159 | def plot_combined_bar_chart(henan_score, hlj_score, title, type):
160 | years = ['2018年', '2019年', '2020年', '2021年', '2022年', '2023年']
161 | bar_width = 0.15
162 | index = np.arange(len(years)) # 生成等差数列
163 |
164 | fig, ax = plt.subplots(figsize=(18, 10)) # 生成轴对象和图标对象
165 | subject = '文科' if type == 1 else '理科'
166 |
167 | # 画henan的本科一批和hlj本科一批
168 | ax.bar(index - bar_width, henan_score['本科一批'][subject][::-1], bar_width, label='河南本科一批')
169 | ax.bar(index, hlj_score['本科一批'][subject][::-1], bar_width, label='黑龙江本科一批')
170 |
171 | # 画henan的本科二批和hlj二批
172 | ax.bar(index + 1.5 * bar_width, henan_score['本科二批'][subject][::-1], bar_width, label='河南本科二批')
173 | ax.bar(index + 2.5 * bar_width, hlj_score['本科二批'][subject][::-1], bar_width, label='黑龙江本科二批')
174 |
175 | # 添加文字标注
176 | for i in index:
177 | # 河南本科一批
178 | henan_one = henan_score['本科一批'][subject][::-1][i]
179 | ax.text(i - bar_width, henan_one + 1, str(henan_one) + "分", ha='center', va='bottom', fontsize=10)
180 |
181 | # 黑龙江本科一批
182 | hlj_one = hlj_score['本科一批'][subject][::-1][i]
183 | ax.text(i, hlj_one + 1, str(hlj_one) + "分", ha='center', va='bottom', fontsize=10)
184 |
185 | # 河南本科二批
186 | henan_two = henan_score['本科二批'][subject][::-1][i]
187 | ax.text(i + 1.5 * bar_width, henan_two + 1, str(henan_two) + "分", ha='center', va='bottom', fontsize=10)
188 |
189 | # 黑龙江本科二批
190 | hlj_two = hlj_score['本科二批'][subject][::-1][i]
191 | ax.text(i + 2.5 * bar_width, hlj_two + 1, str(hlj_two) + "分", ha='center', va='bottom', fontsize=10)
192 |
193 | ax.set_xlabel('年份', fontsize=12)
194 | ax.set_ylabel('分数', fontsize=12)
195 | ax.set_title(title, fontsize=14)
196 | ax.set_xticks(index + bar_width * 0.75)
197 | ax.set_xticklabels(years)
198 | ax.legend()
199 |
200 | plt.show()
201 |
202 |
203 | # 分析河南和黑龙江两个省份
204 | def analysis_certain_province():
205 | henan_data = []
206 | hlj_data = []
207 | # 获取详细数据
208 | for province in list1:
209 | # 注意,这里的data是一个数组包对象的形式
210 | if province['city'] == '河南':
211 | henan_data = province['data']
212 | elif province['city'] == '黑龙江':
213 | hlj_data = province['data']
214 | print(henan_data)
215 | # 解析并翻转数据
216 | henan_scores = parse_scores(henan_data)
217 | hlj_scores = parse_scores(hlj_data)
218 |
219 | # 绘制文科的组合条形图
220 | plot_combined_bar_chart(henan_scores, hlj_scores, '文科两省一本和二本分数线对比', 1)
221 | # 绘制理科的组合条形图
222 | plot_combined_bar_chart(henan_scores, hlj_scores, '理科两省本科一批和二本分数线对比', 2)
223 |
224 |
225 | if __name__ == '__main__':
226 | # 创建连接
227 | conn = MongoClient(host="localhost", port=27017)
228 | collection = conn.test.scoreline
229 |
230 | # 读取数据,不查id
231 | total_list = collection.find({}, {"_id": 0}) # 不要用关键字命名,找了10分钟才找出来list问题
232 | list1 = filter_province()
233 | # # 分别画四个图,对整体的分数线有个大概的认识
234 | draw_all_scoreline("一", 1)
235 | draw_all_scoreline("一", 2)
236 | draw_all_scoreline("二", 1)
237 | draw_all_scoreline("二", 2)
238 | # 请出主角,河南和黑龙江的详细对比,绘制组合条形图
239 | analysis_certain_province()
240 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/分析评论.py:
--------------------------------------------------------------------------------
1 | import jieba
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from wordcloud import WordCloud
5 | import re
6 |
7 | # 指定中文字体,使用 'SimHei'
8 | plt.rcParams['font.sans-serif'] = ['SimHei']
9 | # 为了正常显示负号
10 | plt.rcParams['axes.unicode_minus'] = False
11 |
12 |
13 | # 分析参与讨论的人们的ip分布
14 | def draw_pie_with_ip(data):
15 | # 按IP分组并计数
16 | ip_counts = data['ip_label'].value_counts()
17 |
18 | # 选择前8个最常见的IP
19 | top_8_ips = ip_counts.head(8)
20 |
21 | # 计算“其他”类别的评论数
22 | total_comments = ip_counts.sum() # 计算总的评论数
23 | top_8_sum = top_8_ips.sum() # 计算前8个评论数
24 | other_count = total_comments - top_8_sum
25 |
26 | other_series = pd.Series({'其他': other_count}) # 创建包含“其他”类别的新Series
27 |
28 | all_ips = pd.concat([top_8_ips, other_series]) # 将“其他”类别的Series和前8个的IP合并
29 |
30 | # 绘制饼状图
31 | plt.figure(figsize=(8, 8))
32 | plt.pie(all_ips, labels=all_ips.index, autopct='%1.1f%%', startangle=140)
33 | plt.title('评论分布的地域情况', fontsize=18)
34 | plt.show()
35 |
36 |
37 | # 发布评论的时间分布
38 | def draw_pie_with_time(data):
39 | # 确保 'create_time' 列中的值是数字,并且创建一个副本,不要让他报错,以为我要修改原来的对象
40 | data_clean = data[pd.to_numeric(data['create_time'], errors='coerce').notna()].copy()
41 |
42 | # 将 'create_time' 列从 Unix 时间戳转换为 datetime 对象
43 | data_clean['create_time'] = pd.to_datetime(data_clean['create_time'].astype(int), unit='s')
44 |
45 | # 从 datetime 中提取日期
46 | data_clean['date'] = data_clean['create_time'].dt.date
47 |
48 | # 按日期分组并计算每天的评论数
49 | comments_per_day = data_clean['date'].value_counts().sort_index()
50 |
51 | # 为饼状图准备数据和标签
52 | pie_labels = [f"{date} ({count}人)" for date, count in comments_per_day.items()]
53 |
54 | # 绘制饼状图
55 | plt.figure(figsize=(8, 8))
56 | plt.pie(comments_per_day, labels=pie_labels, autopct='%1.1f%%', startangle=140)
57 | plt.title('日期的评论分布', fontsize=18)
58 | plt.show()
59 |
60 |
61 | # 进行分词,绘制评论的词云图
62 | def draw_word_cloud_with_comment(text):
63 | # print(text, type(text))
64 | joined_text = ' '.join(text) # 转换成字符串
65 | # 使用jieba进行中文分词,并过滤掉单个字的词和一些不想要的词
66 | words = jieba.cut(joined_text)
67 | filtered_words = [word for word in words if
68 | len(word) > 1 and word not in '不是 捂脸 黑龙江 河南 东北 我们 你们 什么']
69 |
70 | # 将过滤后的词用空格连接成一个字符串
71 | filtered_text = ' '.join(filtered_words)
72 | # 再次生成词云图
73 | wordcloud_cn = WordCloud(width=800, height=400, background_color='white',
74 | font_path='simhei.ttf').generate(filtered_text)
75 |
76 | # 显示词云图
77 | plt.figure(figsize=(16, 10))
78 | plt.imshow(wordcloud_cn, interpolation='bilinear')
79 | plt.axis('off')
80 | plt.title('词云图分析', fontsize=50, y=1.1)
81 | plt.show()
82 |
83 |
84 | # 简单的模式匹配,分析大家评论的倾向频率
85 | def draw_bar_plot_with_comment(comments_df):
86 | # 定义不同的匹配模式
87 | patterns = {
88 | 1: r"支持河南考生移民|支持河南学生迁户口|赞成河南学生迁移",
89 | 2: r"占用黑龙江考生名额|抢占黑龙江学生的机会|河南考生侵占黑龙江名额|抢教育资源",
90 | 3: r"河南分数线太高|河南高考难|河南分数线高|分数线高",
91 | 4: r"教育不平等|全国统一分数线|统一分数线|教育公平|统一录取率|统一试卷",
92 | 5: r"复读生.*?多|复读生|复读|复读现象严重",
93 | 6: r"河南大学少|河南高校不够多",
94 | 7: r"黑龙江教育资源差|河南教育资源号|河南和黑龙江教育差距|教育资源差异|教育资源",
95 | 8: r"河南人多|河南学生多|河南考生多|河南生的孩子多|人多",
96 | 9: r"这就是河南人|河南人爱钻空子|地图炮",
97 | 10: r"去年西安|之前西安|移民西安|西安",
98 | 11: r"阴谋|国外分子"
99 | }
100 |
101 | # 判断当前这个评论属于什么分类
102 | def classify_comment(text):
103 | for category, pattern in patterns.items():
104 | if re.search(fr".*?{pattern}.*?", text):
105 | print(text, category)
106 | return category
107 | return 12 # 默认的原因
108 |
109 | # 对每个评论都进行分类
110 | comments_df['type'] = comments_df['text'].apply(classify_comment)
111 | print(len(comments_df['text']))
112 |
113 | # 计算分类的数量
114 | category_counts = comments_df['type'].value_counts().sort_index()
115 |
116 | # 排除分类11
117 | category_counts = category_counts[category_counts.index != 12]
118 | top_5_categories = category_counts.nlargest(5)
119 |
120 | plt.figure(figsize=(10, 6))
121 | bars = plt.bar(top_5_categories.index.astype(str), top_5_categories.values, color='skyblue')
122 |
123 | # 添加每个柱子的个数标记
124 | for bar in bars:
125 | yval = bar.get_height()
126 | plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, int(yval), ha='center', va='bottom')
127 |
128 | plt.xlabel('分类')
129 | plt.ylabel('评论数量')
130 | plt.title('评论分类的前5个')
131 | plt.xticks(rotation=45)
132 | plt.grid(axis='y', linestyle='--', alpha=0.7)
133 | plt.show()
134 |
135 |
136 | if __name__ == '__main__':
137 | # 读取CSV文件
138 | file_path = './comments.csv'
139 | data = pd.read_csv(file_path)
140 | draw_pie_with_ip(data) # 绘制ip分布的饼状图
141 | draw_pie_with_time(data) # 绘制时间分布的饼状图
142 | draw_word_cloud_with_comment(data['text'].tolist()) # 绘制评论分布的词云图
143 | draw_bar_plot_with_comment(data) # 绘制评论分布的柱状图
144 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/存储到mysql数据库.py:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 |
4 | import pymysql
5 |
6 |
7 | # 这个类是用来把数据存储到Mysql数据库的
8 | class MyDatabase:
9 |
10 | # 初始化数据库连接环境
11 | def __init__(self):
12 | self.db = pymysql.connect(host='localhost', user='root', password='123456', database='spidertestdb')
13 | self.cursor = self.db.cursor()
14 | self.create_table()
15 |
16 | # 这个数据库里面主要装爬取的所有评论数据
17 | def create_table(self):
18 | create_table_sql = """
19 | CREATE TABLE IF NOT EXISTS tiktok_comments (
20 | id BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT,
21 | aweme_id BIGINT COMMENT '视频id',
22 | cid BIGINT comment '评论id',
23 | comment_text text comment '评论内容',
24 | digg_count int comment '点赞数',
25 | reply_comment_total int comment '回应的评论数',
26 | nickname varchar(100) comment '昵称',
27 | ip_label VARCHAR(100) comment 'ip属地',
28 | create_time datetime comment '评论发送时间'
29 | )
30 | """
31 | self.cursor.execute(create_table_sql)
32 |
33 | def save_data(self, datas):
34 | insert_sql = """
35 | INSERT INTO tiktok_comments (cid, aweme_id,comment_text, digg_count, reply_comment_total, nickname, ip_label, create_time)
36 | VALUES (%s,%s, %s, %s, %s, %s, %s, %s)
37 | """
38 | try:
39 | for data in datas:
40 | values = (
41 | data.get("cid", ""),
42 | data.get('aweme_id', ""),
43 | data.get("text", 0),
44 | data.get("digg_count", 0),
45 | data.get("reply_comment_total", 0),
46 | data.get("nickname", 0),
47 | data.get("ip_label", ""),
48 | datetime.fromtimestamp(data.get("create_time", int(time.time()))).strftime(
49 | '%Y-%m-%d %H:%M:%S'),
50 | )
51 | self.cursor.execute(insert_sql, values)
52 | self.db.commit()
53 | except Exception as e:
54 | print("插入数据时候出错了:", e)
55 | self.db.rollback()
56 |
57 | # 关闭数据库连接
58 | def close(self):
59 | self.cursor.close()
60 | self.db.close()
61 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/测试.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | # 指定中文字体,使用 'SimHei'
5 | plt.rcParams['font.sans-serif'] = ['SimHei']
6 | # 为了正常显示负号
7 | plt.rcParams['axes.unicode_minus'] = False
8 |
9 | # 河南和黑龙江的数据
10 | henan_data = [{'2023年': ['批次\n文科\n理科', '本科一批\n547\n514', '本科二批\n465\n409', '高职专科批\n185\n185',
11 | '查看详情\n查看详情']}, {
12 | '2022年': ['批次\n文科\n理科', '本科一批\n527\n509', '本科二批\n445\n405', '高职专科批\n190\n190',
13 | '查看更多\n查看更多']}, {
14 | '2021年': ['批次\n文科\n理科', '一本\n558\n518', '二本\n466\n400', '高职高专\n200\n200',
15 | '艺术类\n点击查看\n点击查看', '体育类\n点击查看\n点击查看']},
16 | {'2020年': ['批次\n文科\n理科', '一本\n556\n544', '二本\n465\n418', '高职高专\n180\n180']},
17 | {'2019年': ['批次\n文科\n理科', '一批\n536\n502', '二批\n447\n385', '专科\n160\n160']},
18 | {'2018年': ['批次\n文科\n理科', '一批\n547\n499', '二批\n436\n374', '专科\n200\n200']}]
19 | hlj_data = [{'2023年': ['批次\n文科\n理科', '本科一批\n430\n408', '本科二批\n341\n287', '高职(专科)\n160\n160',
20 | '查看详情\n查看详情']}, {
21 | '2022年': ['批次\n文科\n理科', '本科一批\n463\n429', '本科二批\n365\n308', '高职(专科)\n160\n160',
22 | '查看更多\n查看更多']}, {
23 | '2021年': ['批次\n文科\n理科', '本科一批\n472\n415', '本科二批\n354\n280', '高职专科\n160\n160',
24 | '艺术体育类\n点击查看\n点击查看']}, {
25 | '2020年': ['批次\n文科\n理科', '本科一批\n483\n455', '本科二批\n356\n301', '高职专科\n160\n160',
26 | '艺术体育类\n点击查看\n点击查看']}, {
27 | '2019年': ['批次\n文科\n理科', '一批\n500\n477', '二批\n424\n372', '艺术类本科\n254\n236',
28 | '体育类本科\n254\n223', '本科三批\n348\n324', '高职专科\n160\n160']}, {
29 | '2018年': ['批次\n文科\n理科', '一批\n490\n472', '二批\n406\n353', '高职\n160\n160', '艺术类本科\n-\n-',
30 | '体育类本科\n-\n-']}]
31 |
32 |
33 | # 解析数据的函数
34 | def parse_scores(data):
35 | scores = {'本科一批': {'文科': [], '理科': []}, '本科二批': {'文科': [], '理科': []}}
36 | for entry in data:
37 | # print(entry)
38 | year_scores = list(entry.values())[0]
39 | # ['批次\n文科\n理科', '本科一批\n547\n514', '本科二批\n465\n409', '高职专科批\n185\n185', '查看详情\n查看详情']
40 | for score in year_scores:
41 | # print(score)
42 | parts = score.split('\n')
43 | if '本科一批' in parts[0] or '一本' in parts[0] or '一批' in parts[0]:
44 | scores['本科一批']['文科'].append(int(parts[1]))
45 | scores['本科一批']['理科'].append(int(parts[2]))
46 | elif '本科二批' in parts[0] or '二本' in parts[0] or '二批' in parts[0]:
47 | scores['本科二批']['文科'].append(int(parts[1]))
48 | scores['本科二批']['理科'].append(int(parts[2]))
49 | return scores
50 |
51 |
52 | # 绘制组合条形图的函数
53 | def plot_combined_bar_chart(henan_score, hlj_score, title):
54 | print(henan_score)
55 | print(hlj_score)
56 | years = ['2018年', '2019年', '2020年', '2021年', '2022年', '2023年']
57 | bar_width = 0.15
58 | index = np.arange(len(years))
59 |
60 | fig, ax = plt.subplots(figsize=(18, 10))
61 |
62 | # 画henan的本科一批和hlj本科一批
63 | ax.bar(index - bar_width, henan_score['本科一批']['理科'][::-1], bar_width, label='河南本科一批')
64 | ax.bar(index, hlj_score['本科一批']['理科'][::-1], bar_width, label='黑龙江本科一批')
65 |
66 | # 画henan的本科二批和hlj二批
67 | ax.bar(index + 1.5 * bar_width, henan_score['本科二批']['理科'][::-1], bar_width, label='河南本科二批')
68 | ax.bar(index + 2.5 * bar_width, hlj_score['本科二批']['理科'][::-1], bar_width, label='黑龙江本科二批')
69 |
70 | # 添加文字标注
71 | for i in index:
72 | # 河南本科一批
73 | henan_one = henan_score['本科一批']['理科'][::-1][i]
74 | ax.text(i - bar_width, henan_one + 1, henan_one, ha='center', va='bottom')
75 |
76 | # 黑龙江本科一批
77 | hlj_one = hlj_score['本科一批']['理科'][::-1][i]
78 | ax.text(i, hlj_one + 1, hlj_one, ha='center', va='bottom')
79 |
80 | # 河南本科二批
81 | henan_two = henan_score['本科二批']['理科'][::-1][i]
82 | ax.text(i + 1.5 * bar_width, henan_two + 1, henan_two, ha='center', va='bottom')
83 |
84 | # 黑龙江本科二批
85 | hlj_two = hlj_score['本科二批']['理科'][::-1][i]
86 | ax.text(i + 2.5 * bar_width, hlj_two + 1, hlj_two, ha='center', va='bottom')
87 |
88 | ax.set_xlabel('年份')
89 | ax.set_ylabel('分数')
90 | ax.set_title(title)
91 | ax.set_xticks(index + bar_width * 0.75)
92 | ax.set_xticklabels(years)
93 | ax.legend()
94 |
95 | plt.show()
96 |
97 |
98 | # 解析并翻转数据
99 | henan_scores = parse_scores(henan_data)
100 | hlj_scores = parse_scores(hlj_data)
101 |
102 | # 绘制理科的组合条形图
103 | plot_combined_bar_chart(henan_scores, hlj_scores, '理科本科一批和本科二批分数线对比')
104 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/爬取分数线.py:
--------------------------------------------------------------------------------
1 | # 数据来源:高考网:https://college.gaokao.com/。中国教育在线:https://www.eol.cn/e_html/gk/fsx/index.shtml
2 | # 看着ui,感觉还是第二个靠谱一点,应该是官方的吧?我们也不需要近20年的数据,参考价值也不大
3 | import requests
4 | from pyquery import PyQuery as pq
5 | from pymongo import MongoClient
6 |
7 | headers = {
8 | 'authority': 'www.eol.cn',
9 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
10 | 'accept-language': 'zh-CN,zh;q=0.9',
11 | 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
12 | 'sec-ch-ua-mobile': '?0',
13 | 'sec-ch-ua-platform': '"Windows"',
14 | 'sec-fetch-dest': 'document',
15 | 'sec-fetch-mode': 'navigate',
16 | 'sec-fetch-site': 'none',
17 | 'sec-fetch-user': '?1',
18 | 'upgrade-insecure-requests': '1',
19 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
20 | }
21 |
22 | response = requests.get('https://www.eol.cn/e_html/gk/fsx/index.shtml', headers=headers)
23 |
24 | html = response.content.decode("utf-8")
25 | # print(html)
26 | doc1 = pq(html)
27 | # 首先明确我们需要什么?要采集的数据有:每个省份的名称,年份,每个批次招收的最低分数线
28 | # 提取数据
29 | province_items = doc1(".fsshowli.hascore") # 提取出所有的小方格,所有的省份
30 | # print(province_items)
31 | data = [] # 存储所有省份的数据
32 |
33 | for city_index, province in enumerate(province_items.items()):
34 | province_data = {}
35 | # 提取每个省份的数据
36 | province_data['city'] = province('.topline .city').text() # 提取省份名称
37 | # 这里得提取每个年份的数据,year:2023.score_line:['批次,分数线','普通本科批,463','艺术类本科,点击查看']
38 | # 其实筛选一下,本科一批,本科二批就可以了,但是爬取完后面再数据处理也可以,更多的扩展空间
39 | years = province('.sline .year').text().split(' ') # 提取年份名称
40 | score_data = [] # 存储每年数据的数组
41 | for year_index, line in enumerate(province('.tline div').items()):
42 | # 存储每一行的数据
43 | score_line = [tr.text() for tr in line('tr').items()]
44 | # 把这一行数据映射到当年
45 | score_data.append({years[year_index]: score_line})
46 | province_data['data'] = score_data # 把数据复制给对象
47 | print(province_data)
48 | data.append(province_data)
49 | print(data)
50 | # 由于刚才用到了mysql数据库,这个数据结构也不是很规则,这次我们练习一下mongodb数据库
51 | # 创建连接
52 | conn = MongoClient(host="localhost", port=27017)
53 | collection = conn.test.scoreline
54 | # 存储数据,没错,就一行命令
55 | collection.insert_many(data)
56 |
--------------------------------------------------------------------------------
/第8章-JS补环境/抖音/分析移民问题/爬取评论.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import random
3 | import time
4 |
5 | import requests
6 | # 固定格式,用于解决windows执行js文件输出汉字乱码问题
7 | from functools import partial # 锁定参数
8 | import subprocess
9 |
10 | subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
11 | import execjs # 此时再引入execjs的时候. 里面就可以自动使用你的subprocess.Popen
12 |
13 | from 存储到mysql数据库 import MyDatabase
14 |
15 | # 获取任意页的数据,主要进行的测试
16 | def fake_get_data():
17 | # time.sleep(random.randint(1, 3)) # 每次睡几秒
18 | cursor = 587 * 20
19 | count = 20
20 | params = f"device_platform=webapp&aid=6383&channel=channel_pc_web&aweme_id=7323036659463785791&cursor={cursor}&count={count}&item_type=0&insert_ids=&whale_cut_token=&cut_version=1&rcFT=&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1440&screen_height=900&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=120.0.0.0&browser_online=true&engine_name=Blink&engine_version=120.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=1.45&effective_type=3g&round_trip_time=300&webid=7322737813169997322&msToken=X-9sXWNcpTGrbIJ0La4G7SuEFTfKEvsl9OSplNoSfm-xiqor6oqsZI1HlDy9WSyXRxUP5HENnRfeXFtkPEiuf4WgmvrU1BujPNtJcg-kKZfoQNNAQQDoGg=="
21 | # 读取js代码
22 | with open('../接单/扣代码.js', mode='r', encoding='utf-8') as f:
23 | js_code = f.read()
24 |
25 | # 加载代码
26 | js = execjs.compile(js_code)
27 |
28 | # 执行js代码中的函数,传递url字符串
29 | X_Bogus = js.call("fn", params)
30 | params += f"&X-Bogus={X_Bogus}"
31 | response = requests.get(
32 | base_url + params,
33 | headers=headers,
34 | )
35 | print(response.json())
36 |
37 | # 开始爬取所有数据的函数
38 | def get_all_data():
39 | global page
40 | global total_page
41 | with open(filename, 'a', newline='', encoding='utf-8') as csvfile: # mode一定是a
42 | for item_id in item_id_list: # 遍历不同视频
43 | is_begin = True # 是不是刚开始
44 | is_continue = 0 # 还要不要继续爬
45 | page = 1 # 重置页数
46 | while is_begin or is_continue == 1: # 循环爬取
47 | time.sleep(random.randint(1, 3)) # 每次睡几秒
48 | cursor = page * 20
49 | count = 20
50 | params = f"device_platform=webapp&aid=6383&channel=channel_pc_web&aweme_id={item_id}&cursor={cursor}&count={count}&item_type=0&insert_ids=&whale_cut_token=&cut_version=1&rcFT=&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1440&screen_height=900&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=120.0.0.0&browser_online=true&engine_name=Blink&engine_version=120.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=1.45&effective_type=3g&round_trip_time=300&webid=7322737813169997322&msToken=X-9sXWNcpTGrbIJ0La4G7SuEFTfKEvsl9OSplNoSfm-xiqor6oqsZI1HlDy9WSyXRxUP5HENnRfeXFtkPEiuf4WgmvrU1BujPNtJcg-kKZfoQNNAQQDoGg=="
51 | # 读取js代码
52 | with open('../接单/扣代码.js', mode='r', encoding='utf-8') as f:
53 | js_code = f.read()
54 |
55 | # 加载代码
56 | js = execjs.compile(js_code)
57 |
58 | # 执行js代码中的函数,传递url字符串
59 | X_Bogus = js.call("fn", params)
60 | params += f"&X-Bogus={X_Bogus}"
61 | response = requests.get(
62 | base_url + params,
63 | headers=headers,
64 | )
65 | # print(response.json())
66 | # has_more字段应该是标识还有没有数据了,0没有,1有
67 | is_continue = response.json()['has_more']
68 | if (is_continue == 0):
69 | continue # 应该是爬取到头了,直接break然后跳到下一个item,或者让他进continue,这里字节应该在上一个请求设置字段啊......
70 | comments = response.json()['comments']
71 | # 这里我们存储什么字段?
72 | # cid 【评论id?】 text:评论内容 digg_count:点赞数 reply_comment_total:评论回复数 nickname:用户昵称 ip_label:所在地域 create_time:创建时间
73 | # 下面这个列表推导式做的的工作有从评论中筛选出来指定的字段,并且赋值给新的数组
74 | fields_ = ['cid', 'aweme_id', 'text', 'digg_count', 'reply_comment_total', 'nickname', 'ip_label',
75 | 'create_time']
76 | data = [{k: v for k, v in d.items() if k in fields_} for d in comments]
77 |
78 | print(f"当前爬取的是第{page}页数据,itemid为{item_id},具体数据是{data}")
79 | # 1:存储到csv文件:2:存储到mysql数据库
80 |
81 | # 写入CSV
82 | try:
83 | writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
84 | if is_begin:
85 | writer.writeheader()
86 | for comment in data:
87 | writer.writerow(comment)
88 | except Exception as e:
89 | print("写入csv文件时候出错了,错误信息是:", e)
90 |
91 | # 写入mysql
92 | my_database = MyDatabase()
93 | my_database.save_data(data)
94 | page += 1 # 更新页的参数
95 | total_page += 1 # 更新全部页的参数
96 | is_begin = False # 更新不是开始了
97 |
98 |
99 | if __name__ == '__main__':
100 | base_url = "https://www.douyin.com/aweme/v1/web/comment/list/?"
101 | headers = {
102 | 'authority': 'www.douyin.com',
103 | 'accept': 'application/json, text/plain, */*',
104 | 'accept-language': 'zh-CN,zh;q=0.9',
105 | 'cookie': 'ttwid=1%7C3RF4jSjUuHRykLjmCE0XvH22oXsLrJjAd_1rhwea81A%7C1704957772%7Cd5de9e3ac1a04f5f28de1740078ca4cde389bb9154032b370fd72447f2eef79b; dy_swidth=1440; dy_sheight=900; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; ttcid=52612352430d49f998d97c8b38ae9b4736; s_v_web_id=verify_lr8vvkte_XEcA0Jys_0GbJ_4kkB_AryH_1UhpKssCsoj7; passport_csrf_token=52440d3ed22f14c0098d58eee34df89d; passport_csrf_token_default=52440d3ed22f14c0098d58eee34df89d; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Atrue%2C%22volume%22%3A0.5%7D; bd_ticket_guard_client_web_domain=2; download_guide=%223%2F20240111%2F0%22; passport_assist_user=CkGcEizHYrICAOhlighTQykMuNrH8FVFExZHqq6O8jY4q1ilhCa3qitzfdnSGu3p6Nf6hkmIiG7lRTqtKQ-12PL9-RpKCjzGNSUcJhoarxi4_C5pXOMzO0vo0ET2IEB3mNaXs61TxkF9o9_PDVa00RdALKvWE6spDmtCvf5TcazzrU8QorDGDRiJr9ZUIAEiAQMT9w93; n_mh=13zwzL9MgH4qiZZNvz6VgUrKf7141N7fMjaTafXwvSg; sso_uid_tt=9b8b28c3cf5c814d355a84ef25c89de2; sso_uid_tt_ss=9b8b28c3cf5c814d355a84ef25c89de2; toutiao_sso_user=1f1edda202f5d36d8185131094a4f82e; toutiao_sso_user_ss=1f1edda202f5d36d8185131094a4f82e; sid_ucp_sso_v1=1.0.0-KDIwZTYwODkxOGNmYTM1ZWZhM2Q2MzI2YTgyNDMzMmNhOThkMmFkZGQKHwjk0cCAiozlBhDn0v6sBhjvMSAMMNKEqJAGOAZA9AcaAmxxIiAxZjFlZGRhMjAyZjVkMzZkODE4NTEzMTA5NGE0ZjgyZQ; ssid_ucp_sso_v1=1.0.0-KDIwZTYwODkxOGNmYTM1ZWZhM2Q2MzI2YTgyNDMzMmNhOThkMmFkZGQKHwjk0cCAiozlBhDn0v6sBhjvMSAMMNKEqJAGOAZA9AcaAmxxIiAxZjFlZGRhMjAyZjVkMzZkODE4NTEzMTA5NGE0ZjgyZQ; passport_auth_status=1ffd90169b48b20fc8b94e7edbd9862a%2C; passport_auth_status_ss=1ffd90169b48b20fc8b94e7edbd9862a%2C; uid_tt=8090c4b439be6386ae8c25d643cd00f5; uid_tt_ss=8090c4b439be6386ae8c25d643cd00f5; sid_tt=69eeab161bc0c2fec010a37e04326d84; sessionid=69eeab161bc0c2fec010a37e04326d84; sessionid_ss=69eeab161bc0c2fec010a37e04326d84; LOGIN_STATUS=1; store-region=cn-hl; store-region-src=uid; _bd_ticket_crypt_doamin=2; _bd_ticket_crypt_cookie=9f92a67df42160bb656dd3dd86775d1a; __security_server_data_status=1; sid_guard=69eeab161bc0c2fec010a37e04326d84%7C1704962427%7C5183983%7CMon%2C+11-Mar-2024+08%3A40%3A10+GMT; sid_ucp_v1=1.0.0-KGY1YmVmOTA0ZmQ1ZGQxOGE0MTEzOTJhM2Q1YWFkMzY0MTEzY2E2MmUKGwjk0cCAiozlBhD70v6sBhjvMSAMOAZA9AdIBBoCaGwiIDY5ZWVhYjE2MWJjMGMyZmVjMDEwYTM3ZTA0MzI2ZDg0; ssid_ucp_v1=1.0.0-KGY1YmVmOTA0ZmQ1ZGQxOGE0MTEzOTJhM2Q1YWFkMzY0MTEzY2E2MmUKGwjk0cCAiozlBhD70v6sBhjvMSAMOAZA9AdIBBoCaGwiIDY5ZWVhYjE2MWJjMGMyZmVjMDEwYTM3ZTA0MzI2ZDg0; pwa2=%220%7C0%7C3%7C0%22; my_rd=2; EnhanceDownloadGuide=%221_1704966403_0_0_0_0%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAAJGPAk8yXuLh_sfymNjO_NFrV1fK1u09aBJxPgjJcdQ43_-Beejmxgaf2Mo-80NUp%2F1704988800000%2F0%2F1704966489201%2F0%22; douyin.com; xg_device_score=7.627371509122499; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1440%2C%5C%22screen_height%5C%22%3A900%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A7.5%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; csrf_session_id=bccebc3fbe81580f677a46ad97cfcd99; strategyABtestKey=%221705122861.306%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAAJGPAk8yXuLh_sfymNjO_NFrV1fK1u09aBJxPgjJcdQ43_-Beejmxgaf2Mo-80NUp%2F1705161600000%2F0%2F1705122861742%2F0%22; passport_fe_beating_status=true; __ac_nonce=065a21c4800cdd38db5e2; __ac_signature=_02B4Z6wo00f01eLlXCgAAIDClMqBZyPmRa3ixViAAB0dcnzK1sJ2X72-3s4aCoADPsuK7-LC1kGnREdKtLNa4HOH6XsmqdjaV6r05Wo2SaunYf3mmsndjs2t2IK61Xabo16db8WYPmUtLyI5a9; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A0%2C%5C%22is_mute%5C%22%3A1%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A0%7D%22; IsDouyinActive=true; home_can_add_dy_2_desktop=%221%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCRnMwSWZRUTZUeXFrenpzbU1Zdk1CVHVDTllITVpXMW43L3VQVGRLWFlsbGRHMG4yaS9uT1ZJcXphVG9QVzhGVGFtT3oxeDZBSHQ0NURNSTdNYndhbUk9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; publish_badge_show_info=%220%2C0%2C0%2C1705122897180%22; msToken=k3zUhX77Lp2s8G-gHCdqPiA5LTTTcJreSaGrlUdaQ4YloO3q2AzE_dduIMuOr7yKDyG4gZvsOtqIy2WOSekze7b-dr1YLHOvq0fUsjDWHqS3iTu8vwDUnw==; tt_scid=j8-WPk42cgZtl1U0R6P5p.c1xhG3O4ejJ7jVYiB5bPJK6dRjEVVyinMNdT1LkYQXbe95; odin_tt=704f9f9eb371476ddc6ee5a1b88d44542688ad53451b9e481979b521fb169beb67148c143f9abd0976e6ec6c4a3e8436; msToken=X-9sXWNcpTGrbIJ0La4G7SuEFTfKEvsl9OSplNoSfm-xiqor6oqsZI1HlDy9WSyXRxUP5HENnRfeXFtkPEiuf4WgmvrU1BujPNtJcg-kKZfoQNNAQQDoGg==',
106 | 'referer': 'https://www.douyin.com/search/%E5%A4%A7%E5%BA%86%E5%9B%9E%E5%BA%94%E6%B2%B3%E5%8D%97%E4%B8%AD%E8%80%83%E7%94%9F%E7%A7%BB%E6%B0%91?aid=65a94252-11ae-427d-a91d-f75f7a134446&publish_time=0&sort_type=0&source=recom_search&type=general',
107 | 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
108 | 'sec-ch-ua-mobile': '?0',
109 | 'sec-ch-ua-platform': '"Windows"',
110 | 'sec-fetch-dest': 'empty',
111 | 'sec-fetch-mode': 'cors',
112 | 'sec-fetch-site': 'same-origin',
113 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
114 | }
115 | # 7323036659463785791
116 | # 指定爬取的视频id列表
117 | item_id_list = ["7323036659463785791", "7323096163404565786", "7323115639957163279","7323051287136734479","7323066454583135498","7323054311179603238"] # 爬取这三个视频的评论
118 | # cursor = 40 # 偏移量
119 | # count = 20
120 | total_page = 1
121 | page = 1
122 | start_time = time.time() # 开始时间
123 | # 指定CSV文件名
124 | filename = 'comments.csv'
125 | try:
126 | get_all_data() # 爬取评论数据
127 | # fake_get_data()
128 | except Exception as e:
129 | print("爬取数据发生了异常,异常信息为", e)
130 | end_time = time.time()
131 |
132 | print(f"数据爬取完成,爬取{total_page}页数据,数据共{20 * total_page}条,花费时间为{int((end_time - start_time))}秒")
133 |
--------------------------------------------------------------------------------
/第9章-APP逆向/代练通/Heroes.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver import Chrome
2 | from selenium.webdriver.common.by import By
3 | from selenium.webdriver.chrome.options import Options
4 |
5 | # 这个类是获取王者荣耀英雄列表的
6 | heroes = []
7 |
8 |
9 | def get_heros_order():
10 | opt = Options()
11 | opt.add_argument("--headless")
12 | opt.add_argument('--disable-gpu')
13 | opt.add_argument("--window-size=4000,1600") # 设置窗口大小
14 |
15 | driver = Chrome(options=opt)
16 | driver.get("https://pvp.qq.com/web201605/herolist.shtml")
17 | driver.implicitly_wait(10)
18 | lis = driver.find_elements(By.CSS_SELECTOR, ".herolist li")
19 | for li in lis:
20 | # 把数据存储到列表中
21 | heroes.append(li.text)
22 |
23 |
24 | get_heros_order()
25 |
--------------------------------------------------------------------------------
/第9章-APP逆向/代练通/数据库操作.py:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 |
4 | import pymysql
5 |
6 | # 这个类是用来把数据存储到Mysql数据库的
7 | class MyDatabase:
8 |
9 | # 初始化数据库连接环境
10 | def __init__(self):
11 | self.db = pymysql.connect(host='localhost', user='root', password='123456', database='spidertestdb')
12 | self.cursor = self.db.cursor()
13 | self.create_table()
14 |
15 | # 这个数据库里面主要装爬取的所有数据,重复的也可以装
16 | def create_table(self):
17 | create_table_sql = """
18 | CREATE TABLE IF NOT EXISTS dailiantong_base (
19 | id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
20 | Title VARCHAR(10000) comment '标题',
21 | Price int comment '价格',
22 | Ensure1 int comment '安全保证金',
23 | Ensure2 int comment '效率保证金',
24 | TimeLimit int comment '时间限制',
25 | Creater VARCHAR(100) comment '发单人',
26 | Stamp DATETIME comment '发布时间',
27 | Zone VARCHAR(100) comment '游戏大区',
28 | UnitPrice int comment '单价',
29 | UserID VARCHAR(100) comment '发单人ID',
30 | SerialNo VARCHAR(100) comment '订单ID'
31 | )
32 | """
33 | # 这个数据库主要装按照英雄分类的时候爬取到的数据
34 | create_table_sql2="""
35 | CREATE TABLE IF NOT EXISTS heroes_table(
36 | id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
37 | hero VARCHAR(100) COMMENT '英雄名称',
38 | Title VARCHAR(10000) comment '标题',
39 | Price int comment '价格',
40 | Ensure1 int comment '安全保证金',
41 | Ensure2 int comment '效率保证金',
42 | TimeLimit int comment '时间限制',
43 | Creater VARCHAR(100) comment '发单人',
44 | Stamp DATETIME comment '发布时间',
45 | Zone VARCHAR(100) comment '游戏大区',
46 | UnitPrice int comment '单价',
47 | UserID VARCHAR(100) comment '发单人ID',
48 | SerialNo VARCHAR(100) comment '订单ID'
49 |
50 | )
51 | """
52 | self.cursor.execute(create_table_sql)
53 | self.cursor.execute(create_table_sql2)
54 | # 还有个表base_dailiantong用来装清洗过后的数据
55 |
56 |
57 | def save_data(self, datas):
58 | insert_sql = """
59 | INSERT INTO dailiantong_base (Title, Price, Ensure1, Ensure2, TimeLimit, Creater, Stamp, Zone, UnitPrice,UserID,SerialNo)
60 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s)
61 | """
62 | try:
63 | for data in datas["LevelOrderList"]:
64 | values = (
65 | data.get("Title", ""),
66 | data.get("Price", 0),
67 | data.get("Ensure1", 0),
68 | data.get("Ensure2", 0),
69 | data.get("TimeLimit", 0),
70 | data.get("Create", ""),
71 | datetime.fromtimestamp(data.get("Stamp", int(time.time())) + 255845872).strftime(
72 | '%Y-%m-%d %H:%M:%S'),
73 | data.get("Zone", ""),
74 | data.get("UnitPrice", 0),
75 | data.get("UserID", ""),
76 | data.get("SerialNo", "")
77 | )
78 | self.cursor.execute(insert_sql, values)
79 | self.db.commit()
80 | except Exception as e:
81 | print("插入数据时候出错了:", e)
82 | self.db.rollback()
83 |
84 | # 把英雄数据保存到数据库中
85 | def save_heroes_data(self, datas,search_str):
86 | insert_sql = """
87 | INSERT INTO heroes_table (hero,Title, Price, Ensure1, Ensure2, TimeLimit, Creater, Stamp, Zone, UnitPrice,UserID,SerialNo)
88 | VALUES (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s)
89 | """
90 | try:
91 | for data in datas["LevelOrderList"]:
92 | values = (
93 | search_str,
94 | data.get("Title", ""),
95 | data.get("Price", 0),
96 | data.get("Ensure1", 0),
97 | data.get("Ensure2", 0),
98 | data.get("TimeLimit", 0),
99 | data.get("Create", ""),
100 | datetime.fromtimestamp(data.get("Stamp", int(time.time())) + 255845872).strftime(
101 | '%Y-%m-%d %H:%M:%S'),
102 | data.get("Zone", ""),
103 | data.get("UnitPrice", 0),
104 | data.get("UserID", ""),
105 | data.get("SerialNo", "")
106 | )
107 | self.cursor.execute(insert_sql, values)
108 | self.db.commit()
109 | except Exception as e:
110 | print("插入数据时候出错了:", e)
111 | self.db.rollback()
112 |
113 | # 关闭数据库连接
114 | def close(self):
115 | self.cursor.close()
116 | self.db.close()
117 |
--------------------------------------------------------------------------------
/第9章-APP逆向/代练通/爬取数据.py:
--------------------------------------------------------------------------------
1 | import pprint
2 | import time
3 | import requests
4 | from urllib.parse import parse_qs, urlparse, parse_qsl
5 | from hashlib import md5
6 |
7 | from 数据库操作 import MyDatabase
8 | from Heroes import heroes # 会自动运行Heroes里面的代码
9 |
10 | """
11 | 这个逆向其实并不难,但是为了对代练通的尊重,我还是该hook就hook,其实就是一个简单的请求参数+时间戳+signkey的md5加密,作为app逆向的入门案例还是挺不错的。
12 | 主要的思路就是逆向出来sign,把代练通app反编译出来搜索LevelOrderList,话说代练通是DCloud旗下的吗,还是外包给DCloud了,怎么包名都不是代练通。
13 | """
14 |
15 |
16 | # 这些参数的含义分别是,分页索引,分页长度,接单状态(1普通接单,9已被接单),平台(1安卓全区,0安卓ios都有),查询字符串
17 | def get_logined_data(page_index, page_size, pub_type, pg_type, search_str):
18 | # 伪造时间戳
19 | time_stamp = int(time.time())
20 | # 准备url
21 | UserID = 11340745 # 这个使用UserID的时候会从本地里面拿出来token校验,这里我们没有把token拿出来,所以就不搞这个了。如果想的话,去hook getUserToken这个方法
22 | url = f"https://server.dailiantong.com.cn/API/APPService.ashx?Action=LevelOrderList&IsPub={pub_type}&GameID=107&ZoneID=0&ServerID=0&SearchStr={search_str}&Sort_Str=&PageIndex={page_index}&PageSize={page_size}&Price_Str=&PubCancel=0&SettleHour=0&FilterType=0&PGType={pg_type}&Focused=-1&STier=&ETier=&Score1=0&Score2=0&UserID={UserID}&TimeStamp={time_stamp}&Ver=1.0&AppOS=Android&AppID=DLTAndroid&AppVer=4.3.1&Sign=3d19d7bfd9b74e4dc6c913105ed3bf88"
23 | base_url = "https://server.dailiantong.com.cn/API/APPService.ashx" # 这个是发请求的url
24 |
25 | query = urlparse(url).query # 提取出查询字符串
26 | params = dict(parse_qsl(query, keep_blank_values=True)) # 把查询参数转成字典方便处理
27 | signKey = "9c7b9399680658d308691f2acad58c0a" # app里面的salt
28 | UserToken = "F92BDB2597884E0A8547CC5907E932BE" # UserToken,这个没有就不能有UserID,UserID也得用那个0的
29 | # 但是如果不登录能爬取的数据不太准确,但是登录了返回的数据变少了,看自己的取舍吧,
30 | # 获取用于加密md5的字典
31 | value_dict = dict(parse_qsl(query[:query.rfind("&")]))
32 | ValueStr = "" # 模仿加密时的拼接字符串
33 | # 循环读取
34 | for key, value in value_dict.items():
35 | ValueStr += value
36 |
37 | sign = md5((signKey + ValueStr + UserToken).encode("utf-8")).hexdigest() # 获取sign值
38 | params['Sign'] = sign # 拼接到查询参数里面
39 | headers = {
40 | "User-Agent": "Dalvik/2.1.0 (Linux; U; Android 9; ASUS_I003DD Build/PI)",
41 | "Host": "server.dailiantong.com.cn",
42 | }
43 | # 发送请求
44 | res = requests.get(base_url, headers=headers, params=params)
45 | return res.json()
46 |
47 |
48 | # 这个函数是不登录的时候发请求
49 | # 这些参数的含义分别是,分页索引,分页长度,接单状态(1普通接单,9已被接单),平台(1安卓全区,0安卓ios都有),查询字符串
50 | def get_data(page_index, page_size, pub_type, pg_type, search_str):
51 | # 伪造时间戳
52 | time_stamp = int(time.time())
53 | # 准备url
54 | UserID = 0 # 这个使用UserID的时候会从本地里面拿出来token校验,这里我们没有把token拿出来,所以就不搞这个了。如果想的话,去hook getUserToken这个方法
55 | url = f"https://server.dailiantong.com.cn/API/APPService.ashx?Action=LevelOrderList&IsPub={pub_type}&GameID=107&ZoneID=0&ServerID=0&SearchStr={search_str}&Sort_Str=&PageIndex={page_index}&PageSize={page_size}&Price_Str=&PubCancel=0&SettleHour=0&FilterType=0&PGType={pg_type}&Focused=-1&STier=&ETier=&Score1=0&Score2=0&UserID={UserID}&TimeStamp={time_stamp}&Ver=1.0&AppOS=Android&AppID=DLTAndroid&AppVer=4.3.1&Sign=3d19d7bfd9b74e4dc6c913105ed3bf88"
56 | base_url = "https://server.dailiantong.com.cn/API/APPService.ashx" # 这个是发请求的url
57 |
58 | query = urlparse(url).query # 提取出查询字符串
59 | params = dict(parse_qsl(query, keep_blank_values=True)) # 把查询参数转成字典方便处理
60 | signKey = "9c7b9399680658d308691f2acad58c0a" # app里面的salt
61 | # 但是如果不登录能爬取的数据不太准确,但是登录了返回的数据变少了,看自己的取舍吧,
62 | # 获取用于加密md5的字典
63 | value_dict = dict(parse_qsl(query[:query.rfind("&")]))
64 | ValueStr = "" # 模仿加密时的拼接字符串
65 | # 循环读取
66 | for key, value in value_dict.items():
67 | ValueStr += value
68 |
69 | sign = md5((signKey + ValueStr).encode("utf-8")).hexdigest() # 获取sign值
70 | params['Sign'] = sign # 拼接到查询参数里面
71 | headers = {
72 | "User-Agent": "Dalvik/2.1.0 (Linux; U; Android 9; ASUS_I003DD Build/PI)",
73 | "Host": "server.dailiantong.com.cn",
74 | "Cookie": "SERVERID=e99d03716a9aa7fd702a811546645e6b|1700577296|1700576638; SERVERCORSID=e99d03716a9aa7fd702a811546645e6b|1700577296|1700576638"
75 | # cookie带不带都行
76 | }
77 | # 发送请求
78 | res = requests.get(base_url, headers=headers, params=params)
79 | return res.json()
80 |
81 |
82 | def get_all_data(pub_type, pg_type):
83 | count = 0
84 | # 创建类实例
85 | my_database = MyDatabase()
86 | start_time = time.time()
87 | for i in range(0, 2000): # 爬取前2000页的
88 | res = get_data(i + 1, 20, pub_type, pg_type, "")
89 | # 存储数据
90 | count += len(res["LevelOrderList"])
91 | print(f"第{i + 1}页,爬取数据{res['RecordCount']}条")
92 | if len(res["LevelOrderList"]) == 0:
93 | break
94 | my_database.save_data(res)
95 | time.sleep(5) # 每次睡5秒
96 | my_database.close()
97 | print("----------------------------------------")
98 | print(f"本次数据爬取完成,共爬取{count}条数据,花费{time.time() - start_time}秒")
99 |
100 |
101 | def get_data_by_search_str(search_str):
102 | count = 0
103 | # 创建类实例
104 | my_database = MyDatabase()
105 | start_time = time.time()
106 | for i in range(0, 2000):
107 | res = get_logined_data(i + 1, 20, 1, 0, search_str)
108 | # 存储数据
109 | count += len(res["LevelOrderList"])
110 | print(f"第{i + 1}次请求,总数据还有{res['RecordCount']}条")
111 |
112 | if len(res["LevelOrderList"]) == 0:
113 | break
114 | my_database.save_heroes_data(res, search_str)
115 | time.sleep(5) # 每次睡5秒
116 | my_database.close()
117 | print(f"本次数据爬取完成,共爬取{count}条数据,花费{time.time() - start_time}秒")
118 | print("----------------------------------------------------")
119 | return count
120 |
121 |
122 | if __name__ == '__main__':
123 | """
124 | 一共27个字段,其实我主要想分析的是单价,但是没有真机,用模拟器升一下新版本直接闪退,也不知道怎么这个UnitPrice这个字段才可以有值,感觉是查询参数这里设置的,
125 | 但是也抓不到包,很烦。因此这里就简单分析一下,以后有真机了再重新搞这个。还有这个筛选巅峰赛荣耀战力功能,是通过levelType实现的,但是也很奇怪,筛选出来的根本不对。
126 | 这里就简单一点,只提取:
127 | 标题(Title)、价格(Price)、安全保证金(Ensure1)、效率保证金(Ensure2)、时间限制(TimeLimit)、发单人(Create)、发布时间(Stamp)、游戏大区(Zone)
128 | ! Stamp比真实时间戳少了255845872秒,到时候记得加上去。加上三个字段 单价UnitPrice、发单者ID(UserID)、订单号(SerialNo)
129 | 第二个分析的点是指定英雄的所有订单
130 | 2023-11-21:update 这次我把UserToken和UserID加上去了,用来爬取指定英雄的所有订单(不登录好东西不让看)。
131 | 新的表字段我还是用的原来的字段,只不过加了一个hero方便分组
132 |
133 |
134 | """
135 | # get_all_data(1,1) # 这里爬取未被抢的订单,安卓的,失误
136 | # get_all_data(9,1) # 这里爬取已被抢的订单,安卓的,失误
137 | # 这里爬取所有订单(包括安卓和ios)
138 | # get_all_data(1, 0)
139 | # 2023-11-28日继续爬取,4361条数据,共7688条有效数据 记录表为dailiantong_base id6224->10584,实际时间2023-11-27 23:10 订单时间:2024-01-04 11:10:14。TODO:矫正时间
140 | # 2023-12-06日继续爬取,2800条数据,共10488条有效数据 记录表为dailiantong_base id10584->13384。
141 | # 2024-01-01日继续爬取,5041条数据,共15529条有效数据 记录表为dailiantong_base id13384->18425。
142 |
143 |
144 | # 这里对每一个英雄进行搜索爬取
145 | # 2023-11-28日继续爬取,2428条数据,共4542条有效数据,记录表为heros_table id2114->4542
146 | # 2023-12-06日继续爬取,1620条数据,共6162条有效数据,记录表为heros_table id4542->6162
147 | # 这里时间也不是重点分析的点,就不再分析了,感觉应该是时间戳+一定的offset+代练时限。有点莫名其妙
148 | total_count = 0
149 | start_time = time.time()
150 | for index, hero in enumerate(heroes):
151 | print(f"第{index + 1}个英雄,英雄是{hero}")
152 | count = get_data_by_search_str(hero)
153 | total_count += count
154 |
155 | print(f"总数据爬取完成,共爬取{total_count}条数据,耗时{time.time() - start_time}秒")
156 |
--------------------------------------------------------------------------------