├── .gitattributes
├── .gitignore
├── B站模拟扫码登录
└── demo.py
├── README.md
├── requirements.txt
├── 下载小鹅通视频
├── 2021年12月
│ ├── 1.前置知识
│ │ ├── 1.AES-CBC解密
│ │ │ ├── CBC解密.py
│ │ │ ├── after.ts
│ │ │ └── before.ts
│ │ ├── 2.m3u8解析
│ │ │ ├── demo.m3u8
│ │ │ └── parse.py
│ │ ├── 3.HTML注入JS
│ │ │ ├── after.html
│ │ │ ├── before.html
│ │ │ └── demo.py
│ │ └── 4.ffmpeg合成ts视频
│ │ │ ├── demo.py
│ │ │ ├── ffmpeg.exe
│ │ │ ├── file.txt
│ │ │ ├── out.mp4
│ │ │ └── 素材
│ │ │ ├── 1.ts
│ │ │ ├── 10.ts
│ │ │ ├── 11.ts
│ │ │ ├── 12.ts
│ │ │ ├── 13.ts
│ │ │ ├── 14.ts
│ │ │ ├── 15.ts
│ │ │ ├── 2.ts
│ │ │ ├── 3.ts
│ │ │ ├── 4.ts
│ │ │ ├── 5.ts
│ │ │ ├── 6.ts
│ │ │ ├── 7.ts
│ │ │ ├── 8.ts
│ │ │ └── 9.ts
│ ├── 2.自动合并版本
│ │ ├── ffmpeg.exe
│ │ ├── request_demo.py
│ │ ├── requirements.txt
│ │ ├── selenium启动
│ │ │ ├── chromedriver.exe
│ │ │ ├── selenium_start.py
│ │ │ └── 谷歌驱动下载地址.txt
│ │ ├── 启动程序指令.txt
│ │ └── 安装环境指令.txt
│ └── 3.手动合并版本
│ │ ├── ffmpeg.exe
│ │ ├── request_demo.py
│ │ ├── requirements.txt
│ │ ├── selenium启动
│ │ ├── chromedriver.exe
│ │ ├── selenium_start.py
│ │ └── 谷歌驱动下载地址.txt
│ │ ├── 启动程序指令.txt
│ │ └── 安装环境指令.txt
└── 2022年12月
│ └── 1.自动合并版本
│ ├── N_m3u8DL-CLI_v3.0.2.exe
│ ├── ffmpeg.exe
│ ├── request_demo.py
│ ├── requirements.txt
│ ├── selenium启动
│ ├── chromedriver.exe
│ ├── selenium_start.py
│ └── 谷歌驱动下载地址.txt
│ ├── 启动程序指令.txt
│ └── 安装环境指令.txt
├── 下载荔枝微课
├── ffmpeg.exe
├── request_demo.py
├── requirements.txt
├── selenium启动
│ ├── chromedriver.exe
│ ├── selenium_start.py
│ └── 谷歌驱动下载地址.txt
├── 启动程序指令.txt
└── 安装环境指令.txt
├── 京东商品信息
└── crawl.py
├── 房天下
├── crawl.py
└── db.py
├── 新版QQ音乐
├── README.md
├── crawl.py
├── db.py
├── demo.py
└── get_sign.js
├── 旧版QQ音乐(仍可用)
├── README.md
├── crawl.py
├── db.py
└── demo.py
├── 有道翻译
└── crawl.py
├── 构建代理池
├── crawl.py
└── ip_pool.json
├── 百度图片
└── crawl.py
├── 破解有道翻译
└── crawl.py
├── 破解网易登录
├── crawl.py
├── pw.js
└── rtid.js
└── 豆瓣读书
├── 入库版
├── book.py
└── boook_db.py
└── 分类实现版
├── requirements.txt
├── 【bs4实现】豆瓣读书爬虫.py
├── 【re实现】豆瓣读书爬虫.py
└── 【xpath实现】豆瓣读书爬虫.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 |
--------------------------------------------------------------------------------
/B站模拟扫码登录/demo.py:
--------------------------------------------------------------------------------
1 | # Python3.7
2 | # encoding=utf-8
3 |
4 | import requests,time,json,os
5 | import qrcode # 生成二维码
6 | import cv2 as cv # 读取二维码图片
7 | from concurrent.futures import ThreadPoolExecutor
8 |
9 | '''
10 | 需要安装第三方库:
11 | pip install qrcode==7.3
12 | pip install opencv-python==4.5.3.56
13 | '''
14 |
15 | headers = {
16 | 'referer':'https://passport.bilibili.com/login',
17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36x-requested-with: XMLHttpRequest'
18 | }
19 |
20 | class Login():
21 |
22 | def __init__(self):
23 | self.oauthKey = ''
24 | self.qrcodeURL = ''
25 | self.session = requests.Session()
26 | self.session.headers = headers
27 |
28 | # 获取二维码图片地址
29 | def getQRcode(self):
30 |
31 | html = self.session.get('https://passport.bilibili.com/qrcode/getLoginUrl')
32 | if html.json()['status'] == True:
33 | self.oauthKey = html.json()['data']['oauthKey']
34 | self.qrcodeURL = html.json()['data']['url']
35 | return True
36 | return False
37 |
38 | # 利用 opencv 读取图片
39 | @staticmethod
40 | def showQRCode(url):
41 | qrCode = qrcode.QRCode()
42 | qrCode.add_data(url)
43 | qrCode = qrCode.make_image()
44 | qrCode.save('qrCode.png')
45 | img = cv.imread('qrCode.png',1)
46 | cv.imshow('Login',img)
47 | cv.waitKey()
48 |
49 | # 开始登录
50 | def login(self):
51 |
52 | # 创建另一个线程,展示二维码图片
53 | thread_pool = ThreadPoolExecutor(max_workers=2)
54 | if self.getQRcode():
55 | thread_pool.submit(self.showQRCode,self.qrcodeURL)
56 |
57 | # 不断检查二维码是否确认登录
58 | while True:
59 | time.sleep(1)
60 | data = {
61 | 'oauthKey':self.oauthKey,
62 | 'gourl':'https://www.bilibili.com/'
63 | }
64 |
65 | html = self.session.post('https://passport.bilibili.com/qrcode/getLoginInfo',headers=headers,data=data)
66 |
67 | if html.json()['data'] == -4: # 还没扫码
68 | pass
69 | elif html.json()['data'] == -2: # 二维码过期,需要重新生成
70 | self.getQRcode()
71 | thread_pool.submit(self.showQRCode,self.qrcodeURL)
72 | elif html.json()['data'] == -5: # 已经扫码,等待确认
73 | pass
74 | else:
75 | break
76 |
77 | # 解析 cookie
78 | cookieRaw = html.json()['data']['url'].split('?')[1].split('&')
79 | cookies = {}
80 | for cookie in cookieRaw:
81 | key,value = cookie.split('=')
82 | if key != 'gourl' and key != 'Expires':
83 | cookies[key] = value
84 | print(json.dumps(cookies))
85 | os._exit(0)
86 |
87 | if __name__ == '__main__':
88 | login = Login()
89 | login.login()
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # **Python3Webcrawler**
2 | ## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)** **UPDATE 2023-02-15**
3 |
4 | ---
5 |
6 | > **精心挑选了几个爬虫,给大家在学Scrapy框架之前打基础。**
7 | >> **该项目仅限学习交流,请勿用于商业用途,如有侵权,请联系删除。**
8 |
9 | ---
10 |
11 | |**运行环境**|**项目使用版本**|
12 | |:----:|:--------:|
13 | |**python**|**3.7.9**|
14 | |**NodeJS**|**14.6.0**|
15 |
16 | ---
17 |
18 | |**程序依赖**|**安装指令**|**项目使用版本**|
19 | |:----:|:--------:|:--------:|
20 | |**lxml**|**pip install lxml**|**4.6.2**|
21 | |**aiohttp**|**pip install aiohttp**|**3.7.4**|
22 | |**requests**|**pip install requests**|**2.25.1**|
23 | |**PyExecJS**|**pip install PyExecJS**|**1.5.1**|
24 | |**sqlalchemy**|**pip install sqlalchemy**|**1.3.23**|
25 | |**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.3**|
26 | |**mysqlconnector**|**pip install mysql-connector-python**|**8.0.23**|
27 | |**qrcode**|**pip install qrcode**|**7.3**|
28 | |**opencv-python**|**pip install opencv-python**|**4.5.3.56**|
29 | |**m3u8**|**pip install m3u8**|**0.9.0**|
30 | |**mitmproxy**|**pip install mitmproxy**|**5.3.0**|
31 | |**selenium**|**pip install selenium**|**3.141.0**|
32 | |**pycryptodome**|**pip install pycryptodome**|**3.10.1**|
33 | ---
34 |
35 | * ### **京东 [官网地址](https://item.jd.com)**
36 | * ### **网易 [官网地址](https://www.163.com/)**
37 | * ### **房天下 [官网地址](https://www.fang.com)**
38 | * ### **快代理 [官网地址](https://www.kuaidaili.com)**
39 | * ### **QQ音乐 [官网地址](https://y.qq.com)**
40 | * ### **百度图片 [官网地址](https://image.baidu.com)**
41 | * ### **豆瓣读书 [官网地址](https://book.douban.com)**
42 | * ### **有道翻译 [官网地址](http://fanyi.youdao.com)**
43 | * ### **哔哩哔哩 [官网地址](https://bilibili.com)**
44 | * ### **小鹅通 [官网地址](https://www.xiaoe-tech.com)**
45 | * ### **荔枝微课 [官网地址](https://m.lizhiweike.com)**
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.7.4
2 | async-timeout==3.0.1
3 | attrs==20.3.0
4 | beautifulsoup4==4.9.3
5 | certifi==2020.12.5
6 | chardet==3.0.4
7 | idna==2.10
8 | lxml==4.6.2
9 | multidict==5.1.0
10 | mysql-connector-python==8.0.23
11 | protobuf==3.13.0
12 | PyExecJS==1.5.1
13 | requests==2.25.1
14 | six==1.15.0
15 | soupsieve==2.2
16 | SQLAlchemy==1.3.23
17 | typing-extensions==3.7.4.3
18 | urllib3==1.26.3
19 | yarl==1.6.3
20 | qrcode==7.3
21 | opencv-python==4.5.3.56
22 | selenium==3.141.0
23 | m3u8==0.9.0
24 | mitmproxy==5.3.0
25 | pycryptodome==3.10.1
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/CBC解密.py:
--------------------------------------------------------------------------------
1 | from Crypto.Cipher import AES
2 |
3 | # 设置模式
4 | mode = AES.MODE_CBC
5 |
6 | key = b'V\x9dH\x1e:\xe6g\x10\x11l\xd7\xab\xd5\xd3\xc1\xbc'
7 |
8 | '''
9 | 生成解密对象
10 | key:密钥
11 | mode:解密模式
12 | iv:偏移量
13 | '''
14 | cryptos = AES.new(key=key,mode=mode,iv=b'0000000000000000')
15 |
16 | with open('before.ts','rb') as f: # 解密前
17 | with open('after.ts','wb') as f2: # 解密后
18 | f2.write(cryptos.decrypt(f.read()))
19 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/demo.m3u8:
--------------------------------------------------------------------------------
1 | #EXTM3U
2 | #EXT-X-VERSION:3
3 | #EXT-X-TARGETDURATION:11
4 | #EXT-X-MEDIA-SEQUENCE:0
5 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
6 | #EXTINF:2.000000,
7 | v.f230.ts?start=0&end=68063&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
8 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
9 | #EXTINF:2.000000,
10 | v.f230.ts?start=68064&end=130671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
11 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
12 | #EXTINF:1.708333,
13 | v.f230.ts?start=130672&end=190847&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
14 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
15 | #EXTINF:2.000000,
16 | v.f230.ts?start=190848&end=281471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
17 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
18 | #EXTINF:2.000000,
19 | v.f230.ts?start=281472&end=369471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
20 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
21 | #EXTINF:2.000000,
22 | v.f230.ts?start=369472&end=457647&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
23 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
24 | #EXTINF:2.000000,
25 | v.f230.ts?start=457648&end=742095&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
26 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
27 | #EXTINF:4.291667,
28 | v.f230.ts?start=742096&end=1186719&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
29 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
30 | #EXTINF:4.000000,
31 | v.f230.ts?start=1186720&end=1413087&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
32 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
33 | #EXTINF:4.000000,
34 | v.f230.ts?start=1413088&end=1776687&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
35 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
36 | #EXTINF:5.791667,
37 | v.f230.ts?start=1776688&end=2031631&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
38 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
39 | #EXTINF:6.000000,
40 | v.f230.ts?start=2031632&end=2294271&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
41 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
42 | #EXTINF:6.000000,
43 | v.f230.ts?start=2294272&end=2535679&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
44 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
45 | #EXTINF:10.208333,
46 | v.f230.ts?start=2535680&end=3179583&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
47 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
48 | #EXTINF:9.666667,
49 | v.f230.ts?start=3179584&end=3695279&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
50 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
51 | #EXTINF:10.000000,
52 | v.f230.ts?start=3695280&end=3994207&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
53 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
54 | #EXTINF:10.750000,
55 | v.f230.ts?start=3994208&end=4735695&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
56 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
57 | #EXTINF:9.333333,
58 | v.f230.ts?start=4735696&end=5240671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
59 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
60 | #EXTINF:9.583333,
61 | v.f230.ts?start=5240672&end=5551439&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
62 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
63 | #EXTINF:10.416667,
64 | v.f230.ts?start=5551440&end=5820671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
65 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
66 | #EXTINF:6.416667,
67 | v.f230.ts?start=5820672&end=5890239&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
68 | #EXT-X-ENDLIST
69 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/parse.py:
--------------------------------------------------------------------------------
1 | import m3u8
2 |
3 | '''
4 | m3u8 官方文档:https://github.com/globocom/m3u8
5 | '''
6 | with open(r'demo.m3u8','r',encoding='utf-8') as f:
7 |
8 | # 解析 m3u8
9 | dict_data = m3u8.parse(f.read())
10 | print(dict_data)
11 |
12 | # 获取键值
13 | # print(dict_data.keys())
14 |
15 | # 获取 m3u8 分片地址
16 | # for data in dict_data['segments']:
17 | # print(data['uri'])
18 | # start = data['uri'].split('?')[1].split('&')[0]
19 | # end = data['uri'].split('?')[1].split('&')[1]
20 | # print(start + end)
21 |
22 |
23 | # 获取 m3u8 加密地址
24 | # for data in dict_data['keys']:
25 | # print(data['uri'])
26 |
27 |
28 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/after.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | 模拟多个script节点
14 |
15 |
17 |
19 |
20 |
21 |
23 |
24 |
26 |
28 |
31 |
32 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/before.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | 模拟多个script节点
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/demo.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | '''
4 | BeautifulSoup修改文档树-官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id45
5 | '''
6 |
7 | with open('before.html','r',encoding='utf-8') as f:
8 |
9 | soup = BeautifulSoup(f.read(), 'lxml')
10 |
11 |
12 | # 创建 HTML 的 script 节点
13 | script_tag = soup.new_tag('script', type='text/javascript')
14 | script_tag.string = "alert('靓仔')"
15 | # print(script_tag)
16 |
17 | # 获取最后一个 script 节点,向后插入
18 | print('[插入前] 最后一个节点:{}'.format(soup.select('script')[-1]))
19 | soup.select('script')[-1].insert_after(script_tag)
20 | print('[插入后] 最后一个节点:{}'.format(soup.select('script')[-1]))
21 |
22 | with open('after.html','w',encoding='utf-8') as f:
23 | f.write(soup.prettify()) # 格式化写入
24 |
25 | # print(soup)
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/demo.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | '''
4 | 路径常识:
5 | \ 不能在Linux
6 | / 能够跨平台
7 | 推荐使用 / 路径
8 | '''
9 |
10 | print('\\')
11 | print('/')
12 |
13 | # 遍历文件夹
14 | for dirpath,dirnames,files in os.walk('./素材'):
15 |
16 | # 获取有多少个文件
17 | print(files)
18 |
19 | # 将文件名排序好
20 | # list_data = [ int(data.replace('.ts','')) for data in files]
21 | # list_data.sort()
22 | # print(list_data)
23 |
24 | # 开始写入文件
25 | # for index in list_data:
26 |
27 | # # 写入
28 | # with open('file.txt','a+',encoding='utf-8') as f1:
29 |
30 | # # 读取
31 | # with open('file.txt','r',encoding='utf-8') as f2:
32 |
33 | # # 获取当前绝对路径
34 | # current_filename = os.getcwd().replace('\\','/')
35 |
36 | # # 文件名
37 | # filename = current_filename + '/素材/{}.ts'.format(index)
38 |
39 | # # 如果该文件名不在里面,就写入
40 | # if filename not in f2.read():
41 | # f1.write("file '{}'\n".format(filename))
42 |
43 |
44 | # 设置UTF-8编码,让命令行支持中文编码
45 | # cmd = 'ffmpeg.exe -f concat -safe 0 -i file.txt -c copy out.mp4"'
46 | # os.system('CHCP 65001')
47 | # os.system(cmd.replace('/', '\\'))
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/file.txt:
--------------------------------------------------------------------------------
1 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/1.ts'
2 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/2.ts'
3 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/3.ts'
4 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/4.ts'
5 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/5.ts'
6 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/6.ts'
7 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/7.ts'
8 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/8.ts'
9 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/9.ts'
10 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/10.ts'
11 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/11.ts'
12 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/12.ts'
13 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/13.ts'
14 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/14.ts'
15 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/15.ts'
16 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/request_demo.py:
--------------------------------------------------------------------------------
1 | # python 3.7
2 | import mitmproxy.http,json,os,m3u8,requests,shutil
3 | from bs4 import BeautifulSoup
4 | from mitmproxy import ctx
5 | from pathlib import Path
6 | from Crypto.Cipher import AES
7 |
8 | '''
9 | 本次爬取的课程地址:https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6
10 | '''
11 |
12 | # 生成python修复文件
13 | repair_file_py = r'''
14 | import os
15 |
16 | from Crypto.Cipher import AES
17 |
18 | # 获取当前路径
19 | current_filename = os.getcwd().replace('\\','/')
20 |
21 | # 修复文件连接
22 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt'
23 |
24 | # 开始修复文件
25 | def decrypt_file():
26 |
27 | global new_repair_file_txt
28 |
29 | before_content = None
30 |
31 | key = {}
32 |
33 | mode = AES.MODE_CBC
34 |
35 | # 获取 AES 解密对象
36 | cryptos = AES.new(key, mode)
37 |
38 | # 创建修复文件
39 | repair_filename = current_filename + '/修复文件'
40 | if not os.path.exists(repair_filename):
41 | os.makedirs(repair_filename)
42 |
43 | with open('not_finish_file.txt','r',encoding='utf-8') as f1:
44 |
45 | # 读取第一行
46 | line = f1.readline()
47 |
48 | # 逐行读取
49 | while line:
50 | # 获取 还没被解密的 ts 视频的路径
51 | not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/')
52 | print(not_finish_file_line)
53 |
54 | with open(not_finish_file_line,'rb') as f: # 解密之前
55 | before_content = f.read()
56 |
57 | # 写入 修复文件
58 | new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1]
59 | print(new_repair_filename)
60 | with open(new_repair_filename,'wb') as f: # 解密之后
61 | f.write(cryptos.decrypt(before_content))
62 |
63 | new_repair_file_txt = repair_filename + '/' + 'repair_file.txt'
64 |
65 | # 确保不重复
66 | with open(new_repair_file_txt,'a+',encoding='utf-8') as f3: # 解密之后
67 | with open(new_repair_file_txt,'r',encoding='utf-8') as f4:
68 | if str(new_repair_filename) not in f4.read():
69 | f3.write("file '%s'\n" % str(new_repair_filename))
70 |
71 | line = f1.readline()
72 |
73 | # 使用 not_finish_file.txt 合成视频
74 | def compose_file():
75 |
76 | cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt + " -c copy 1.修复视频.mp4"
77 | print(cmd)
78 | # 设置UTF-8编码
79 | os.system('CHCP 65001')
80 | os.system(cmd.replace('/','\\'))
81 |
82 | decrypt_file()
83 | compose_file()
84 | '''
85 |
86 | cryptos = None # AES解密
87 | m3u8_data = None # 保存m3u8有多少个uri
88 | filename = None # 下载视频路径
89 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
90 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径
91 | title = None # 标题
92 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频
93 |
94 | class Counter:
95 |
96 | def __init__(self):
97 | self.Referer = 'https://xueyuan.xiaoe-tech.com/'
98 | self.Cookie = '请填写你的cookie'
99 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
100 | self.headers = {
101 | 'Referer':self.Referer,
102 | 'Cookie':self.Cookie,
103 | 'UserAgent':self.UserAgent
104 | }
105 |
106 | def request(self, flow: mitmproxy.http.HTTPFlow):
107 |
108 | # 所有请求插入协议头
109 | flow.request.headers['Referer'] = self.Referer
110 | flow.request.headers['Cookie'] = self.Cookie
111 |
112 | def response(self, flow: mitmproxy.http.HTTPFlow):
113 |
114 | # 导入全局变量
115 | global cryptos,filename,m3u8_data,result_filename,repair_file_py,title,finish_file_flag
116 |
117 | # 注入 JavaScript
118 | # 启动就能点击播放器
119 | if 'detail' in flow.request.url:
120 |
121 | # 确保匹配 HTML
122 | if 'text/html' in flow.response.headers['Content-Type']:
123 |
124 | javascript_text = '''
125 | // 视频播放速度
126 | const playbackRate = 16;
127 |
128 | function start_video(){
129 |
130 | // 确保修改了视频播放速度
131 | while(document.querySelector('video').playbackRate != playbackRate ){
132 |
133 | // 点击播放器
134 | document.querySelector('div.iconfont.playButton.icon-icon_play').click();
135 |
136 | // 设置视频重头播放
137 | document.querySelector('video').currentTime = 0;
138 |
139 | // 设置视频自动播放
140 | document.querySelector('video').autoplay = true;
141 |
142 | // 设置视频播放速度
143 | document.querySelector('video').playbackRate = playbackRate;
144 |
145 | // 设置视频静音
146 | document.querySelector('video').muted = true
147 |
148 | // 开始播放
149 | document.querySelector('video').play();
150 | }
151 | };
152 |
153 | // 使用递归,异步等待,确保video标签会出现
154 | function waitForElementToDisplay(selector, time) {
155 |
156 | // video标签出现后,异步等待 1 秒
157 | if(document.querySelector(selector)!=null) {
158 |
159 | console.log('获取成功video');
160 | setTimeout(
161 | ()=>{
162 | start_video();
163 | },1000
164 | );
165 |
166 | return;
167 | }
168 | else {
169 | setTimeout( ()=> {
170 | waitForElementToDisplay(selector, time);
171 | }, time);
172 | }
173 | }
174 |
175 | // 每过 1 秒检查video标签 是否出现
176 | waitForElementToDisplay('video',1000)
177 | '''
178 |
179 | # 获取 BeautifulSoup 对象
180 | soup = BeautifulSoup(flow.response.text, 'lxml')
181 |
182 | # 生成一个script节点
183 | script_tag = soup.new_tag('script', type='text/javascript')
184 |
185 | # 往script节点写入内容
186 | script_tag.string = javascript_text
187 |
188 | # 在当前 HTML 最后一个script节点 向后插入一个节点
189 | soup.select('script')[-1].insert_after(script_tag)
190 |
191 | # 修改当前 HTML 全部内容
192 | flow.response.text = str(soup)
193 |
194 | # 设置 AES解密模式
195 | mode = AES.MODE_CBC
196 |
197 | # 获取课程标题
198 | if 'xe.goods.detail.get' in flow.request.url:
199 |
200 | # 加载 JSON 对象
201 | json_data = json.loads(flow.response.text)
202 |
203 | # 获取当前视频标题
204 | title = json_data['data']['title'].replace(' ','')
205 |
206 | # 如果没有文件夹,就创建文件夹
207 | filename = current_filename + '/下载成功的视频/{}'.format(title)
208 | if not os.path.exists(filename):
209 | os.makedirs(filename)
210 |
211 | if not os.path.exists(result_filename):
212 | os.makedirs(result_filename)
213 |
214 | # 匹配 m3u8
215 | if '.m3u8' in flow.request.url:
216 |
217 | # 加载 m3u8 对象
218 | dict_data = m3u8.parse(flow.response.text)
219 |
220 | # 获取 m3u8 全部分片链接
221 | m3u8_data = [ data['uri'] for data in dict_data['segments']]
222 | print(m3u8_data)
223 |
224 | # 获取解密参数
225 | m3u8_content = requests.get(url=dict_data['keys'][0]['uri'],headers=self.headers).content
226 | cryptos = AES.new(m3u8_content,mode)
227 |
228 | # 将密钥 写入 修复文件
229 | repair_file_py = repair_file_py.format(str(m3u8_content))
230 | print('\n' + '-'*50)
231 | print('\n当前密钥:{}'.format(str(m3u8_content)))
232 |
233 |
234 | # 匹配密钥
235 | if 'get_video_key.php' in flow.request.url:
236 |
237 | print('\n当前密钥:{}'.format(str(flow.response.content)))
238 |
239 | # 将密钥 写入 修复文件
240 | repair_file_py = repair_file_py.format(str(flow.response.content))
241 | cryptos = AES.new(flow.response.content, mode)
242 |
243 | # 解密 ts 文件
244 | if '.ts' in flow.request.url:
245 |
246 | print('-'*50)
247 | print('\n[当前解密对象]:{}\n'.format(cryptos))
248 |
249 | # 拼接当前视频保存路径
250 | m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end'))
251 | print('[当前视频]:{} [保存路径]:{}\n'.format(title,m3u8_ts_filename))
252 |
253 | # 用于合成
254 | m3u8_finish_file_filename = filename + '/finish_file.txt'
255 |
256 | # 确定最后一个分片
257 | start_data = m3u8_data[-1].split('?')[1].split('&')[0]
258 | end_data = m3u8_data[-1].split('?')[1].split('&')[1]
259 | result_data = start_data + end_data
260 |
261 | # 获取成功密钥,再解密
262 | if cryptos != None:
263 |
264 | # 保存 解密好的 ts
265 | with open(m3u8_ts_filename,'wb') as f:
266 | f.write(cryptos.decrypt(flow.response.content))
267 |
268 |
269 | # 写入 解密成功 标记文件
270 | with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1:
271 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2:
272 |
273 | # 如果文件为空,同时又存在最后一片,将不写入
274 | if result_data in m3u8_ts_filename and f2.read()=='':
275 | pass
276 |
277 | # 防止重复,确保路径没问题
278 | elif m3u8_ts_filename not in f2.read():
279 | f1.write("file '{}'\n".format(m3u8_ts_filename))
280 |
281 | # 如果是最后一个分片,开始合成视频
282 | if result_data in m3u8_ts_filename:
283 |
284 | # 拷贝 ffmpeg.exe 写入指定目录
285 | ffmpeg_filename = filename + '/ffmpeg.exe'
286 | shutil.copyfile('ffmpeg.exe', ffmpeg_filename)
287 |
288 | # 如果 存在 还没被解密的 ts 视频
289 | if finish_file_flag:
290 |
291 | # 生成修复python
292 | repair_file = filename + '/repair.py'
293 | with open(repair_file,'w',encoding='utf-8') as f:
294 | f.write(repair_file_py)
295 |
296 | # 合成视频
297 | cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"'
298 |
299 |
300 | # 读取 解密成功 标记文件
301 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f:
302 |
303 | # 确保文件不为空
304 | if f.read()!='':
305 |
306 | mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4'
307 |
308 | # 如果合成的视频已经存在,先删除,再执行
309 | if os.path.exists(mp4_filename):
310 | os.remove(mp4_filename)
311 |
312 | # 设置UTF-8编码
313 | os.system('CHCP 65001')
314 | os.system(cmd.replace('/','\\'))
315 | print('[警告]:文件路径 {}'.format(mp4_filename))
316 | print('[警告]:文件被覆盖了,由于该文件之前已存在过')
317 | else:
318 | os.system('CHCP 65001')
319 | os.system(cmd.replace('/','\\'))
320 | print('[成功]:文件路径 {}'.format(mp4_filename))
321 | print('[成功]:合并完毕')
322 | else:
323 | print(os.path.exists(result_filename + '/' + filename.split('/')[-1] + '.mp4"'))
324 | print(result_filename + '/' + filename.split('/')[-1] + '.mp4')
325 | print('[异常]:当前视频只下载最后一片,将不会合成视频')
326 | else:
327 |
328 | # 标记是否存在 还没被解密的 ts 视频
329 | finish_file_flag = True
330 |
331 | # 保存 还没被解密的 ts 视频
332 | with open(m3u8_ts_filename,'wb') as f:
333 | f.write(flow.response.content)
334 |
335 | # 用于合成
336 | m3u8_not_finish_file__filename = filename + '/not_finish_file.txt'
337 | with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f:
338 | f.write("file '{}'\n".format(m3u8_ts_filename))
339 |
340 |
341 | addons = [
342 | Counter()
343 | ]
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/requirements.txt:
--------------------------------------------------------------------------------
1 | asgiref==3.3.4
2 | beautifulsoup4==4.9.3
3 | blinker==1.4
4 | Brotli==1.0.9
5 | bs4==0.0.1
6 | certifi==2021.5.30
7 | cffi==1.14.6
8 | charset-normalizer==2.0.3
9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | PROXY='http://127.0.0.1:8080'
4 |
5 | chrome_options = webdriver.ChromeOptions()
6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式
8 |
9 |
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 |
12 | '''
13 | 本次爬取的课程地址:https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6
14 | '''
15 |
16 | url = 'https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6'
17 | browser.get(url)#访问网站
18 |
19 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | 地址一:
2 | http://chromedriver.storage.googleapis.com/index.html
3 |
4 | 地址二:
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd,输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
1 | 一次性安装
2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
3 |
4 | 分开安装
5 | pip install mitmproxy
6 | pip install selenium
7 | pip install m3u8
8 | pip install requests
9 | pip install bs4
10 | pip install pycryptodome
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/request_demo.py:
--------------------------------------------------------------------------------
1 | # python 3.7
2 | import mitmproxy.http,json,os,shutil
3 | from bs4 import BeautifulSoup
4 | from mitmproxy import ctx
5 | from pathlib import Path
6 | from Crypto.Cipher import AES
7 |
8 | '''
9 | 本次爬取的课程地址:https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e
10 | '''
11 |
12 | # 生成python修复文件
13 | repair_file_py = r'''
14 |
15 | "此文件用于保存密钥,请不要执行代码"
16 |
17 | import os
18 |
19 | from Crypto.Cipher import AES
20 |
21 | # 获取当前路径
22 | current_filename = os.getcwd().replace('\\','/')
23 |
24 | # 修复文件连接
25 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt'
26 |
27 | # 开始修复文件
28 | def decrypt_file():
29 |
30 | global new_repair_file_txt
31 |
32 | before_content = None
33 |
34 | key = {}
35 |
36 | mode = AES.MODE_CBC
37 |
38 | # 获取 AES 解密对象
39 | cryptos = AES.new(key, mode)
40 |
41 | # 创建修复文件
42 | repair_filename = current_filename + '/修复文件'
43 | if not os.path.exists(repair_filename):
44 | os.makedirs(repair_filename)
45 |
46 | with open('not_finish_file.txt','r',encoding='utf-8') as f1:
47 |
48 | # 读取第一行
49 | line = f1.readline()
50 |
51 | # 逐行读取
52 | while line:
53 | # 获取 还没被解密的 ts 视频的路径
54 | not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/')
55 | print(not_finish_file_line)
56 |
57 | with open(not_finish_file_line,'rb') as f: # 解密之前
58 | before_content = f.read()
59 |
60 | # 写入 修复文件
61 | new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1]
62 | print(new_repair_filename)
63 | with open(new_repair_filename,'wb') as f: # 解密之后
64 | f.write(cryptos.decrypt(before_content))
65 |
66 | new_repair_file_txt = repair_filename + '/' + 'repair_file.txt'
67 |
68 | # 确保不重复
69 | with open(new_repair_file_txt,'a+',encoding='utf-8') as f3: # 解密之后
70 | with open(new_repair_file_txt,'r',encoding='utf-8') as f4:
71 | if str(new_repair_filename) not in f4.read():
72 | f3.write("file '%s'\n" % str(new_repair_filename))
73 |
74 | line = f1.readline()
75 |
76 | # 使用 not_finish_file.txt 合成视频
77 | def compose_file():
78 |
79 | cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt + " -c copy 1.修复视频.mp4"
80 | print(cmd)
81 | # 设置UTF-8编码
82 | os.system('CHCP 65001')
83 | os.system(cmd.replace('/','\\'))
84 |
85 | decrypt_file()
86 | compose_file()
87 | '''
88 |
89 | # 生成python合成文件
90 | merge_file_py = r'''
91 |
92 | "此文件用于合成视频"
93 |
94 | import os
95 |
96 | mp4_filename = '%s'
97 |
98 | cmd = '%s'
99 |
100 | # 如果合成的视频已经存在,先删除,再执行
101 | if os.path.exists(mp4_filename):
102 | os.remove(mp4_filename)
103 |
104 | # 设置UTF-8编码
105 | os.system('CHCP 65001')
106 | os.system(cmd.replace('/','\\'))
107 | print('[警告]:文件路径 {}'.format(mp4_filename))
108 | print('[警告]:文件被覆盖了,由于该文件之前已存在过')
109 | else:
110 | os.system('CHCP 65001')
111 | os.system(cmd.replace('/','\\'))
112 | print('[成功]:文件路径 {}'.format(mp4_filename))
113 | print('[成功]:合并完毕')
114 | '''
115 |
116 | cryptos = None # AES解密
117 | filename = None # 下载视频路径
118 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
119 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径
120 | title = None # 标题
121 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频
122 |
123 | class Counter:
124 |
125 | def __init__(self):
126 | self.Referer = 'https://appdgjqmn6j1714.h5.xiaoeknow.com'
127 | self.Cookie = '请填写你的Cooie'
128 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
129 | self.headers = {
130 | 'Referer':self.Referer,
131 | 'Cookie':self.Cookie,
132 | 'UserAgent':self.UserAgent
133 | }
134 |
135 | def request(self, flow: mitmproxy.http.HTTPFlow):
136 |
137 | # 所有请求插入协议头
138 | flow.request.headers['Referer'] = self.Referer
139 | flow.request.headers['Cookie'] = self.Cookie
140 |
141 | def response(self, flow: mitmproxy.http.HTTPFlow):
142 |
143 | # 导入全局变量
144 | global cryptos,filename,result_filename,repair_file_py,title,finish_file_flag,merge_file_py
145 |
146 | # 注入 JavaScript
147 | # 启动就能点击播放器
148 | if 'v_61ceb0f8e4b05006f9c4214e' in flow.request.url:
149 |
150 | # 确保匹配 HTML
151 | if 'text/html' in flow.response.headers.get('content-type'):
152 |
153 | try:
154 | print('尝试执行JS控制播放器代码')
155 | javascript_text = '''
156 | // 视频播放速度
157 | const playbackRate = 16;
158 |
159 | function start_video(){
160 |
161 | // 确保修改了视频播放速度
162 | while(document.querySelector('video').playbackRate != playbackRate ){
163 |
164 | // 点击播放器
165 | document.querySelector('div.iconfont.playButton.icon-icon_play').click();
166 |
167 | // 设置视频重头播放
168 | document.querySelector('video').currentTime = 0;
169 |
170 | // 设置视频自动播放
171 | document.querySelector('video').autoplay = true;
172 |
173 | // 设置视频播放速度
174 | document.querySelector('video').playbackRate = playbackRate;
175 |
176 | // 设置视频静音
177 | document.querySelector('video').muted = true
178 |
179 | // 开始播放
180 | document.querySelector('video').play();
181 | }
182 | };
183 |
184 | // 使用递归,异步等待,确保video标签会出现
185 | function waitForElementToDisplay(selector, time) {
186 |
187 | // video标签出现后,异步等待 1 秒
188 | if(document.querySelector(selector)!=null) {
189 |
190 | console.log('获取成功video');
191 | setTimeout(
192 | ()=>{
193 | start_video();
194 | },1000
195 | );
196 |
197 | return;
198 | }
199 | else {
200 | setTimeout( ()=> {
201 | waitForElementToDisplay(selector, time);
202 | }, time);
203 | }
204 | }
205 |
206 | // 每过 1 秒检查video标签 是否出现
207 | waitForElementToDisplay('video',1000)
208 | '''
209 |
210 | # 获取 BeautifulSoup 对象
211 | soup = BeautifulSoup(flow.response.text, 'lxml')
212 |
213 | # 生成一个script节点
214 | script_tag = soup.new_tag('script', type='text/javascript')
215 |
216 | # 往script节点写入内容
217 | script_tag.string = javascript_text
218 |
219 | # 在当前 HTML 最后一个script节点 向后插入一个节点
220 | soup.select('script')[-1].insert_after(script_tag)
221 |
222 | # 修改当前 HTML 全部内容
223 | flow.response.text = str(soup)
224 | except:
225 | pass
226 |
227 | # 设置 AES解密模式
228 | mode = AES.MODE_CBC
229 |
230 | # 获取课程标题
231 | if 'get_goods_info_business' in flow.request.url:
232 |
233 | # 加载 JSON 对象
234 | json_data = json.loads(flow.response.text)
235 |
236 | # 获取当前视频标题
237 | title = json_data['data']['goods_name'].replace(' ','')
238 |
239 | # 如果没有文件夹,就创建文件夹
240 | filename = current_filename + '/下载成功的视频/{}'.format(title)
241 | if not os.path.exists(filename):
242 | os.makedirs(filename)
243 |
244 | if not os.path.exists(result_filename):
245 | os.makedirs(result_filename)
246 |
247 | # 匹配密钥
248 | if 'get_video_key.php' in flow.request.url:
249 |
250 | print('\n当前密钥:{}'.format(str(flow.response.content)))
251 |
252 | # 将密钥 写入 修复文件
253 | repair_file_py = repair_file_py.format(str(flow.response.content))
254 | cryptos = AES.new(flow.response.content, mode)
255 |
256 | # 解密 ts 文件
257 | if '.ts' in flow.request.url:
258 |
259 | print('-'*50)
260 | print('\n[当前解密对象]:{}\n'.format(cryptos))
261 |
262 | # 拼接当前视频保存路径
263 | m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end'))
264 | print('[当前视频]:{} [保存路径]:{}\n'.format(title,m3u8_ts_filename))
265 |
266 | # 用于合成
267 | m3u8_finish_file_filename = filename + '/finish_file.txt'
268 |
269 | # 确定最后一个分片
270 | start_data = flow.request.query.get('start')
271 | end_data = flow.request.query.get('end')
272 | result_data = start_data + end_data
273 |
274 | # 获取成功密钥,再解密
275 | if cryptos != None:
276 |
277 |
278 | # 保存 解密好的 ts
279 | with open(m3u8_ts_filename,'wb') as f:
280 | f.write(cryptos.decrypt(flow.response.content))
281 |
282 |
283 | # 写入 解密成功 标记文件
284 | with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1:
285 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2:
286 |
287 | # 如果文件为空,同时又存在最后一片,将不写入
288 | if result_data in m3u8_ts_filename and f2.read()=='':
289 | pass
290 |
291 | # 防止重复,确保路径没问题
292 | elif m3u8_ts_filename not in f2.read():
293 | f1.write("file '{}'\n".format(m3u8_ts_filename))
294 |
295 | ffmpeg_filename = filename + '/ffmpeg.exe'
296 | shutil.copyfile('ffmpeg.exe', ffmpeg_filename)
297 |
298 | # 优化版 生成python合成文件
299 | mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4'
300 | cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"'
301 |
302 | if mp4_filename and cmd:
303 |
304 | try:
305 | merge_file_py = merge_file_py % (str(mp4_filename),str(cmd))
306 | except:
307 | pass
308 |
309 | # 开始生成python合成文件
310 | merge_file = filename + '/merge.py'
311 | with open(merge_file,'w',encoding='utf-8') as f:
312 | f.write(merge_file_py)
313 |
314 | # 生成修复python文件
315 | repair_file = filename + '/repair.py'
316 | with open(repair_file,'w',encoding='utf-8') as f:
317 | f.write(repair_file_py)
318 | else:
319 |
320 | # 标记是否存在 还没被解密的 ts 视频
321 | finish_file_flag = True
322 |
323 | # 保存 还没被解密的 ts 视频
324 | with open(m3u8_ts_filename,'wb') as f:
325 | f.write(flow.response.content)
326 |
327 | # 用于合成
328 | m3u8_not_finish_file__filename = filename + '/not_finish_file.txt'
329 | with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f:
330 | f.write("file '{}'\n".format(m3u8_ts_filename))
331 |
332 |
333 | addons = [
334 | Counter()
335 | ]
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/requirements.txt:
--------------------------------------------------------------------------------
1 | asgiref==3.3.4
2 | beautifulsoup4==4.9.3
3 | blinker==1.4
4 | Brotli==1.0.9
5 | bs4==0.0.1
6 | certifi==2021.5.30
7 | cffi==1.14.6
8 | charset-normalizer==2.0.3
9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | PROXY='http://127.0.0.1:8080'
4 |
5 | chrome_options = webdriver.ChromeOptions()
6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式
8 |
9 |
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 |
12 | '''
13 | 本次爬取的课程地址:https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e
14 | '''
15 |
16 | url = 'https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e'
17 | browser.get(url)#访问网站
18 |
19 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | 地址一:
2 | http://chromedriver.storage.googleapis.com/index.html
3 |
4 | 地址二:
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd,输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py
--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
1 | 一次性安装
2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
3 |
4 | 分开安装
5 | pip install mitmproxy
6 | pip install selenium
7 | pip install m3u8
8 | pip install requests
9 | pip install bs4
10 | pip install pycryptodome
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/request_demo.py:
--------------------------------------------------------------------------------
1 | # python 3.7.9
2 | import mitmproxy.http,json,os,m3u8,requests,base64
3 | from mitmproxy import ctx
4 | from pathlib import Path
5 |
6 | '''
7 | 解密参考文章: https://www.52pojie.cn/thread-1689801-1-1.html
8 | m3u8下载器GitHub地址: https://github.com/nilaoda/N_m3u8DL-CLI
9 |
10 | 旧版课程: https://appjkyl58fl2930.h5.xiaoeknow.com/p/course/column/p_5c483e6305292_C1LfcA9T?type=3
11 | 新版课程: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
12 |
13 | 本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
14 | '''
15 |
16 | userid = None # 用户uid
17 | filename = None # 下载视频路径
18 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
19 | ts_url = None # ts文件下载地址
20 | title = None # 标题
21 | m3u8_obj = None # m3u8对象
22 | m3u8_content = None # m3u8密钥
23 |
24 | class Counter:
25 |
26 | def __init__(self):
27 | self.Referer = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3'
28 | self.Cookie = '请填写你的Cooie'
29 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
30 | self.headers = {
31 | 'Referer':self.Referer,
32 | 'Cookie':self.Cookie,
33 | 'UserAgent':self.UserAgent
34 | }
35 |
36 | def request(self, flow: mitmproxy.http.HTTPFlow):
37 |
38 | # 所有请求插入协议头
39 | flow.request.headers['Referer'] = self.Referer
40 | flow.request.headers['Cookie'] = self.Cookie
41 |
42 | def response(self, flow: mitmproxy.http.HTTPFlow):
43 |
44 | # 导入全局变量
45 | global filename,title,userid,ts_url,m3u8_obj,m3u8_content
46 |
47 |
48 | # 获取课程标题
49 | if 'xe.course.business.core.info.get' in flow.request.url:
50 |
51 | # 加载 JSON 对象
52 | json_data = json.loads(flow.response.text)
53 |
54 | # 获取当前视频标题
55 | title = json_data['data']['resource_name'].replace(' ','')
56 |
57 | print(f'[当前标题] {title}')
58 |
59 | # 如果没有文件夹,就创建文件夹
60 | filename = current_filename + '/下载成功的视频/{}'.format(title)
61 | if not os.path.exists(filename):
62 | os.makedirs(filename)
63 |
64 | if not os.path.exists(current_filename+'/m3u8'):
65 | os.makedirs(current_filename+'/m3u8')
66 |
67 | if 'xe.course.business.composite_info.get' in flow.request.url:
68 |
69 | # 加载 JSON 对象
70 | json_data = json.loads(flow.response.text)
71 |
72 | # 获取userid
73 | userid = json_data['data']['user_info']['user_id'].replace(' ','')
74 |
75 | print(f'[用户ID] {userid}')
76 |
77 | # 匹配 m3u8
78 | if '.m3u8' in flow.request.url:
79 |
80 | if userid != None and filename != None:
81 |
82 | # 加载 m3u8 对象
83 | m3u8_obj = m3u8.loads(flow.response.text)
84 |
85 | # 添加用户userid
86 | m3u8_obj.keys[0].uri = m3u8_obj.keys[0].uri + f'&uid={userid}'
87 |
88 | # 获取m3u8密钥 URL
89 | m3u8_key_url = m3u8_obj.keys[0].uri
90 |
91 | # 获取解密参数(第一次解密)
92 | # print(m3u8_key_url)
93 | try:
94 | m3u8_content = requests.get(
95 | url=m3u8_key_url,
96 | headers=self.headers,
97 | proxies={ "http": None, "https": None} # 不走系统代理,防止clash、v2ray等代理软件拦截
98 | ).content
99 | except:
100 | print(f'-'*25 + 'm3u8_content连接失败' + '-'*25)
101 | print(f'[m3u8链接] = {m3u8_key_url}')
102 | print(f'[协议头] = {self.headers}')
103 | print(f'-'*50)
104 | raise (f'[ERROR]m3u8_content连接失败')
105 |
106 | # 基于用户userid解密(第二次解密)
107 | rsp_data = m3u8_content
108 | userid_bytes = bytes(userid.encode(encoding='utf-8'))
109 | result_list = []
110 | for index in range(0, len(rsp_data)):
111 | result_list.append(
112 | rsp_data[index] ^ userid_bytes[index])
113 | m3u8_content = bytes(result_list)
114 |
115 | # 最终密钥
116 | m3u8_content = base64.b64encode(bytes(result_list)).decode()
117 | print(f'[m3u8密钥] {m3u8_content}')
118 |
119 | else:
120 | print(f'[当前标题] {title}')
121 | print(f'[用户ID] {userid}')
122 | print('[错误] 没有用户id || 没有标题')
123 |
124 |
125 | if '.ts' in flow.request.url:
126 |
127 | video_url = flow.request.url
128 |
129 | print('[开始下载视频]------------------')
130 | # print(f'video_url: {video_url}')
131 |
132 | # 获取ts文件下载域名(前缀)
133 | start_url = video_url.split('/')[:-1]
134 |
135 | # 获取ts文件下载域名(后缀)
136 | end_url = video_url.split('/')[-1].split('?')
137 | end_url[0] = '{ts_url}'
138 |
139 | # 后缀塞入前缀
140 | start_url.append('&'.join(end_url))
141 |
142 | # 生成 ts文件下载地址
143 | ts_url = '/'.join(start_url)
144 |
145 | # 添加 ts 链接地址
146 | for tmp_data in m3u8_obj.segments:
147 |
148 | # 插入
149 | if ts_url != None:
150 | tmp_data.uri = ts_url.format(ts_url=tmp_data.uri)
151 | else:
152 | print(f'[错误] ts_url is None')
153 |
154 | m3u8_filename = f'./m3u8/{title}.m3u8'
155 | m3u8_obj.dump(m3u8_filename)
156 |
157 | # 确保m3u8文件存在
158 | if os.path.exists(m3u8_filename):
159 |
160 | if os.path.exists(f'{filename}/{title}.mp4'):
161 | print(f'[停止下载警告] 已经存在 {filename}/{title}.mp4')
162 |
163 | elif m3u8_content == None:
164 | print(f'[m3u8密钥] {m3u8_content}')
165 | print('[错误] 没有m3u8密钥')
166 |
167 | else:
168 | cmd = f'N_m3u8DL-CLI_v3.0.2.exe "{m3u8_filename}" --workDir "{filename}" --saveName "{title}" --useKeyBase64 "{m3u8_content}"'
169 | print(cmd)
170 | os.system('CHCP 65001')
171 | os.system(cmd)
172 |
173 | else:
174 | print('[错误]m3u8文件生成失败')
175 |
176 |
177 | addons = [
178 | Counter()
179 | ]
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/requirements.txt:
--------------------------------------------------------------------------------
1 | asgiref==3.3.4
2 | beautifulsoup4==4.9.3
3 | blinker==1.4
4 | Brotli==1.0.9
5 | bs4==0.0.1
6 | certifi==2021.5.30
7 | cffi==1.14.6
8 | charset-normalizer==2.0.3
9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
1 | # python 3.7.9
2 | from selenium import webdriver
3 |
4 | PROXY='http://127.0.0.1:8080'
5 |
6 | chrome_options = webdriver.ChromeOptions()
7 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
8 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式
9 |
10 |
11 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
12 |
13 | '''
14 | 本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
15 | '''
16 |
17 | url = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3'
18 | browser.get(url)#访问网站
19 |
20 |
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | 地址一:
2 | http://chromedriver.storage.googleapis.com/index.html
3 |
4 | 地址二:
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd,输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py
--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
1 | 一次性安装
2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
3 |
4 | 分开安装
5 | pip install mitmproxy
6 | pip install selenium
7 | pip install m3u8
8 | pip install requests
9 | pip install bs4
10 | pip install pycryptodome
--------------------------------------------------------------------------------
/下载荔枝微课/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/ffmpeg.exe
--------------------------------------------------------------------------------
/下载荔枝微课/request_demo.py:
--------------------------------------------------------------------------------
1 | # python 3.7
2 | import mitmproxy.http,json,os,requests
3 | from mitmproxy import ctx
4 | from pathlib import Path
5 |
6 | '''
7 | 本次爬取的课程地址:https://m.lizhiweike.com/channel2/1192275
8 | '''
9 |
10 | cookie = '请填写你的Cooie'
11 | filename = None # 下载视频路径
12 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
13 | title = None # 标题
14 |
15 | class Counter:
16 |
17 | def __init__(self):
18 | self.Referer = 'https://m.lizhiweike.com/channel2/1192275'
19 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
20 | self.headers = {
21 | 'Referer':self.Referer,
22 | 'Host':'m.lizhiweike.com',
23 | 'User-Agent':self.UserAgent
24 | }
25 |
26 | def request(self, flow: mitmproxy.http.HTTPFlow):
27 |
28 | # 所有请求插入协议头
29 | flow.request.headers['Referer'] = self.Referer
30 |
31 | def response(self, flow: mitmproxy.http.HTTPFlow):
32 |
33 | # 导入全局变量
34 | global filename,title,current_filename,cookie
35 |
36 | if 'lecture' in flow.request.url and 'info' in flow.request.url:
37 |
38 | # 加载 JSON 对象
39 | json_data = json.loads(flow.response.text)
40 |
41 | try:
42 | # 获取当前视频标题
43 | title = json_data['data']['share_info']['share_title'].replace(' ','')
44 | except:
45 | pass
46 |
47 | # 获取课程标题
48 | if 'qcvideo' in flow.request.url:
49 |
50 | # 加载 JSON 对象
51 | json_data = json.loads(flow.response.text)
52 |
53 | # 获取视频URL
54 | video_url = json_data['data']['play_list'][0]['url']
55 |
56 | print(f'【信息】当前视频标题:{title},视频mp4链接:{video_url}')
57 |
58 | # 如果没有文件夹,就创建文件夹
59 | filename = current_filename + '/下载成功的视频/'
60 | if not os.path.exists(filename):
61 | os.makedirs(filename)
62 |
63 | # 生产mp4存放路径
64 | mp4_filename_path = f'{filename}{title}.mp4'
65 |
66 | headers = {
67 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
68 | 'referer':'https://m.lizhiweike.com/channel2/1192275',
69 | 'Cookie':cookie
70 | }
71 |
72 | # 下载视频
73 | html = requests.get(url=video_url,headers=headers)
74 | with open(mp4_filename_path,'wb') as f:
75 | f.write(html.content)
76 |
77 | addons = [
78 | Counter()
79 | ]
--------------------------------------------------------------------------------
/下载荔枝微课/requirements.txt:
--------------------------------------------------------------------------------
1 | asgiref==3.3.4
2 | beautifulsoup4==4.9.3
3 | blinker==1.4
4 | Brotli==1.0.9
5 | bs4==0.0.1
6 | certifi==2021.5.30
7 | cffi==1.14.6
8 | charset-normalizer==2.0.3
9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 |
--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/selenium启动/chromedriver.exe
--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | PROXY='http://127.0.0.1:8080'
4 |
5 | chrome_options = webdriver.ChromeOptions()
6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式
8 |
9 |
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 |
12 | '''
13 | 本次爬取的课程地址:https://m.lizhiweike.com/channel2/1192275
14 | '''
15 |
16 | url = 'https://m.lizhiweike.com/channel2/1192275'
17 | browser.get(url)#访问网站
18 |
19 |
--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | 地址一:
2 | http://chromedriver.storage.googleapis.com/index.html
3 |
4 | 地址二:
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/
--------------------------------------------------------------------------------
/下载荔枝微课/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd,输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py
--------------------------------------------------------------------------------
/下载荔枝微课/安装环境指令.txt:
--------------------------------------------------------------------------------
1 | 一次性安装
2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
3 |
4 | 分开安装
5 | pip install mitmproxy
6 | pip install selenium
7 | pip install m3u8
8 | pip install requests
9 | pip install bs4
10 | pip install pycryptodome
--------------------------------------------------------------------------------
/京东商品信息/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests,re,json
5 | from bs4 import BeautifulSoup
6 | from urllib import parse
7 |
8 | KEYWORD = parse.quote('python')
9 |
10 | base = 'https://item.jd.com'
11 | headers = {
12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
13 | 'Connection':'keep-alive',
14 | #参考链接:https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python
15 | }
16 |
17 |
18 | def get_index(url):
19 | #一开始的请求页面
20 |
21 | session = requests.Session()
22 | session.headers = headers
23 | html = session.get(url)
24 | html.encoding = 'GBK'
25 | soup = BeautifulSoup(html.text,'lxml')
26 | items = soup.select('li.gl-item')
27 |
28 |
29 | for item in items:
30 | inner_url = item.select('li.gl-item .gl-i-wrap .p-img a')[0].get('href')
31 | print(inner_url)
32 | inner_url = parse.urljoin(base,inner_url)#转成URL格式
33 |
34 | item_id = get_id(inner_url)
35 |
36 | #评论数
37 | comm_num = get_comm_num(inner_url)
38 | inner_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6501&productId=11993134&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
39 |
40 | #获取评论
41 | if comm_num>0:
42 | get_comm(inner_url,comm_num,item_id)
43 |
44 |
45 |
46 |
47 | def get_comm(url,comm_num,item_id ):
48 |
49 | headers = {
50 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
51 | }
52 | good_comments = '' #存放结果
53 | #获取评论
54 |
55 | pages = comm_num//10
56 | if pages>99:
57 | pages = 99
58 |
59 | for page in range(0,pages):
60 | comment_url = 'https://sclub.jd.com/comment/productPageComments.action?'\
61 | 'callback=fetchJSON_comment98vv4&productId={}&score=0'\
62 | '&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(item_id,page)
63 |
64 | json_decoder = requests.get(comment_url,headers=headers).text
65 | try:
66 | if json_decoder:
67 | start = json_decoder.find('{"productAttr":null,')
68 |
69 | end = json_decoder.find(',"afterDays":0}]}')+len(',"afterDays":0}]}')
70 |
71 | content = json.loads(json_decoder[start:end])
72 |
73 | comments = content['comments']
74 |
75 | for c in comments:
76 | comm = c['content']
77 | good_comments+="{}|".format(comm)
78 |
79 | print(good_comments)
80 | except Exception as e:
81 | print(e)
82 |
83 | print(item_id,good_comments)
84 |
85 | def get_shop_info(url):#获取商品信息
86 | shop_data = {}
87 | html = requests.get(url,headers = headers)
88 | soup = BeautifulSoup(html.text,'lxml')
89 | try:
90 | shop_name = soup.select('div.mt h3 a')
91 | except Exception as e:
92 | raise e
93 |
94 | def get_index_lists(html):#获取索引列表
95 | html.encoding = 'utf8'
96 | soup = BeautifulSoup(html.text,'lxml')
97 | lis = soup.find_all('li',attrs = {"class":"gl-item"})
98 | for li in lis:
99 | number = li.find('div',attrs = {"class":"p-commit"}).strong
100 | print(number)
101 |
102 | def get_comm_num(url):#获取评论数量
103 |
104 | item_id = get_id(url)
105 | comm_url = 'https://club.jd.com/comment/productCommentSummaries.action?'\
106 | 'referenceIds={}&callback=jQuery3096445'.format(item_id)
107 | comment = requests.get(comm_url,headers = headers).text
108 | start = comment.find('{"CommentsCount":')#起始
109 | end = comment.find('"PoorRateStyle":0}]}')+len('"PoorRateStyle":0}]}')#结尾
110 | try:
111 | content = json.loads(comment[start:end])['CommentsCount']#取出json
112 | except:
113 | return 0
114 | comm_num = content[0]['CommentCount']
115 | return comm_num
116 |
117 |
118 | def get_id(url):#匹配id
119 | id = re.compile('\d+')
120 | res = id.findall(url)
121 | return res[0]
122 |
123 |
124 | if __name__ == '__main__':
125 |
126 | for i in range(1,30,2):
127 | url = 'https://search.jd.com/Search?'\
128 | 'keyword={}&page={}'.format(KEYWORD,i)
129 | get_index(url)
130 |
131 |
132 |
--------------------------------------------------------------------------------
/房天下/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests, re
5 | from lxml import etree
6 | from urllib import parse
7 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
8 | from db import sess, House
9 |
10 | headers = {
11 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
12 | 'referer':'https://fs.zu.fang.com/house-a0617/i32/',
13 | #参考链接 https://zu.fang.com/house-a01
14 | #请填写你的Cookie
15 | }
16 |
17 | session = requests.session() #保持会话状态,不必重复请求
18 | session.headers = headers
19 |
20 |
21 | #获取str中的数字
22 | def get_number(text):
23 | number = re.compile('\d+')
24 | return number.findall(text)[0]
25 |
26 |
27 | #获取页面的page数目
28 | def get_pages(html):
29 | soup = etree.HTML(html.text)
30 | pages = soup.xpath("//div[@class='fanye']/span/text()")
31 | number = get_number(pages[0])
32 | if number:
33 | return int(number)
34 | return None
35 |
36 |
37 | def get_house_data(url, *args):
38 | headers = {
39 | 'Connection': 'keep-alive', #常链接
40 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
41 | 'Referer': 'https://fs.zu.fang.com/house-a0617/i33/',
42 | #参考链接 https://zu.fang.com/house-a01
43 | #请填写你的Cookie
44 | }
45 |
46 | loca_url = re.compile("(.*?) | ") #获取跳转链接
47 | xiangqing_url = re.compile('location.href="(.*?)"')
48 |
49 | session = requests.session() #长连接 保持会话
50 | session.headers = headers
51 |
52 | url = 'http://search.fang.com/captcha-854085290c4833ba19/redirect?h=' + url
53 |
54 | html = session.get(url)
55 |
56 | one_url = xiangqing_url.findall(html.text)[-1] #第一次跳转
57 | html = session.get(one_url)
58 |
59 | two_url = xiangqing_url.findall(html.text)[-1] #第二次跳转
60 | html = session.get(two_url)
61 |
62 | soup = etree.HTML(html.text)
63 | xiangqing = soup.xpath('//div[@class="fyms_con floatl gray3"]/text()')
64 | xiangqing = '|'.join(xiangqing)
65 | print('block:{}\t标题:{}\t租金:{}详情:{}'.format(args[0], args[2], args[1],xiangqing))
66 |
67 | s = sess()
68 | try:
69 | house = House(block=args[0],
70 | title=args[2],
71 | rent=args[1],
72 | data=xiangqing)
73 |
74 | s.add(house)
75 | s.commit()
76 | print('commit')
77 | except Exception as e:
78 | print('rollback', e)
79 | s.rollback()
80 |
81 |
82 | #获取页面信息
83 | def get_data_next(url):
84 | html = session.get(url)
85 | soup = etree.HTML(html.text)
86 | dls = soup.xpath("//div[@class='houseList']/dl")
87 | block = soup.xpath("//span[@class='selestfinds']/a/text()")
88 | rfss = soup.xpath("//input[@id='baidid']/@value")[0]
89 | for dl in dls:
90 | try:
91 | title = dl.xpath('dd/p/a/text()')[0]
92 | rent = dl.xpath("dd/div/p/span[@class='price']/text()")[0]
93 | href = parse.urljoin('https://zu.fang.com',
94 | dl.xpath('dd/p/a/@href')[0]) #拼接链
95 | get_house_data(href, block, rent, title)
96 | except IndexError as e:
97 | print('dl error', e)
98 |
99 |
100 | #获取页面
101 | def get_data(html):
102 | pages = get_pages(html)
103 | if not pages:
104 | pages = 1
105 | urls = [
106 | 'https://zu.fang.com/house-a01/i3%d/' % i for i in range(1, pages + 1)
107 | ]
108 |
109 | with ProcessPoolExecutor(max_workers=2) as t:
110 |
111 | for url in urls:
112 | t.submit(get_data_next, url)
113 |
114 |
115 | #进入首页
116 | def get_index(url):
117 | html = session.get(url, headers=headers)
118 | if html.status_code == 200:
119 | get_data(html)
120 | else:
121 | print('请求页面{}出错'.format(url))
122 |
123 |
124 | def main():
125 | urls = ['https://zu.fang.com/house-a0{}/'.format(i) for i in range(1, 17)]
126 | with ProcessPoolExecutor(max_workers=2) as p:
127 | for url in urls:
128 | p.submit(get_index, url)
129 |
130 |
131 | if __name__ == '__main__':
132 | main()
133 | session.close()
134 |
--------------------------------------------------------------------------------
/房天下/db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 | from sqlalchemy import Column,Integer,String,Text
3 | from sqlalchemy.orm import sessionmaker,scoped_session
4 | from sqlalchemy.ext.declarative import declarative_base
5 |
6 | BASE = declarative_base()#创建基类
7 |
8 | #此处没有使用pymysql的驱动
9 | #请安装pip install mysql-connector-python
10 | #engine中的 mysqlconnector 为 mysql官网驱动
11 | engine = create_engine(
12 | "mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8",#确定编码格式
13 | max_overflow = 500,#超过连接池大小外最多可以创建的链接
14 | pool_size = 100,#连接池大小
15 | echo = False,#调试信息展示
16 | )
17 |
18 | class House(BASE):#继承基类
19 | __tablename__ = 'house' #表名字
20 | id = Column(Integer,primary_key = True,autoincrement = True)
21 | block = Column(String(125))
22 | title = Column(String(125))
23 | rent = Column(String(125))
24 | data = Column(Text())
25 |
26 | BASE.metadata.create_all(engine)#通过基类创建表
27 | Session = sessionmaker(engine)
28 | sess = scoped_session(Session)
29 |
30 |
--------------------------------------------------------------------------------
/新版QQ音乐/README.md:
--------------------------------------------------------------------------------
1 | **注意事项**
2 | - [QQ音乐爬虫原理视频](https://www.bilibili.com/video/BV1pk4y1m7TG)
3 | - `execjs`依赖于`NodeJS`,请务必提前安装(本项目的开发环境为`NodeJS v14.6.0`)。
4 | - `cd`到当前文件夹,执行`python demo.py`即可
5 | - `demo.py`为没有入库版,只爬取一个分类,没开多进程,方便大家理解
6 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题
7 | - `get_singer_mid(index)` 方法决定分类爬取
--------------------------------------------------------------------------------
/新版QQ音乐/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import execjs,requests,math,os,threading
5 | from urllib import parse
6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7 | from db import SQLsession,Song
8 |
9 | lock = threading.Lock()
10 |
11 | headers = {
12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
13 | 'Referer':'https://y.qq.com/portal/singer_list.html',
14 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
15 | }
16 |
17 | session = SQLsession()
18 |
19 | def get_sign(data):
20 |
21 | with open('./新版QQ音乐/get_sign.js','r',encoding='utf-8') as f:
22 | text = f.read()
23 |
24 | js_data = execjs.compile(text)
25 | sign = js_data.call('get_sign',data)
26 | return sign
27 |
28 |
29 | def myProcess():
30 | #把歌手按照首字母分为27类
31 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
32 | for i in range(1,28):#28
33 | p.submit(get_singer_mid,i)
34 |
35 |
36 | def get_singer_mid(index):
37 | #index = 1-----27
38 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
39 | #找出mid, 用于歌手详情页
40 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\
41 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
42 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
43 | sign = get_sign(data)
44 |
45 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
46 | '&sign={}'\
47 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
48 | '¬ice=0&platform=yqq.json&needNewCode=0'\
49 | '&data={}'.format(sign,parse.quote(data))
50 |
51 | html = requests.get(url,headers = headers).json()
52 |
53 | total = html['singerList']['data']['total']#多少个歌手
54 |
55 | pages = int(math.floor(int(total)/80))#向下取整
56 |
57 | thread_number = pages
58 | Thread = ThreadPoolExecutor(max_workers = thread_number)
59 |
60 | sin = 0
61 | #分页迭代每一个字母下的所有页面歌手
62 | for page in range(1,pages+2):
63 |
64 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\
65 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
66 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page))
67 | sign = get_sign(data)
68 |
69 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
70 | '&sign={}'\
71 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
72 | '¬ice=0&platform=yqq.json&needNewCode=0'\
73 | '&data={}'.format(sign,parse.quote(data))
74 |
75 | html = requests.get(url,headers = headers).json()
76 |
77 | sings = html['singerList']['data']['singerlist']
78 |
79 | for sing in sings:
80 |
81 | singer_name = sing['singer_name'] #获取歌手名字
82 | mid = sing['singer_mid'] #获取歌手mid
83 |
84 | Thread.submit(get_singer_data,mid = mid,
85 | singer_name = singer_name,)
86 | sin+=80
87 |
88 | #获取歌手信息
89 | def get_singer_data(mid,singer_name):
90 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
91 | #找出歌手的歌曲信息页
92 |
93 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
94 | '{"order":1,"singerMid":"%s","begin":0,"num":10}'\
95 | ',"module":"musichall.song_list_server"}}'%(str(mid))
96 |
97 | sign = get_sign(data)
98 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
99 | '&g_tk=5381&sign={}&loginUin=0'\
100 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
101 | '&data={}'.format(sign,parse.quote(data))
102 |
103 | html = requests.get(url,headers = headers).json()
104 |
105 | songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数
106 |
107 |
108 | for number in range(0,songs_num,100):
109 |
110 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
111 | '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\
112 | ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num))
113 |
114 | sign = get_sign(data)
115 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
116 | '&g_tk=5381&sign={}&loginUin=0'\
117 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
118 | '&data={}'.format(sign,parse.quote(data))
119 |
120 | html = requests.get(url,headers = headers).json()
121 |
122 | datas = html['singerSongList']['data']['songList']
123 |
124 | for d in datas:
125 | sing_name = d['songInfo']['title']
126 | song_mid = d['songInfo']['mid']
127 | try:
128 | lock.acquire()#锁上
129 |
130 | session.add(Song(song_name = sing_name,
131 | song_singer = singer_name,
132 | song_mid = song_mid))
133 | session.commit()
134 |
135 | lock.release()#解锁
136 | print('commit')
137 | except:
138 | session.rollback()
139 | print('rollbeak')
140 |
141 |
142 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,song_mid))
143 | download(song_mid,sing_name,singer_name)
144 |
145 |
146 | def download(song_mid,sing_name,singer_name):
147 |
148 | qq_number = '1641202711'#请修改你的QQ号
149 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\
150 | ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\
151 | '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\
152 | '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\
153 | '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number))
154 |
155 | sign = get_sign(data)
156 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\
157 | '&g_tk=1291538537&sign={}&loginUin={}'\
158 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0'\
159 | '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data))
160 |
161 | html = requests.get(url,headers = headers).json()
162 |
163 | try:
164 | purl = html['req_0']['data']['midurlinfo'][0]['purl']
165 |
166 |
167 | url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl)
168 |
169 | html = requests.get(url,headers = headers,verify=False)
170 |
171 | html.encoding = 'utf-8'
172 |
173 | sing_file_name = '{} -- {}'.format(sing_name,singer_name)
174 |
175 | filename = './新版QQ音乐/歌曲'
176 |
177 | if html.status_code != 403:
178 | if not os.path.exists(filename):
179 | os.makedirs(filename)
180 |
181 | with open('./新版QQ音乐/歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
182 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
183 | f.write(html.content)
184 |
185 | except:
186 | print('查询权限失败,或没有查到对应的歌曲')
187 |
188 |
189 |
190 |
191 |
192 | if __name__ == "__main__":
193 | myProcess()
194 |
195 |
196 |
197 |
--------------------------------------------------------------------------------
/新版QQ音乐/db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import Column,Integer,String,create_engine
2 | from sqlalchemy.orm import sessionmaker,scoped_session
3 | from sqlalchemy.ext.declarative import declarative_base
4 |
5 | #此处没有使用pymysql的驱动
6 | #请安装pip install mysql-connector-python
7 | #engine中的 mysqlconnector 为 mysql官网驱动
8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8',
9 | max_overflow = 500,#超过连接池大小外最多可以创建的链接
10 | pool_size = 100,#连接池大小
11 | echo = False,#调试信息展示
12 | )
13 | Base = declarative_base()
14 |
15 | class Song(Base):
16 | __tablename__ = 'song'
17 | song_id = Column(Integer,primary_key = True,autoincrement = True)
18 | song_name = Column(String(64))
19 | song_ablum = Column(String(64))
20 | song_mid = Column(String(50))
21 | song_singer = Column(String(50))
22 | Base.metadata.create_all(engine)
23 |
24 | DBsession = sessionmaker(bind = engine)
25 |
26 | SQLsession = scoped_session(DBsession)
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/新版QQ音乐/demo.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import execjs,requests,math,os,threading
5 | from urllib import parse
6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7 | from db import SQLsession,Song
8 |
9 | # lock = threading.Lock()
10 |
11 | headers = {
12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
13 | 'Referer':'https://y.qq.com/portal/singer_list.html',
14 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
15 | }
16 |
17 | # session = SQLsession()
18 |
19 | def get_sign(data):
20 |
21 | with open('./get_sign.js','r',encoding='utf-8') as f:
22 | text = f.read()
23 |
24 | js_data = execjs.compile(text)
25 | sign = js_data.call('get_sign',data)
26 | return sign
27 |
28 |
29 | def myProcess():
30 | #把歌手按照首字母分为27类
31 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
32 | for i in range(1,28):#28
33 | p.submit(get_singer_mid,i)
34 |
35 |
36 | def get_singer_mid(index):
37 | #index = 1-----27
38 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
39 | #找出mid, 用于歌手详情页
40 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\
41 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
42 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
43 | sign = get_sign(data)
44 |
45 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
46 | '&sign={}'\
47 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
48 | '¬ice=0&platform=yqq.json&needNewCode=0'\
49 | '&data={}'.format(sign,parse.quote(data))
50 |
51 | html = requests.get(url,headers = headers).json()
52 |
53 | total = html['singerList']['data']['total']#多少个歌手
54 |
55 | pages = int(math.floor(int(total)/80))#向下取整
56 |
57 | thread_number = pages
58 | Thread = ThreadPoolExecutor(max_workers = thread_number)
59 |
60 | sin = 0
61 | #分页迭代每一个字母下的所有页面歌手
62 | for page in range(1,pages+2):
63 |
64 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\
65 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
66 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page))
67 | sign = get_sign(data)
68 |
69 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
70 | '&sign={}'\
71 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
72 | '¬ice=0&platform=yqq.json&needNewCode=0'\
73 | '&data={}'.format(sign,parse.quote(data))
74 |
75 | html = requests.get(url,headers = headers).json()
76 |
77 | sings = html['singerList']['data']['singerlist']
78 |
79 | for sing in sings:
80 |
81 | singer_name = sing['singer_name'] #获取歌手名字
82 | mid = sing['singer_mid'] #获取歌手mid
83 |
84 | Thread.submit(get_singer_data,mid = mid,
85 | singer_name = singer_name,)
86 | sin+=80
87 |
88 | #获取歌手信息
89 | def get_singer_data(mid,singer_name):
90 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
91 | #找出歌手的歌曲信息页
92 |
93 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
94 | '{"order":1,"singerMid":"%s","begin":0,"num":10}'\
95 | ',"module":"musichall.song_list_server"}}'%(str(mid))
96 |
97 | sign = get_sign(data)
98 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
99 | '&g_tk=5381&sign={}&loginUin=0'\
100 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
101 | '&data={}'.format(sign,parse.quote(data))
102 |
103 | html = requests.get(url,headers = headers).json()
104 |
105 | songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数
106 |
107 |
108 | for number in range(0,songs_num,100):
109 |
110 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
111 | '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\
112 | ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num))
113 |
114 | sign = get_sign(data)
115 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
116 | '&g_tk=5381&sign={}&loginUin=0'\
117 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
118 | '&data={}'.format(sign,parse.quote(data))
119 |
120 | html = requests.get(url,headers = headers).json()
121 |
122 | datas = html['singerSongList']['data']['songList']
123 |
124 | for d in datas:
125 | sing_name = d['songInfo']['title']
126 | song_mid = d['songInfo']['mid']
127 | # try:
128 | # lock.acquire()#锁上
129 | #
130 | # session.add(Song(song_name = sing_name,
131 | # song_singer = singer_name,
132 | # song_mid = song_mid))
133 | # session.commit()
134 | #
135 | # lock.release()#解锁
136 | # print('commit')
137 | # except:
138 | # session.rollback()
139 | # print('rollbeak')
140 |
141 |
142 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,song_mid))
143 | download(song_mid,sing_name,singer_name)
144 |
145 |
146 | def download(song_mid,sing_name,singer_name):
147 |
148 | qq_number = '1641202711'#请修改你的QQ号
149 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\
150 | ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\
151 | '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\
152 | '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\
153 | '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number))
154 |
155 | sign = get_sign(data)
156 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\
157 | '&g_tk=1291538537&sign={}&loginUin={}'\
158 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0'\
159 | '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data))
160 |
161 | html = requests.get(url,headers = headers).json()
162 |
163 | try:
164 | purl = html['req_0']['data']['midurlinfo'][0]['purl']
165 |
166 |
167 | url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl)
168 |
169 | html = requests.get(url,headers = headers,verify=False)
170 |
171 | html.encoding = 'utf-8'
172 |
173 | sing_file_name = '{} -- {}'.format(sing_name,singer_name)
174 |
175 | filename = './歌曲'
176 |
177 | if html.status_code != 403:
178 | if not os.path.exists(filename):
179 | os.makedirs(filename)
180 |
181 | with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
182 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
183 | f.write(html.content)
184 |
185 | except:
186 | print('查询权限失败,或没有查到对应的歌曲')
187 |
188 |
189 |
190 |
191 |
192 | if __name__ == "__main__":
193 | # myProcess()
194 | get_singer_mid(1)
195 |
196 |
197 |
--------------------------------------------------------------------------------
/新版QQ音乐/get_sign.js:
--------------------------------------------------------------------------------
1 |
2 | this.window = this;
3 | var sign = null;
4 |
5 | !function(n, t) {
6 | "object" == typeof exports && "undefined" != typeof module ? module.exports = t() : "function" == typeof define && define.amd ? define(t) : (n = n || self).getSecuritySign = t()
7 | } (this,
8 | function() {
9 | "use strict";
10 | var n = function() {
11 | if ("undefined" != typeof self) return self;
12 | if ("undefined" != typeof window) return window;
13 | if ("undefined" != typeof global) return global;
14 | throw new Error("unable to locate global object")
15 | } ();
16 | n.__sign_hash_20200305 = function(n) {
17 | function l(n, t) {
18 | var o = (65535 & n) + (65535 & t);
19 | return (n >> 16) + (t >> 16) + (o >> 16) << 16 | 65535 & o
20 | }
21 | function r(n, t, o, e, u, p) {
22 | return l((i = l(l(t, n), l(e, p))) << (r = u) | i >>> 32 - r, o);
23 | var i, r
24 | }
25 | function g(n, t, o, e, u, p, i) {
26 | return r(t & o | ~t & e, n, t, u, p, i)
27 | }
28 | function a(n, t, o, e, u, p, i) {
29 | return r(t & e | o & ~e, n, t, u, p, i)
30 | }
31 | function s(n, t, o, e, u, p, i) {
32 | return r(t ^ o ^ e, n, t, u, p, i)
33 | }
34 | function v(n, t, o, e, u, p, i) {
35 | return r(o ^ (t | ~e), n, t, u, p, i)
36 | }
37 | function t(n) {
38 | return function(n) {
39 | var t, o = "";
40 | for (t = 0; t < 32 * n.length; t += 8) o += String.fromCharCode(n[t >> 5] >>> t % 32 & 255);
41 | return o
42 | } (function(n, t) {
43 | n[t >> 5] |= 128 << t % 32,
44 | n[14 + (t + 64 >>> 9 << 4)] = t;
45 | var o, e, u, p, i, r = 1732584193,
46 | f = -271733879,
47 | h = -1732584194,
48 | c = 271733878;
49 | for (o = 0; o < n.length; o += 16) r = g(e = r, u = f, p = h, i = c, n[o], 7, -680876936),
50 | c = g(c, r, f, h, n[o + 1], 12, -389564586),
51 | h = g(h, c, r, f, n[o + 2], 17, 606105819),
52 | f = g(f, h, c, r, n[o + 3], 22, -1044525330),
53 | r = g(r, f, h, c, n[o + 4], 7, -176418897),
54 | c = g(c, r, f, h, n[o + 5], 12, 1200080426),
55 | h = g(h, c, r, f, n[o + 6], 17, -1473231341),
56 | f = g(f, h, c, r, n[o + 7], 22, -45705983),
57 | r = g(r, f, h, c, n[o + 8], 7, 1770035416),
58 | c = g(c, r, f, h, n[o + 9], 12, -1958414417),
59 | h = g(h, c, r, f, n[o + 10], 17, -42063),
60 | f = g(f, h, c, r, n[o + 11], 22, -1990404162),
61 | r = g(r, f, h, c, n[o + 12], 7, 1804603682),
62 | c = g(c, r, f, h, n[o + 13], 12, -40341101),
63 | h = g(h, c, r, f, n[o + 14], 17, -1502002290),
64 | r = a(r, f = g(f, h, c, r, n[o + 15], 22, 1236535329), h, c, n[o + 1], 5, -165796510),
65 | c = a(c, r, f, h, n[o + 6], 9, -1069501632),
66 | h = a(h, c, r, f, n[o + 11], 14, 643717713),
67 | f = a(f, h, c, r, n[o], 20, -373897302),
68 | r = a(r, f, h, c, n[o + 5], 5, -701558691),
69 | c = a(c, r, f, h, n[o + 10], 9, 38016083),
70 | h = a(h, c, r, f, n[o + 15], 14, -660478335),
71 | f = a(f, h, c, r, n[o + 4], 20, -405537848),
72 | r = a(r, f, h, c, n[o + 9], 5, 568446438),
73 | c = a(c, r, f, h, n[o + 14], 9, -1019803690),
74 | h = a(h, c, r, f, n[o + 3], 14, -187363961),
75 | f = a(f, h, c, r, n[o + 8], 20, 1163531501),
76 | r = a(r, f, h, c, n[o + 13], 5, -1444681467),
77 | c = a(c, r, f, h, n[o + 2], 9, -51403784),
78 | h = a(h, c, r, f, n[o + 7], 14, 1735328473),
79 | r = s(r, f = a(f, h, c, r, n[o + 12], 20, -1926607734), h, c, n[o + 5], 4, -378558),
80 | c = s(c, r, f, h, n[o + 8], 11, -2022574463),
81 | h = s(h, c, r, f, n[o + 11], 16, 1839030562),
82 | f = s(f, h, c, r, n[o + 14], 23, -35309556),
83 | r = s(r, f, h, c, n[o + 1], 4, -1530992060),
84 | c = s(c, r, f, h, n[o + 4], 11, 1272893353),
85 | h = s(h, c, r, f, n[o + 7], 16, -155497632),
86 | f = s(f, h, c, r, n[o + 10], 23, -1094730640),
87 | r = s(r, f, h, c, n[o + 13], 4, 681279174),
88 | c = s(c, r, f, h, n[o], 11, -358537222),
89 | h = s(h, c, r, f, n[o + 3], 16, -722521979),
90 | f = s(f, h, c, r, n[o + 6], 23, 76029189),
91 | r = s(r, f, h, c, n[o + 9], 4, -640364487),
92 | c = s(c, r, f, h, n[o + 12], 11, -421815835),
93 | h = s(h, c, r, f, n[o + 15], 16, 530742520),
94 | r = v(r, f = s(f, h, c, r, n[o + 2], 23, -995338651), h, c, n[o], 6, -198630844),
95 | c = v(c, r, f, h, n[o + 7], 10, 1126891415),
96 | h = v(h, c, r, f, n[o + 14], 15, -1416354905),
97 | f = v(f, h, c, r, n[o + 5], 21, -57434055),
98 | r = v(r, f, h, c, n[o + 12], 6, 1700485571),
99 | c = v(c, r, f, h, n[o + 3], 10, -1894986606),
100 | h = v(h, c, r, f, n[o + 10], 15, -1051523),
101 | f = v(f, h, c, r, n[o + 1], 21, -2054922799),
102 | r = v(r, f, h, c, n[o + 8], 6, 1873313359),
103 | c = v(c, r, f, h, n[o + 15], 10, -30611744),
104 | h = v(h, c, r, f, n[o + 6], 15, -1560198380),
105 | f = v(f, h, c, r, n[o + 13], 21, 1309151649),
106 | r = v(r, f, h, c, n[o + 4], 6, -145523070),
107 | c = v(c, r, f, h, n[o + 11], 10, -1120210379),
108 | h = v(h, c, r, f, n[o + 2], 15, 718787259),
109 | f = v(f, h, c, r, n[o + 9], 21, -343485551),
110 | r = l(r, e),
111 | f = l(f, u),
112 | h = l(h, p),
113 | c = l(c, i);
114 | return [r, f, h, c]
115 | } (function(n) {
116 | var t, o = [];
117 | for (o[(n.length >> 2) - 1] = void 0, t = 0; t < o.length; t += 1) o[t] = 0;
118 | for (t = 0; t < 8 * n.length; t += 8) o[t >> 5] |= (255 & n.charCodeAt(t / 8)) << t % 32;
119 | return o
120 | } (n), 8 * n.length))
121 | }
122 | function o(n) {
123 | return t(unescape(encodeURIComponent(n)))
124 | }
125 | return function(n) {
126 | var t, o, e = "0123456789abcdef",
127 | u = "";
128 | for (o = 0; o < n.length; o += 1) t = n.charCodeAt(o),
129 | u += e.charAt(t >>> 4 & 15) + e.charAt(15 & t);
130 | return u
131 | } (o(n))
132 | },
133 | function r(f, h, c, l, g) {
134 | g = g || [[this], [{}]];
135 | for (var t = [], o = null, n = [function() {
136 | return ! 0
137 | },
138 | function() {},
139 | function() {
140 | g.length = c[h++]
141 | },
142 | function() {
143 | g.push(c[h++])
144 | },
145 | function() {
146 | g.pop()
147 | },
148 | function() {
149 | var n = c[h++],
150 | t = g[g.length - 2 - n];
151 | g[g.length - 2 - n] = g.pop(),
152 | g.push(t)
153 | },
154 | function() {
155 | g.push(g[g.length - 1])
156 | },
157 | function() {
158 | g.push([g.pop(), g.pop()].reverse())
159 | },
160 | function() {
161 | g.push([l, g.pop()])
162 | },
163 | function() {
164 | g.push([g.pop()])
165 | },
166 | function() {
167 | var n = g.pop();
168 | g.push(n[0][n[1]])
169 | },
170 | function() {
171 | g.push(g[g.pop()[0]][0])
172 | },
173 | function() {
174 | var n = g[g.length - 2];
175 | n[0][n[1]] = g[g.length - 1]
176 | },
177 | function() {
178 | g[g[g.length - 2][0]][0] = g[g.length - 1]
179 | },
180 | function() {
181 | var n = g.pop(),
182 | t = g.pop();
183 | g.push([t[0][t[1]], n])
184 | },
185 | function() {
186 | var n = g.pop();
187 | g.push([g[g.pop()][0], n])
188 | },
189 | function() {
190 | var n = g.pop();
191 | g.push(delete n[0][n[1]])
192 | },
193 | function() {
194 | var n = [];
195 | for (var t in g.pop()) n.push(t);
196 | g.push(n)
197 | },
198 | function() {
199 | g[g.length - 1].length ? g.push(g[g.length - 1].shift(), !0) : g.push(void 0, !1)
200 | },
201 | function() {
202 | var n = g[g.length - 2],
203 | t = Object.getOwnPropertyDescriptor(n[0], n[1]) || {
204 | configurable: !0,
205 | enumerable: !0
206 | };
207 | t.get = g[g.length - 1],
208 | Object.defineProperty(n[0], n[1], t)
209 | },
210 | function() {
211 | var n = g[g.length - 2],
212 | t = Object.getOwnPropertyDescriptor(n[0], n[1]) || {
213 | configurable: !0,
214 | enumerable: !0
215 | };
216 | t.set = g[g.length - 1],
217 | Object.defineProperty(n[0], n[1], t)
218 | },
219 | function() {
220 | h = c[h++]
221 | },
222 | function() {
223 | var n = c[h++];
224 | g[g.length - 1] && (h = n)
225 | },
226 | function() {
227 | throw g[g.length - 1]
228 | },
229 | function() {
230 | var n = c[h++],
231 | t = n ? g.slice( - n) : [];
232 | g.length -= n,
233 | g.push(g.pop().apply(l, t))
234 | },
235 | function() {
236 | var n = c[h++],
237 | t = n ? g.slice( - n) : [];
238 | g.length -= n;
239 | var o = g.pop();
240 | g.push(o[0][o[1]].apply(o[0], t))
241 | },
242 | function() {
243 | var n = c[h++],
244 | t = n ? g.slice( - n) : [];
245 | g.length -= n,
246 | t.unshift(null),
247 | g.push(new(Function.prototype.bind.apply(g.pop(), t)))
248 | },
249 | function() {
250 | var n = c[h++],
251 | t = n ? g.slice( - n) : [];
252 | g.length -= n,
253 | t.unshift(null);
254 | var o = g.pop();
255 | g.push(new(Function.prototype.bind.apply(o[0][o[1]], t)))
256 | },
257 | function() {
258 | g.push(!g.pop())
259 | },
260 | function() {
261 | g.push(~g.pop())
262 | },
263 | function() {
264 | g.push(typeof g.pop())
265 | },
266 | function() {
267 | g[g.length - 2] = g[g.length - 2] == g.pop()
268 | },
269 | function() {
270 | g[g.length - 2] = g[g.length - 2] === g.pop()
271 | },
272 | function() {
273 | g[g.length - 2] = g[g.length - 2] > g.pop()
274 | },
275 | function() {
276 | g[g.length - 2] = g[g.length - 2] >= g.pop()
277 | },
278 | function() {
279 | g[g.length - 2] = g[g.length - 2] << g.pop()
280 | },
281 | function() {
282 | g[g.length - 2] = g[g.length - 2] >> g.pop()
283 | },
284 | function() {
285 | g[g.length - 2] = g[g.length - 2] >>> g.pop()
286 | },
287 | function() {
288 | g[g.length - 2] = g[g.length - 2] + g.pop()
289 | },
290 | function() {
291 | g[g.length - 2] = g[g.length - 2] - g.pop()
292 | },
293 | function() {
294 | g[g.length - 2] = g[g.length - 2] * g.pop()
295 | },
296 | function() {
297 | g[g.length - 2] = g[g.length - 2] / g.pop()
298 | },
299 | function() {
300 | g[g.length - 2] = g[g.length - 2] % g.pop()
301 | },
302 | function() {
303 | g[g.length - 2] = g[g.length - 2] | g.pop()
304 | },
305 | function() {
306 | g[g.length - 2] = g[g.length - 2] & g.pop()
307 | },
308 | function() {
309 | g[g.length - 2] = g[g.length - 2] ^ g.pop()
310 | },
311 | function() {
312 | g[g.length - 2] = g[g.length - 2] in g.pop()
313 | },
314 | function() {
315 | g[g.length - 2] = g[g.length - 2] instanceof g.pop()
316 | },
317 | function() {
318 | g[g[g.length - 1][0]] = void 0 === g[g[g.length - 1][0]] ? [] : g[g[g.length - 1][0]]
319 | },
320 | function() {
321 | for (var e = c[h++], u = [], n = c[h++], t = c[h++], p = [], o = 0; o < n; o++) u[c[h++]] = g[c[h++]];
322 | for (var i = 0; i < t; i++) p[i] = c[h++];
323 | g.push(function n() {
324 | var t = u.slice(0);
325 | t[0] = [this],
326 | t[1] = [arguments],
327 | t[2] = [n];
328 | for (var o = 0; o < p.length && o < arguments.length; o++) 0 < p[o] && (t[p[o]] = [arguments[o]]);
329 | return r(f, e, c, l, t)
330 | })
331 | },
332 | function() {
333 | t.push([c[h++], g.length, c[h++]])
334 | },
335 | function() {
336 | t.pop()
337 | },
338 | function() {
339 | return !! o
340 | },
341 | function() {
342 | o = null
343 | },
344 | function() {
345 | g[g.length - 1] += String.fromCharCode(c[h++])
346 | },
347 | function() {
348 | g.push("")
349 | },
350 | function() {
351 | g.push(void 0)
352 | },
353 | function() {
354 | g.push(null)
355 | },
356 | function() {
357 | g.push(!0)
358 | },
359 | function() {
360 | g.push(!1)
361 | },
362 | function() {
363 | g.length -= c[h++]
364 | },
365 | function() {
366 | g[g.length - 1] = c[h++]
367 | },
368 | function() {
369 | var n = g.pop(),
370 | t = g[g.length - 1];
371 | t[0][t[1]] = g[n[0]][0]
372 | },
373 | function() {
374 | var n = g.pop(),
375 | t = g[g.length - 1];
376 | t[0][t[1]] = n[0][n[1]]
377 | },
378 | function() {
379 | var n = g.pop(),
380 | t = g[g.length - 1];
381 | g[t[0]][0] = g[n[0]][0]
382 | },
383 | function() {
384 | var n = g.pop(),
385 | t = g[g.length - 1];
386 | g[t[0]][0] = n[0][n[1]]
387 | },
388 | function() {
389 | g[g.length - 2] = g[g.length - 2] < g.pop()
390 | },
391 | function() {
392 | g[g.length - 2] = g[g.length - 2] <= g.pop()
393 | }];;) try {
394 | for (; ! n[c[h++]](););
395 | if (o) throw o;
396 | return g.pop()
397 | } catch(n) {
398 | var e = t.pop();
399 | if (void 0 === e) throw n;
400 | o = n,
401 | h = e[0],
402 | g.length = e[1],
403 | e[2] && (g[e[2]][0] = o)
404 | }
405 | } (120731, 0, [21, 34, 50, 100, 57, 50, 102, 50, 98, 99, 101, 52, 54, 97, 52, 99, 55, 56, 52, 49, 57, 54, 57, 49, 56, 98, 102, 100, 100, 48, 48, 55, 55, 102, 2, 10, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 4, 21, 427, 54, 2, 15, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 12, 9, 48, 61, 13, 9, 48, 61, 14, 9, 48, 61, 10, 9, 55, 54, 97, 54, 98, 54, 99, 54, 100, 54, 101, 54, 102, 54, 103, 54, 104, 54, 105, 54, 106, 54, 107, 54, 108, 54, 109, 54, 110, 54, 111, 54, 112, 54, 113, 54, 114, 54, 115, 54, 116, 54, 117, 54, 118, 54, 119, 54, 120, 54, 121, 54, 122, 54, 48, 54, 49, 54, 50, 54, 51, 54, 52, 54, 53, 54, 54, 54, 55, 54, 56, 54, 57, 13, 4, 61, 11, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 4, 9, 11, 3, 3, 9, 11, 39, 3, 1, 38, 40, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 12, 9, 55, 13, 4, 61, 13, 9, 3, 0, 13, 4, 4, 3, 13, 9, 11, 3, 11, 9, 11, 66, 22, 306, 4, 21, 422, 24, 4, 3, 14, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 10, 9, 55, 54, 108, 54, 101, 54, 110, 54, 103, 54, 116, 54, 104, 15, 10, 40, 25, 1, 13, 4, 61, 12, 9, 6, 11, 3, 10, 9, 3, 14, 9, 11, 15, 10, 38, 13, 4, 61, 13, 9, 6, 11, 6, 5, 1, 5, 0, 3, 1, 38, 13, 4, 61, 0, 5, 0, 43, 4, 21, 291, 61, 3, 12, 9, 11, 0, 3, 9, 9, 49, 72, 0, 2, 3, 4, 13, 4, 61, 8, 9, 21, 721, 3, 2, 8, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 510, 4, 21, 523, 22, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 0, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 566, 4, 21, 583, 3, 4, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 0, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 626, 4, 21, 643, 25, 4, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 0, 55, 54, 69, 54, 114, 54, 114, 54, 111, 54, 114, 8, 55, 54, 117, 54, 110, 54, 97, 54, 98, 54, 108, 54, 101, 54, 32, 54, 116, 54, 111, 54, 32, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 101, 54, 32, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 54, 32, 54, 111, 54, 98, 54, 106, 54, 101, 54, 99, 54, 116, 27, 1, 23, 56, 0, 49, 444, 0, 0, 24, 0, 13, 4, 61, 8, 9, 55, 54, 95, 54, 95, 54, 103, 54, 101, 54, 116, 54, 83, 54, 101, 54, 99, 54, 117, 54, 114, 54, 105, 54, 116, 54, 121, 54, 83, 54, 105, 54, 103, 54, 110, 15, 21, 1126, 49, 2, 14, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 9, 9, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 862, 21, 932, 21, 4, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 55, 54, 104, 54, 111, 54, 115, 54, 116, 14, 55, 54, 105, 54, 110, 54, 100, 54, 101, 54, 120, 54, 79, 54, 102, 14, 55, 54, 121, 54, 46, 54, 113, 54, 113, 54, 46, 54, 99, 54, 111, 54, 109, 25, 1, 3, 0, 3, 1, 39, 32, 22, 963, 4, 55, 54, 67, 54, 74, 54, 66, 54, 80, 54, 65, 54, 67, 54, 114, 54, 82, 54, 117, 54, 78, 54, 121, 54, 55, 21, 974, 50, 4, 3, 12, 9, 11, 3, 8, 3, 10, 24, 2, 13, 4, 61, 10, 9, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 10, 22, 1030, 21, 1087, 22, 4, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 3, 9, 9, 11, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 11, 9, 3, 12, 9, 11, 3, 10, 3, 53, 3, 37, 39, 24, 2, 13, 4, 4, 55, 54, 122, 54, 122, 54, 97, 3, 11, 9, 11, 38, 3, 10, 9, 11, 38, 0, 49, 771, 2, 1, 12, 9, 13, 8, 3, 12, 4, 4, 56, 0], n);
406 | var t = n.__getSecuritySign;
407 | sign = t;
408 | return t;
409 | });
410 |
411 | function get_sign(data){
412 | return sign(data)
413 | };
414 |
--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/README.md:
--------------------------------------------------------------------------------
1 | **注意事项**
2 | - `cd`到当前文件夹,执行`python demo.py`即可
3 | - `demo.py`为没有入库版,只爬取一个分类,没开多进程,方便大家理解
4 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题
5 | - `get_singer_mid(index)` 方法决定分类爬取
--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests,os,json,math,threading
5 | from urllib import parse
6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7 | from db import SQLsession,Song
8 |
9 | headers = {
10 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
11 | 'referer':'https://y.qq.com/portal/singer_list.html',
12 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
13 | }
14 |
15 | lock = threading.Lock()
16 | session = SQLsession()
17 |
18 | def myProcess():
19 | #把歌手按照首字母分为27类
20 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
21 | for i in range(1,28):#28
22 | p.submit(get_singer_mid,i)
23 |
24 | def get_singer_mid(index):
25 | #index = 1-----27
26 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
27 | #找出mid, 用于歌手详情页
28 |
29 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
30 | ',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
31 | '"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
32 |
33 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
34 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
35 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
36 | '&data={}'.format(parse.quote(data))
37 |
38 | html = requests.get(url).json()
39 | total = html['singerList']['data']['total']#多少个歌手
40 | pages = int(math.floor(int(total)/80))
41 | thread_number = pages
42 |
43 | Thread = ThreadPoolExecutor(max_workers = thread_number)
44 |
45 | sin = 0
46 | #分页迭代每一个字母下的所有页面歌手
47 | for page in range(1,pages+2):
48 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
49 | '"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
50 | 'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
51 |
52 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
53 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
54 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
55 | '&data={}'.format(parse.quote(data))
56 |
57 | html = requests.get(url,headers = headers).json()
58 |
59 | sings = html['singerList']['data']['singerlist']
60 |
61 | for sing in sings:
62 |
63 | singer_name = sing['singer_name']
64 | mid = sing['singer_mid']
65 |
66 | Thread.submit(get_singer_data,mid = mid,
67 | singer_name = singer_name,)
68 | sin+=80
69 |
70 |
71 |
72 | #获取歌手信息
73 | def get_singer_data(mid,singer_name):
74 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
75 | #找出歌手的歌曲信息页
76 |
77 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
78 | '"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
79 | '"module":"musichall.song_list_server"}}'%str(mid)
80 |
81 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
82 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
83 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
84 |
85 | html = requests.session()
86 | content = html.get(url,headers = headers).json()
87 |
88 | songs_num = content['singerSongList']['data']['totalNum']
89 |
90 |
91 | for a in range(0,songs_num,100):
92 |
93 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
94 | '"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
95 | '"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
96 |
97 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
98 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
99 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
100 |
101 | html = requests.session()
102 | content = html.get(url, headers=headers).json()
103 |
104 | datas = content['singerSongList']['data']['songList']
105 |
106 | for d in datas:
107 | sing_name = d['songInfo']['title']
108 | songmid = d['songInfo']['mid']
109 | try:
110 | lock.acquire()#锁上
111 | session.add(Song(song_name = sing_name,
112 | song_singer = singer_name,
113 | song_mid = songmid))
114 | session.commit()
115 | lock.release()#解锁
116 | print('commit')
117 | except:
118 | session.rollback()
119 | print('rollbeak')
120 |
121 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,mid))
122 | download(songmid,sing_name,singer_name)
123 |
124 | def download(songmid,sing_name,singer_name):
125 | headers = {
126 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
127 | 'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
128 | }
129 |
130 |
131 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
132 | '"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
133 | '"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
134 | '"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
135 |
136 |
137 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
138 | '&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
139 | '¬ice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
140 |
141 | html = requests.get(url,headers = headers)
142 |
143 | try:
144 | purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
145 |
146 | url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl)
147 |
148 | html = requests.get(url,headers = headers)
149 | html.encoding = 'utf-8'
150 |
151 | sing_file_name = '{} -- {}'.format(sing_name,singer_name)
152 |
153 | filename = './旧版QQ音乐(仍可用)/歌曲'
154 |
155 | if not os.path.exists(filename):
156 | os.makedirs(filename)
157 |
158 | with open('./旧版QQ音乐(仍可用)/歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
159 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
160 | f.write(html.content)
161 |
162 | except:
163 | print('查询权限失败,或没有查到对应的歌曲')
164 |
165 |
166 |
167 | if __name__ == '__main__':
168 | myProcess()
--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import Column,Integer,String,create_engine
2 | from sqlalchemy.orm import sessionmaker,scoped_session
3 | from sqlalchemy.ext.declarative import declarative_base
4 |
5 | #此处没有使用pymysql的驱动
6 | #请安装pip install mysql-connector-python
7 | #engine中的 mysqlconnector 为 mysql官网驱动
8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8',
9 | max_overflow = 500,#超过连接池大小外最多可以创建的链接
10 | pool_size = 100,#连接池大小
11 | echo = False,#调试信息展示
12 | )
13 | Base = declarative_base()
14 |
15 | class Song(Base):
16 | __tablename__ = 'song'
17 | song_id = Column(Integer,primary_key = True,autoincrement = True)
18 | song_name = Column(String(64))
19 | song_ablum = Column(String(64))
20 | song_mid = Column(String(50))
21 | song_singer = Column(String(50))
22 | Base.metadata.create_all(engine)
23 |
24 | DBsession = sessionmaker(bind = engine)
25 |
26 | SQLsession = scoped_session(DBsession)
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/demo.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests,os,json,math,threading
5 | from urllib import parse
6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
7 | from db import SQLsession,Song
8 |
9 | headers = {
10 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
11 | 'referer':'https://y.qq.com/portal/singer_list.html',
12 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
13 | }
14 |
15 | lock = threading.Lock()
16 | session = SQLsession()
17 |
18 | def myProcess():
19 | #把歌手按照首字母分为27类
20 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
21 | for i in range(1,28):#28
22 | p.submit(get_singer_mid,i)
23 |
24 | def get_singer_mid(index):
25 | #index = 1-----27
26 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
27 | #找出mid, 用于歌手详情页
28 |
29 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
30 | ',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
31 | '"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
32 |
33 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
34 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
35 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
36 | '&data={}'.format(parse.quote(data))
37 |
38 | html = requests.get(url).json()
39 | total = html['singerList']['data']['total']#多少个歌手
40 | pages = int(math.floor(int(total)/80))
41 | thread_number = pages
42 |
43 | Thread = ThreadPoolExecutor(max_workers = thread_number)
44 |
45 | sin = 0
46 | #分页迭代每一个字母下的所有页面歌手
47 | for page in range(1,pages+2):
48 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
49 | '"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
50 | 'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
51 |
52 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
53 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
54 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\
55 | '&data={}'.format(parse.quote(data))
56 |
57 | html = requests.get(url,headers = headers).json()
58 |
59 | sings = html['singerList']['data']['singerlist']
60 |
61 | for sing in sings:
62 |
63 | singer_name = sing['singer_name']
64 | mid = sing['singer_mid']
65 |
66 | Thread.submit(get_singer_data,mid = mid,
67 | singer_name = singer_name,)
68 | sin+=80
69 |
70 |
71 |
72 | #获取歌手信息
73 | def get_singer_data(mid,singer_name):
74 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面
75 | #找出歌手的歌曲信息页
76 |
77 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
78 | '"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
79 | '"module":"musichall.song_list_server"}}'%str(mid)
80 |
81 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
82 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
83 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
84 |
85 | html = requests.session()
86 | content = html.get(url,headers = headers).json()
87 |
88 | songs_num = content['singerSongList']['data']['totalNum']
89 |
90 |
91 | for a in range(0,songs_num,100):
92 |
93 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
94 | '"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
95 | '"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
96 |
97 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
98 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
99 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
100 |
101 | html = requests.session()
102 | content = html.get(url, headers=headers).json()
103 |
104 | datas = content['singerSongList']['data']['songList']
105 |
106 | for d in datas:
107 | sing_name = d['songInfo']['title']
108 | songmid = d['songInfo']['mid']
109 | try:
110 | lock.acquire()#锁上
111 | session.add(Song(song_name = sing_name,
112 | song_singer = singer_name,
113 | song_mid = songmid))
114 | session.commit()
115 | lock.release()#解锁
116 | print('commit')
117 | except:
118 | session.rollback()
119 | print('rollbeak')
120 |
121 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,mid))
122 | download(songmid,sing_name,singer_name)
123 |
124 | def download(songmid,sing_name,singer_name):
125 | headers = {
126 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
127 | 'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
128 | }
129 |
130 |
131 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
132 | '"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
133 | '"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
134 | '"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
135 |
136 |
137 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
138 | '&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
139 | '¬ice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
140 |
141 | html = requests.get(url,headers = headers)
142 |
143 | try:
144 | purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
145 |
146 | url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl)
147 |
148 | html = requests.get(url,headers = headers)
149 | html.encoding = 'utf-8'
150 |
151 | sing_file_name = '{} -- {}'.format(sing_name,singer_name)
152 |
153 | filename = './歌曲'
154 |
155 | if not os.path.exists(filename):
156 | os.makedirs(filename)
157 |
158 | with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
159 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
160 | f.write(html.content)
161 |
162 | except:
163 | print('查询权限失败,或没有查到对应的歌曲')
164 |
165 |
166 |
167 | if __name__ == '__main__':
168 | # myProcess()
169 | get_singer_mid(1)
--------------------------------------------------------------------------------
/有道翻译/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import time, math,random,hashlib
5 | import requests
6 |
7 | def get_html(name):
8 |
9 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
10 |
11 |
12 | ts = math.floor(time.time() * 1000)
13 | salt = ts + int(random.random() * 10)
14 |
15 | sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest()
16 | bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest()
17 |
18 | data = {
19 | 'i': name,
20 | 'from': 'AUTO',
21 | 'to': 'AUTO',
22 | 'smartresult': 'dict',
23 | 'client': 'fanyideskweb',
24 | 'salt': salt,
25 | 'sign': sign,
26 | 'ts': ts,
27 | 'bv': bv,
28 | 'doctype': 'json',
29 | 'version': '2.1',
30 | 'keyfrom': 'fanyi.web',
31 | 'action': 'FY_BY_CLICKBUTTION',
32 | }
33 |
34 | headers = {
35 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
36 | 'Referer': 'http://fanyi.youdao.com/',
37 | #参考链接:http://fanyi.youdao.com/
38 | #请在此处填写你的 Cookie
39 | }
40 |
41 |
42 | html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法
43 | # print(html.json())
44 | print('正在执行有道翻译程序:')
45 | print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src']))
46 | print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt']))
47 |
48 | if __name__ == "__main__":
49 |
50 | name = '靓仔'
51 |
52 | get_html(name)
--------------------------------------------------------------------------------
/构建代理池/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests,time,json
5 | from bs4 import BeautifulSoup
6 |
7 | headers ={
8 | 'Referer':'https://www.kuaidaili.com/free/inha/1/',
9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
10 | #参考链接:https://www.kuaidaili.com/free/inha/1/
11 | }
12 |
13 |
14 | def get_ip(url):#访问网站
15 | html = requests.get(url,headers = headers)
16 | if html.status_code==200:
17 | time.sleep(2)
18 | print('[INFO]正在爬取...')
19 | parse_html(html.text)
20 | else:
21 | print("[ERROR]错误",url)
22 |
23 | def parse_html(html):#获取ip信息
24 | soup = BeautifulSoup(html,'lxml')
25 | ips = soup.select('.table tbody tr')
26 | for line in ips:
27 | ip = line.select_one('td').text
28 | port = line.select('td')[1].text
29 | print('[INFO]获取IP:{} Port:{}'.format(ip,port))
30 |
31 | address = 'http://{}:{}'.format(ip,port)#构造ip访问
32 | proxies = {
33 | 'http':address,
34 | 'https':address,
35 | }
36 | verify_ip(proxies)
37 |
38 | def verify_ip(proxies):#验证ip能否被用
39 |
40 | try:
41 | html = requests.get('http://www.baidu.com',proxies = proxies,timeout = 3)#连接测试
42 | print('[SUCC]可用代理:{}'.format(proxies))
43 | write_json(proxies)
44 | except:
45 | print("[ERROR]代理超时不可用:{}".format(proxies))
46 |
47 |
48 | def write_json(row):#写入文本
49 |
50 | with open('ip_pool.json','a+',encoding='utf-8') as f:
51 | json.dump(row,f)
52 | f.write('\n')
53 |
54 |
55 | def read_json():#读取文件
56 |
57 | with open('ip_pool.json','r',encoding='utf-8') as f:
58 |
59 | for i in f.readlines():
60 | content = json.loads(i.strip())
61 | print(content)
62 |
63 |
64 | if __name__ == '__main__':
65 |
66 | for i in range(15,25):
67 | url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i)
68 | get_ip(url)
69 |
70 | print('目前验证成功的IP')
71 | read_json()
--------------------------------------------------------------------------------
/构建代理池/ip_pool.json:
--------------------------------------------------------------------------------
1 | {"http": "http://183.164.239.153:9999", "https": "http://183.164.239.153:9999"}
2 | {"http": "http://49.235.69.138:8118", "https": "http://49.235.69.138:8118"}
3 | {"http": "http://111.38.91.99:8060", "https": "http://111.38.91.99:8060"}
4 | {"http": "http://47.107.160.99:8118", "https": "http://47.107.160.99:8118"}
5 |
--------------------------------------------------------------------------------
/百度图片/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import requests,json,re,os,traceback,datetime,aiohttp,asyncio
5 | from uuid import uuid4
6 | from urllib import parse
7 | from concurrent.futures import ThreadPoolExecutor
8 |
9 | headers = {
10 | 'Accept':'text/plain, */*; q=0.01',
11 | 'Accept-Encoding':'gzip, deflate, br',
12 | 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
13 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
14 | 'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E5%88%9D%E9%9F%B3%E6%9C%AA%E6%9D%A5',
15 | #参考链接:https://image.baidu.com/
16 | #请在此处填写你的 Cookie
17 | }
18 |
19 | tasks = []
20 |
21 | def get_html(url):
22 |
23 | try:
24 | html = requests.get(url,headers=headers)
25 | json_data = html.text.replace('\\','')#去除JSON数据多余的\
26 | json_data = json.loads(json_data)
27 | parse_json(json_data)
28 |
29 | except json.decoder.JSONDecodeError:
30 |
31 | #去除"fromPageTitle"键值的双引号异常
32 | fromPageTitle = r'"fromPageTitle":"(.*?)",'
33 | json_data = replace_data(fromPageTitle,json_data)
34 |
35 | #去除"fromPageTitle"键值的双引号异常
36 | fromPageTitle = r'"fromPageTitleEnc":"(.*?)",'
37 | json_data = replace_data(fromPageTitle,json_data)
38 |
39 | json_data = json.loads(json_data)
40 | write_error(url,flag='已经成功处理')
41 | parse_json(json_data)
42 |
43 | except Exception:
44 | write_error(url,flag='未能成功处理')
45 |
46 | #解析JSON获取图片URL
47 | def parse_json(json_data):
48 | list_data = json_data['data']
49 | for data in list_data[:-1]:
50 | image_name = data["fromPageTitleEnc"]
51 | for image_data in data["replaceUrl"]:
52 | image_url = image_data['ObjURL']
53 | tasks.append(download(image_url,image_name))
54 |
55 | #下载图片
56 | async def download(image_url,image_name):
57 |
58 | black_image = b'GIF89a\x04\x00\x08\x00\x91\x02\x00\xff\xff\xff\x00\x00\x00\xff\xff\xff\x00\x00\x00!\xf9\x04\x01\x00\x00\x02\x00,\x00\x00\x00\x00\x04\x00\x08\x00\x00\x02\x05\x94\x8f\xa9\x8b\x05\x00;'
59 |
60 | filename = './百度图片/下载好的图片'
61 | if not os.path.exists(filename):
62 | os.makedirs(filename)
63 |
64 | print("[INFO]{} 正在下载图片:{}".format(datetime.datetime.now(),image_name))
65 |
66 | async with aiohttp.ClientSession(headers = headers) as session:
67 | async with session.get(image_url) as html:
68 |
69 | uuid_id = uuid4()
70 | image_file_name = '{}/{}.jpg'.format(filename,uuid_id)
71 |
72 | #筛选掉异常的黑色图片、查询不到的图片
73 | if black_image not in await html.read() and b'' not in await html.read():
74 |
75 | with open(image_file_name,'wb') as f:
76 | f.write(await html.read())
77 |
78 | with open('./百度图片/图片映射表.json','a+',encoding='utf-8') as f:
79 | json_data = json.dumps(dict(image_name = image_name,id=str(uuid_id)),ensure_ascii=False)
80 | f.write(json_data + '\n')
81 |
82 | #用正则删除双引号异常
83 | def replace_data(re_compile,json_data):
84 | re_data = re.compile(re_compile)
85 | for i in re_data.findall(json_data):
86 | data = i.replace('"','').replace("\\'",'')
87 | json_data = json_data.replace(i,data)
88 | return json_data
89 |
90 | #写入异常
91 | def write_error(url,flag=None):
92 |
93 | with open('./百度图片/错误日志.txt','a+',encoding='utf-8') as f:
94 | f.write('JSON异常是否处理成功:{}\n'.format(flag))
95 | f.write('异常时间:{}\n'.format(datetime.datetime.now()))
96 | f.write('异常URL:{}\n'.format(url))
97 | f.write(traceback.format_exc() + '\n')
98 |
99 | if __name__ == "__main__":
100 |
101 | loop = asyncio.get_event_loop()#创建异步编程
102 | name = parse.quote('初音未来')
103 |
104 | with ThreadPoolExecutor(max_workers = 2) as t:
105 | #翻页30
106 | for i in range(30,120,30):
107 | url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592'\
108 | '&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest='\
109 | '©right=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1'\
110 | '&fr=&expermode=&force=&pn={}&rn=30'.format(name,name,i)
111 | t.submit(get_html,url)
112 |
113 | loop.run_until_complete(asyncio.wait(tasks))
114 | loop.close()#程序关闭
--------------------------------------------------------------------------------
/破解有道翻译/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import time, math,random,hashlib
5 | import requests
6 |
7 | def get_html(name):
8 |
9 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
10 |
11 |
12 | ts = math.floor(time.time() * 1000)
13 | salt = ts + int(random.random() * 10)
14 |
15 | sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest()
16 | bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest()
17 |
18 | data = {
19 | 'i': name,
20 | 'from': 'AUTO',
21 | 'to': 'AUTO',
22 | 'smartresult': 'dict',
23 | 'client': 'fanyideskweb',
24 | 'salt': salt,
25 | 'sign': sign,
26 | 'ts': ts,
27 | 'bv': bv,
28 | 'doctype': 'json',
29 | 'version': '2.1',
30 | 'keyfrom': 'fanyi.web',
31 | 'action': 'FY_BY_CLICKBUTTION',
32 | }
33 |
34 | headers = {
35 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
36 | 'Referer': 'http://fanyi.youdao.com/',
37 | #参考链接:http://fanyi.youdao.com/
38 | #请在此处填写你的 Cookie
39 | }
40 |
41 |
42 | html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法
43 |
44 | print('正在执行有道翻译程序:')
45 | print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src']))
46 | print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt']))
47 |
48 | if __name__ == "__main__":
49 |
50 | name = '靓仔'
51 |
52 | get_html(name)
--------------------------------------------------------------------------------
/破解网易登录/crawl.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | import execjs,requests,time
5 |
6 | class User():#获取用户密码加密
7 |
8 | def __init__(self,user_id,user_password):
9 |
10 | self.user_id = user_id
11 | self.user_password = user_password
12 | self.session = requests.session()
13 | self.session.headers = {
14 | 'Referer':'https://dl.reg.163.com/webzj/v1.0.1/pub/index_dl2_new.html?cd=https%3A%2F%2Ftemp.163.com%2Fspecial%2F00804C4H%2F&cf=urs_style_2019.css%3Ft%3D20190527&MGID=1590637061742.5342&wdaId=&pkid=MODXOXd&product=163',
15 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
16 | #请在此处输入你的Cookie
17 | #参考链接 https://www.163.com/
18 | }
19 |
20 | def get_pw(self):
21 |
22 | with open('pw.js','r',encoding='utf-8') as f:
23 | content = f.read()
24 |
25 | js_data = execjs.compile(content)#编译js
26 | pw = js_data.call('get_pw',self.user_password)#调用get_pw函数
27 | return pw
28 |
29 | def get_rtid(self):
30 |
31 | with open('rtid.js','r',encoding='utf-8') as f:
32 | content = f.read()
33 |
34 | js_data = execjs.compile(content)#编译js
35 | rtid = js_data.call('get_rtid')#调用get_rtid函数
36 | return rtid
37 |
38 | def get_tk(self,rtid):
39 |
40 | url = 'https://dl.reg.163.com/dl/gt'
41 |
42 | params = {
43 | 'un':self.user_id,
44 | 'pkid':'MODXOXd',
45 | 'pd':'163',
46 | 'channel':'0',
47 | 'topURL':'https://www.163.com/',
48 | 'rtid':rtid,
49 | 'nocache':int(time.time()*1000),
50 | }
51 |
52 | html = self.session.get(url,params = params).json()
53 | return html['tk']
54 |
55 | def get_login(self,pw,rtid,tk):
56 |
57 | url = 'https://dl.reg.163.com/dl/l'
58 |
59 |
60 | data = {
61 | 'channel':'0',
62 | 'd':'10',
63 | 'domains':"163.com",
64 | 'l':'0',
65 | 'pd':"163",
66 | 'pkid':"MODXOXd",
67 | 'pw':pw,
68 | 'pwdKeyUp':'1',
69 | 'rtid':rtid,
70 | 't':int(time.time()*1000),
71 | 'tk':tk,
72 | 'topURL':"https://www.163.com/",
73 | 'un':self.user_id,
74 | }
75 |
76 | html = self.session.post(url,json = data).json()#传递JSON
77 | return html
78 |
79 |
80 | if __name__ == "__main__":
81 |
82 | user = User('请输入你的账号','请输入你的密码')
83 | pw = user.get_pw()#获取pw
84 | rtid = user.get_rtid()#获取rtid
85 |
86 | tk = user.get_tk(rtid)#获取tk
87 |
88 | login = user.get_login(pw,rtid,tk)
89 | print(login)
90 |
91 |
92 |
--------------------------------------------------------------------------------
/破解网易登录/rtid.js:
--------------------------------------------------------------------------------
1 | function t() {
2 | var e = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
3 | , t = 32
4 | , i = [];
5 | for (; t-- > 0; )
6 | i[t] = e.charAt(Math.random() * e.length);
7 | return i.join("")
8 | };
9 |
10 | function get_rtid(){
11 | return t()
12 | }
--------------------------------------------------------------------------------
/豆瓣读书/入库版/book.py:
--------------------------------------------------------------------------------
1 | #Python3.7
2 | #encoding = utf-8
3 |
4 | from urllib import parse
5 | import asyncio,aiohttp,os,time,requests
6 | from bs4 import BeautifulSoup#爬虫解析库
7 | from boook_db import Book,sess
8 | from concurrent.futures import ThreadPoolExecutor
9 |
10 | tasks = []
11 |
12 | headers = {
13 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
14 | 'Referer':'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',
15 | #参考链接 https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T
16 | }
17 |
18 |
19 | def get_html(url):
20 |
21 |
22 | html = requests.get(url,headers = headers)
23 |
24 | if html.status_code==200:
25 |
26 | parse_html(html.text)
27 | else:
28 | print('错误')
29 |
30 | def parse_html(html):
31 |
32 | soup =BeautifulSoup(html,'lxml')#选择解析器
33 | books = soup.select('li.subject-item')#选择文章
34 |
35 | for book in books:
36 |
37 | try:#防错机制
38 |
39 | title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')#选择书名并去除空格
40 | info = book.select_one('.subject-item .info div.pub').text.strip().replace(' ','').replace('\n','')#选择作者
41 | star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')#选择评分
42 | pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')#选择评价
43 | introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')#选择书本简介
44 | img = book.select_one('.nbg img')['src']#获取图片url
45 |
46 | tasks.append(dowmload(title,img))#异步编程
47 | print(title,info,star,pl,img)
48 | print(introduce)
49 | print('-'*50)
50 |
51 | #插入数据库
52 | book_data = Book(
53 | title = title,
54 | info = info,
55 | star = star,
56 | pl = pl,
57 | introduce = introduce,
58 | )
59 | sess.add(book_data)
60 | sess.commit()
61 | except Exception as e:#发生任何错误返回
62 | print(e)
63 | sess.rollback()#事务回滚
64 |
65 |
66 | async def dowmload(title,url):#保存封面图片
67 |
68 | if not os.path.exists('./豆瓣读书/doubanImg'):#检查有没有文件夹并创建
69 | os.makedirs('./豆瓣读书/doubanImg')
70 |
71 | async with aiohttp.ClientSession(headers = headers) as session:
72 | async with session.get(url) as html:
73 | with open('./豆瓣读书/doubanImg/{}.jpg'.format(title),'wb')as f:
74 | f.write(await html.content.read())
75 |
76 | if __name__ == '__main__':
77 |
78 | loop = asyncio.get_event_loop()
79 | with ThreadPoolExecutor(max_workers = 2) as t:
80 | for i in range(0,100,20):#翻页参数为20
81 | url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T'.format(i)
82 | t.submit(get_html,url)
83 | loop.run_until_complete(asyncio.wait(tasks))
84 | loop.close()#程序关闭
85 |
86 |
87 |
--------------------------------------------------------------------------------
/豆瓣读书/入库版/boook_db.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 | from sqlalchemy import Column,String,Integer,Text
3 | from sqlalchemy.orm import sessionmaker
4 | from sqlalchemy.ext.declarative import declarative_base
5 |
6 | #基础类
7 | Base = declarative_base()
8 |
9 | #此处没有使用pymysql的驱动
10 | #请安装pip install mysql-connector-python
11 | #engine中的 mysqlconnector 为 mysql官网驱动
12 | engine = create_engine(
13 | 'mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8',#连接本地
14 | echo = True
15 | )
16 |
17 | class Book(Base):
18 | __tablename__ = 'book'
19 | id = Column('id',Integer(),primary_key = True,autoincrement = True)
20 | title = Column('title',String(20))
21 | info = Column('info',String(30))
22 | star = Column('star',String(10))
23 | pl = Column('pl',String(10))
24 | introduce = Column('introduce',Text())
25 |
26 | Base.metadata.create_all(engine)
27 |
28 | session = sessionmaker(engine)
29 | sess=session()
--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.3
2 | certifi==2021.10.8
3 | chardet==4.0.0
4 | charset-normalizer==2.0.7
5 | idna==2.10
6 | lxml==4.6.2
7 | requests==2.25.1
8 | soupsieve==2.2.1
9 | urllib3==1.26.7
10 |
--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【bs4实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import requests,json,csv,os
3 | from uuid import uuid4
4 | from bs4 import BeautifulSoup
5 | from urllib import parse
6 |
7 | '''主域名'''
8 | DOMAIN_URL = 'https://book.douban.com'
9 |
10 | '''
11 | 协议头
12 | user-agent(必填)
13 | Referer(有就填,没有不填)
14 | Cookie(有账号登录就填,没有不填)
15 | '''
16 | HEADERS = {
17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
18 | 'Referer':'https://book.douban.com/',
19 | 'Cookie':'填写你的Cookie'
20 | }
21 |
22 | '''结果去重集合'''
23 | RESULT_SET_DATA = set()
24 |
25 | '''
26 | 获取book的tag链接
27 | params:
28 | parse_number: int --> 爬取几个tag链接,默认全部
29 |
30 | return: List[str] --> 确定爬取几个tag链接
31 | '''
32 | def get_book_tag_url(split_number:int=None) -> List[str]:
33 |
34 | html = requests.get(url=DOMAIN_URL,headers=HEADERS)
35 | soup = BeautifulSoup(html.text,'lxml')
36 |
37 | tag_url_list_data = [
38 | DOMAIN_URL+ parse.quote(tag_url['href'])
39 | for tag_url in soup.select('ul.hot-tags-col5.s ul a')
40 | ]
41 |
42 | if split_number:
43 | tag_url_list_data = tag_url_list_data[:split_number]
44 |
45 | return tag_url_list_data
46 |
47 |
48 | '''
49 | 解析tag_url,进行翻页后,获取book的内容
50 | params:
51 | tag_url_list_data: List[str] --> book的tag链接
52 | parse_number: int --> 翻页参数,默认爬取3页
53 | write_type: bool --> 是否写入json文件
54 | return:List[dict] --> 爬取成功book的内容
55 | '''
56 | def parse_book_url_info(
57 | tag_url_list_data:List[str],
58 | parse_number:int=3,
59 | write_json_type:bool=True,
60 | write_csv_type:bool=True,
61 | write_image_type:bool=True
62 | ) -> List[dict]:
63 |
64 | book_info_list_data = []
65 |
66 | for tag_url in tag_url_list_data:
67 |
68 | # 开始翻页,每20算一页
69 | for parse in range(0,parse_number*20+1,20):
70 |
71 | # 翻页URL
72 | parse_url = f'{tag_url}?start={parse}'
73 |
74 | html = requests.get(url=parse_url,headers=HEADERS)
75 | soup = BeautifulSoup(html.text,'lxml')
76 |
77 | # 选择书本
78 | books = soup.select('li.subject-item')
79 |
80 | for book in books:
81 |
82 | # 选择书本链接
83 | book_url = book.select_one('.info h2 a')['href']
84 |
85 | # 选择书名
86 | title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')
87 |
88 | # 选择作者
89 | info = book.select_one('.info div.pub').text.strip().replace(' ','').replace('\n','')
90 |
91 | # 选择评分
92 | star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')
93 |
94 | # 选择评价
95 | pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')
96 |
97 | # 选择书本简介
98 | introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')
99 |
100 | # 获取图片URL
101 | image_url = book.select_one('.nbg img')['src']
102 |
103 | book_info_result = dict(
104 | 书本链接=book_url,
105 | 书名=title,
106 | 作者=info,
107 | 评分=star,
108 | 评价=pl,
109 | 书本简介=introduce,
110 | 图片链接=image_url
111 | )
112 |
113 | '''生成结果hash值'''
114 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
115 |
116 | if result_hash_data not in RESULT_SET_DATA:
117 |
118 | '''加入去重集合'''
119 | RESULT_SET_DATA.add(result_hash_data)
120 |
121 | if write_image_type:
122 | write_image_book_info(
123 | image_url=image_url,
124 | image_name=title,
125 | headers=HEADERS
126 | )
127 |
128 | # 检查是否写入json文件
129 | if write_json_type:
130 | write_json_book_info(book_info_result)
131 |
132 | # 检查是否写入csv文件
133 | if write_csv_type:
134 | write_csv_book_info(
135 | headers=[key for key,value in book_info_result.items()],
136 | book_info=[value for key,value in book_info_result.items()]
137 | )
138 |
139 | print(book_info_result)
140 |
141 | book_info_list_data.append(book_info_result)
142 |
143 | return book_info_list_data
144 |
145 |
146 |
147 | '''
148 | 保存图片,生成图片映射JSON文件
149 | params:
150 | image_url:str --> 图片链接
151 | image_name:str --> 图片名字
152 | headers: dict --> 协议头
153 | '''
154 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
155 |
156 | '''确保图片文件名不重复'''
157 | uuid_id = uuid4()
158 |
159 | filename = './保存图片/图片'
160 |
161 | image_file_name = f'{filename}/{uuid_id}.jpg'
162 |
163 | image_map_file_name = f'./保存图片/image_map_data.json'
164 |
165 | '''如果不存在文件夹则创建'''
166 | if not os.path.exists(filename):
167 | os.makedirs(filename)
168 |
169 | html = requests.get(url=image_url,headers=headers)
170 |
171 | '''写入图片'''
172 | with open(image_file_name,'wb') as f:
173 |
174 | f.write(html.content)
175 |
176 | '''保存图片映射JSON文件'''
177 | with open(image_map_file_name,'a+',encoding='utf-8') as f:
178 |
179 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
180 |
181 |
182 |
183 | '''
184 | 将book的内容,写入json文件
185 | params:
186 | book_info: dict --> 爬取成功book的内容
187 | '''
188 | def write_json_book_info(book_info:dict):
189 |
190 | with open('book_info.json','a+',encoding='utf-8') as f:
191 |
192 | '''
193 | json.dumps() 将dict对象转成str对象,json就是str对象
194 | ensure_ascii=False 让json显示中文编码
195 | '''
196 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
197 |
198 |
199 |
200 | '''
201 | 将book的内容,写入csv文件(带表头)
202 | params:
203 | headers:list --> CSV表头
204 | book_info: list --> 爬取成功book的内容
205 | '''
206 | def write_csv_book_info(headers:list,book_info:list):
207 |
208 | '''
209 | 跨平台问题:
210 | 写入csv 因为Windows有点BUG
211 | writerows()写入会出现空行
212 | 所以加入newline=''
213 | 没有出现这种情况则不需要
214 | '''
215 |
216 | '''
217 | 检查是否创建了CSV文件
218 | 没有则生成带有表头的CSV文件
219 | '''
220 | if not os.path.exists('book_info.csv'):
221 |
222 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
223 |
224 | f_csv = csv.writer(f)
225 | f_csv.writerow(headers)
226 |
227 |
228 |
229 | '''
230 | 逐行开始写入CSV
231 | '''
232 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
233 |
234 | f_csv = csv.writer(f)
235 | f_csv.writerow(book_info) #逐行插入
236 |
237 | if __name__ == '__main__':
238 |
239 | book_tag_url = get_book_tag_url(1)
240 |
241 | book_url_info = parse_book_url_info(book_tag_url)
--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【re实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import requests,json,csv,os,re
3 | from uuid import uuid4
4 | from urllib import parse
5 |
6 | '''主域名'''
7 | DOMAIN_URL = 'https://book.douban.com'
8 |
9 | '''
10 | 协议头
11 | user-agent(必填)
12 | Referer(有就填,没有不填)
13 | Cookie(有账号登录就填,没有不填)
14 | '''
15 | HEADERS = {
16 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
17 | 'Referer':'https://book.douban.com/',
18 | 'Cookie':'填写你的Cookie'
19 | }
20 |
21 | '''结果去重集合'''
22 | RESULT_SET_DATA = set()
23 |
24 |
25 | class ReFind():
26 |
27 | def __init__(self,text):
28 |
29 | '''去除所有空格、换行'''
30 | self.text = re.sub('\s+','',text)
31 |
32 |
33 |
34 | '''
35 | 【链式调用】传入指定正则表达式,获取第一个结果文本
36 | params:
37 | compile: str --> 指定正则表达式
38 | re_type:RegexFlag --> 匹配模式
39 | return: ReFind --> 实例化对象本身,方便进行链式调用
40 | '''
41 | def add_search(self,compile:str,re_type=re.I|re.S):
42 |
43 | self.text = re.compile(compile,re_type).search(self.text).group()
44 |
45 | return self
46 |
47 | '''
48 | 传入指定正则表达式,返回所有查询结果
49 | params:
50 | compile: str --> 指定正则表达式
51 | re_type:RegexFlag --> 匹配模式
52 | return: List[str] --> 正则匹配成功的结果
53 | '''
54 | def find_all(self,compile:str,re_type=re.I|re.S) -> List[str]:
55 |
56 | return re.compile(compile,re_type).findall(self.text)
57 |
58 |
59 |
60 | '''
61 | 打印当前文本
62 | return: str --> 当前对象的文本
63 | '''
64 | def print(self) -> str:
65 | print(self.text)
66 |
67 |
68 |
69 | '''
70 | 获取book的tag链接
71 | params:
72 | parse_number: int --> 爬取几个tag链接,默认全部
73 |
74 | return: List[str] --> 确定爬取几个tag链接
75 | '''
76 | def get_book_tag_url(split_number:int=None) -> List[str]:
77 |
78 | html = requests.get(url=DOMAIN_URL,headers=HEADERS)
79 |
80 | tag_url_list_data = [
81 | DOMAIN_URL+ parse.quote(tag_url)
82 | for tag_url in (
83 | ReFind(html.text)
84 | .add_search(r'')
85 | .find_all(r'.*?')
86 | )
87 | ]
88 |
89 | if split_number:
90 | tag_url_list_data = tag_url_list_data[:split_number]
91 |
92 | return tag_url_list_data
93 |
94 |
95 | '''
96 | 解析tag_url,进行翻页后,获取book的内容
97 | params:
98 | tag_url_list_data: List[str] --> book的tag链接
99 | parse_number: int --> 翻页参数,默认爬取3页
100 | write_type: bool --> 是否写入json文件
101 | return:List[dict] --> 爬取成功book的内容
102 | '''
103 | def parse_book_url_info(
104 | tag_url_list_data:List[str],
105 | parse_number:int=3,
106 | write_json_type:bool=True,
107 | write_csv_type:bool=True,
108 | write_image_type:bool=True
109 | ) -> List[dict]:
110 |
111 | book_info_list_data = []
112 |
113 | for tag_url in tag_url_list_data:
114 |
115 | # 开始翻页,每20算一页
116 | for parse in range(0,parse_number*20+1,20):
117 |
118 | # 翻页URL
119 | parse_url = f'{tag_url}?start={parse}'
120 |
121 | html = requests.get(url=parse_url,headers=HEADERS)
122 |
123 | # 选择书本
124 | books = (
125 | ReFind(html.text)
126 | .find_all(r'')
127 | )
128 |
129 | for book in books:
130 |
131 | # 选择书本链接
132 | book_url = (
133 | ReFind(book)
134 | .find_all(r'')
135 | )[0]
136 |
137 | # 选择书名
138 | title = (
139 | ReFind(book)
140 | .find_all(r'(.*?)')
141 | )[0].strip().replace(' ','').replace('\n','')
142 |
143 | # 选择作者
144 | info = (
145 | ReFind(book)
146 | .find_all(r'(.*?)')
147 | )[0].strip().replace(' ','').replace('\n','')
148 |
149 | # 选择评分
150 | star = (
151 | ReFind(book)
152 | .find_all(r'(.*?)')
153 | )[0].strip().replace(' ','').replace('\n','')
154 |
155 | # 选择评价
156 | pl = (
157 | ReFind(book)
158 | .find_all(r'(.*?)')
159 | )[0].strip().replace(' ','').replace('\n','')
160 |
161 | # 选择书本简介
162 | introduce = (
163 | ReFind(book)
164 | .find_all(r'(.*?)
')
165 | )[0].strip().replace(' ','').replace('\n','')
166 |
167 |
168 | # 获取图片URL
169 | image_url =(
170 | ReFind(book)
171 | .find_all(r'')
172 | )[0]
173 |
174 | book_info_result = dict(
175 | 书本链接=book_url,
176 | 书名=title,
177 | 作者=info,
178 | 评分=star,
179 | 评价=pl,
180 | 书本简介=introduce,
181 | 图片链接=image_url
182 | )
183 |
184 | '''生成结果hash值'''
185 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
186 |
187 | if result_hash_data not in RESULT_SET_DATA:
188 |
189 | '''加入去重集合'''
190 | RESULT_SET_DATA.add(result_hash_data)
191 |
192 | if write_image_type:
193 | write_image_book_info(
194 | image_url=image_url,
195 | image_name=title,
196 | headers=HEADERS
197 | )
198 |
199 | # 检查是否写入json文件
200 | if write_json_type:
201 | write_json_book_info(book_info_result)
202 |
203 | # 检查是否写入csv文件
204 | if write_csv_type:
205 | write_csv_book_info(
206 | headers=[key for key,value in book_info_result.items()],
207 | book_info=[value for key,value in book_info_result.items()]
208 | )
209 |
210 | print(book_info_result)
211 |
212 | book_info_list_data.append(book_info_result)
213 |
214 | return book_info_list_data
215 |
216 |
217 | '''
218 | 保存图片,生成图片映射JSON文件
219 | params:
220 | image_url:str --> 图片链接
221 | image_name:str --> 图片名字
222 | headers: dict --> 协议头
223 | '''
224 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
225 |
226 | '''确保图片文件名不重复'''
227 | uuid_id = uuid4()
228 |
229 | filename = './保存图片/图片'
230 |
231 | image_file_name = f'{filename}/{uuid_id}.jpg'
232 |
233 | image_map_file_name = f'./保存图片/image_map_data.json'
234 |
235 | '''如果不存在文件夹则创建'''
236 | if not os.path.exists(filename):
237 | os.makedirs(filename)
238 |
239 | html = requests.get(url=image_url,headers=headers)
240 |
241 | '''写入图片'''
242 | with open(image_file_name,'wb') as f:
243 |
244 | f.write(html.content)
245 |
246 | '''保存图片映射JSON文件'''
247 | with open(image_map_file_name,'a+',encoding='utf-8') as f:
248 |
249 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
250 |
251 |
252 |
253 | '''
254 | 将book的内容,写入json文件
255 | params:
256 | book_info: dict --> 爬取成功book的内容
257 | '''
258 | def write_json_book_info(book_info:dict):
259 |
260 | with open('book_info.json','a+',encoding='utf-8') as f:
261 |
262 | '''
263 | json.dumps() 将dict对象转成str对象,json就是str对象
264 | ensure_ascii=False 让json显示中文编码
265 | '''
266 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
267 |
268 |
269 |
270 | '''
271 | 将book的内容,写入csv文件(带表头)
272 | params:
273 | headers:list --> CSV表头
274 | book_info: list --> 爬取成功book的内容
275 | '''
276 | def write_csv_book_info(headers:list,book_info:list):
277 |
278 | '''
279 | 跨平台问题:
280 | 写入csv 因为Windows有点BUG
281 | writerows()写入会出现空行
282 | 所以加入newline=''
283 | 没有出现这种情况则不需要
284 | '''
285 |
286 | '''
287 | 检查是否创建了CSV文件
288 | 没有则生成带有表头的CSV文件
289 | '''
290 | if not os.path.exists('book_info.csv'):
291 |
292 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
293 |
294 | f_csv = csv.writer(f)
295 | f_csv.writerow(headers)
296 |
297 |
298 |
299 | '''
300 | 逐行开始写入CSV
301 | '''
302 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
303 |
304 | f_csv = csv.writer(f)
305 | f_csv.writerow(book_info) #逐行插入
306 |
307 | if __name__ == '__main__':
308 |
309 | book_tag_url = get_book_tag_url(1)
310 |
311 | book_url_info = parse_book_url_info(book_tag_url)
--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【xpath实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import requests,json,csv,os
3 | from uuid import uuid4
4 | from lxml import etree
5 | from urllib import parse
6 |
7 | '''主域名'''
8 | DOMAIN_URL = 'https://book.douban.com'
9 |
10 | '''
11 | 协议头
12 | user-agent(必填)
13 | Referer(有就填,没有不填)
14 | Cookie(有账号登录就填,没有不填)
15 | '''
16 | HEADERS = {
17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
18 | 'Referer':'https://book.douban.com/',
19 | 'Cookie':'填写你的Cookie'
20 | }
21 |
22 | '''结果去重集合'''
23 | RESULT_SET_DATA = set()
24 |
25 | '''
26 | 获取book的tag链接
27 | params:
28 | parse_number: int --> 爬取几个tag链接,默认全部
29 |
30 | return: List[str] --> 确定爬取几个tag链接
31 | '''
32 | def get_book_tag_url(split_number:int=None) -> List[str]:
33 |
34 | html = requests.get(url=DOMAIN_URL,headers=HEADERS)
35 | soup = etree.HTML(html.text)
36 |
37 | tag_url_list_data = [
38 | DOMAIN_URL+ parse.quote(tag_url)
39 | for tag_url in soup.xpath('//ul[@class="hot-tags-col5 s"]//ul//a/@href')
40 | ]
41 |
42 | if split_number:
43 | tag_url_list_data = tag_url_list_data[:split_number]
44 |
45 | return tag_url_list_data
46 |
47 |
48 | '''
49 | 解析tag_url,进行翻页后,获取book的内容
50 | params:
51 | tag_url_list_data: List[str] --> book的tag链接
52 | parse_number: int --> 翻页参数,默认爬取3页
53 | write_type: bool --> 是否写入json文件
54 | return:List[dict] --> 爬取成功book的内容
55 | '''
56 | def parse_book_url_info(
57 | tag_url_list_data:List[str],
58 | parse_number:int=3,
59 | write_json_type:bool=True,
60 | write_csv_type:bool=True,
61 | write_image_type:bool=True
62 | ) -> List[dict]:
63 |
64 | book_info_list_data = []
65 |
66 | for tag_url in tag_url_list_data:
67 |
68 | # 开始翻页,每20算一页
69 | for parse in range(0,parse_number*20+1,20):
70 |
71 | # 翻页URL
72 | parse_url = f'{tag_url}?start={parse}'
73 |
74 | html = requests.get(url=parse_url,headers=HEADERS)
75 | soup = etree.HTML(html.text)
76 |
77 | # 选择书本
78 | books = soup.xpath('//li[@class="subject-item"]')
79 |
80 | for book in books:
81 |
82 | # 选择书本链接
83 | book_url = book.xpath('.//h2/a/@href')[0]
84 |
85 | # 选择书名
86 | title = book.xpath('.//h2/a/text()')[0].strip().replace(' ','').replace('\n','')
87 |
88 | # 选择作者
89 | info = book.xpath('.//div[@class="pub"]/text()')[0].strip().replace(' ','').replace('\n','')
90 |
91 | # 选择评分
92 | star = book.xpath('.//span[@class="rating_nums"]/text()')[0].strip().replace(' ','').replace('\n','')
93 |
94 | # 选择评价
95 | pl = book.xpath('.//span[@class="pl"]/text()')[0].strip().replace(' ','').replace('\n','')
96 |
97 | # 选择书本简介
98 | introduce = book.xpath('.//p/text()')[0].strip().replace(' ','').replace('\n','')
99 |
100 | # 获取图片URL
101 | image_url = book.xpath('.//img/@src')[0]
102 |
103 | book_info_result = dict(
104 | 书本链接=book_url,
105 | 书名=title,
106 | 作者=info,
107 | 评分=star,
108 | 评价=pl,
109 | 书本简介=introduce,
110 | 图片链接=image_url
111 | )
112 |
113 | '''生成结果hash值'''
114 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
115 |
116 | if result_hash_data not in RESULT_SET_DATA:
117 |
118 | '''加入去重集合'''
119 | RESULT_SET_DATA.add(result_hash_data)
120 |
121 | if write_image_type:
122 | write_image_book_info(
123 | image_url=image_url,
124 | image_name=title,
125 | headers=HEADERS
126 | )
127 |
128 | # 检查是否写入json文件
129 | if write_json_type:
130 | write_json_book_info(book_info_result)
131 |
132 | # 检查是否写入csv文件
133 | if write_csv_type:
134 | write_csv_book_info(
135 | headers=[key for key,value in book_info_result.items()],
136 | book_info=[value for key,value in book_info_result.items()]
137 | )
138 |
139 | print(book_info_result)
140 |
141 | book_info_list_data.append(book_info_result)
142 |
143 | return book_info_list_data
144 |
145 |
146 | '''
147 | 保存图片,生成图片映射JSON文件
148 | params:
149 | image_url:str --> 图片链接
150 | image_name:str --> 图片名字
151 | headers: dict --> 协议头
152 | '''
153 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
154 |
155 | '''确保图片文件名不重复'''
156 | uuid_id = uuid4()
157 |
158 | filename = './保存图片/图片'
159 |
160 | image_file_name = f'{filename}/{uuid_id}.jpg'
161 |
162 | image_map_file_name = f'./保存图片/image_map_data.json'
163 |
164 | '''如果不存在文件夹则创建'''
165 | if not os.path.exists(filename):
166 | os.makedirs(filename)
167 |
168 | html = requests.get(url=image_url,headers=headers)
169 |
170 | '''写入图片'''
171 | with open(image_file_name,'wb') as f:
172 |
173 | f.write(html.content)
174 |
175 | '''保存图片映射JSON文件'''
176 | with open(image_map_file_name,'a+',encoding='utf-8') as f:
177 |
178 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
179 |
180 |
181 |
182 | '''
183 | 将book的内容,写入json文件
184 | params:
185 | book_info: dict --> 爬取成功book的内容
186 | '''
187 | def write_json_book_info(book_info:dict):
188 |
189 | with open('book_info.json','a+',encoding='utf-8') as f:
190 |
191 | '''
192 | json.dumps() 将dict对象转成str对象,json就是str对象
193 | ensure_ascii=False 让json显示中文编码
194 | '''
195 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
196 |
197 |
198 |
199 | '''
200 | 将book的内容,写入csv文件(带表头)
201 | params:
202 | headers:list --> CSV表头
203 | book_info: list --> 爬取成功book的内容
204 | '''
205 | def write_csv_book_info(headers:list,book_info:list):
206 |
207 | '''
208 | 跨平台问题:
209 | 写入csv 因为Windows有点BUG
210 | writerows()写入会出现空行
211 | 所以加入newline=''
212 | 没有出现这种情况则不需要
213 | '''
214 |
215 | '''
216 | 检查是否创建了CSV文件
217 | 没有则生成带有表头的CSV文件
218 | '''
219 | if not os.path.exists('book_info.csv'):
220 |
221 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
222 |
223 | f_csv = csv.writer(f)
224 | f_csv.writerow(headers)
225 |
226 |
227 |
228 | '''
229 | 逐行开始写入CSV
230 | '''
231 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
232 |
233 | f_csv = csv.writer(f)
234 | f_csv.writerow(book_info) #逐行插入
235 |
236 | if __name__ == '__main__':
237 |
238 | book_tag_url = get_book_tag_url(1)
239 |
240 | book_url_info = parse_book_url_info(book_tag_url)
--------------------------------------------------------------------------------