├── .gitattributes
├── .gitignore
├── B站模拟扫码登录
    └── demo.py
├── README.md
├── requirements.txt
├── 下载小鹅通视频
    ├── 2021年12月
    │   ├── 1.前置知识
    │   │   ├── 1.AES-CBC解密
    │   │   │   ├── CBC解密.py
    │   │   │   ├── after.ts
    │   │   │   └── before.ts
    │   │   ├── 2.m3u8解析
    │   │   │   ├── demo.m3u8
    │   │   │   └── parse.py
    │   │   ├── 3.HTML注入JS
    │   │   │   ├── after.html
    │   │   │   ├── before.html
    │   │   │   └── demo.py
    │   │   └── 4.ffmpeg合成ts视频
    │   │   │   ├── demo.py
    │   │   │   ├── ffmpeg.exe
    │   │   │   ├── file.txt
    │   │   │   ├── out.mp4
    │   │   │   └── 素材
    │   │   │       ├── 1.ts
    │   │   │       ├── 10.ts
    │   │   │       ├── 11.ts
    │   │   │       ├── 12.ts
    │   │   │       ├── 13.ts
    │   │   │       ├── 14.ts
    │   │   │       ├── 15.ts
    │   │   │       ├── 2.ts
    │   │   │       ├── 3.ts
    │   │   │       ├── 4.ts
    │   │   │       ├── 5.ts
    │   │   │       ├── 6.ts
    │   │   │       ├── 7.ts
    │   │   │       ├── 8.ts
    │   │   │       └── 9.ts
    │   ├── 2.自动合并版本
    │   │   ├── ffmpeg.exe
    │   │   ├── request_demo.py
    │   │   ├── requirements.txt
    │   │   ├── selenium启动
    │   │   │   ├── chromedriver.exe
    │   │   │   ├── selenium_start.py
    │   │   │   └── 谷歌驱动下载地址.txt
    │   │   ├── 启动程序指令.txt
    │   │   └── 安装环境指令.txt
    │   └── 3.手动合并版本
    │   │   ├── ffmpeg.exe
    │   │   ├── request_demo.py
    │   │   ├── requirements.txt
    │   │   ├── selenium启动
    │   │       ├── chromedriver.exe
    │   │       ├── selenium_start.py
    │   │       └── 谷歌驱动下载地址.txt
    │   │   ├── 启动程序指令.txt
    │   │   └── 安装环境指令.txt
    └── 2022年12月
    │   └── 1.自动合并版本
    │       ├── N_m3u8DL-CLI_v3.0.2.exe
    │       ├── ffmpeg.exe
    │       ├── request_demo.py
    │       ├── requirements.txt
    │       ├── selenium启动
    │           ├── chromedriver.exe
    │           ├── selenium_start.py
    │           └── 谷歌驱动下载地址.txt
    │       ├── 启动程序指令.txt
    │       └── 安装环境指令.txt
├── 下载荔枝微课
    ├── ffmpeg.exe
    ├── request_demo.py
    ├── requirements.txt
    ├── selenium启动
    │   ├── chromedriver.exe
    │   ├── selenium_start.py
    │   └── 谷歌驱动下载地址.txt
    ├── 启动程序指令.txt
    └── 安装环境指令.txt
├── 京东商品信息
    └── crawl.py
├── 房天下
    ├── crawl.py
    └── db.py
├── 新版QQ音乐
    ├── README.md
    ├── crawl.py
    ├── db.py
    ├── demo.py
    └── get_sign.js
├── 旧版QQ音乐(仍可用)
    ├── README.md
    ├── crawl.py
    ├── db.py
    └── demo.py
├── 有道翻译
    └── crawl.py
├── 构建代理池
    ├── crawl.py
    └── ip_pool.json
├── 百度图片
    └── crawl.py
├── 破解有道翻译
    └── crawl.py
├── 破解网易登录
    ├── crawl.py
    ├── pw.js
    └── rtid.js
└── 豆瓣读书
    ├── 入库版
        ├── book.py
        └── boook_db.py
    └── 分类实现版
        ├── requirements.txt
        ├── 【bs4实现】豆瓣读书爬虫.py
        ├── 【re实现】豆瓣读书爬虫.py
        └── 【xpath实现】豆瓣读书爬虫.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | 


--------------------------------------------------------------------------------
/B站模拟扫码登录/demo.py:
--------------------------------------------------------------------------------
 1 | # Python3.7
 2 | # encoding=utf-8
 3 | 
 4 | import requests,time,json,os
 5 | import qrcode   # 生成二维码
 6 | import cv2 as cv # 读取二维码图片
 7 | from concurrent.futures import ThreadPoolExecutor
 8 | 
 9 | '''
10 |     需要安装第三方库：
11 |     pip install qrcode==7.3
12 |     pip install opencv-python==4.5.3.56
13 | '''
14 | 
15 | headers = {
16 |     'referer':'https://passport.bilibili.com/login',
17 |     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36x-requested-with: XMLHttpRequest'
18 | }
19 | 
20 | class Login():
21 | 
22 |     def __init__(self):
23 |         self.oauthKey = ''
24 |         self.qrcodeURL = ''
25 |         self.session = requests.Session()
26 |         self.session.headers = headers
27 | 
28 |     # 获取二维码图片地址
29 |     def getQRcode(self):
30 |         
31 |         html = self.session.get('https://passport.bilibili.com/qrcode/getLoginUrl')
32 |         if html.json()['status'] == True:
33 |             self.oauthKey = html.json()['data']['oauthKey']
34 |             self.qrcodeURL = html.json()['data']['url']
35 |             return True
36 |         return False
37 | 
38 |     # 利用 opencv 读取图片
39 |     @staticmethod
40 |     def showQRCode(url):
41 |         qrCode = qrcode.QRCode()
42 |         qrCode.add_data(url)
43 |         qrCode = qrCode.make_image()
44 |         qrCode.save('qrCode.png')
45 |         img = cv.imread('qrCode.png',1)
46 |         cv.imshow('Login',img)
47 |         cv.waitKey()
48 | 
49 |     # 开始登录
50 |     def login(self):
51 | 
52 |         # 创建另一个线程，展示二维码图片
53 |         thread_pool = ThreadPoolExecutor(max_workers=2)
54 |         if self.getQRcode():
55 |             thread_pool.submit(self.showQRCode,self.qrcodeURL)
56 |         
57 |         # 不断检查二维码是否确认登录
58 |         while True:
59 |             time.sleep(1)
60 |             data = {
61 |                 'oauthKey':self.oauthKey,
62 |                 'gourl':'https://www.bilibili.com/'
63 |             }
64 | 
65 |             html = self.session.post('https://passport.bilibili.com/qrcode/getLoginInfo',headers=headers,data=data)
66 | 
67 |             if html.json()['data'] == -4: # 还没扫码
68 |                 pass
69 |             elif html.json()['data'] == -2: # 二维码过期，需要重新生成
70 |                 self.getQRcode()
71 |                 thread_pool.submit(self.showQRCode,self.qrcodeURL)
72 |             elif html.json()['data'] == -5: # 已经扫码，等待确认
73 |                 pass
74 |             else:
75 |                 break
76 |         
77 |         # 解析 cookie
78 |         cookieRaw = html.json()['data']['url'].split('?')[1].split('&')
79 |         cookies = {}
80 |         for cookie in cookieRaw:
81 |             key,value = cookie.split('=')
82 |             if key != 'gourl' and key != 'Expires':
83 |                 cookies[key] = value
84 |         print(json.dumps(cookies))
85 |         os._exit(0)
86 | 
87 | if __name__ == '__main__':
88 |     login = Login()
89 |     login.login()
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # **Python3Webcrawler**
 2 | ## **[哔哩哔哩作者：-相依-](https://space.bilibili.com/343154012)**  &emsp;**UPDATE 2023-02-15**
 3 | 
 4 | ---
 5 | 
 6 | > **精心挑选了几个爬虫，给大家在学Scrapy框架之前打基础。**
 7 | >> **该项目仅限学习交流，请勿用于商业用途，如有侵权，请联系删除。**
 8 | 
 9 | ---
10 | 
11 | |**运行环境**|**项目使用版本**|
12 | |:----:|:--------:|
13 | |**python**|**3.7.9**|
14 | |**NodeJS**|**14.6.0**|
15 | 
16 | ---
17 | 
18 | |**程序依赖**|**安装指令**|**项目使用版本**|
19 | |:----:|:--------:|:--------:|
20 | |**lxml**|**pip install lxml**|**4.6.2**|
21 | |**aiohttp**|**pip install aiohttp**|**3.7.4**|
22 | |**requests**|**pip install requests**|**2.25.1**|
23 | |**PyExecJS**|**pip install PyExecJS**|**1.5.1**|
24 | |**sqlalchemy**|**pip install sqlalchemy**|**1.3.23**|
25 | |**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.3**|
26 | |**mysqlconnector**|**pip install mysql-connector-python**|**8.0.23**|
27 | |**qrcode**|**pip install qrcode**|**7.3**|
28 | |**opencv-python**|**pip install opencv-python**|**4.5.3.56**|
29 | |**m3u8**|**pip install m3u8**|**0.9.0**|
30 | |**mitmproxy**|**pip install mitmproxy**|**5.3.0**|
31 | |**selenium**|**pip install selenium**|**3.141.0**|
32 | |**pycryptodome**|**pip install pycryptodome**|**3.10.1**|
33 | ---
34 | 
35 |  * ### **京东&emsp;&emsp;&emsp;[官网地址](https://item.jd.com)** 
36 |  * ### **网易&emsp;&emsp;&emsp;[官网地址](https://www.163.com/)** 
37 |  * ### **房天下&emsp;&emsp;[官网地址](https://www.fang.com)** 
38 |  * ### **快代理&emsp;&emsp;[官网地址](https://www.kuaidaili.com)** 
39 |  * ### **QQ音乐 &emsp; [官网地址](https://y.qq.com)** 
40 |  * ### **百度图片&emsp;[官网地址](https://image.baidu.com)** 
41 |  * ### **豆瓣读书&emsp;[官网地址](https://book.douban.com)** 
42 |  * ### **有道翻译&emsp;[官网地址](http://fanyi.youdao.com)** 
43 |  * ### **哔哩哔哩&emsp;[官网地址](https://bilibili.com)** 
44 |  * ### **小鹅通&emsp;&emsp;[官网地址](https://www.xiaoe-tech.com)**
45 |  * ### **荔枝微课&emsp;[官网地址](https://m.lizhiweike.com)**


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.4
 2 | async-timeout==3.0.1
 3 | attrs==20.3.0
 4 | beautifulsoup4==4.9.3
 5 | certifi==2020.12.5
 6 | chardet==3.0.4
 7 | idna==2.10
 8 | lxml==4.6.2
 9 | multidict==5.1.0
10 | mysql-connector-python==8.0.23
11 | protobuf==3.13.0
12 | PyExecJS==1.5.1
13 | requests==2.25.1
14 | six==1.15.0
15 | soupsieve==2.2
16 | SQLAlchemy==1.3.23
17 | typing-extensions==3.7.4.3
18 | urllib3==1.26.3
19 | yarl==1.6.3
20 | qrcode==7.3
21 | opencv-python==4.5.3.56
22 | selenium==3.141.0
23 | m3u8==0.9.0
24 | mitmproxy==5.3.0
25 | pycryptodome==3.10.1


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/CBC解密.py:
--------------------------------------------------------------------------------
 1 | from Crypto.Cipher import AES
 2 | 
 3 | # 设置模式
 4 | mode = AES.MODE_CBC
 5 | 
 6 | key = b'V\x9dH\x1e:\xe6g\x10\x11l\xd7\xab\xd5\xd3\xc1\xbc'
 7 | 
 8 | '''
 9 |     生成解密对象
10 |     key：密钥
11 |     mode：解密模式
12 |     iv：偏移量
13 | '''
14 | cryptos = AES.new(key=key,mode=mode,iv=b'0000000000000000')
15 | 
16 | with open('before.ts','rb') as f:   # 解密前
17 |     with open('after.ts','wb') as f2:   # 解密后
18 |         f2.write(cryptos.decrypt(f.read()))
19 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/demo.m3u8:
--------------------------------------------------------------------------------
 1 | #EXTM3U
 2 | #EXT-X-VERSION:3
 3 | #EXT-X-TARGETDURATION:11
 4 | #EXT-X-MEDIA-SEQUENCE:0
 5 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
 6 | #EXTINF:2.000000,
 7 | v.f230.ts?start=0&end=68063&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
 8 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
 9 | #EXTINF:2.000000,
10 | v.f230.ts?start=68064&end=130671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
11 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
12 | #EXTINF:1.708333,
13 | v.f230.ts?start=130672&end=190847&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
14 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
15 | #EXTINF:2.000000,
16 | v.f230.ts?start=190848&end=281471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
17 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
18 | #EXTINF:2.000000,
19 | v.f230.ts?start=281472&end=369471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
20 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
21 | #EXTINF:2.000000,
22 | v.f230.ts?start=369472&end=457647&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
23 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
24 | #EXTINF:2.000000,
25 | v.f230.ts?start=457648&end=742095&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
26 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
27 | #EXTINF:4.291667,
28 | v.f230.ts?start=742096&end=1186719&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
29 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
30 | #EXTINF:4.000000,
31 | v.f230.ts?start=1186720&end=1413087&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
32 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
33 | #EXTINF:4.000000,
34 | v.f230.ts?start=1413088&end=1776687&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
35 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
36 | #EXTINF:5.791667,
37 | v.f230.ts?start=1776688&end=2031631&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
38 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
39 | #EXTINF:6.000000,
40 | v.f230.ts?start=2031632&end=2294271&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
41 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
42 | #EXTINF:6.000000,
43 | v.f230.ts?start=2294272&end=2535679&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
44 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
45 | #EXTINF:10.208333,
46 | v.f230.ts?start=2535680&end=3179583&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
47 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
48 | #EXTINF:9.666667,
49 | v.f230.ts?start=3179584&end=3695279&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
50 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
51 | #EXTINF:10.000000,
52 | v.f230.ts?start=3695280&end=3994207&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
53 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
54 | #EXTINF:10.750000,
55 | v.f230.ts?start=3994208&end=4735695&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
56 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
57 | #EXTINF:9.333333,
58 | v.f230.ts?start=4735696&end=5240671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
59 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
60 | #EXTINF:9.583333,
61 | v.f230.ts?start=5240672&end=5551439&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
62 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
63 | #EXTINF:10.416667,
64 | v.f230.ts?start=5551440&end=5820671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
65 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000
66 | #EXTINF:6.416667,
67 | v.f230.ts?start=5820672&end=5890239&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com
68 | #EXT-X-ENDLIST
69 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/parse.py:
--------------------------------------------------------------------------------
 1 | import m3u8
 2 | 
 3 | '''
 4 |     m3u8 官方文档：https://github.com/globocom/m3u8
 5 | '''
 6 | with open(r'demo.m3u8','r',encoding='utf-8') as f:
 7 | 
 8 |     # 解析 m3u8
 9 |     dict_data = m3u8.parse(f.read())
10 |     print(dict_data)
11 | 
12 |     # 获取键值
13 |     # print(dict_data.keys())
14 | 
15 |     # 获取 m3u8 分片地址
16 |     # for data in dict_data['segments']:
17 |     #     print(data['uri'])
18 |     #     start = data['uri'].split('?')[1].split('&')[0]
19 |     #     end = data['uri'].split('?')[1].split('&')[1]
20 |     #     print(start + end)
21 | 
22 | 
23 |     # 获取 m3u8 加密地址
24 |     # for data in dict_data['keys']:
25 |     #     print(data['uri'])
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/after.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!--[if lt IE 9]><html lang="en-US" class="lt-ie9 l-ie9" data-n-head-ssr><![endif]-->
 3 | <!--[if IE 9]><html lang="en-US" class="lt-ie9 ie9" data-n-head-ssr><![endif]-->
 4 | <!--[if (gt IE 9)|!(IE)]><!-->
 5 | <html data-n-head-ssr="">
 6 |  <!--<![endif]-->
 7 |  <head>
 8 |   <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
 9 |   <meta content="upgrade-insecure-requests" http-equiv="Content-Security-Policy"/>
10 |   <meta content="webkit" name="renderer"/>
11 |   <meta content="width=device-width,initial-scale=1" name="viewport"/>
12 |   <title>
13 |    模拟多个script节点
14 |   </title>
15 |   <script>
16 |   </script>
17 |   <script>
18 |   </script>
19 |  </head>
20 |  <body>
21 |   <script>
22 |   </script>
23 |  </body>
24 |  <script>
25 |  </script>
26 |  <script>
27 |  </script>
28 |  <script type="text/javascript">
29 |   alert('靓仔')
30 |  </script>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/before.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <!--[if lt IE 9]><html lang="en-US" class="lt-ie9 l-ie9" data-n-head-ssr><![endif]-->
 3 | <!--[if IE 9]><html lang="en-US" class="lt-ie9 ie9" data-n-head-ssr><![endif]-->
 4 | <!--[if (gt IE 9)|!(IE)]><!--><html data-n-head-ssr><!--<![endif]-->
 5 | <head >
 6 |   <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
 7 |   <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
 8 |   <meta name="renderer" content="webkit">
 9 |   <meta name="viewport" content="width=device-width,initial-scale=1">
10 |   <title>模拟多个script节点</title>
11 | 
12 |   <script></script>
13 |   <script></script>
14 | 
15 | </head>
16 | <body>
17 | 
18 |       <script></script>
19 | 
20 | </body>
21 |       <script></script>
22 |       <script></script>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/demo.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | '''
 4 |     BeautifulSoup修改文档树-官方文档：https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id45
 5 | '''
 6 | 
 7 | with open('before.html','r',encoding='utf-8') as f:
 8 |     
 9 |     soup = BeautifulSoup(f.read(), 'lxml')
10 | 
11 | 
12 |     # 创建 HTML 的 script 节点
13 |     script_tag = soup.new_tag('script', type='text/javascript')
14 |     script_tag.string = "alert('靓仔')"
15 |     # print(script_tag)
16 | 
17 |     # 获取最后一个 script 节点，向后插入
18 |     print('[插入前] 最后一个节点：{}'.format(soup.select('script')[-1]))
19 |     soup.select('script')[-1].insert_after(script_tag)
20 |     print('[插入后] 最后一个节点：{}'.format(soup.select('script')[-1]))
21 | 
22 |     with open('after.html','w',encoding='utf-8') as f:
23 |         f.write(soup.prettify()) # 格式化写入
24 |     
25 |     # print(soup)


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | '''
 4 |     路径常识：
 5 |     \ 不能在Linux
 6 |     / 能够跨平台
 7 |     推荐使用 / 路径  
 8 | '''
 9 | 
10 | print('\\') 
11 | print('/')
12 | 
13 | # 遍历文件夹	
14 | for dirpath,dirnames,files in os.walk('./素材'):
15 | 
16 |     # 获取有多少个文件
17 |     print(files)
18 | 
19 |     # 将文件名排序好
20 |     # list_data = [ int(data.replace('.ts','')) for data in files]
21 |     # list_data.sort()
22 |     # print(list_data)
23 | 
24 |     # 开始写入文件
25 |     # for index in list_data:
26 | 
27 |     #     # 写入
28 |     #     with open('file.txt','a+',encoding='utf-8') as f1:
29 |             
30 |     #         # 读取
31 |     #         with open('file.txt','r',encoding='utf-8') as f2:
32 |                 
33 |     #             # 获取当前绝对路径
34 |     #             current_filename = os.getcwd().replace('\\','/')
35 | 
36 |     #             # 文件名
37 |     #             filename = current_filename + '/素材/{}.ts'.format(index)
38 | 
39 |     #             # 如果该文件名不在里面，就写入
40 |     #             if filename not in f2.read():
41 |     #                 f1.write("file '{}'\n".format(filename))
42 | 
43 | 
44 | # 设置UTF-8编码，让命令行支持中文编码
45 | # cmd = 'ffmpeg.exe -f concat -safe 0 -i file.txt -c copy out.mp4"'
46 | # os.system('CHCP 65001')
47 | # os.system(cmd.replace('/', '\\'))
48 | 
49 |     
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/file.txt:
--------------------------------------------------------------------------------
 1 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/1.ts'
 2 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/2.ts'
 3 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/3.ts'
 4 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/4.ts'
 5 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/5.ts'
 6 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/6.ts'
 7 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/7.ts'
 8 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/8.ts'
 9 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/9.ts'
10 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/10.ts'
11 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/11.ts'
12 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/12.ts'
13 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/13.ts'
14 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/14.ts'
15 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/15.ts'
16 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/request_demo.py:
--------------------------------------------------------------------------------
  1 | # python 3.7
  2 | import mitmproxy.http,json,os,m3u8,requests,shutil
  3 | from bs4 import BeautifulSoup
  4 | from mitmproxy import ctx
  5 | from pathlib import Path
  6 | from Crypto.Cipher import AES
  7 | 
  8 | '''
  9 | 本次爬取的课程地址：https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6
 10 | '''
 11 | 
 12 | # 生成python修复文件
 13 | repair_file_py = r'''
 14 | import os
 15 | 
 16 | from Crypto.Cipher import AES
 17 | 
 18 | # 获取当前路径
 19 | current_filename = os.getcwd().replace('\\','/') 
 20 | 
 21 | # 修复文件连接
 22 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt'
 23 | 
 24 | # 开始修复文件
 25 | def decrypt_file():
 26 | 
 27 |     global new_repair_file_txt
 28 | 
 29 |     before_content = None
 30 | 
 31 |     key = {}
 32 | 
 33 |     mode = AES.MODE_CBC
 34 | 
 35 |     # 获取 AES 解密对象
 36 |     cryptos = AES.new(key, mode)
 37 | 
 38 |     # 创建修复文件
 39 |     repair_filename = current_filename + '/修复文件'
 40 |     if not os.path.exists(repair_filename):
 41 |         os.makedirs(repair_filename)
 42 | 
 43 |     with open('not_finish_file.txt','r',encoding='utf-8') as f1:
 44 | 
 45 |         # 读取第一行
 46 |         line = f1.readline()
 47 | 
 48 |         # 逐行读取
 49 |         while line:
 50 |             # 获取 还没被解密的 ts 视频的路径
 51 |             not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/')
 52 |             print(not_finish_file_line)
 53 | 
 54 |             with open(not_finish_file_line,'rb') as f:  # 解密之前
 55 |                 before_content = f.read()
 56 | 
 57 |             # 写入 修复文件
 58 |             new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1]
 59 |             print(new_repair_filename)
 60 |             with open(new_repair_filename,'wb') as f:  # 解密之后
 61 |                 f.write(cryptos.decrypt(before_content))
 62 | 
 63 |             new_repair_file_txt = repair_filename + '/' + 'repair_file.txt'
 64 | 
 65 |             # 确保不重复
 66 |             with open(new_repair_file_txt,'a+',encoding='utf-8') as f3:  # 解密之后
 67 |                 with open(new_repair_file_txt,'r',encoding='utf-8') as f4:
 68 |                     if str(new_repair_filename) not in f4.read():
 69 |                         f3.write("file '%s'\n" % str(new_repair_filename))
 70 | 
 71 |             line = f1.readline()
 72 | 
 73 | # 使用 not_finish_file.txt 合成视频
 74 | def compose_file():
 75 |     
 76 |     cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt  + " -c copy 1.修复视频.mp4"
 77 |     print(cmd)
 78 |     # 设置UTF-8编码
 79 |     os.system('CHCP 65001')
 80 |     os.system(cmd.replace('/','\\'))
 81 | 
 82 | decrypt_file()
 83 | compose_file()
 84 | '''
 85 | 
 86 | cryptos = None    # AES解密
 87 | m3u8_data = None  # 保存m3u8有多少个uri
 88 | filename = None   # 下载视频路径
 89 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
 90 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径
 91 | title = None # 标题
 92 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频
 93 | 
 94 | class Counter:
 95 | 
 96 |     def __init__(self):
 97 |         self.Referer = 'https://xueyuan.xiaoe-tech.com/'
 98 |         self.Cookie = '请填写你的cookie'
 99 |         self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
100 |         self.headers = {
101 |             'Referer':self.Referer,
102 |             'Cookie':self.Cookie,
103 |             'UserAgent':self.UserAgent
104 |         }
105 | 
106 |     def request(self, flow: mitmproxy.http.HTTPFlow):
107 | 
108 |         # 所有请求插入协议头
109 |         flow.request.headers['Referer'] = self.Referer
110 |         flow.request.headers['Cookie'] = self.Cookie
111 | 
112 |     def response(self, flow: mitmproxy.http.HTTPFlow):
113 |         
114 |         # 导入全局变量
115 |         global cryptos,filename,m3u8_data,result_filename,repair_file_py,title,finish_file_flag
116 | 
117 |         # 注入 JavaScript 
118 |         # 启动就能点击播放器
119 |         if 'detail' in flow.request.url:
120 |             
121 |             # 确保匹配 HTML
122 |             if 'text/html' in flow.response.headers['Content-Type']:
123 |                 
124 |                 javascript_text = '''
125 |                     // 视频播放速度
126 |                     const playbackRate = 16;
127 |                     
128 |                     function start_video(){
129 |                         
130 |                         // 确保修改了视频播放速度
131 |                         while(document.querySelector('video').playbackRate != playbackRate ){
132 |                             
133 |                             // 点击播放器
134 |                             document.querySelector('div.iconfont.playButton.icon-icon_play').click();
135 | 
136 |                             // 设置视频重头播放
137 |                             document.querySelector('video').currentTime = 0;
138 | 
139 |                             // 设置视频自动播放
140 |                             document.querySelector('video').autoplay = true;
141 | 
142 |                             // 设置视频播放速度
143 |                             document.querySelector('video').playbackRate = playbackRate;
144 | 
145 |                             // 设置视频静音
146 |                             document.querySelector('video').muted = true
147 | 
148 |                             // 开始播放
149 |                             document.querySelector('video').play();
150 |                         } 
151 |                     };
152 | 
153 |                     // 使用递归，异步等待，确保video标签会出现
154 |                     function waitForElementToDisplay(selector, time) {
155 | 
156 |                         // video标签出现后，异步等待 1 秒
157 |                         if(document.querySelector(selector)!=null) {
158 |                             
159 |                             console.log('获取成功video');
160 |                             setTimeout(
161 |                                 ()=>{
162 |                                     start_video();
163 |                                 },1000
164 |                             ); 
165 |                             
166 |                             return;
167 |                         }
168 |                         else {
169 |                             setTimeout( ()=> {
170 |                                 waitForElementToDisplay(selector, time);
171 |                             }, time);
172 |                         }
173 |                     }
174 | 
175 |                     // 每过 1 秒检查video标签 是否出现
176 |                     waitForElementToDisplay('video',1000)                            
177 |                 '''
178 | 
179 |                 # 获取 BeautifulSoup 对象
180 |                 soup = BeautifulSoup(flow.response.text, 'lxml')
181 | 
182 |                 # 生成一个script节点
183 |                 script_tag = soup.new_tag('script', type='text/javascript')
184 | 
185 |                 # 往script节点写入内容
186 |                 script_tag.string = javascript_text
187 | 
188 |                 # 在当前 HTML 最后一个script节点  向后插入一个节点
189 |                 soup.select('script')[-1].insert_after(script_tag)
190 | 
191 |                 # 修改当前 HTML 全部内容
192 |                 flow.response.text = str(soup)
193 | 
194 |         # 设置 AES解密模式
195 |         mode = AES.MODE_CBC
196 | 
197 |         # 获取课程标题
198 |         if 'xe.goods.detail.get' in flow.request.url:
199 |             
200 |             # 加载 JSON 对象
201 |             json_data = json.loads(flow.response.text)
202 | 
203 |             # 获取当前视频标题
204 |             title = json_data['data']['title'].replace(' ','')
205 | 
206 |             # 如果没有文件夹，就创建文件夹
207 |             filename = current_filename + '/下载成功的视频/{}'.format(title)
208 |             if not os.path.exists(filename):
209 |                 os.makedirs(filename)
210 |                 
211 |             if not os.path.exists(result_filename):
212 |                 os.makedirs(result_filename)
213 | 
214 |         # 匹配 m3u8
215 |         if '.m3u8' in flow.request.url:
216 |             
217 |             # 加载 m3u8 对象
218 |             dict_data = m3u8.parse(flow.response.text)
219 | 
220 |             # 获取 m3u8 全部分片链接
221 |             m3u8_data = [ data['uri'] for data in dict_data['segments']]
222 |             print(m3u8_data)
223 | 
224 |             # 获取解密参数
225 |             m3u8_content = requests.get(url=dict_data['keys'][0]['uri'],headers=self.headers).content
226 |             cryptos = AES.new(m3u8_content,mode)
227 | 
228 |             # 将密钥 写入 修复文件
229 |             repair_file_py = repair_file_py.format(str(m3u8_content))
230 |             print('\n' + '-'*50)
231 |             print('\n当前密钥：{}'.format(str(m3u8_content)))
232 | 
233 |                
234 |         # 匹配密钥
235 |         if 'get_video_key.php' in flow.request.url:
236 |             
237 |             print('\n当前密钥：{}'.format(str(flow.response.content)))
238 | 
239 |             # 将密钥 写入 修复文件
240 |             repair_file_py = repair_file_py.format(str(flow.response.content))
241 |             cryptos = AES.new(flow.response.content, mode)
242 | 
243 |         # 解密 ts 文件
244 |         if '.ts' in flow.request.url:
245 |             
246 |             print('-'*50)
247 |             print('\n[当前解密对象]：{}\n'.format(cryptos))
248 | 
249 |             # 拼接当前视频保存路径
250 |             m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end'))
251 |             print('[当前视频]：{} [保存路径]：{}\n'.format(title,m3u8_ts_filename))
252 | 
253 |             # 用于合成
254 |             m3u8_finish_file_filename = filename + '/finish_file.txt'
255 | 
256 |             # 确定最后一个分片
257 |             start_data = m3u8_data[-1].split('?')[1].split('&')[0]
258 |             end_data = m3u8_data[-1].split('?')[1].split('&')[1]
259 |             result_data = start_data + end_data
260 | 
261 |             # 获取成功密钥，再解密
262 |             if cryptos != None:
263 | 
264 |                 # 保存 解密好的 ts
265 |                 with open(m3u8_ts_filename,'wb') as f:
266 |                     f.write(cryptos.decrypt(flow.response.content))
267 | 
268 | 
269 |                 # 写入 解密成功 标记文件
270 |                 with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1:
271 |                     with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2:
272 |                         
273 |                         # 如果文件为空，同时又存在最后一片，将不写入
274 |                         if result_data in m3u8_ts_filename and f2.read()=='':
275 |                             pass
276 | 
277 |                         # 防止重复，确保路径没问题
278 |                         elif m3u8_ts_filename not in f2.read():
279 |                             f1.write("file '{}'\n".format(m3u8_ts_filename))
280 | 
281 |                 # 如果是最后一个分片，开始合成视频
282 |                 if result_data in m3u8_ts_filename:
283 |                     
284 |                     # 拷贝 ffmpeg.exe 写入指定目录
285 |                     ffmpeg_filename = filename + '/ffmpeg.exe'
286 |                     shutil.copyfile('ffmpeg.exe', ffmpeg_filename)
287 | 
288 |                     # 如果 存在 还没被解密的 ts 视频
289 |                     if finish_file_flag:
290 | 
291 |                         # 生成修复python
292 |                         repair_file = filename + '/repair.py'
293 |                         with open(repair_file,'w',encoding='utf-8') as f:
294 |                             f.write(repair_file_py)
295 | 
296 |                     # 合成视频
297 |                     cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"'
298 | 
299 | 
300 |                     # 读取 解密成功 标记文件
301 |                     with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f:
302 |                         
303 |                         # 确保文件不为空
304 |                         if f.read()!='':
305 |                             
306 |                             mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4'
307 | 
308 |                             # 如果合成的视频已经存在，先删除，再执行
309 |                             if os.path.exists(mp4_filename):
310 |                                 os.remove(mp4_filename)
311 | 
312 |                                 # 设置UTF-8编码
313 |                                 os.system('CHCP 65001')
314 |                                 os.system(cmd.replace('/','\\'))
315 |                                 print('[警告]：文件路径 {}'.format(mp4_filename))
316 |                                 print('[警告]：文件被覆盖了，由于该文件之前已存在过')
317 |                             else:
318 |                                 os.system('CHCP 65001')
319 |                                 os.system(cmd.replace('/','\\'))
320 |                                 print('[成功]：文件路径 {}'.format(mp4_filename))
321 |                                 print('[成功]：合并完毕')
322 |                         else:
323 |                             print(os.path.exists(result_filename + '/' + filename.split('/')[-1] + '.mp4"'))
324 |                             print(result_filename + '/' + filename.split('/')[-1] + '.mp4')
325 |                             print('[异常]：当前视频只下载最后一片，将不会合成视频')
326 |             else:
327 |                 
328 |                 # 标记是否存在 还没被解密的 ts 视频
329 |                 finish_file_flag = True
330 | 
331 |                 # 保存 还没被解密的 ts 视频
332 |                 with open(m3u8_ts_filename,'wb') as f:
333 |                     f.write(flow.response.content)
334 |                 
335 |                 # 用于合成
336 |                 m3u8_not_finish_file__filename = filename + '/not_finish_file.txt'
337 |                 with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f:
338 |                     f.write("file '{}'\n".format(m3u8_ts_filename))
339 |                 
340 | 
341 | addons = [
342 |     Counter()
343 | ]


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/requirements.txt:
--------------------------------------------------------------------------------
 1 | asgiref==3.3.4
 2 | beautifulsoup4==4.9.3
 3 | blinker==1.4
 4 | Brotli==1.0.9
 5 | bs4==0.0.1
 6 | certifi==2021.5.30
 7 | cffi==1.14.6
 8 | charset-normalizer==2.0.3
 9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | PROXY='http://127.0.0.1:8080'
 4 | 
 5 | chrome_options = webdriver.ChromeOptions()
 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 专业模式
 8 | 
 9 | 
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 | 
12 | '''
13 | 本次爬取的课程地址：https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6
14 | '''
15 | 
16 | url = 'https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6'
17 | browser.get(url)#访问网站
18 | 
19 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | ﻿地址一：
2 | http://chromedriver.storage.googleapis.com/index.html
3 |  
4 | 地址二：
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd，输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹，输入python selenium_start.py


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/2.自动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
 1 | 一次性安装
 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
 3 | 
 4 | 分开安装
 5 | pip install mitmproxy
 6 | pip install selenium
 7 | pip install m3u8
 8 | pip install requests
 9 | pip install bs4
10 | pip install pycryptodome


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/request_demo.py:
--------------------------------------------------------------------------------
  1 | # python 3.7
  2 | import mitmproxy.http,json,os,shutil
  3 | from bs4 import BeautifulSoup
  4 | from mitmproxy import ctx
  5 | from pathlib import Path
  6 | from Crypto.Cipher import AES
  7 | 
  8 | '''
  9 | 本次爬取的课程地址：https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e
 10 | '''
 11 | 
 12 | # 生成python修复文件
 13 | repair_file_py = r'''
 14 | 
 15 | "此文件用于保存密钥，请不要执行代码"
 16 | 
 17 | import os
 18 | 
 19 | from Crypto.Cipher import AES
 20 | 
 21 | # 获取当前路径
 22 | current_filename = os.getcwd().replace('\\','/') 
 23 | 
 24 | # 修复文件连接
 25 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt'
 26 | 
 27 | # 开始修复文件
 28 | def decrypt_file():
 29 | 
 30 |     global new_repair_file_txt
 31 | 
 32 |     before_content = None
 33 | 
 34 |     key = {}
 35 | 
 36 |     mode = AES.MODE_CBC
 37 | 
 38 |     # 获取 AES 解密对象
 39 |     cryptos = AES.new(key, mode)
 40 | 
 41 |     # 创建修复文件
 42 |     repair_filename = current_filename + '/修复文件'
 43 |     if not os.path.exists(repair_filename):
 44 |         os.makedirs(repair_filename)
 45 | 
 46 |     with open('not_finish_file.txt','r',encoding='utf-8') as f1:
 47 | 
 48 |         # 读取第一行
 49 |         line = f1.readline()
 50 | 
 51 |         # 逐行读取
 52 |         while line:
 53 |             # 获取 还没被解密的 ts 视频的路径
 54 |             not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/')
 55 |             print(not_finish_file_line)
 56 | 
 57 |             with open(not_finish_file_line,'rb') as f:  # 解密之前
 58 |                 before_content = f.read()
 59 | 
 60 |             # 写入 修复文件
 61 |             new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1]
 62 |             print(new_repair_filename)
 63 |             with open(new_repair_filename,'wb') as f:  # 解密之后
 64 |                 f.write(cryptos.decrypt(before_content))
 65 | 
 66 |             new_repair_file_txt = repair_filename + '/' + 'repair_file.txt'
 67 | 
 68 |             # 确保不重复
 69 |             with open(new_repair_file_txt,'a+',encoding='utf-8') as f3:  # 解密之后
 70 |                 with open(new_repair_file_txt,'r',encoding='utf-8') as f4:
 71 |                     if str(new_repair_filename) not in f4.read():
 72 |                         f3.write("file '%s'\n" % str(new_repair_filename))
 73 | 
 74 |             line = f1.readline()
 75 | 
 76 | # 使用 not_finish_file.txt 合成视频
 77 | def compose_file():
 78 |     
 79 |     cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt  + " -c copy 1.修复视频.mp4"
 80 |     print(cmd)
 81 |     # 设置UTF-8编码
 82 |     os.system('CHCP 65001')
 83 |     os.system(cmd.replace('/','\\'))
 84 | 
 85 | decrypt_file()
 86 | compose_file()
 87 | '''
 88 | 
 89 | # 生成python合成文件
 90 | merge_file_py = r'''
 91 | 
 92 | "此文件用于合成视频"
 93 | 
 94 | import os
 95 | 
 96 | mp4_filename = '%s'
 97 | 
 98 | cmd = '%s'
 99 | 
100 | # 如果合成的视频已经存在，先删除，再执行
101 | if os.path.exists(mp4_filename):
102 |     os.remove(mp4_filename)
103 | 
104 |     # 设置UTF-8编码
105 |     os.system('CHCP 65001')
106 |     os.system(cmd.replace('/','\\'))
107 |     print('[警告]：文件路径 {}'.format(mp4_filename))
108 |     print('[警告]：文件被覆盖了，由于该文件之前已存在过')
109 | else:
110 |     os.system('CHCP 65001')
111 |     os.system(cmd.replace('/','\\'))
112 |     print('[成功]：文件路径 {}'.format(mp4_filename))
113 |     print('[成功]：合并完毕')
114 | '''
115 | 
116 | cryptos = None    # AES解密
117 | filename = None   # 下载视频路径
118 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
119 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径
120 | title = None # 标题
121 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频
122 | 
123 | class Counter:
124 | 
125 |     def __init__(self):
126 |         self.Referer = 'https://appdgjqmn6j1714.h5.xiaoeknow.com'
127 |         self.Cookie = '请填写你的Cooie'
128 |         self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
129 |         self.headers = {
130 |             'Referer':self.Referer,
131 |             'Cookie':self.Cookie,
132 |             'UserAgent':self.UserAgent
133 |         }
134 | 
135 |     def request(self, flow: mitmproxy.http.HTTPFlow):
136 | 
137 |         # 所有请求插入协议头
138 |         flow.request.headers['Referer'] = self.Referer
139 |         flow.request.headers['Cookie'] = self.Cookie
140 | 
141 |     def response(self, flow: mitmproxy.http.HTTPFlow):
142 |         
143 |         # 导入全局变量
144 |         global cryptos,filename,result_filename,repair_file_py,title,finish_file_flag,merge_file_py
145 | 
146 |         # 注入 JavaScript 
147 |         # 启动就能点击播放器
148 |         if 'v_61ceb0f8e4b05006f9c4214e' in flow.request.url:
149 |             
150 |             # 确保匹配 HTML
151 |             if 'text/html' in flow.response.headers.get('content-type'):
152 |                 
153 |                 try:
154 |                     print('尝试执行JS控制播放器代码')
155 |                     javascript_text = '''
156 |                         // 视频播放速度
157 |                         const playbackRate = 16;
158 |                         
159 |                         function start_video(){
160 |                             
161 |                             // 确保修改了视频播放速度
162 |                             while(document.querySelector('video').playbackRate != playbackRate ){
163 |                                 
164 |                                 // 点击播放器
165 |                                 document.querySelector('div.iconfont.playButton.icon-icon_play').click();
166 | 
167 |                                 // 设置视频重头播放
168 |                                 document.querySelector('video').currentTime = 0;
169 | 
170 |                                 // 设置视频自动播放
171 |                                 document.querySelector('video').autoplay = true;
172 | 
173 |                                 // 设置视频播放速度
174 |                                 document.querySelector('video').playbackRate = playbackRate;
175 | 
176 |                                 // 设置视频静音
177 |                                 document.querySelector('video').muted = true
178 | 
179 |                                 // 开始播放
180 |                                 document.querySelector('video').play();
181 |                             } 
182 |                         };
183 | 
184 |                         // 使用递归，异步等待，确保video标签会出现
185 |                         function waitForElementToDisplay(selector, time) {
186 | 
187 |                             // video标签出现后，异步等待 1 秒
188 |                             if(document.querySelector(selector)!=null) {
189 |                                 
190 |                                 console.log('获取成功video');
191 |                                 setTimeout(
192 |                                     ()=>{
193 |                                         start_video();
194 |                                     },1000
195 |                                 ); 
196 |                                 
197 |                                 return;
198 |                             }
199 |                             else {
200 |                                 setTimeout( ()=> {
201 |                                     waitForElementToDisplay(selector, time);
202 |                                 }, time);
203 |                             }
204 |                         }
205 | 
206 |                         // 每过 1 秒检查video标签 是否出现
207 |                         waitForElementToDisplay('video',1000)                            
208 |                     '''
209 | 
210 |                     # 获取 BeautifulSoup 对象
211 |                     soup = BeautifulSoup(flow.response.text, 'lxml')
212 | 
213 |                     # 生成一个script节点
214 |                     script_tag = soup.new_tag('script', type='text/javascript')
215 | 
216 |                     # 往script节点写入内容
217 |                     script_tag.string = javascript_text
218 | 
219 |                     # 在当前 HTML 最后一个script节点  向后插入一个节点
220 |                     soup.select('script')[-1].insert_after(script_tag)
221 | 
222 |                     # 修改当前 HTML 全部内容
223 |                     flow.response.text = str(soup)
224 |                 except:
225 |                     pass
226 | 
227 |         # 设置 AES解密模式
228 |         mode = AES.MODE_CBC
229 | 
230 |         # 获取课程标题
231 |         if 'get_goods_info_business' in flow.request.url:
232 |             
233 |             # 加载 JSON 对象
234 |             json_data = json.loads(flow.response.text)
235 | 
236 |             # 获取当前视频标题
237 |             title = json_data['data']['goods_name'].replace(' ','')
238 | 
239 |             # 如果没有文件夹，就创建文件夹
240 |             filename = current_filename + '/下载成功的视频/{}'.format(title)
241 |             if not os.path.exists(filename):
242 |                 os.makedirs(filename)
243 |                 
244 |             if not os.path.exists(result_filename):
245 |                 os.makedirs(result_filename)
246 |      
247 |         # 匹配密钥
248 |         if 'get_video_key.php' in flow.request.url:
249 |             
250 |             print('\n当前密钥：{}'.format(str(flow.response.content)))
251 | 
252 |             # 将密钥 写入 修复文件
253 |             repair_file_py = repair_file_py.format(str(flow.response.content))
254 |             cryptos = AES.new(flow.response.content, mode)
255 | 
256 |         # 解密 ts 文件
257 |         if '.ts' in flow.request.url:
258 |             
259 |             print('-'*50)
260 |             print('\n[当前解密对象]：{}\n'.format(cryptos))
261 | 
262 |             # 拼接当前视频保存路径
263 |             m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end'))
264 |             print('[当前视频]：{} [保存路径]：{}\n'.format(title,m3u8_ts_filename))
265 |          
266 |             # 用于合成
267 |             m3u8_finish_file_filename = filename + '/finish_file.txt'
268 | 
269 |             # 确定最后一个分片
270 |             start_data = flow.request.query.get('start')
271 |             end_data = flow.request.query.get('end')
272 |             result_data = start_data + end_data
273 | 
274 |             # 获取成功密钥，再解密
275 |             if cryptos != None:
276 |                 
277 |                 
278 |                 # 保存 解密好的 ts
279 |                 with open(m3u8_ts_filename,'wb') as f:
280 |                     f.write(cryptos.decrypt(flow.response.content))
281 | 
282 | 
283 |                 # 写入 解密成功 标记文件
284 |                 with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1:
285 |                     with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2:
286 |                         
287 |                         # 如果文件为空，同时又存在最后一片，将不写入
288 |                         if result_data in m3u8_ts_filename and f2.read()=='':
289 |                             pass
290 | 
291 |                         # 防止重复，确保路径没问题
292 |                         elif m3u8_ts_filename not in f2.read():
293 |                             f1.write("file '{}'\n".format(m3u8_ts_filename))
294 | 
295 |                 ffmpeg_filename = filename + '/ffmpeg.exe'
296 |                 shutil.copyfile('ffmpeg.exe', ffmpeg_filename)
297 | 
298 |                 # 优化版 生成python合成文件
299 |                 mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4'
300 |                 cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"'
301 | 
302 |                 if mp4_filename and cmd:
303 |                     
304 |                     try:
305 |                         merge_file_py = merge_file_py % (str(mp4_filename),str(cmd)) 
306 |                     except:
307 |                         pass
308 | 
309 |                     # 开始生成python合成文件
310 |                     merge_file = filename + '/merge.py'
311 |                     with open(merge_file,'w',encoding='utf-8') as f:
312 |                         f.write(merge_file_py)
313 | 
314 |                 # 生成修复python文件
315 |                 repair_file = filename + '/repair.py'
316 |                 with open(repair_file,'w',encoding='utf-8') as f:
317 |                     f.write(repair_file_py)
318 |             else:
319 |                 
320 |                 # 标记是否存在 还没被解密的 ts 视频
321 |                 finish_file_flag = True
322 | 
323 |                 # 保存 还没被解密的 ts 视频
324 |                 with open(m3u8_ts_filename,'wb') as f:
325 |                     f.write(flow.response.content)
326 |                 
327 |                 # 用于合成
328 |                 m3u8_not_finish_file__filename = filename + '/not_finish_file.txt'
329 |                 with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f:
330 |                     f.write("file '{}'\n".format(m3u8_ts_filename))
331 |                 
332 | 
333 | addons = [
334 |     Counter()
335 | ]


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/requirements.txt:
--------------------------------------------------------------------------------
 1 | asgiref==3.3.4
 2 | beautifulsoup4==4.9.3
 3 | blinker==1.4
 4 | Brotli==1.0.9
 5 | bs4==0.0.1
 6 | certifi==2021.5.30
 7 | cffi==1.14.6
 8 | charset-normalizer==2.0.3
 9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | PROXY='http://127.0.0.1:8080'
 4 | 
 5 | chrome_options = webdriver.ChromeOptions()
 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 专业模式
 8 | 
 9 | 
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 | 
12 | '''
13 | 本次爬取的课程地址：https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e
14 | '''
15 | 
16 | url = 'https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e'
17 | browser.get(url)#访问网站
18 | 
19 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | ﻿地址一：
2 | http://chromedriver.storage.googleapis.com/index.html
3 |  
4 | 地址二：
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd，输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹，输入python selenium_start.py


--------------------------------------------------------------------------------
/下载小鹅通视频/2021年12月/3.手动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
 1 | 一次性安装
 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
 3 | 
 4 | 分开安装
 5 | pip install mitmproxy
 6 | pip install selenium
 7 | pip install m3u8
 8 | pip install requests
 9 | pip install bs4
10 | pip install pycryptodome


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/request_demo.py:
--------------------------------------------------------------------------------
  1 | # python 3.7.9
  2 | import mitmproxy.http,json,os,m3u8,requests,base64
  3 | from mitmproxy import ctx
  4 | from pathlib import Path
  5 | 
  6 | '''
  7 |     解密参考文章: https://www.52pojie.cn/thread-1689801-1-1.html
  8 |     m3u8下载器GitHub地址: https://github.com/nilaoda/N_m3u8DL-CLI
  9 | 
 10 |     旧版课程: https://appjkyl58fl2930.h5.xiaoeknow.com/p/course/column/p_5c483e6305292_C1LfcA9T?type=3
 11 |     新版课程: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
 12 | 
 13 |     本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
 14 | '''
 15 | 
 16 | userid = None # 用户uid
 17 | filename = None   # 下载视频路径
 18 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
 19 | ts_url = None # ts文件下载地址
 20 | title = None # 标题
 21 | m3u8_obj = None # m3u8对象
 22 | m3u8_content = None # m3u8密钥
 23 | 
 24 | class Counter:
 25 | 
 26 |     def __init__(self):
 27 |         self.Referer = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3'
 28 |         self.Cookie = '请填写你的Cooie'
 29 |         self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
 30 |         self.headers = {
 31 |             'Referer':self.Referer,
 32 |             'Cookie':self.Cookie,
 33 |             'UserAgent':self.UserAgent
 34 |         }
 35 | 
 36 |     def request(self, flow: mitmproxy.http.HTTPFlow):
 37 | 
 38 |         # 所有请求插入协议头
 39 |         flow.request.headers['Referer'] = self.Referer
 40 |         flow.request.headers['Cookie'] = self.Cookie
 41 | 
 42 |     def response(self, flow: mitmproxy.http.HTTPFlow):
 43 |         
 44 |         # 导入全局变量
 45 |         global filename,title,userid,ts_url,m3u8_obj,m3u8_content
 46 | 
 47 | 
 48 |         # 获取课程标题
 49 |         if 'xe.course.business.core.info.get' in flow.request.url:
 50 |             
 51 |             # 加载 JSON 对象
 52 |             json_data = json.loads(flow.response.text)
 53 | 
 54 |             # 获取当前视频标题
 55 |             title = json_data['data']['resource_name'].replace(' ','')
 56 | 
 57 |             print(f'[当前标题] {title}')
 58 | 
 59 |             # 如果没有文件夹，就创建文件夹
 60 |             filename = current_filename + '/下载成功的视频/{}'.format(title)
 61 |             if not os.path.exists(filename):
 62 |                 os.makedirs(filename)
 63 |             
 64 |             if not os.path.exists(current_filename+'/m3u8'):
 65 |                 os.makedirs(current_filename+'/m3u8')
 66 | 
 67 |         if 'xe.course.business.composite_info.get' in flow.request.url:
 68 |             
 69 |             # 加载 JSON 对象
 70 |             json_data = json.loads(flow.response.text)
 71 | 
 72 |             # 获取userid
 73 |             userid = json_data['data']['user_info']['user_id'].replace(' ','')
 74 | 
 75 |             print(f'[用户ID] {userid}')
 76 | 
 77 |         # 匹配 m3u8
 78 |         if '.m3u8' in flow.request.url:
 79 |             
 80 |             if userid != None and filename != None:
 81 | 
 82 |                 # 加载 m3u8 对象
 83 |                 m3u8_obj = m3u8.loads(flow.response.text)
 84 | 
 85 |                 # 添加用户userid
 86 |                 m3u8_obj.keys[0].uri = m3u8_obj.keys[0].uri + f'&uid={userid}'
 87 | 
 88 |                 # 获取m3u8密钥 URL
 89 |                 m3u8_key_url = m3u8_obj.keys[0].uri
 90 | 
 91 |                 # 获取解密参数（第一次解密）
 92 |                 # print(m3u8_key_url)
 93 |                 try:
 94 |                     m3u8_content = requests.get(
 95 |                         url=m3u8_key_url,
 96 |                         headers=self.headers,
 97 |                         proxies={ "http": None, "https": None} # 不走系统代理，防止clash、v2ray等代理软件拦截
 98 |                     ).content
 99 |                 except:
100 |                     print(f'-'*25 + 'm3u8_content连接失败' + '-'*25)
101 |                     print(f'[m3u8链接] = {m3u8_key_url}')
102 |                     print(f'[协议头] = {self.headers}')
103 |                     print(f'-'*50)
104 |                     raise (f'[ERROR]m3u8_content连接失败')
105 | 
106 |                 # 基于用户userid解密（第二次解密）
107 |                 rsp_data = m3u8_content
108 |                 userid_bytes = bytes(userid.encode(encoding='utf-8'))
109 |                 result_list = []
110 |                 for index in range(0, len(rsp_data)):
111 |                     result_list.append(
112 |                         rsp_data[index] ^ userid_bytes[index])
113 |                 m3u8_content = bytes(result_list)
114 | 
115 |                 # 最终密钥
116 |                 m3u8_content = base64.b64encode(bytes(result_list)).decode()
117 |                 print(f'[m3u8密钥] {m3u8_content}')
118 | 
119 |             else:
120 |                 print(f'[当前标题] {title}')
121 |                 print(f'[用户ID] {userid}')
122 |                 print('[错误] 没有用户id || 没有标题')
123 | 
124 | 
125 |         if '.ts' in flow.request.url:
126 |             
127 |             video_url = flow.request.url
128 | 
129 |             print('[开始下载视频]------------------')
130 |             # print(f'video_url: {video_url}')
131 | 
132 |             # 获取ts文件下载域名（前缀）
133 |             start_url = video_url.split('/')[:-1]
134 | 
135 |             # 获取ts文件下载域名（后缀）
136 |             end_url = video_url.split('/')[-1].split('?')
137 |             end_url[0] = '{ts_url}'
138 | 
139 |             # 后缀塞入前缀
140 |             start_url.append('&'.join(end_url))
141 |             
142 |             # 生成 ts文件下载地址
143 |             ts_url = '/'.join(start_url)
144 | 
145 |             # 添加 ts 链接地址
146 |             for tmp_data in m3u8_obj.segments:
147 | 
148 |                 # 插入
149 |                 if ts_url != None:
150 |                     tmp_data.uri = ts_url.format(ts_url=tmp_data.uri)
151 |                 else:
152 |                     print(f'[错误] ts_url is None')
153 | 
154 |             m3u8_filename = f'./m3u8/{title}.m3u8'
155 |             m3u8_obj.dump(m3u8_filename)
156 | 
157 |             # 确保m3u8文件存在
158 |             if os.path.exists(m3u8_filename):
159 | 
160 |                 if os.path.exists(f'{filename}/{title}.mp4'):
161 |                     print(f'[停止下载警告] 已经存在 {filename}/{title}.mp4')
162 | 
163 |                 elif m3u8_content == None:
164 |                     print(f'[m3u8密钥] {m3u8_content}')
165 |                     print('[错误] 没有m3u8密钥')
166 | 
167 |                 else:
168 |                     cmd = f'N_m3u8DL-CLI_v3.0.2.exe "{m3u8_filename}" --workDir "{filename}" --saveName "{title}" --useKeyBase64 "{m3u8_content}"'
169 |                     print(cmd)
170 |                     os.system('CHCP 65001')
171 |                     os.system(cmd)
172 |                 
173 |             else:
174 |                 print('[错误]m3u8文件生成失败')
175 | 
176 | 
177 | addons = [
178 |     Counter()
179 | ]


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/requirements.txt:
--------------------------------------------------------------------------------
 1 | asgiref==3.3.4
 2 | beautifulsoup4==4.9.3
 3 | blinker==1.4
 4 | Brotli==1.0.9
 5 | bs4==0.0.1
 6 | certifi==2021.5.30
 7 | cffi==1.14.6
 8 | charset-normalizer==2.0.3
 9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
 1 | # python 3.7.9
 2 | from selenium import webdriver
 3 | 
 4 | PROXY='http://127.0.0.1:8080'
 5 | 
 6 | chrome_options = webdriver.ChromeOptions()
 7 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
 8 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 专业模式
 9 | 
10 | 
11 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
12 | 
13 | '''
14 | 本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3
15 | '''
16 | 
17 | url = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3'
18 | browser.get(url)#访问网站
19 | 
20 | 


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | ﻿地址一：
2 | http://chromedriver.storage.googleapis.com/index.html
3 |  
4 | 地址二：
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd，输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹，输入python selenium_start.py


--------------------------------------------------------------------------------
/下载小鹅通视频/2022年12月/1.自动合并版本/安装环境指令.txt:
--------------------------------------------------------------------------------
 1 | 一次性安装
 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
 3 | 
 4 | 分开安装
 5 | pip install mitmproxy
 6 | pip install selenium
 7 | pip install m3u8
 8 | pip install requests
 9 | pip install bs4
10 | pip install pycryptodome


--------------------------------------------------------------------------------
/下载荔枝微课/ffmpeg.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/ffmpeg.exe


--------------------------------------------------------------------------------
/下载荔枝微课/request_demo.py:
--------------------------------------------------------------------------------
 1 | # python 3.7
 2 | import mitmproxy.http,json,os,requests
 3 | from mitmproxy import ctx
 4 | from pathlib import Path
 5 | 
 6 | '''
 7 | 本次爬取的课程地址：https://m.lizhiweike.com/channel2/1192275
 8 | '''
 9 | 
10 | cookie = '请填写你的Cooie'
11 | filename = None   # 下载视频路径
12 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径
13 | title = None # 标题
14 | 
15 | class Counter:
16 | 
17 |     def __init__(self):
18 |         self.Referer = 'https://m.lizhiweike.com/channel2/1192275'
19 |         self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
20 |         self.headers = {
21 |             'Referer':self.Referer,
22 |             'Host':'m.lizhiweike.com',
23 |             'User-Agent':self.UserAgent
24 |         }
25 | 
26 |     def request(self, flow: mitmproxy.http.HTTPFlow):
27 | 
28 |         # 所有请求插入协议头
29 |         flow.request.headers['Referer'] = self.Referer
30 |    
31 |     def response(self, flow: mitmproxy.http.HTTPFlow):
32 |         
33 |         # 导入全局变量
34 |         global filename,title,current_filename,cookie
35 | 
36 |         if 'lecture' in flow.request.url and 'info' in flow.request.url:
37 | 
38 |             # 加载 JSON 对象
39 |             json_data = json.loads(flow.response.text)
40 | 
41 |             try:
42 |                 # 获取当前视频标题
43 |                 title = json_data['data']['share_info']['share_title'].replace(' ','')
44 |             except:
45 |                 pass
46 | 
47 |         # 获取课程标题
48 |         if 'qcvideo' in flow.request.url:
49 |             
50 |             # 加载 JSON 对象
51 |             json_data = json.loads(flow.response.text)
52 | 
53 |             # 获取视频URL
54 |             video_url = json_data['data']['play_list'][0]['url']
55 | 
56 |             print(f'【信息】当前视频标题：{title}，视频mp4链接：{video_url}')
57 |             
58 |             # 如果没有文件夹，就创建文件夹
59 |             filename = current_filename + '/下载成功的视频/'
60 |             if not os.path.exists(filename):
61 |                 os.makedirs(filename)
62 |             
63 |             # 生产mp4存放路径
64 |             mp4_filename_path = f'{filename}{title}.mp4'
65 |             
66 |             headers = {
67 |                 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
68 |                 'referer':'https://m.lizhiweike.com/channel2/1192275',
69 |                 'Cookie':cookie
70 |             }
71 | 
72 |             # 下载视频
73 |             html = requests.get(url=video_url,headers=headers)
74 |             with open(mp4_filename_path,'wb') as f:
75 |                 f.write(html.content)
76 | 
77 | addons = [
78 |     Counter()
79 | ]


--------------------------------------------------------------------------------
/下载荔枝微课/requirements.txt:
--------------------------------------------------------------------------------
 1 | asgiref==3.3.4
 2 | beautifulsoup4==4.9.3
 3 | blinker==1.4
 4 | Brotli==1.0.9
 5 | bs4==0.0.1
 6 | certifi==2021.5.30
 7 | cffi==1.14.6
 8 | charset-normalizer==2.0.3
 9 | click==7.1.2
10 | cryptography==3.2.1
11 | Flask==1.1.4
12 | h11==0.12.0
13 | h2==4.0.0
14 | hpack==4.0.0
15 | hyperframe==6.0.1
16 | idna==3.2
17 | iso8601==0.1.14
18 | itsdangerous==1.1.0
19 | Jinja2==2.11.3
20 | kaitaistruct==0.9
21 | ldap3==2.8.1
22 | m3u8==0.9.0
23 | MarkupSafe==2.0.1
24 | mitmproxy==5.3.0
25 | msgpack==1.0.2
26 | passlib==1.7.4
27 | protobuf==3.13.0
28 | publicsuffix2==2.20191221
29 | pyasn1==0.4.8
30 | pycparser==2.20
31 | pycryptodome==3.10.1
32 | pydivert==2.1.0
33 | pyOpenSSL==19.1.0
34 | pyparsing==2.4.7
35 | pyperclip==1.8.2
36 | requests==2.26.0
37 | ruamel.yaml==0.16.13
38 | ruamel.yaml.clib==0.2.6
39 | selenium==3.141.0
40 | six==1.16.0
41 | sortedcontainers==2.2.2
42 | soupsieve==2.2.1
43 | tornado==6.1
44 | typing-extensions==3.10.0.0
45 | urllib3==1.26.6
46 | urwid==2.1.2
47 | Werkzeug==1.0.1
48 | wsproto==0.15.0
49 | zstandard==0.14.1
50 | 


--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/selenium启动/chromedriver.exe


--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/selenium_start.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | PROXY='http://127.0.0.1:8080'
 4 | 
 5 | chrome_options = webdriver.ChromeOptions()
 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080")
 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 专业模式
 8 | 
 9 | 
10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义
11 | 
12 | '''
13 | 本次爬取的课程地址：https://m.lizhiweike.com/channel2/1192275
14 | '''
15 | 
16 | url = 'https://m.lizhiweike.com/channel2/1192275'
17 | browser.get(url)#访问网站
18 | 
19 | 


--------------------------------------------------------------------------------
/下载荔枝微课/selenium启动/谷歌驱动下载地址.txt:
--------------------------------------------------------------------------------
1 | ﻿地址一：
2 | http://chromedriver.storage.googleapis.com/index.html
3 |  
4 | 地址二：
5 | https://registry.npmmirror.com/binary.html?path=chromedriver/


--------------------------------------------------------------------------------
/下载荔枝微课/启动程序指令.txt:
--------------------------------------------------------------------------------
1 | 先打开一个cmd，输入mitmweb -s request_demo.py
2 | 然后再cd进入selenium启动文件夹，输入python selenium_start.py


--------------------------------------------------------------------------------
/下载荔枝微课/安装环境指令.txt:
--------------------------------------------------------------------------------
 1 | 一次性安装
 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple
 3 | 
 4 | 分开安装
 5 | pip install mitmproxy
 6 | pip install selenium
 7 | pip install m3u8
 8 | pip install requests
 9 | pip install bs4
10 | pip install pycryptodome


--------------------------------------------------------------------------------
/京东商品信息/crawl.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import requests,re,json
  5 | from bs4 import BeautifulSoup
  6 | from urllib import parse
  7 | 
  8 | KEYWORD = parse.quote('python')
  9 | 
 10 | base = 'https://item.jd.com'
 11 | headers = {
 12 | 	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 13 | 	'Connection':'keep-alive',
 14 | 	#参考链接:https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python
 15 | }
 16 | 
 17 | 
 18 | def get_index(url):
 19 | 	#一开始的请求页面
 20 | 
 21 | 	session = requests.Session()
 22 | 	session.headers = headers
 23 | 	html = session.get(url)
 24 | 	html.encoding = 'GBK'
 25 | 	soup = BeautifulSoup(html.text,'lxml')
 26 | 	items = soup.select('li.gl-item')
 27 | 
 28 | 
 29 | 	for item in items:
 30 | 		inner_url = item.select('li.gl-item .gl-i-wrap .p-img a')[0].get('href')
 31 | 		print(inner_url)	
 32 | 		inner_url = parse.urljoin(base,inner_url)#转成URL格式
 33 | 	
 34 | 		item_id = get_id(inner_url)
 35 | 
 36 | 		#评论数
 37 | 		comm_num = get_comm_num(inner_url)
 38 | 		inner_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6501&productId=11993134&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
 39 | 
 40 | 		#获取评论
 41 | 		if comm_num>0:
 42 | 			get_comm(inner_url,comm_num,item_id)
 43 | 	
 44 | 
 45 | 
 46 | 
 47 | def get_comm(url,comm_num,item_id ):
 48 | 
 49 | 	headers = {
 50 | 		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
 51 | 	}
 52 | 	good_comments = ''  #存放结果
 53 | 	#获取评论
 54 | 
 55 | 	pages = comm_num//10
 56 | 	if pages>99:
 57 | 		pages = 99
 58 | 
 59 | 	for page in range(0,pages):
 60 | 		comment_url = 'https://sclub.jd.com/comment/productPageComments.action?'\
 61 | 					'callback=fetchJSON_comment98vv4&productId={}&score=0'\
 62 | 					'&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(item_id,page)
 63 | 	
 64 | 		json_decoder = requests.get(comment_url,headers=headers).text
 65 | 		try:
 66 | 			if json_decoder:
 67 | 				start = json_decoder.find('{"productAttr":null,')
 68 | 
 69 | 				end = json_decoder.find(',"afterDays":0}]}')+len(',"afterDays":0}]}')
 70 | 			
 71 | 				content = json.loads(json_decoder[start:end])
 72 | 				
 73 | 				comments = content['comments']
 74 | 				
 75 | 				for c in comments:
 76 | 					comm = c['content']
 77 | 					good_comments+="{}|".format(comm)
 78 | 					
 79 | 				print(good_comments)
 80 | 		except Exception as e:
 81 | 			print(e)
 82 | 
 83 | 	print(item_id,good_comments)
 84 | 
 85 | def get_shop_info(url):#获取商品信息
 86 | 	shop_data = {}
 87 | 	html = requests.get(url,headers = headers)
 88 | 	soup = BeautifulSoup(html.text,'lxml')
 89 | 	try:
 90 | 		shop_name = soup.select('div.mt h3 a')
 91 | 	except Exception as e:
 92 | 		raise e
 93 | 
 94 | def get_index_lists(html):#获取索引列表
 95 | 	html.encoding = 'utf8'
 96 | 	soup = BeautifulSoup(html.text,'lxml')
 97 | 	lis = soup.find_all('li',attrs = {"class":"gl-item"})
 98 | 	for li in lis:
 99 | 		number = li.find('div',attrs = {"class":"p-commit"}).strong
100 | 		print(number)
101 | 
102 | def get_comm_num(url):#获取评论数量
103 | 	
104 | 	item_id = get_id(url)
105 | 	comm_url = 'https://club.jd.com/comment/productCommentSummaries.action?'\
106 | 			'referenceIds={}&callback=jQuery3096445'.format(item_id)
107 | 	comment = requests.get(comm_url,headers = headers).text
108 | 	start = comment.find('{"CommentsCount":')#起始
109 | 	end = comment.find('"PoorRateStyle":0}]}')+len('"PoorRateStyle":0}]}')#结尾
110 | 	try:
111 | 		content = json.loads(comment[start:end])['CommentsCount']#取出json
112 | 	except:
113 | 		return 0
114 | 	comm_num = content[0]['CommentCount']
115 | 	return comm_num
116 | 
117 | 
118 | def get_id(url):#匹配id
119 | 	id = re.compile('\d+')
120 | 	res = id.findall(url)
121 | 	return res[0]
122 | 
123 | 
124 | if __name__ == '__main__':
125 | 
126 | 	for i in range(1,30,2):
127 | 		url = 'https://search.jd.com/Search?'\
128 | 		'keyword={}&page={}'.format(KEYWORD,i)
129 | 		get_index(url)
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/房天下/crawl.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import requests, re
  5 | from lxml import etree
  6 | from urllib import parse
  7 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
  8 | from db import sess, House
  9 | 
 10 | headers = {
 11 |     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 12 |     'referer':'https://fs.zu.fang.com/house-a0617/i32/',
 13 |     #参考链接 https://zu.fang.com/house-a01
 14 |     #请填写你的Cookie
 15 | }
 16 | 
 17 | session = requests.session()  #保持会话状态，不必重复请求
 18 | session.headers = headers
 19 | 
 20 | 
 21 | #获取str中的数字
 22 | def get_number(text):
 23 |     number = re.compile('\d+')
 24 |     return number.findall(text)[0]
 25 | 
 26 | 
 27 | #获取页面的page数目
 28 | def get_pages(html):
 29 |     soup = etree.HTML(html.text)
 30 |     pages = soup.xpath("//div[@class='fanye']/span/text()")
 31 |     number = get_number(pages[0])
 32 |     if number:
 33 |         return int(number)
 34 |     return None
 35 | 
 36 | 
 37 | def get_house_data(url, *args):
 38 |     headers = {
 39 |         'Connection': 'keep-alive',  #常链接
 40 |         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 41 |         'Referer': 'https://fs.zu.fang.com/house-a0617/i33/',
 42 |         #参考链接 https://zu.fang.com/house-a01
 43 |         #请填写你的Cookie
 44 |     }
 45 | 
 46 |     loca_url = re.compile("<td>(.*?)</td>")  #获取跳转链接
 47 |     xiangqing_url = re.compile('location.href="(.*?)"')
 48 | 
 49 |     session = requests.session()  #长连接   保持会话
 50 |     session.headers = headers
 51 | 
 52 |     url = 'http://search.fang.com/captcha-854085290c4833ba19/redirect?h=' + url
 53 | 
 54 |     html = session.get(url)
 55 | 
 56 |     one_url = xiangqing_url.findall(html.text)[-1]  #第一次跳转
 57 |     html = session.get(one_url)
 58 | 
 59 |     two_url = xiangqing_url.findall(html.text)[-1]  #第二次跳转
 60 |     html = session.get(two_url)
 61 | 
 62 |     soup = etree.HTML(html.text)
 63 |     xiangqing = soup.xpath('//div[@class="fyms_con floatl gray3"]/text()')
 64 |     xiangqing = '|'.join(xiangqing)
 65 |     print('block:{}\t标题:{}\t租金:{}详情:{}'.format(args[0], args[2], args[1],xiangqing))
 66 | 
 67 |     s = sess()
 68 |     try:
 69 |         house = House(block=args[0],
 70 |                       title=args[2],
 71 |                       rent=args[1],
 72 |                       data=xiangqing)
 73 | 
 74 |         s.add(house)
 75 |         s.commit()
 76 |         print('commit')
 77 |     except Exception as e:
 78 |         print('rollback', e)
 79 |         s.rollback()
 80 | 
 81 | 
 82 | #获取页面信息
 83 | def get_data_next(url):
 84 |     html = session.get(url)
 85 |     soup = etree.HTML(html.text)
 86 |     dls = soup.xpath("//div[@class='houseList']/dl")
 87 |     block = soup.xpath("//span[@class='selestfinds']/a/text()")
 88 |     rfss = soup.xpath("//input[@id='baidid']/@value")[0]
 89 |     for dl in dls:
 90 |         try:
 91 |             title = dl.xpath('dd/p/a/text()')[0]
 92 |             rent = dl.xpath("dd/div/p/span[@class='price']/text()")[0]
 93 |             href = parse.urljoin('https://zu.fang.com',
 94 |                                  dl.xpath('dd/p/a/@href')[0])  #拼接链
 95 |             get_house_data(href, block, rent, title)
 96 |         except IndexError as e:
 97 |             print('dl error', e)
 98 | 
 99 | 
100 | #获取页面
101 | def get_data(html):
102 |     pages = get_pages(html)
103 |     if not pages:
104 |         pages = 1
105 |     urls = [
106 |         'https://zu.fang.com/house-a01/i3%d/' % i for i in range(1, pages + 1)
107 |     ]
108 | 
109 |     with ProcessPoolExecutor(max_workers=2) as t:
110 | 
111 |         for url in urls:
112 |             t.submit(get_data_next, url)
113 | 
114 | 
115 | #进入首页
116 | def get_index(url):
117 |     html = session.get(url, headers=headers)
118 |     if html.status_code == 200:
119 |         get_data(html)
120 |     else:
121 |         print('请求页面{}出错'.format(url))
122 | 
123 | 
124 | def main():
125 |     urls = ['https://zu.fang.com/house-a0{}/'.format(i) for i in range(1, 17)]
126 |     with ProcessPoolExecutor(max_workers=2) as p:
127 |         for url in urls:
128 |             p.submit(get_index, url)
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 |     session.close()
134 | 


--------------------------------------------------------------------------------
/房天下/db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | from sqlalchemy import Column,Integer,String,Text
 3 | from sqlalchemy.orm import sessionmaker,scoped_session
 4 | from sqlalchemy.ext.declarative import declarative_base
 5 | 
 6 | BASE = declarative_base()#创建基类
 7 | 
 8 | #此处没有使用pymysql的驱动
 9 | #请安装pip install mysql-connector-python
10 | #engine中的 mysqlconnector 为 mysql官网驱动
11 | engine = create_engine(
12 | 	"mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8",#确定编码格式
13 | 	max_overflow = 500,#超过连接池大小外最多可以创建的链接
14 | 	pool_size = 100,#连接池大小
15 | 	echo = False,#调试信息展示
16 | )
17 | 
18 | class House(BASE):#继承基类
19 | 	__tablename__ = 'house' #表名字
20 | 	id = Column(Integer,primary_key = True,autoincrement = True)
21 | 	block = Column(String(125))
22 | 	title = Column(String(125))
23 | 	rent = Column(String(125))
24 | 	data = Column(Text())
25 | 
26 | BASE.metadata.create_all(engine)#通过基类创建表
27 | Session = sessionmaker(engine)
28 | sess = scoped_session(Session)
29 | 
30 | 


--------------------------------------------------------------------------------
/新版QQ音乐/README.md:
--------------------------------------------------------------------------------
1 | **注意事项**
2 | - [QQ音乐爬虫原理视频](https://www.bilibili.com/video/BV1pk4y1m7TG)
3 | - `execjs`依赖于`NodeJS`，请务必提前安装（本项目的开发环境为`NodeJS v14.6.0`）。
4 | - `cd`到当前文件夹，执行`python demo.py`即可
5 | - `demo.py`为没有入库版，只爬取一个分类，没开多进程，方便大家理解
6 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题
7 | - `get_singer_mid(index)` 方法决定分类爬取


--------------------------------------------------------------------------------
/新版QQ音乐/crawl.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import execjs,requests,math,os,threading
  5 | from urllib import parse
  6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
  7 | from db import SQLsession,Song
  8 | 
  9 | lock = threading.Lock()
 10 | 
 11 | headers = {
 12 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 13 |     'Referer':'https://y.qq.com/portal/singer_list.html',
 14 |     #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
 15 | }
 16 | 
 17 | session = SQLsession()
 18 | 
 19 | def get_sign(data):
 20 | 
 21 |     with open('./新版QQ音乐/get_sign.js','r',encoding='utf-8') as f:
 22 |         text = f.read()
 23 |     
 24 |     js_data = execjs.compile(text)
 25 |     sign = js_data.call('get_sign',data)
 26 |     return sign
 27 | 
 28 | 
 29 | def myProcess():
 30 |     #把歌手按照首字母分为27类
 31 |     with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
 32 |         for i in range(1,28):#28
 33 |             p.submit(get_singer_mid,i)
 34 | 
 35 | 
 36 | def get_singer_mid(index):
 37 |     #index =  1-----27
 38 |     #打开歌手列表页面，找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
 39 |     #找出mid, 用于歌手详情页
 40 |     data = '{"comm":{"ct":24,"cv":0},"singerList":'\
 41 |         '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
 42 |         '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
 43 |     sign = get_sign(data)
 44 | 
 45 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
 46 |         '&sign={}'\
 47 |         '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 48 |         '&notice=0&platform=yqq.json&needNewCode=0'\
 49 |         '&data={}'.format(sign,parse.quote(data)) 
 50 | 
 51 |     html = requests.get(url,headers = headers).json()
 52 |   
 53 |     total = html['singerList']['data']['total']#多少个歌手
 54 |     
 55 |     pages = int(math.floor(int(total)/80))#向下取整
 56 | 
 57 |     thread_number = pages
 58 |     Thread = ThreadPoolExecutor(max_workers = thread_number)
 59 | 
 60 |     sin = 0
 61 |     #分页迭代每一个字母下的所有页面歌手
 62 |     for page in range(1,pages+2):
 63 | 
 64 |         data = '{"comm":{"ct":24,"cv":0},"singerList":'\
 65 |             '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
 66 |             '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page))
 67 |         sign = get_sign(data)
 68 | 
 69 |         url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
 70 |             '&sign={}'\
 71 |             '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 72 |             '&notice=0&platform=yqq.json&needNewCode=0'\
 73 |             '&data={}'.format(sign,parse.quote(data)) 
 74 | 
 75 |         html = requests.get(url,headers = headers).json()
 76 | 
 77 |         sings = html['singerList']['data']['singerlist']
 78 | 
 79 |         for sing in sings:
 80 | 
 81 |             singer_name = sing['singer_name']   #获取歌手名字
 82 |             mid = sing['singer_mid']            #获取歌手mid
 83 |          
 84 |             Thread.submit(get_singer_data,mid = mid,
 85 |                             singer_name = singer_name,)
 86 |         sin+=80
 87 |   
 88 | #获取歌手信息
 89 | def get_singer_data(mid,singer_name):
 90 |     #获取歌手mid,进入歌手详情页，也就是每一个歌手歌曲所在页面
 91 |     #找出歌手的歌曲信息页
 92 | 
 93 |     data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
 94 |             '{"order":1,"singerMid":"%s","begin":0,"num":10}'\
 95 |             ',"module":"musichall.song_list_server"}}'%(str(mid))
 96 | 
 97 |     sign = get_sign(data)
 98 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
 99 |         '&g_tk=5381&sign={}&loginUin=0'\
100 |         '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
101 |         '&data={}'.format(sign,parse.quote(data))
102 |     
103 |     html = requests.get(url,headers = headers).json()
104 | 
105 |     songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数
106 |  
107 | 
108 |     for number in range(0,songs_num,100):
109 | 
110 |         data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
111 |             '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\
112 |             ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num))
113 | 
114 |         sign = get_sign(data)
115 |         url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
116 |             '&g_tk=5381&sign={}&loginUin=0'\
117 |             '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
118 |             '&data={}'.format(sign,parse.quote(data))
119 |         
120 |         html = requests.get(url,headers = headers).json()
121 |         
122 |         datas = html['singerSongList']['data']['songList']
123 | 
124 |         for d in datas:
125 |             sing_name = d['songInfo']['title']
126 |             song_mid = d['songInfo']['mid']
127 |             try:
128 |                 lock.acquire()#锁上
129 | 
130 |                 session.add(Song(song_name = sing_name,
131 |                                 song_singer = singer_name,
132 |                                 song_mid = song_mid))
133 |                 session.commit()
134 | 
135 |                 lock.release()#解锁
136 |                 print('commit')
137 |             except:                
138 |                 session.rollback()
139 |                 print('rollbeak')
140 |    
141 | 
142 |             print('歌手名字：{}\t歌曲名字：{}\t歌曲ID：{}'.format(singer_name,sing_name,song_mid))
143 |             download(song_mid,sing_name,singer_name)
144 | 
145 | 
146 | def download(song_mid,sing_name,singer_name):
147 | 
148 |     qq_number = '1641202711'#请修改你的QQ号  
149 |     data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\
150 |         ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\
151 |         '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\
152 |         '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\
153 |         '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number))
154 | 
155 |     sign = get_sign(data)    
156 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\
157 |         '&g_tk=1291538537&sign={}&loginUin={}'\
158 |         '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0'\
159 |         '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data))
160 | 
161 |     html = requests.get(url,headers = headers).json()
162 | 
163 |     try:
164 |         purl = html['req_0']['data']['midurlinfo'][0]['purl']
165 |     
166 | 
167 |         url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl)
168 | 
169 |         html = requests.get(url,headers = headers,verify=False)
170 | 
171 |         html.encoding = 'utf-8'
172 | 
173 |         sing_file_name = '{} -- {}'.format(sing_name,singer_name)
174 | 
175 |         filename = './新版QQ音乐/歌曲'
176 | 
177 |         if html.status_code != 403:
178 |             if not os.path.exists(filename):
179 |                 os.makedirs(filename)
180 |         
181 |             with open('./新版QQ音乐/歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
182 |                 print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
183 |                 f.write(html.content)
184 |             
185 |     except:
186 |         print('查询权限失败，或没有查到对应的歌曲')
187 | 
188 | 
189 | 
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     myProcess()
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/新版QQ音乐/db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column,Integer,String,create_engine
 2 | from sqlalchemy.orm import  sessionmaker,scoped_session
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | 
 5 | #此处没有使用pymysql的驱动
 6 | #请安装pip install mysql-connector-python
 7 | #engine中的 mysqlconnector 为 mysql官网驱动
 8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8',
 9 |                        max_overflow = 500,#超过连接池大小外最多可以创建的链接
10 |                        pool_size = 100,#连接池大小
11 |                        echo = False,#调试信息展示
12 | )
13 | Base = declarative_base()
14 | 
15 | class Song(Base):
16 |     __tablename__ = 'song'
17 |     song_id = Column(Integer,primary_key = True,autoincrement = True)
18 |     song_name = Column(String(64))
19 |     song_ablum = Column(String(64))
20 |     song_mid = Column(String(50))
21 |     song_singer = Column(String(50))
22 | Base.metadata.create_all(engine)
23 | 
24 | DBsession = sessionmaker(bind = engine)
25 | 
26 | SQLsession = scoped_session(DBsession)
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/新版QQ音乐/demo.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import execjs,requests,math,os,threading
  5 | from urllib import parse
  6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
  7 | from db import SQLsession,Song
  8 | 
  9 | # lock = threading.Lock()
 10 | 
 11 | headers = {
 12 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 13 |     'Referer':'https://y.qq.com/portal/singer_list.html',
 14 |     #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
 15 | }
 16 | 
 17 | # session = SQLsession()
 18 | 
 19 | def get_sign(data):
 20 | 
 21 |     with open('./get_sign.js','r',encoding='utf-8') as f:
 22 |         text = f.read()
 23 |     
 24 |     js_data = execjs.compile(text)
 25 |     sign = js_data.call('get_sign',data)
 26 |     return sign
 27 | 
 28 | 
 29 | def myProcess():
 30 |     #把歌手按照首字母分为27类
 31 |     with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
 32 |         for i in range(1,28):#28
 33 |             p.submit(get_singer_mid,i)
 34 | 
 35 | 
 36 | def get_singer_mid(index):
 37 |     #index =  1-----27
 38 |     #打开歌手列表页面，找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
 39 |     #找出mid, 用于歌手详情页
 40 |     data = '{"comm":{"ct":24,"cv":0},"singerList":'\
 41 |         '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
 42 |         '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
 43 |     sign = get_sign(data)
 44 | 
 45 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
 46 |         '&sign={}'\
 47 |         '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 48 |         '&notice=0&platform=yqq.json&needNewCode=0'\
 49 |         '&data={}'.format(sign,parse.quote(data)) 
 50 | 
 51 |     html = requests.get(url,headers = headers).json()
 52 |   
 53 |     total = html['singerList']['data']['total']#多少个歌手
 54 |     
 55 |     pages = int(math.floor(int(total)/80))#向下取整
 56 | 
 57 |     thread_number = pages
 58 |     Thread = ThreadPoolExecutor(max_workers = thread_number)
 59 | 
 60 |     sin = 0
 61 |     #分页迭代每一个字母下的所有页面歌手
 62 |     for page in range(1,pages+2):
 63 | 
 64 |         data = '{"comm":{"ct":24,"cv":0},"singerList":'\
 65 |             '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\
 66 |             '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page))
 67 |         sign = get_sign(data)
 68 | 
 69 |         url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\
 70 |             '&sign={}'\
 71 |             '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 72 |             '&notice=0&platform=yqq.json&needNewCode=0'\
 73 |             '&data={}'.format(sign,parse.quote(data)) 
 74 | 
 75 |         html = requests.get(url,headers = headers).json()
 76 | 
 77 |         sings = html['singerList']['data']['singerlist']
 78 | 
 79 |         for sing in sings:
 80 | 
 81 |             singer_name = sing['singer_name']   #获取歌手名字
 82 |             mid = sing['singer_mid']            #获取歌手mid
 83 |          
 84 |             Thread.submit(get_singer_data,mid = mid,
 85 |                             singer_name = singer_name,)
 86 |         sin+=80
 87 |   
 88 | #获取歌手信息
 89 | def get_singer_data(mid,singer_name):
 90 |     #获取歌手mid,进入歌手详情页，也就是每一个歌手歌曲所在页面
 91 |     #找出歌手的歌曲信息页
 92 | 
 93 |     data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
 94 |             '{"order":1,"singerMid":"%s","begin":0,"num":10}'\
 95 |             ',"module":"musichall.song_list_server"}}'%(str(mid))
 96 | 
 97 |     sign = get_sign(data)
 98 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
 99 |         '&g_tk=5381&sign={}&loginUin=0'\
100 |         '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
101 |         '&data={}'.format(sign,parse.quote(data))
102 |     
103 |     html = requests.get(url,headers = headers).json()
104 | 
105 |     songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数
106 |  
107 | 
108 |     for number in range(0,songs_num,100):
109 | 
110 |         data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\
111 |             '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\
112 |             ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num))
113 | 
114 |         sign = get_sign(data)
115 |         url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\
116 |             '&g_tk=5381&sign={}&loginUin=0'\
117 |             '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
118 |             '&data={}'.format(sign,parse.quote(data))
119 |         
120 |         html = requests.get(url,headers = headers).json()
121 |         
122 |         datas = html['singerSongList']['data']['songList']
123 | 
124 |         for d in datas:
125 |             sing_name = d['songInfo']['title']
126 |             song_mid = d['songInfo']['mid']
127 |             # try:
128 |             #     lock.acquire()#锁上
129 |             #
130 |             #     session.add(Song(song_name = sing_name,
131 |             #                     song_singer = singer_name,
132 |             #                     song_mid = song_mid))
133 |             #     session.commit()
134 |             #
135 |             #     lock.release()#解锁
136 |             #     print('commit')
137 |             # except:
138 |             #     session.rollback()
139 |             #     print('rollbeak')
140 |    
141 | 
142 |             print('歌手名字：{}\t歌曲名字：{}\t歌曲ID：{}'.format(singer_name,sing_name,song_mid))
143 |             download(song_mid,sing_name,singer_name)
144 | 
145 | 
146 | def download(song_mid,sing_name,singer_name):
147 | 
148 |     qq_number = '1641202711'#请修改你的QQ号  
149 |     data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\
150 |         ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\
151 |         '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\
152 |         '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\
153 |         '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number))
154 | 
155 |     sign = get_sign(data)    
156 |     url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\
157 |         '&g_tk=1291538537&sign={}&loginUin={}'\
158 |         '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0'\
159 |         '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data))
160 | 
161 |     html = requests.get(url,headers = headers).json()
162 | 
163 |     try:
164 |         purl = html['req_0']['data']['midurlinfo'][0]['purl']
165 |     
166 | 
167 |         url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl)
168 | 
169 |         html = requests.get(url,headers = headers,verify=False)
170 | 
171 |         html.encoding = 'utf-8'
172 | 
173 |         sing_file_name = '{} -- {}'.format(sing_name,singer_name)
174 | 
175 |         filename = './歌曲'
176 | 
177 |         if html.status_code != 403:
178 |             if not os.path.exists(filename):
179 |                 os.makedirs(filename)
180 |         
181 |             with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
182 |                 print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
183 |                 f.write(html.content)
184 |             
185 |     except:
186 |         print('查询权限失败，或没有查到对应的歌曲')
187 | 
188 | 
189 | 
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     # myProcess()
194 |     get_singer_mid(1)
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/新版QQ音乐/get_sign.js:
--------------------------------------------------------------------------------
  1 | 
  2 | this.window = this;
  3 | var sign = null;
  4 | 
  5 | !function(n, t) {
  6 |     "object" == typeof exports && "undefined" != typeof module ? module.exports = t() : "function" == typeof define && define.amd ? define(t) : (n = n || self).getSecuritySign = t()
  7 | } (this,
  8 | function() {
  9 |     "use strict";
 10 |     var n = function() {
 11 |         if ("undefined" != typeof self) return self;
 12 |         if ("undefined" != typeof window) return window;
 13 |         if ("undefined" != typeof global) return global;
 14 |         throw new Error("unable to locate global object")
 15 |     } ();
 16 |     n.__sign_hash_20200305 = function(n) {
 17 |         function l(n, t) {
 18 |             var o = (65535 & n) + (65535 & t);
 19 |             return (n >> 16) + (t >> 16) + (o >> 16) << 16 | 65535 & o
 20 |         }
 21 |         function r(n, t, o, e, u, p) {
 22 |             return l((i = l(l(t, n), l(e, p))) << (r = u) | i >>> 32 - r, o);
 23 |             var i, r
 24 |         }
 25 |         function g(n, t, o, e, u, p, i) {
 26 |             return r(t & o | ~t & e, n, t, u, p, i)
 27 |         }
 28 |         function a(n, t, o, e, u, p, i) {
 29 |             return r(t & e | o & ~e, n, t, u, p, i)
 30 |         }
 31 |         function s(n, t, o, e, u, p, i) {
 32 |             return r(t ^ o ^ e, n, t, u, p, i)
 33 |         }
 34 |         function v(n, t, o, e, u, p, i) {
 35 |             return r(o ^ (t | ~e), n, t, u, p, i)
 36 |         }
 37 |         function t(n) {
 38 |             return function(n) {
 39 |                 var t, o = "";
 40 |                 for (t = 0; t < 32 * n.length; t += 8) o += String.fromCharCode(n[t >> 5] >>> t % 32 & 255);
 41 |                 return o
 42 |             } (function(n, t) {
 43 |                 n[t >> 5] |= 128 << t % 32,
 44 |                 n[14 + (t + 64 >>> 9 << 4)] = t;
 45 |                 var o, e, u, p, i, r = 1732584193,
 46 |                 f = -271733879,
 47 |                 h = -1732584194,
 48 |                 c = 271733878;
 49 |                 for (o = 0; o < n.length; o += 16) r = g(e = r, u = f, p = h, i = c, n[o], 7, -680876936),
 50 |                 c = g(c, r, f, h, n[o + 1], 12, -389564586),
 51 |                 h = g(h, c, r, f, n[o + 2], 17, 606105819),
 52 |                 f = g(f, h, c, r, n[o + 3], 22, -1044525330),
 53 |                 r = g(r, f, h, c, n[o + 4], 7, -176418897),
 54 |                 c = g(c, r, f, h, n[o + 5], 12, 1200080426),
 55 |                 h = g(h, c, r, f, n[o + 6], 17, -1473231341),
 56 |                 f = g(f, h, c, r, n[o + 7], 22, -45705983),
 57 |                 r = g(r, f, h, c, n[o + 8], 7, 1770035416),
 58 |                 c = g(c, r, f, h, n[o + 9], 12, -1958414417),
 59 |                 h = g(h, c, r, f, n[o + 10], 17, -42063),
 60 |                 f = g(f, h, c, r, n[o + 11], 22, -1990404162),
 61 |                 r = g(r, f, h, c, n[o + 12], 7, 1804603682),
 62 |                 c = g(c, r, f, h, n[o + 13], 12, -40341101),
 63 |                 h = g(h, c, r, f, n[o + 14], 17, -1502002290),
 64 |                 r = a(r, f = g(f, h, c, r, n[o + 15], 22, 1236535329), h, c, n[o + 1], 5, -165796510),
 65 |                 c = a(c, r, f, h, n[o + 6], 9, -1069501632),
 66 |                 h = a(h, c, r, f, n[o + 11], 14, 643717713),
 67 |                 f = a(f, h, c, r, n[o], 20, -373897302),
 68 |                 r = a(r, f, h, c, n[o + 5], 5, -701558691),
 69 |                 c = a(c, r, f, h, n[o + 10], 9, 38016083),
 70 |                 h = a(h, c, r, f, n[o + 15], 14, -660478335),
 71 |                 f = a(f, h, c, r, n[o + 4], 20, -405537848),
 72 |                 r = a(r, f, h, c, n[o + 9], 5, 568446438),
 73 |                 c = a(c, r, f, h, n[o + 14], 9, -1019803690),
 74 |                 h = a(h, c, r, f, n[o + 3], 14, -187363961),
 75 |                 f = a(f, h, c, r, n[o + 8], 20, 1163531501),
 76 |                 r = a(r, f, h, c, n[o + 13], 5, -1444681467),
 77 |                 c = a(c, r, f, h, n[o + 2], 9, -51403784),
 78 |                 h = a(h, c, r, f, n[o + 7], 14, 1735328473),
 79 |                 r = s(r, f = a(f, h, c, r, n[o + 12], 20, -1926607734), h, c, n[o + 5], 4, -378558),
 80 |                 c = s(c, r, f, h, n[o + 8], 11, -2022574463),
 81 |                 h = s(h, c, r, f, n[o + 11], 16, 1839030562),
 82 |                 f = s(f, h, c, r, n[o + 14], 23, -35309556),
 83 |                 r = s(r, f, h, c, n[o + 1], 4, -1530992060),
 84 |                 c = s(c, r, f, h, n[o + 4], 11, 1272893353),
 85 |                 h = s(h, c, r, f, n[o + 7], 16, -155497632),
 86 |                 f = s(f, h, c, r, n[o + 10], 23, -1094730640),
 87 |                 r = s(r, f, h, c, n[o + 13], 4, 681279174),
 88 |                 c = s(c, r, f, h, n[o], 11, -358537222),
 89 |                 h = s(h, c, r, f, n[o + 3], 16, -722521979),
 90 |                 f = s(f, h, c, r, n[o + 6], 23, 76029189),
 91 |                 r = s(r, f, h, c, n[o + 9], 4, -640364487),
 92 |                 c = s(c, r, f, h, n[o + 12], 11, -421815835),
 93 |                 h = s(h, c, r, f, n[o + 15], 16, 530742520),
 94 |                 r = v(r, f = s(f, h, c, r, n[o + 2], 23, -995338651), h, c, n[o], 6, -198630844),
 95 |                 c = v(c, r, f, h, n[o + 7], 10, 1126891415),
 96 |                 h = v(h, c, r, f, n[o + 14], 15, -1416354905),
 97 |                 f = v(f, h, c, r, n[o + 5], 21, -57434055),
 98 |                 r = v(r, f, h, c, n[o + 12], 6, 1700485571),
 99 |                 c = v(c, r, f, h, n[o + 3], 10, -1894986606),
100 |                 h = v(h, c, r, f, n[o + 10], 15, -1051523),
101 |                 f = v(f, h, c, r, n[o + 1], 21, -2054922799),
102 |                 r = v(r, f, h, c, n[o + 8], 6, 1873313359),
103 |                 c = v(c, r, f, h, n[o + 15], 10, -30611744),
104 |                 h = v(h, c, r, f, n[o + 6], 15, -1560198380),
105 |                 f = v(f, h, c, r, n[o + 13], 21, 1309151649),
106 |                 r = v(r, f, h, c, n[o + 4], 6, -145523070),
107 |                 c = v(c, r, f, h, n[o + 11], 10, -1120210379),
108 |                 h = v(h, c, r, f, n[o + 2], 15, 718787259),
109 |                 f = v(f, h, c, r, n[o + 9], 21, -343485551),
110 |                 r = l(r, e),
111 |                 f = l(f, u),
112 |                 h = l(h, p),
113 |                 c = l(c, i);
114 |                 return [r, f, h, c]
115 |             } (function(n) {
116 |                 var t, o = [];
117 |                 for (o[(n.length >> 2) - 1] = void 0, t = 0; t < o.length; t += 1) o[t] = 0;
118 |                 for (t = 0; t < 8 * n.length; t += 8) o[t >> 5] |= (255 & n.charCodeAt(t / 8)) << t % 32;
119 |                 return o
120 |             } (n), 8 * n.length))
121 |         }
122 |         function o(n) {
123 |             return t(unescape(encodeURIComponent(n)))
124 |         }
125 |         return function(n) {
126 |             var t, o, e = "0123456789abcdef",
127 |             u = "";
128 |             for (o = 0; o < n.length; o += 1) t = n.charCodeAt(o),
129 |             u += e.charAt(t >>> 4 & 15) + e.charAt(15 & t);
130 |             return u
131 |         } (o(n))
132 |     },
133 |     function r(f, h, c, l, g) {
134 |         g = g || [[this], [{}]];
135 |         for (var t = [], o = null, n = [function() {
136 |             return ! 0
137 |         },
138 |         function() {},
139 |         function() {
140 |             g.length = c[h++]
141 |         },
142 |         function() {
143 |             g.push(c[h++])
144 |         },
145 |         function() {
146 |             g.pop()
147 |         },
148 |         function() {
149 |             var n = c[h++],
150 |             t = g[g.length - 2 - n];
151 |             g[g.length - 2 - n] = g.pop(),
152 |             g.push(t)
153 |         },
154 |         function() {
155 |             g.push(g[g.length - 1])
156 |         },
157 |         function() {
158 |             g.push([g.pop(), g.pop()].reverse())
159 |         },
160 |         function() {
161 |             g.push([l, g.pop()])
162 |         },
163 |         function() {
164 |             g.push([g.pop()])
165 |         },
166 |         function() {
167 |             var n = g.pop();
168 |             g.push(n[0][n[1]])
169 |         },
170 |         function() {
171 |             g.push(g[g.pop()[0]][0])
172 |         },
173 |         function() {
174 |             var n = g[g.length - 2];
175 |             n[0][n[1]] = g[g.length - 1]
176 |         },
177 |         function() {
178 |             g[g[g.length - 2][0]][0] = g[g.length - 1]
179 |         },
180 |         function() {
181 |             var n = g.pop(),
182 |             t = g.pop();
183 |             g.push([t[0][t[1]], n])
184 |         },
185 |         function() {
186 |             var n = g.pop();
187 |             g.push([g[g.pop()][0], n])
188 |         },
189 |         function() {
190 |             var n = g.pop();
191 |             g.push(delete n[0][n[1]])
192 |         },
193 |         function() {
194 |             var n = [];
195 |             for (var t in g.pop()) n.push(t);
196 |             g.push(n)
197 |         },
198 |         function() {
199 |             g[g.length - 1].length ? g.push(g[g.length - 1].shift(), !0) : g.push(void 0, !1)
200 |         },
201 |         function() {
202 |             var n = g[g.length - 2],
203 |             t = Object.getOwnPropertyDescriptor(n[0], n[1]) || {
204 |                 configurable: !0,
205 |                 enumerable: !0
206 |             };
207 |             t.get = g[g.length - 1],
208 |             Object.defineProperty(n[0], n[1], t)
209 |         },
210 |         function() {
211 |             var n = g[g.length - 2],
212 |             t = Object.getOwnPropertyDescriptor(n[0], n[1]) || {
213 |                 configurable: !0,
214 |                 enumerable: !0
215 |             };
216 |             t.set = g[g.length - 1],
217 |             Object.defineProperty(n[0], n[1], t)
218 |         },
219 |         function() {
220 |             h = c[h++]
221 |         },
222 |         function() {
223 |             var n = c[h++];
224 |             g[g.length - 1] && (h = n)
225 |         },
226 |         function() {
227 |             throw g[g.length - 1]
228 |         },
229 |         function() {
230 |             var n = c[h++],
231 |             t = n ? g.slice( - n) : [];
232 |             g.length -= n,
233 |             g.push(g.pop().apply(l, t))
234 |         },
235 |         function() {
236 |             var n = c[h++],
237 |             t = n ? g.slice( - n) : [];
238 |             g.length -= n;
239 |             var o = g.pop();
240 |             g.push(o[0][o[1]].apply(o[0], t))
241 |         },
242 |         function() {
243 |             var n = c[h++],
244 |             t = n ? g.slice( - n) : [];
245 |             g.length -= n,
246 |             t.unshift(null),
247 |             g.push(new(Function.prototype.bind.apply(g.pop(), t)))
248 |         },
249 |         function() {
250 |             var n = c[h++],
251 |             t = n ? g.slice( - n) : [];
252 |             g.length -= n,
253 |             t.unshift(null);
254 |             var o = g.pop();
255 |             g.push(new(Function.prototype.bind.apply(o[0][o[1]], t)))
256 |         },
257 |         function() {
258 |             g.push(!g.pop())
259 |         },
260 |         function() {
261 |             g.push(~g.pop())
262 |         },
263 |         function() {
264 |             g.push(typeof g.pop())
265 |         },
266 |         function() {
267 |             g[g.length - 2] = g[g.length - 2] == g.pop()
268 |         },
269 |         function() {
270 |             g[g.length - 2] = g[g.length - 2] === g.pop()
271 |         },
272 |         function() {
273 |             g[g.length - 2] = g[g.length - 2] > g.pop()
274 |         },
275 |         function() {
276 |             g[g.length - 2] = g[g.length - 2] >= g.pop()
277 |         },
278 |         function() {
279 |             g[g.length - 2] = g[g.length - 2] << g.pop()
280 |         },
281 |         function() {
282 |             g[g.length - 2] = g[g.length - 2] >> g.pop()
283 |         },
284 |         function() {
285 |             g[g.length - 2] = g[g.length - 2] >>> g.pop()
286 |         },
287 |         function() {
288 |             g[g.length - 2] = g[g.length - 2] + g.pop()
289 |         },
290 |         function() {
291 |             g[g.length - 2] = g[g.length - 2] - g.pop()
292 |         },
293 |         function() {
294 |             g[g.length - 2] = g[g.length - 2] * g.pop()
295 |         },
296 |         function() {
297 |             g[g.length - 2] = g[g.length - 2] / g.pop()
298 |         },
299 |         function() {
300 |             g[g.length - 2] = g[g.length - 2] % g.pop()
301 |         },
302 |         function() {
303 |             g[g.length - 2] = g[g.length - 2] | g.pop()
304 |         },
305 |         function() {
306 |             g[g.length - 2] = g[g.length - 2] & g.pop()
307 |         },
308 |         function() {
309 |             g[g.length - 2] = g[g.length - 2] ^ g.pop()
310 |         },
311 |         function() {
312 |             g[g.length - 2] = g[g.length - 2] in g.pop()
313 |         },
314 |         function() {
315 |             g[g.length - 2] = g[g.length - 2] instanceof g.pop()
316 |         },
317 |         function() {
318 |             g[g[g.length - 1][0]] = void 0 === g[g[g.length - 1][0]] ? [] : g[g[g.length - 1][0]]
319 |         },
320 |         function() {
321 |             for (var e = c[h++], u = [], n = c[h++], t = c[h++], p = [], o = 0; o < n; o++) u[c[h++]] = g[c[h++]];
322 |             for (var i = 0; i < t; i++) p[i] = c[h++];
323 |             g.push(function n() {
324 |                 var t = u.slice(0);
325 |                 t[0] = [this],
326 |                 t[1] = [arguments],
327 |                 t[2] = [n];
328 |                 for (var o = 0; o < p.length && o < arguments.length; o++) 0 < p[o] && (t[p[o]] = [arguments[o]]);
329 |                 return r(f, e, c, l, t)
330 |             })
331 |         },
332 |         function() {
333 |             t.push([c[h++], g.length, c[h++]])
334 |         },
335 |         function() {
336 |             t.pop()
337 |         },
338 |         function() {
339 |             return !! o
340 |         },
341 |         function() {
342 |             o = null
343 |         },
344 |         function() {
345 |             g[g.length - 1] += String.fromCharCode(c[h++])
346 |         },
347 |         function() {
348 |             g.push("")
349 |         },
350 |         function() {
351 |             g.push(void 0)
352 |         },
353 |         function() {
354 |             g.push(null)
355 |         },
356 |         function() {
357 |             g.push(!0)
358 |         },
359 |         function() {
360 |             g.push(!1)
361 |         },
362 |         function() {
363 |             g.length -= c[h++]
364 |         },
365 |         function() {
366 |             g[g.length - 1] = c[h++]
367 |         },
368 |         function() {
369 |             var n = g.pop(),
370 |             t = g[g.length - 1];
371 |             t[0][t[1]] = g[n[0]][0]
372 |         },
373 |         function() {
374 |             var n = g.pop(),
375 |             t = g[g.length - 1];
376 |             t[0][t[1]] = n[0][n[1]]
377 |         },
378 |         function() {
379 |             var n = g.pop(),
380 |             t = g[g.length - 1];
381 |             g[t[0]][0] = g[n[0]][0]
382 |         },
383 |         function() {
384 |             var n = g.pop(),
385 |             t = g[g.length - 1];
386 |             g[t[0]][0] = n[0][n[1]]
387 |         },
388 |         function() {
389 |             g[g.length - 2] = g[g.length - 2] < g.pop()
390 |         },
391 |         function() {
392 |             g[g.length - 2] = g[g.length - 2] <= g.pop()
393 |         }];;) try {
394 |             for (; ! n[c[h++]](););
395 |             if (o) throw o;
396 |             return g.pop()
397 |         } catch(n) {
398 |             var e = t.pop();
399 |             if (void 0 === e) throw n;
400 |             o = n,
401 |             h = e[0],
402 |             g.length = e[1],
403 |             e[2] && (g[e[2]][0] = o)
404 |         }
405 |     } (120731, 0, [21, 34, 50, 100, 57, 50, 102, 50, 98, 99, 101, 52, 54, 97, 52, 99, 55, 56, 52, 49, 57, 54, 57, 49, 56, 98, 102, 100, 100, 48, 48, 55, 55, 102, 2, 10, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 4, 21, 427, 54, 2, 15, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 12, 9, 48, 61, 13, 9, 48, 61, 14, 9, 48, 61, 10, 9, 55, 54, 97, 54, 98, 54, 99, 54, 100, 54, 101, 54, 102, 54, 103, 54, 104, 54, 105, 54, 106, 54, 107, 54, 108, 54, 109, 54, 110, 54, 111, 54, 112, 54, 113, 54, 114, 54, 115, 54, 116, 54, 117, 54, 118, 54, 119, 54, 120, 54, 121, 54, 122, 54, 48, 54, 49, 54, 50, 54, 51, 54, 52, 54, 53, 54, 54, 54, 55, 54, 56, 54, 57, 13, 4, 61, 11, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 4, 9, 11, 3, 3, 9, 11, 39, 3, 1, 38, 40, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 12, 9, 55, 13, 4, 61, 13, 9, 3, 0, 13, 4, 4, 3, 13, 9, 11, 3, 11, 9, 11, 66, 22, 306, 4, 21, 422, 24, 4, 3, 14, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 10, 9, 55, 54, 108, 54, 101, 54, 110, 54, 103, 54, 116, 54, 104, 15, 10, 40, 25, 1, 13, 4, 61, 12, 9, 6, 11, 3, 10, 9, 3, 14, 9, 11, 15, 10, 38, 13, 4, 61, 13, 9, 6, 11, 6, 5, 1, 5, 0, 3, 1, 38, 13, 4, 61, 0, 5, 0, 43, 4, 21, 291, 61, 3, 12, 9, 11, 0, 3, 9, 9, 49, 72, 0, 2, 3, 4, 13, 4, 61, 8, 9, 21, 721, 3, 2, 8, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 510, 4, 21, 523, 22, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 0, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 566, 4, 21, 583, 3, 4, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 0, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 626, 4, 21, 643, 25, 4, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 0, 55, 54, 69, 54, 114, 54, 114, 54, 111, 54, 114, 8, 55, 54, 117, 54, 110, 54, 97, 54, 98, 54, 108, 54, 101, 54, 32, 54, 116, 54, 111, 54, 32, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 101, 54, 32, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 54, 32, 54, 111, 54, 98, 54, 106, 54, 101, 54, 99, 54, 116, 27, 1, 23, 56, 0, 49, 444, 0, 0, 24, 0, 13, 4, 61, 8, 9, 55, 54, 95, 54, 95, 54, 103, 54, 101, 54, 116, 54, 83, 54, 101, 54, 99, 54, 117, 54, 114, 54, 105, 54, 116, 54, 121, 54, 83, 54, 105, 54, 103, 54, 110, 15, 21, 1126, 49, 2, 14, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 9, 9, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 862, 21, 932, 21, 4, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 55, 54, 104, 54, 111, 54, 115, 54, 116, 14, 55, 54, 105, 54, 110, 54, 100, 54, 101, 54, 120, 54, 79, 54, 102, 14, 55, 54, 121, 54, 46, 54, 113, 54, 113, 54, 46, 54, 99, 54, 111, 54, 109, 25, 1, 3, 0, 3, 1, 39, 32, 22, 963, 4, 55, 54, 67, 54, 74, 54, 66, 54, 80, 54, 65, 54, 67, 54, 114, 54, 82, 54, 117, 54, 78, 54, 121, 54, 55, 21, 974, 50, 4, 3, 12, 9, 11, 3, 8, 3, 10, 24, 2, 13, 4, 61, 10, 9, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 10, 22, 1030, 21, 1087, 22, 4, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 3, 9, 9, 11, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 11, 9, 3, 12, 9, 11, 3, 10, 3, 53, 3, 37, 39, 24, 2, 13, 4, 4, 55, 54, 122, 54, 122, 54, 97, 3, 11, 9, 11, 38, 3, 10, 9, 11, 38, 0, 49, 771, 2, 1, 12, 9, 13, 8, 3, 12, 4, 4, 56, 0], n);
406 |     var t = n.__getSecuritySign;
407 |     sign = t;
408 |     return t;
409 | });
410 | 
411 | function get_sign(data){
412 |     return sign(data)
413 | };
414 | 


--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/README.md:
--------------------------------------------------------------------------------
1 | **注意事项**
2 | - `cd`到当前文件夹，执行`python demo.py`即可
3 | - `demo.py`为没有入库版，只爬取一个分类，没开多进程，方便大家理解
4 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题
5 | - `get_singer_mid(index)` 方法决定分类爬取


--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/crawl.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import requests,os,json,math,threading
  5 | from urllib import parse
  6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
  7 | from db import SQLsession,Song
  8 | 
  9 | headers = {
 10 | 	'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 11 | 	'referer':'https://y.qq.com/portal/singer_list.html',
 12 | 	#参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
 13 | }
 14 | 
 15 | lock = threading.Lock()
 16 | session = SQLsession()
 17 | 
 18 | def myProcess():
 19 | 	#把歌手按照首字母分为27类
 20 | 	with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
 21 | 		for i in range(1,28):#28
 22 | 			p.submit(get_singer_mid,i)
 23 | 	
 24 | def get_singer_mid(index):
 25 | 	#index =  1-----27
 26 | 	#打开歌手列表页面，找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
 27 | 	#找出mid, 用于歌手详情页
 28 | 
 29 | 	data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
 30 | 			',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
 31 | 			'"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
 32 | 
 33 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
 34 | 		'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
 35 | 		'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
 36 | 		'&data={}'.format(parse.quote(data))
 37 | 	
 38 | 	html = requests.get(url).json()
 39 | 	total = html['singerList']['data']['total']#多少个歌手
 40 | 	pages = int(math.floor(int(total)/80))
 41 | 	thread_number = pages
 42 | 
 43 | 	Thread = ThreadPoolExecutor(max_workers = thread_number)
 44 | 
 45 | 	sin = 0
 46 | 	#分页迭代每一个字母下的所有页面歌手
 47 | 	for page in range(1,pages+2):
 48 | 		data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
 49 | 				'"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
 50 | 				'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
 51 | 		
 52 | 		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
 53 | 			'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
 54 | 			'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
 55 | 			'&data={}'.format(parse.quote(data))
 56 | 
 57 | 		html = requests.get(url,headers = headers).json()
 58 | 		
 59 | 		sings = html['singerList']['data']['singerlist']
 60 | 
 61 | 		for sing in sings:
 62 | 
 63 | 			singer_name = sing['singer_name']
 64 | 			mid = sing['singer_mid']
 65 | 
 66 | 			Thread.submit(get_singer_data,mid = mid,
 67 | 							singer_name = singer_name,)
 68 | 		sin+=80
 69 | 
 70 | 
 71 | 
 72 | #获取歌手信息
 73 | def get_singer_data(mid,singer_name):
 74 | 	#获取歌手mid,进入歌手详情页，也就是每一个歌手歌曲所在页面
 75 | 	#找出歌手的歌曲信息页
 76 | 
 77 | 	params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
 78 | 			'"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
 79 | 			'"module":"musichall.song_list_server"}}'%str(mid)
 80 | 
 81 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
 82 | 			'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 83 | 			'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
 84 | 
 85 | 	html = requests.session()
 86 | 	content = html.get(url,headers = headers).json()
 87 | 
 88 | 	songs_num = content['singerSongList']['data']['totalNum']
 89 | 
 90 | 
 91 | 	for a in range(0,songs_num,100):
 92 | 
 93 | 		params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
 94 | 					'"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
 95 | 					'"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
 96 | 
 97 | 		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
 98 | 				'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
 99 | 				'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
100 | 
101 | 		html = requests.session()
102 | 		content = html.get(url, headers=headers).json()
103 | 
104 | 		datas = content['singerSongList']['data']['songList']
105 | 
106 | 		for d in datas:
107 | 			sing_name = d['songInfo']['title']
108 | 			songmid = d['songInfo']['mid']
109 | 			try:
110 | 				lock.acquire()#锁上
111 | 				session.add(Song(song_name = sing_name,
112 | 									song_singer = singer_name,
113 | 									song_mid = songmid))
114 | 				session.commit()
115 | 				lock.release()#解锁
116 | 				print('commit')
117 | 			except:
118 | 				session.rollback()
119 | 				print('rollbeak')
120 | 
121 | 			print('歌手名字：{}\t歌曲名字：{}\t歌曲ID：{}'.format(singer_name,sing_name,mid))
122 | 			download(songmid,sing_name,singer_name)
123 | 
124 | def download(songmid,sing_name,singer_name):
125 | 	headers = {
126 | 		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
127 | 		'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
128 | 	}
129 | 
130 | 
131 | 	data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
132 | 			'"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
133 | 			'"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
134 | 			'"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
135 | 
136 | 
137 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
138 | 		'&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
139 | 		'&notice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
140 | 
141 | 	html = requests.get(url,headers = headers)
142 | 
143 | 	try:
144 | 		purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
145 | 
146 | 		url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl)
147 | 
148 | 		html = requests.get(url,headers = headers)
149 | 		html.encoding = 'utf-8'
150 | 
151 | 		sing_file_name = '{} -- {}'.format(sing_name,singer_name)
152 | 
153 | 		filename = './旧版QQ音乐(仍可用)/歌曲'
154 | 
155 | 		if not os.path.exists(filename):
156 | 			os.makedirs(filename)
157 | 	
158 | 		with open('./旧版QQ音乐(仍可用)/歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
159 | 			print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
160 | 			f.write(html.content)
161 | 		
162 | 	except:
163 | 		print('查询权限失败，或没有查到对应的歌曲')
164 | 
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	myProcess()


--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column,Integer,String,create_engine
 2 | from sqlalchemy.orm import  sessionmaker,scoped_session
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | 
 5 | #此处没有使用pymysql的驱动
 6 | #请安装pip install mysql-connector-python
 7 | #engine中的 mysqlconnector 为 mysql官网驱动
 8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8',
 9 |                        max_overflow = 500,#超过连接池大小外最多可以创建的链接
10 |                        pool_size = 100,#连接池大小
11 |                        echo = False,#调试信息展示
12 | )
13 | Base = declarative_base()
14 | 
15 | class Song(Base):
16 |     __tablename__ = 'song'
17 |     song_id = Column(Integer,primary_key = True,autoincrement = True)
18 |     song_name = Column(String(64))
19 |     song_ablum = Column(String(64))
20 |     song_mid = Column(String(50))
21 |     song_singer = Column(String(50))
22 | Base.metadata.create_all(engine)
23 | 
24 | DBsession = sessionmaker(bind = engine)
25 | 
26 | SQLsession = scoped_session(DBsession)
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/旧版QQ音乐(仍可用)/demo.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import requests,os,json,math,threading
  5 | from urllib import parse
  6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
  7 | from db import SQLsession,Song
  8 | 
  9 | headers = {
 10 | 	'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
 11 | 	'referer':'https://y.qq.com/portal/singer_list.html',
 12 | 	#参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1&
 13 | }
 14 | 
 15 | lock = threading.Lock()
 16 | session = SQLsession()
 17 | 
 18 | def myProcess():
 19 | 	#把歌手按照首字母分为27类
 20 | 	with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程
 21 | 		for i in range(1,28):#28
 22 | 			p.submit(get_singer_mid,i)
 23 | 	
 24 | def get_singer_mid(index):
 25 | 	#index =  1-----27
 26 | 	#打开歌手列表页面，找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手
 27 | 	#找出mid, 用于歌手详情页
 28 | 
 29 | 	data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\
 30 | 			',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\
 31 | 			'"index":%s,"sin":0,"cur_page":1}}}'%(str(index))
 32 | 
 33 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
 34 | 		'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
 35 | 		'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
 36 | 		'&data={}'.format(parse.quote(data))
 37 | 	
 38 | 	html = requests.get(url).json()
 39 | 	total = html['singerList']['data']['total']#多少个歌手
 40 | 	pages = int(math.floor(int(total)/80))
 41 | 	thread_number = pages
 42 | 
 43 | 	Thread = ThreadPoolExecutor(max_workers = thread_number)
 44 | 
 45 | 	sin = 0
 46 | 	#分页迭代每一个字母下的所有页面歌手
 47 | 	for page in range(1,pages+2):
 48 | 		data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\
 49 | 				'"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\
 50 | 				'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page))
 51 | 		
 52 | 		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\
 53 | 			'&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\
 54 | 			'Charset=utf-8&notice=0&platform=yqq.json&needNewCode=0'\
 55 | 			'&data={}'.format(parse.quote(data))
 56 | 
 57 | 		html = requests.get(url,headers = headers).json()
 58 | 		
 59 | 		sings = html['singerList']['data']['singerlist']
 60 | 
 61 | 		for sing in sings:
 62 | 
 63 | 			singer_name = sing['singer_name']
 64 | 			mid = sing['singer_mid']
 65 | 
 66 | 			Thread.submit(get_singer_data,mid = mid,
 67 | 							singer_name = singer_name,)
 68 | 		sin+=80
 69 | 
 70 | 
 71 | 
 72 | #获取歌手信息
 73 | def get_singer_data(mid,singer_name):
 74 | 	#获取歌手mid,进入歌手详情页，也就是每一个歌手歌曲所在页面
 75 | 	#找出歌手的歌曲信息页
 76 | 
 77 | 	params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\
 78 | 			'"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\
 79 | 			'"module":"musichall.song_list_server"}}'%str(mid)
 80 | 
 81 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\
 82 | 			'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
 83 | 			'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
 84 | 
 85 | 	html = requests.session()
 86 | 	content = html.get(url,headers = headers).json()
 87 | 
 88 | 	songs_num = content['singerSongList']['data']['totalNum']
 89 | 
 90 | 
 91 | 	for a in range(0,songs_num,100):
 92 | 
 93 | 		params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \
 94 | 					'"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \
 95 | 					'"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num))
 96 | 
 97 | 		url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \
 98 | 				'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \
 99 | 				'&notice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params))
100 | 
101 | 		html = requests.session()
102 | 		content = html.get(url, headers=headers).json()
103 | 
104 | 		datas = content['singerSongList']['data']['songList']
105 | 
106 | 		for d in datas:
107 | 			sing_name = d['songInfo']['title']
108 | 			songmid = d['songInfo']['mid']
109 | 			try:
110 | 				lock.acquire()#锁上
111 | 				session.add(Song(song_name = sing_name,
112 | 									song_singer = singer_name,
113 | 									song_mid = songmid))
114 | 				session.commit()
115 | 				lock.release()#解锁
116 | 				print('commit')
117 | 			except:
118 | 				session.rollback()
119 | 				print('rollbeak')
120 | 
121 | 			print('歌手名字：{}\t歌曲名字：{}\t歌曲ID：{}'.format(singer_name,sing_name,mid))
122 | 			download(songmid,sing_name,singer_name)
123 | 
124 | def download(songmid,sing_name,singer_name):
125 | 	headers = {
126 | 		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
127 | 		'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html',
128 | 	}
129 | 
130 | 
131 | 	data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\
132 | 			'"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\
133 | 			'"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\
134 | 			'"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid)
135 | 
136 | 
137 | 	url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\
138 | 		'&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\
139 | 		'&notice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data))
140 | 
141 | 	html = requests.get(url,headers = headers)
142 | 
143 | 	try:
144 | 		purl = html.json()['req_0']['data']['midurlinfo'][0]['purl']
145 | 
146 | 		url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl)
147 | 
148 | 		html = requests.get(url,headers = headers)
149 | 		html.encoding = 'utf-8'
150 | 
151 | 		sing_file_name = '{} -- {}'.format(sing_name,singer_name)
152 | 
153 | 		filename = './歌曲'
154 | 
155 | 		if not os.path.exists(filename):
156 | 			os.makedirs(filename)
157 | 	
158 | 		with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f:
159 | 			print('\n正在下载{}歌曲.....\n'.format(sing_file_name))
160 | 			f.write(html.content)
161 | 		
162 | 	except:
163 | 		print('查询权限失败，或没有查到对应的歌曲')
164 | 
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	# myProcess()
169 | 	get_singer_mid(1)


--------------------------------------------------------------------------------
/有道翻译/crawl.py:
--------------------------------------------------------------------------------
 1 | #Python3.7 
 2 | #encoding = utf-8
 3 | 
 4 | import time, math,random,hashlib
 5 | import requests
 6 | 
 7 | def get_html(name):
 8 | 
 9 |     url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
10 | 
11 | 
12 |     ts = math.floor(time.time() * 1000)
13 |     salt = ts + int(random.random() * 10)
14 | 
15 |     sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest()
16 |     bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest()
17 | 
18 |     data = {
19 |         'i': name,
20 |         'from': 'AUTO',
21 |         'to': 'AUTO',
22 |         'smartresult': 'dict',
23 |         'client': 'fanyideskweb',
24 |         'salt': salt,
25 |         'sign': sign,
26 |         'ts': ts,
27 |         'bv': bv,
28 |         'doctype': 'json',
29 |         'version': '2.1',
30 |         'keyfrom': 'fanyi.web',
31 |         'action': 'FY_BY_CLICKBUTTION',
32 |     }
33 | 
34 |     headers = {
35 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
36 |         'Referer': 'http://fanyi.youdao.com/',
37 |         #参考链接:http://fanyi.youdao.com/
38 |         #请在此处填写你的 Cookie
39 |     }
40 | 
41 | 
42 |     html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法
43 |     # print(html.json())
44 |     print('正在执行有道翻译程序:')
45 |     print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src']))
46 |     print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt']))
47 | 
48 | if __name__ == "__main__":
49 |     
50 |     name = '靓仔'
51 |     
52 |     get_html(name)


--------------------------------------------------------------------------------
/构建代理池/crawl.py:
--------------------------------------------------------------------------------
 1 | #Python3.7 
 2 | #encoding = utf-8
 3 | 
 4 | import requests,time,json
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | headers ={
 8 | 	'Referer':'https://www.kuaidaili.com/free/inha/1/',
 9 | 	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
10 | 	#参考链接:https://www.kuaidaili.com/free/inha/1/
11 | }
12 | 
13 | 
14 | def get_ip(url):#访问网站
15 | 	html = requests.get(url,headers = headers)
16 | 	if html.status_code==200:
17 | 		time.sleep(2)
18 | 		print('[INFO]正在爬取...')
19 | 		parse_html(html.text)
20 | 	else:
21 | 		print("[ERROR]错误",url)
22 | 
23 | def parse_html(html):#获取ip信息
24 | 	soup = BeautifulSoup(html,'lxml')
25 | 	ips = soup.select('.table tbody tr')
26 | 	for line in ips:
27 | 		ip = line.select_one('td').text
28 | 		port = line.select('td')[1].text
29 | 		print('[INFO]获取IP:{}  Port:{}'.format(ip,port))
30 | 
31 | 		address = 'http://{}:{}'.format(ip,port)#构造ip访问
32 | 		proxies = {
33 | 			'http':address,
34 | 			'https':address,
35 | 		}
36 | 		verify_ip(proxies)
37 | 
38 | def verify_ip(proxies):#验证ip能否被用
39 | 
40 | 	try:	
41 | 		html = requests.get('http://www.baidu.com',proxies = proxies,timeout = 3)#连接测试
42 | 		print('[SUCC]可用代理：{}'.format(proxies))
43 | 		write_json(proxies)
44 | 	except:
45 | 		print("[ERROR]代理超时不可用:{}".format(proxies))
46 | 
47 | 
48 | def write_json(row):#写入文本
49 | 
50 | 	with open('ip_pool.json','a+',encoding='utf-8') as f:
51 | 		json.dump(row,f)
52 | 		f.write('\n')
53 | 
54 | 
55 | def read_json():#读取文件
56 | 	
57 | 	with open('ip_pool.json','r',encoding='utf-8') as f:
58 | 	 	
59 | 	 	for i in f.readlines():
60 | 	 		content = json.loads(i.strip())
61 | 	 		print(content)		
62 | 
63 | 
64 | if __name__ == '__main__':
65 | 
66 | 	for i in range(15,25):
67 | 		url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i)
68 | 		get_ip(url)
69 | 
70 | 	print('目前验证成功的IP')
71 | 	read_json()


--------------------------------------------------------------------------------
/构建代理池/ip_pool.json:
--------------------------------------------------------------------------------
1 | {"http": "http://183.164.239.153:9999", "https": "http://183.164.239.153:9999"}
2 | {"http": "http://49.235.69.138:8118", "https": "http://49.235.69.138:8118"}
3 | {"http": "http://111.38.91.99:8060", "https": "http://111.38.91.99:8060"}
4 | {"http": "http://47.107.160.99:8118", "https": "http://47.107.160.99:8118"}
5 | 


--------------------------------------------------------------------------------
/百度图片/crawl.py:
--------------------------------------------------------------------------------
  1 | #Python3.7 
  2 | #encoding = utf-8
  3 | 
  4 | import requests,json,re,os,traceback,datetime,aiohttp,asyncio
  5 | from uuid import uuid4
  6 | from urllib import parse
  7 | from concurrent.futures import ThreadPoolExecutor
  8 | 
  9 | headers = {
 10 |     'Accept':'text/plain, */*; q=0.01',
 11 |     'Accept-Encoding':'gzip, deflate, br',
 12 |     'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
 13 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
 14 |     'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&pv=&ic=0&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E5%88%9D%E9%9F%B3%E6%9C%AA%E6%9D%A5',
 15 |     #参考链接:https://image.baidu.com/
 16 |     #请在此处填写你的 Cookie
 17 | }
 18 | 
 19 | tasks = []
 20 | 
 21 | def get_html(url):
 22 | 
 23 |     try:
 24 |         html = requests.get(url,headers=headers)
 25 |         json_data = html.text.replace('\\','')#去除JSON数据多余的\
 26 |         json_data = json.loads(json_data)
 27 |         parse_json(json_data)
 28 | 
 29 |     except json.decoder.JSONDecodeError:
 30 |      
 31 |         #去除"fromPageTitle"键值的双引号异常
 32 |         fromPageTitle = r'"fromPageTitle":"(.*?)",'
 33 |         json_data = replace_data(fromPageTitle,json_data)
 34 | 
 35 |         #去除"fromPageTitle"键值的双引号异常
 36 |         fromPageTitle = r'"fromPageTitleEnc":"(.*?)",'
 37 |         json_data = replace_data(fromPageTitle,json_data)
 38 | 
 39 |         json_data = json.loads(json_data)
 40 |         write_error(url,flag='已经成功处理')
 41 |         parse_json(json_data)
 42 | 
 43 |     except Exception:
 44 |         write_error(url,flag='未能成功处理')
 45 | 
 46 | #解析JSON获取图片URL
 47 | def parse_json(json_data):
 48 |     list_data = json_data['data']
 49 |     for data in list_data[:-1]:
 50 |         image_name = data["fromPageTitleEnc"]
 51 |         for image_data in data["replaceUrl"]:
 52 |             image_url = image_data['ObjURL']
 53 |             tasks.append(download(image_url,image_name))
 54 |             
 55 | #下载图片
 56 | async def download(image_url,image_name):
 57 | 
 58 |     black_image = b'GIF89a\x04\x00\x08\x00\x91\x02\x00\xff\xff\xff\x00\x00\x00\xff\xff\xff\x00\x00\x00!\xf9\x04\x01\x00\x00\x02\x00,\x00\x00\x00\x00\x04\x00\x08\x00\x00\x02\x05\x94\x8f\xa9\x8b\x05\x00;'
 59 | 
 60 |     filename = './百度图片/下载好的图片'
 61 |     if not os.path.exists(filename):
 62 |         os.makedirs(filename)
 63 | 
 64 |     print("[INFO]{} 正在下载图片：{}".format(datetime.datetime.now(),image_name))
 65 | 
 66 |     async with aiohttp.ClientSession(headers = headers) as session:
 67 |         async with session.get(image_url) as html:
 68 |             
 69 |             uuid_id = uuid4()
 70 |             image_file_name = '{}/{}.jpg'.format(filename,uuid_id)
 71 |             
 72 |             #筛选掉异常的黑色图片、查询不到的图片
 73 |             if black_image not in await html.read() and b'<!DOCTYPE html>' not in await html.read():
 74 |             
 75 |                 with open(image_file_name,'wb') as f:
 76 |                     f.write(await html.read())
 77 |         
 78 |                 with open('./百度图片/图片映射表.json','a+',encoding='utf-8') as f:
 79 |                     json_data = json.dumps(dict(image_name = image_name,id=str(uuid_id)),ensure_ascii=False)
 80 |                     f.write(json_data + '\n')
 81 | 
 82 | #用正则删除双引号异常
 83 | def replace_data(re_compile,json_data):
 84 |     re_data = re.compile(re_compile)
 85 |     for i in re_data.findall(json_data):
 86 |         data = i.replace('"','').replace("\\'",'')
 87 |         json_data = json_data.replace(i,data)
 88 |     return json_data
 89 | 
 90 | #写入异常
 91 | def write_error(url,flag=None):
 92 | 
 93 |     with open('./百度图片/错误日志.txt','a+',encoding='utf-8') as f:
 94 |         f.write('JSON异常是否处理成功：{}\n'.format(flag))
 95 |         f.write('异常时间：{}\n'.format(datetime.datetime.now()))
 96 |         f.write('异常URL：{}\n'.format(url))
 97 |         f.write(traceback.format_exc() + '\n')
 98 | 
 99 | if __name__ == "__main__":
100 | 
101 |     loop = asyncio.get_event_loop()#创建异步编程
102 |     name = parse.quote('初音未来')
103 |     
104 |     with ThreadPoolExecutor(max_workers = 2) as t:
105 |         #翻页30
106 |         for i in range(30,120,30):
107 |             url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592'\
108 |                     '&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest='\
109 |                     '&copyright=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1'\
110 |                     '&fr=&expermode=&force=&pn={}&rn=30'.format(name,name,i)
111 |             t.submit(get_html,url)
112 | 
113 |     loop.run_until_complete(asyncio.wait(tasks))
114 |     loop.close()#程序关闭


--------------------------------------------------------------------------------
/破解有道翻译/crawl.py:
--------------------------------------------------------------------------------
 1 | #Python3.7 
 2 | #encoding = utf-8
 3 | 
 4 | import time, math,random,hashlib
 5 | import requests
 6 | 
 7 | def get_html(name):
 8 | 
 9 |     url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
10 | 
11 | 
12 |     ts = math.floor(time.time() * 1000)
13 |     salt = ts + int(random.random() * 10)
14 | 
15 |     sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest()
16 |     bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest()
17 | 
18 |     data = {
19 |         'i': name,
20 |         'from': 'AUTO',
21 |         'to': 'AUTO',
22 |         'smartresult': 'dict',
23 |         'client': 'fanyideskweb',
24 |         'salt': salt,
25 |         'sign': sign,
26 |         'ts': ts,
27 |         'bv': bv,
28 |         'doctype': 'json',
29 |         'version': '2.1',
30 |         'keyfrom': 'fanyi.web',
31 |         'action': 'FY_BY_CLICKBUTTION',
32 |     }
33 | 
34 |     headers = {
35 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
36 |         'Referer': 'http://fanyi.youdao.com/',
37 |         #参考链接:http://fanyi.youdao.com/
38 |         #请在此处填写你的 Cookie
39 |     }
40 | 
41 | 
42 |     html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法
43 | 
44 |     print('正在执行有道翻译程序:')
45 |     print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src']))
46 |     print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt']))
47 | 
48 | if __name__ == "__main__":
49 |     
50 |     name = '靓仔'
51 |     
52 |     get_html(name)


--------------------------------------------------------------------------------
/破解网易登录/crawl.py:
--------------------------------------------------------------------------------
 1 | #Python3.7 
 2 | #encoding = utf-8
 3 | 
 4 | import execjs,requests,time
 5 | 
 6 | class User():#获取用户密码加密
 7 | 
 8 |     def __init__(self,user_id,user_password):
 9 | 
10 |         self.user_id = user_id
11 |         self.user_password = user_password
12 |         self.session = requests.session()
13 |         self.session.headers = {
14 |             'Referer':'https://dl.reg.163.com/webzj/v1.0.1/pub/index_dl2_new.html?cd=https%3A%2F%2Ftemp.163.com%2Fspecial%2F00804C4H%2F&cf=urs_style_2019.css%3Ft%3D20190527&MGID=1590637061742.5342&wdaId=&pkid=MODXOXd&product=163',
15 |             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
16 |             #请在此处输入你的Cookie
17 |             #参考链接 https://www.163.com/
18 |         }
19 | 
20 |     def get_pw(self):
21 | 
22 |         with open('pw.js','r',encoding='utf-8') as f:
23 |             content = f.read()
24 |       
25 |         js_data = execjs.compile(content)#编译js
26 |         pw = js_data.call('get_pw',self.user_password)#调用get_pw函数
27 |         return pw
28 | 
29 |     def get_rtid(self):
30 | 
31 |         with open('rtid.js','r',encoding='utf-8') as f:
32 |             content = f.read()
33 |    
34 |         js_data = execjs.compile(content)#编译js
35 |         rtid = js_data.call('get_rtid')#调用get_rtid函数
36 |         return rtid
37 | 
38 |     def get_tk(self,rtid):
39 | 
40 |         url = 'https://dl.reg.163.com/dl/gt'
41 | 
42 |         params = {
43 |             'un':self.user_id,
44 |             'pkid':'MODXOXd',
45 |             'pd':'163',
46 |             'channel':'0',
47 |             'topURL':'https://www.163.com/',
48 |             'rtid':rtid,
49 |             'nocache':int(time.time()*1000),
50 |         }
51 | 
52 |         html = self.session.get(url,params = params).json()
53 |         return html['tk']
54 | 
55 |     def get_login(self,pw,rtid,tk):
56 | 
57 |         url = 'https://dl.reg.163.com/dl/l'
58 | 
59 | 
60 |         data = {
61 |             'channel':'0',
62 |             'd':'10',
63 |             'domains':"163.com",
64 |             'l':'0',
65 |             'pd':"163",
66 |             'pkid':"MODXOXd",
67 |             'pw':pw,
68 |             'pwdKeyUp':'1',
69 |             'rtid':rtid,
70 |             't':int(time.time()*1000),
71 |             'tk':tk,
72 |             'topURL':"https://www.163.com/",
73 |             'un':self.user_id,
74 |         }
75 | 
76 |         html = self.session.post(url,json = data).json()#传递JSON
77 |         return html
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     
82 |     user = User('请输入你的账号','请输入你的密码')
83 |     pw = user.get_pw()#获取pw
84 |     rtid = user.get_rtid()#获取rtid
85 | 
86 |     tk = user.get_tk(rtid)#获取tk
87 | 
88 |     login = user.get_login(pw,rtid,tk)
89 |     print(login)
90 | 
91 |     
92 | 


--------------------------------------------------------------------------------
/破解网易登录/rtid.js:
--------------------------------------------------------------------------------
 1 | function t() {
 2 |     var e = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 3 |       , t = 32
 4 |       , i = [];
 5 |     for (; t-- > 0; )
 6 |         i[t] = e.charAt(Math.random() * e.length);
 7 |     return i.join("")
 8 |   };
 9 |   
10 |   function get_rtid(){
11 |     return t()
12 |   }


--------------------------------------------------------------------------------
/豆瓣读书/入库版/book.py:
--------------------------------------------------------------------------------
 1 | #Python3.7 
 2 | #encoding = utf-8
 3 | 
 4 | from urllib import parse
 5 | import asyncio,aiohttp,os,time,requests
 6 | from bs4 import BeautifulSoup#爬虫解析库
 7 | from boook_db import Book,sess
 8 | from concurrent.futures import ThreadPoolExecutor
 9 | 
10 | tasks = []
11 | 
12 | headers = {
13 | 	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
14 | 	'Referer':'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',
15 | 	#参考链接 https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T
16 | }
17 | 
18 | 
19 | def get_html(url):
20 | 
21 | 	
22 | 	html = requests.get(url,headers = headers)
23 | 
24 | 	if html.status_code==200:
25 | 		
26 | 		parse_html(html.text)
27 | 	else:
28 | 		print('错误')
29 | 
30 | def parse_html(html):
31 | 
32 | 	soup =BeautifulSoup(html,'lxml')#选择解析器
33 | 	books = soup.select('li.subject-item')#选择文章
34 | 
35 | 	for book in books:
36 | 
37 | 		try:#防错机制
38 | 
39 | 			title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')#选择书名并去除空格
40 | 			info = book.select_one('.subject-item .info div.pub').text.strip().replace(' ','').replace('\n','')#选择作者
41 | 			star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')#选择评分
42 | 			pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')#选择评价
43 | 			introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')#选择书本简介
44 | 			img = book.select_one('.nbg img')['src']#获取图片url
45 | 
46 | 			tasks.append(dowmload(title,img))#异步编程
47 | 			print(title,info,star,pl,img)
48 | 			print(introduce)
49 | 			print('-'*50)
50 | 
51 | 		#插入数据库
52 | 			book_data = Book(
53 | 				title = title,
54 | 				info = info,
55 | 				star = star,
56 | 				pl = pl,
57 | 				introduce = introduce,
58 | 			)
59 | 			sess.add(book_data)
60 | 			sess.commit()
61 | 		except Exception as e:#发生任何错误返回
62 | 			print(e)
63 | 			sess.rollback()#事务回滚
64 | 
65 | 
66 | async def dowmload(title,url):#保存封面图片
67 | 
68 | 	if not os.path.exists('./豆瓣读书/doubanImg'):#检查有没有文件夹并创建
69 | 		os.makedirs('./豆瓣读书/doubanImg')
70 | 
71 | 	async with aiohttp.ClientSession(headers = headers) as session:
72 | 		async with session.get(url) as html:		
73 | 			with open('./豆瓣读书/doubanImg/{}.jpg'.format(title),'wb')as f:
74 | 				f.write(await html.content.read())
75 | 	
76 | if __name__ == '__main__':
77 | 
78 | 	loop = asyncio.get_event_loop()
79 | 	with ThreadPoolExecutor(max_workers = 2) as t:
80 | 		for i in range(0,100,20):#翻页参数为20
81 | 			url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T'.format(i)
82 | 			t.submit(get_html,url)
83 | 	loop.run_until_complete(asyncio.wait(tasks))
84 | 	loop.close()#程序关闭
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/豆瓣读书/入库版/boook_db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | from sqlalchemy import Column,String,Integer,Text
 3 | from sqlalchemy.orm import sessionmaker
 4 | from sqlalchemy.ext.declarative import declarative_base
 5 | 
 6 | #基础类
 7 | Base = declarative_base()
 8 | 
 9 | #此处没有使用pymysql的驱动
10 | #请安装pip install mysql-connector-python
11 | #engine中的 mysqlconnector 为 mysql官网驱动
12 | engine = create_engine(
13 | 	'mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8',#连接本地
14 | 	echo = True
15 | )
16 | 
17 | class Book(Base):
18 | 	__tablename__ = 'book'
19 | 	id = Column('id',Integer(),primary_key = True,autoincrement = True)
20 | 	title = Column('title',String(20))
21 | 	info = Column('info',String(30))
22 | 	star = Column('star',String(10))
23 | 	pl = Column('pl',String(10))
24 | 	introduce = Column('introduce',Text())
25 | 
26 | Base.metadata.create_all(engine)
27 | 
28 | session = sessionmaker(engine)
29 | sess=session()


--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.9.3
 2 | certifi==2021.10.8
 3 | chardet==4.0.0
 4 | charset-normalizer==2.0.7
 5 | idna==2.10
 6 | lxml==4.6.2
 7 | requests==2.25.1
 8 | soupsieve==2.2.1
 9 | urllib3==1.26.7
10 | 


--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【bs4实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import requests,json,csv,os
  3 | from uuid import uuid4
  4 | from bs4 import BeautifulSoup
  5 | from urllib import parse
  6 | 
  7 | '''主域名'''
  8 | DOMAIN_URL = 'https://book.douban.com'
  9 | 
 10 | '''
 11 |     协议头
 12 |     user-agent（必填）
 13 |     Referer（有就填，没有不填）
 14 |     Cookie（有账号登录就填，没有不填）
 15 | '''
 16 | HEADERS = {
 17 |     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
 18 |     'Referer':'https://book.douban.com/',
 19 |     'Cookie':'填写你的Cookie'
 20 | }
 21 | 
 22 | '''结果去重集合'''
 23 | RESULT_SET_DATA = set()
 24 | 
 25 | '''
 26 |     获取book的tag链接
 27 |     params:
 28 |         parse_number: int --> 爬取几个tag链接，默认全部
 29 | 
 30 |     return: List[str] --> 确定爬取几个tag链接
 31 | '''
 32 | def get_book_tag_url(split_number:int=None) -> List[str]:
 33 | 
 34 |     html = requests.get(url=DOMAIN_URL,headers=HEADERS)
 35 |     soup = BeautifulSoup(html.text,'lxml')
 36 | 
 37 |     tag_url_list_data = [ 
 38 |             DOMAIN_URL+ parse.quote(tag_url['href'])
 39 |             for tag_url in soup.select('ul.hot-tags-col5.s ul a')
 40 |         ]
 41 | 
 42 |     if split_number:
 43 |         tag_url_list_data = tag_url_list_data[:split_number]
 44 | 
 45 |     return tag_url_list_data
 46 | 
 47 | 
 48 | '''
 49 |     解析tag_url，进行翻页后，获取book的内容
 50 |     params:
 51 |         tag_url_list_data: List[str] --> book的tag链接
 52 |         parse_number: int --> 翻页参数，默认爬取3页
 53 |         write_type: bool --> 是否写入json文件
 54 |     return：List[dict] --> 爬取成功book的内容
 55 | '''
 56 | def parse_book_url_info(
 57 |     tag_url_list_data:List[str],
 58 |     parse_number:int=3,
 59 |     write_json_type:bool=True,
 60 |     write_csv_type:bool=True,
 61 |     write_image_type:bool=True
 62 | ) -> List[dict]:
 63 | 
 64 |     book_info_list_data = []
 65 | 
 66 |     for tag_url in tag_url_list_data:
 67 | 
 68 |         # 开始翻页，每20算一页
 69 |         for parse in range(0,parse_number*20+1,20):
 70 |             
 71 |             # 翻页URL
 72 |             parse_url = f'{tag_url}?start={parse}'
 73 |             
 74 |             html = requests.get(url=parse_url,headers=HEADERS)
 75 |             soup = BeautifulSoup(html.text,'lxml')
 76 | 
 77 |             # 选择书本
 78 |             books = soup.select('li.subject-item')
 79 | 
 80 |             for book in books:
 81 | 
 82 |                 # 选择书本链接
 83 |                 book_url = book.select_one('.info h2 a')['href'] 
 84 | 
 85 |                 # 选择书名
 86 |                 title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')
 87 | 
 88 |                 # 选择作者
 89 |                 info = book.select_one('.info div.pub').text.strip().replace(' ','').replace('\n','')
 90 | 
 91 |                 # 选择评分
 92 |                 star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')
 93 | 
 94 |                 # 选择评价
 95 |                 pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')
 96 | 
 97 |                 # 选择书本简介
 98 |                 introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')
 99 | 
100 |                 # 获取图片URL
101 |                 image_url = book.select_one('.nbg img')['src']
102 | 
103 |                 book_info_result = dict(
104 |                     书本链接=book_url,
105 |                     书名=title,
106 |                     作者=info,
107 |                     评分=star,
108 |                     评价=pl,
109 |                     书本简介=introduce,
110 |                     图片链接=image_url
111 |                 )
112 | 
113 |                 '''生成结果hash值'''
114 |                 result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
115 | 
116 |                 if result_hash_data not in RESULT_SET_DATA:
117 | 
118 |                     '''加入去重集合'''
119 |                     RESULT_SET_DATA.add(result_hash_data)
120 | 
121 |                     if write_image_type:
122 |                         write_image_book_info(
123 |                             image_url=image_url,
124 |                             image_name=title,
125 |                             headers=HEADERS
126 |                         )
127 | 
128 |                     # 检查是否写入json文件
129 |                     if write_json_type:
130 |                         write_json_book_info(book_info_result)
131 | 
132 |                     # 检查是否写入csv文件
133 |                     if write_csv_type:
134 |                         write_csv_book_info(
135 |                             headers=[key for key,value in book_info_result.items()],
136 |                             book_info=[value for key,value in book_info_result.items()]
137 |                         )
138 | 
139 |                     print(book_info_result)
140 | 
141 |                     book_info_list_data.append(book_info_result)
142 | 
143 |     return book_info_list_data
144 | 
145 | 
146 | 
147 | '''
148 |     保存图片，生成图片映射JSON文件
149 |     params:
150 |         image_url：str --> 图片链接
151 |         image_name：str --> 图片名字
152 |         headers: dict --> 协议头
153 | '''
154 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
155 | 
156 |     '''确保图片文件名不重复'''
157 |     uuid_id = uuid4()
158 | 
159 |     filename = './保存图片/图片'
160 | 
161 |     image_file_name = f'{filename}/{uuid_id}.jpg'
162 | 
163 |     image_map_file_name = f'./保存图片/image_map_data.json'
164 | 
165 |     '''如果不存在文件夹则创建'''
166 |     if not os.path.exists(filename):
167 |         os.makedirs(filename)
168 | 
169 |     html = requests.get(url=image_url,headers=headers)
170 | 
171 |     '''写入图片'''
172 |     with open(image_file_name,'wb') as f:
173 | 
174 |         f.write(html.content)
175 |     
176 |     '''保存图片映射JSON文件'''
177 |     with open(image_map_file_name,'a+',encoding='utf-8') as f:
178 | 
179 |         f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
180 | 
181 | 
182 | 
183 | '''
184 |     将book的内容，写入json文件
185 |     params:
186 |         book_info: dict --> 爬取成功book的内容
187 | '''
188 | def write_json_book_info(book_info:dict):
189 | 
190 |     with open('book_info.json','a+',encoding='utf-8') as f:
191 | 
192 |         '''
193 |             json.dumps() 将dict对象转成str对象，json就是str对象
194 |             ensure_ascii=False 让json显示中文编码
195 |         '''
196 |         f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
197 | 
198 | 
199 | 
200 | '''
201 |     将book的内容，写入csv文件（带表头）
202 |     params:
203 |         headers：list --> CSV表头
204 |         book_info: list --> 爬取成功book的内容
205 | '''
206 | def write_csv_book_info(headers:list,book_info:list):
207 | 
208 |     '''
209 |         跨平台问题：
210 |             写入csv 因为Windows有点BUG
211 |             writerows()写入会出现空行
212 |             所以加入newline=''
213 |             没有出现这种情况则不需要
214 |     '''
215 | 
216 |     '''
217 |         检查是否创建了CSV文件
218 |         没有则生成带有表头的CSV文件
219 |     '''
220 |     if not os.path.exists('book_info.csv'): 
221 | 
222 |         with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
223 | 
224 |              f_csv = csv.writer(f)
225 |              f_csv.writerow(headers)
226 | 
227 | 
228 | 
229 |     '''
230 |         逐行开始写入CSV
231 |     '''
232 |     with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
233 | 
234 |         f_csv = csv.writer(f)
235 |         f_csv.writerow(book_info) #逐行插入
236 | 
237 | if __name__ == '__main__':
238 | 
239 |     book_tag_url = get_book_tag_url(1)
240 |  
241 |     book_url_info = parse_book_url_info(book_tag_url)


--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【re实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import requests,json,csv,os,re
  3 | from uuid import uuid4
  4 | from urllib import parse
  5 | 
  6 | '''主域名'''
  7 | DOMAIN_URL = 'https://book.douban.com'
  8 | 
  9 | '''
 10 |     协议头
 11 |     user-agent（必填）
 12 |     Referer（有就填，没有不填）
 13 |     Cookie（有账号登录就填，没有不填）
 14 | '''
 15 | HEADERS = {
 16 |     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
 17 |     'Referer':'https://book.douban.com/',
 18 |     'Cookie':'填写你的Cookie'
 19 | }
 20 | 
 21 | '''结果去重集合'''
 22 | RESULT_SET_DATA = set()
 23 | 
 24 | 
 25 | class ReFind():
 26 | 
 27 |     def __init__(self,text):
 28 | 
 29 |         '''去除所有空格、换行'''
 30 |         self.text = re.sub('\s+','',text)
 31 | 
 32 |     
 33 |      
 34 |     '''
 35 |         【链式调用】传入指定正则表达式，获取第一个结果文本
 36 |         params:
 37 |             compile: str --> 指定正则表达式
 38 |             re_type：RegexFlag --> 匹配模式
 39 |         return: ReFind --> 实例化对象本身，方便进行链式调用
 40 |     '''
 41 |     def add_search(self,compile:str,re_type=re.I|re.S):
 42 | 
 43 |         self.text = re.compile(compile,re_type).search(self.text).group()
 44 |         
 45 |         return self
 46 |     
 47 |     '''
 48 |         传入指定正则表达式，返回所有查询结果
 49 |         params:
 50 |             compile: str --> 指定正则表达式
 51 |             re_type：RegexFlag --> 匹配模式
 52 |         return: List[str] --> 正则匹配成功的结果
 53 |     '''
 54 |     def find_all(self,compile:str,re_type=re.I|re.S) -> List[str]:
 55 | 
 56 |         return re.compile(compile,re_type).findall(self.text)
 57 | 
 58 | 
 59 | 
 60 |     '''
 61 |         打印当前文本
 62 |         return: str --> 当前对象的文本
 63 |     '''
 64 |     def print(self) -> str:
 65 |         print(self.text)
 66 | 
 67 | 
 68 | 
 69 | '''
 70 |     获取book的tag链接
 71 |     params:
 72 |         parse_number: int --> 爬取几个tag链接，默认全部
 73 | 
 74 |     return: List[str] --> 确定爬取几个tag链接
 75 | '''
 76 | def get_book_tag_url(split_number:int=None) -> List[str]:
 77 | 
 78 |     html = requests.get(url=DOMAIN_URL,headers=HEADERS)
 79 |  
 80 |     tag_url_list_data = [ 
 81 |             DOMAIN_URL+ parse.quote(tag_url)
 82 |             for tag_url in (
 83 |                     ReFind(html.text)
 84 |                     .add_search(r'<ulclass="hot-tags-col5s".*?<!--doubanadbegin-->')
 85 |                     .find_all(r'<li><ahref="(/tag/.*?)"class="tag">.*?</a></li>')
 86 |                 )
 87 |         ]
 88 | 
 89 |     if split_number:
 90 |         tag_url_list_data = tag_url_list_data[:split_number]
 91 | 
 92 |     return tag_url_list_data
 93 | 
 94 | 
 95 | '''
 96 |     解析tag_url，进行翻页后，获取book的内容
 97 |     params:
 98 |         tag_url_list_data: List[str] --> book的tag链接
 99 |         parse_number: int --> 翻页参数，默认爬取3页
100 |         write_type: bool --> 是否写入json文件
101 |     return：List[dict] --> 爬取成功book的内容
102 | '''
103 | def parse_book_url_info(
104 |     tag_url_list_data:List[str],
105 |     parse_number:int=3,
106 |     write_json_type:bool=True,
107 |     write_csv_type:bool=True,
108 |     write_image_type:bool=True
109 | ) -> List[dict]:
110 | 
111 |     book_info_list_data = []
112 | 
113 |     for tag_url in tag_url_list_data:
114 | 
115 |         # 开始翻页，每20算一页
116 |         for parse in range(0,parse_number*20+1,20):
117 |             
118 |             # 翻页URL
119 |             parse_url = f'{tag_url}?start={parse}'
120 |             
121 |             html = requests.get(url=parse_url,headers=HEADERS)
122 | 
123 |             # 选择书本
124 |             books = (
125 |                     ReFind(html.text)
126 |                     .find_all(r'<liclass="subject-item".*?</li>')
127 |                 )
128 | 
129 |             for book in books:
130 | 
131 |                 # 选择书本链接
132 |                 book_url = (
133 |                     ReFind(book)
134 |                     .find_all(r'<h2class=""><ahref="(.*?)".*?</a></h2>')
135 |                 )[0]
136 | 
137 |                 # 选择书名
138 |                 title = (
139 |                     ReFind(book)
140 |                     .find_all(r'<h2class=""><ahref=.*?>(.*?)</a></h2>')
141 |                 )[0].strip().replace(' ','').replace('\n','')
142 | 
143 |                 # 选择作者
144 |                 info = (
145 |                     ReFind(book)
146 |                     .find_all(r'<divclass="pub">(.*?)</div>')
147 |                 )[0].strip().replace(' ','').replace('\n','')
148 | 
149 |                 # 选择评分
150 |                 star = (
151 |                     ReFind(book)
152 |                     .find_all(r'<spanclass="rating_nums">(.*?)</span>')
153 |                 )[0].strip().replace(' ','').replace('\n','')
154 | 
155 |                 # 选择评价
156 |                 pl = (
157 |                     ReFind(book)
158 |                     .find_all(r'<spanclass="pl">(.*?)</span>')
159 |                 )[0].strip().replace(' ','').replace('\n','')
160 | 
161 |                 # 选择书本简介
162 |                 introduce = (
163 |                     ReFind(book)
164 |                     .find_all(r'<p>(.*?)</p>')
165 |                 )[0].strip().replace(' ','').replace('\n','')
166 | 
167 | 
168 |                 # 获取图片URL
169 |                 image_url =(
170 |                     ReFind(book)
171 |                     .find_all(r'<img.*?src="(.*?)".*?>')
172 |                 )[0]
173 | 
174 |                 book_info_result = dict(
175 |                     书本链接=book_url,
176 |                     书名=title,
177 |                     作者=info,
178 |                     评分=star,
179 |                     评价=pl,
180 |                     书本简介=introduce,
181 |                     图片链接=image_url
182 |                 )
183 | 
184 |                 '''生成结果hash值'''
185 |                 result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
186 | 
187 |                 if result_hash_data not in RESULT_SET_DATA:
188 | 
189 |                     '''加入去重集合'''
190 |                     RESULT_SET_DATA.add(result_hash_data)
191 | 
192 |                     if write_image_type:
193 |                         write_image_book_info(
194 |                             image_url=image_url,
195 |                             image_name=title,
196 |                             headers=HEADERS
197 |                         )
198 | 
199 |                     # 检查是否写入json文件
200 |                     if write_json_type:
201 |                         write_json_book_info(book_info_result)
202 | 
203 |                     # 检查是否写入csv文件
204 |                     if write_csv_type:
205 |                         write_csv_book_info(
206 |                             headers=[key for key,value in book_info_result.items()],
207 |                             book_info=[value for key,value in book_info_result.items()]
208 |                         )
209 | 
210 |                     print(book_info_result)
211 | 
212 |                     book_info_list_data.append(book_info_result)
213 | 
214 |     return book_info_list_data
215 | 
216 | 
217 | '''
218 |     保存图片，生成图片映射JSON文件
219 |     params:
220 |         image_url：str --> 图片链接
221 |         image_name：str --> 图片名字
222 |         headers: dict --> 协议头
223 | '''
224 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
225 | 
226 |     '''确保图片文件名不重复'''
227 |     uuid_id = uuid4()
228 | 
229 |     filename = './保存图片/图片'
230 | 
231 |     image_file_name = f'{filename}/{uuid_id}.jpg'
232 | 
233 |     image_map_file_name = f'./保存图片/image_map_data.json'
234 | 
235 |     '''如果不存在文件夹则创建'''
236 |     if not os.path.exists(filename):
237 |         os.makedirs(filename)
238 | 
239 |     html = requests.get(url=image_url,headers=headers)
240 | 
241 |     '''写入图片'''
242 |     with open(image_file_name,'wb') as f:
243 | 
244 |         f.write(html.content)
245 |     
246 |     '''保存图片映射JSON文件'''
247 |     with open(image_map_file_name,'a+',encoding='utf-8') as f:
248 | 
249 |         f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
250 | 
251 | 
252 | 
253 | '''
254 |     将book的内容，写入json文件
255 |     params:
256 |         book_info: dict --> 爬取成功book的内容
257 | '''
258 | def write_json_book_info(book_info:dict):
259 | 
260 |     with open('book_info.json','a+',encoding='utf-8') as f:
261 | 
262 |         '''
263 |             json.dumps() 将dict对象转成str对象，json就是str对象
264 |             ensure_ascii=False 让json显示中文编码
265 |         '''
266 |         f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
267 | 
268 | 
269 | 
270 | '''
271 |     将book的内容，写入csv文件（带表头）
272 |     params:
273 |         headers：list --> CSV表头
274 |         book_info: list --> 爬取成功book的内容
275 | '''
276 | def write_csv_book_info(headers:list,book_info:list):
277 | 
278 |     '''
279 |         跨平台问题：
280 |             写入csv 因为Windows有点BUG
281 |             writerows()写入会出现空行
282 |             所以加入newline=''
283 |             没有出现这种情况则不需要
284 |     '''
285 | 
286 |     '''
287 |         检查是否创建了CSV文件
288 |         没有则生成带有表头的CSV文件
289 |     '''
290 |     if not os.path.exists('book_info.csv'): 
291 | 
292 |         with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
293 | 
294 |              f_csv = csv.writer(f)
295 |              f_csv.writerow(headers)
296 | 
297 | 
298 | 
299 |     '''
300 |         逐行开始写入CSV
301 |     '''
302 |     with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
303 | 
304 |         f_csv = csv.writer(f)
305 |         f_csv.writerow(book_info) #逐行插入
306 | 
307 | if __name__ == '__main__':
308 | 
309 |     book_tag_url = get_book_tag_url(1)
310 |  
311 |     book_url_info = parse_book_url_info(book_tag_url)


--------------------------------------------------------------------------------
/豆瓣读书/分类实现版/【xpath实现】豆瓣读书爬虫.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import requests,json,csv,os
  3 | from uuid import uuid4
  4 | from lxml import etree
  5 | from urllib import parse
  6 | 
  7 | '''主域名'''
  8 | DOMAIN_URL = 'https://book.douban.com'
  9 | 
 10 | '''
 11 |     协议头
 12 |     user-agent（必填）
 13 |     Referer（有就填，没有不填）
 14 |     Cookie（有账号登录就填，没有不填）
 15 | '''
 16 | HEADERS = {
 17 |     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
 18 |     'Referer':'https://book.douban.com/',
 19 |     'Cookie':'填写你的Cookie'
 20 | }
 21 | 
 22 | '''结果去重集合'''
 23 | RESULT_SET_DATA = set()
 24 | 
 25 | '''
 26 |     获取book的tag链接
 27 |     params:
 28 |         parse_number: int --> 爬取几个tag链接，默认全部
 29 | 
 30 |     return: List[str] --> 确定爬取几个tag链接
 31 | '''
 32 | def get_book_tag_url(split_number:int=None) -> List[str]:
 33 | 
 34 |     html = requests.get(url=DOMAIN_URL,headers=HEADERS)
 35 |     soup = etree.HTML(html.text)
 36 | 
 37 |     tag_url_list_data = [ 
 38 |             DOMAIN_URL+ parse.quote(tag_url)
 39 |             for tag_url in soup.xpath('//ul[@class="hot-tags-col5 s"]//ul//a/@href')
 40 |         ]
 41 | 
 42 |     if split_number:
 43 |         tag_url_list_data = tag_url_list_data[:split_number]
 44 | 
 45 |     return tag_url_list_data
 46 | 
 47 | 
 48 | '''
 49 |     解析tag_url，进行翻页后，获取book的内容
 50 |     params:
 51 |         tag_url_list_data: List[str] --> book的tag链接
 52 |         parse_number: int --> 翻页参数，默认爬取3页
 53 |         write_type: bool --> 是否写入json文件
 54 |     return：List[dict] --> 爬取成功book的内容
 55 | '''
 56 | def parse_book_url_info(
 57 |     tag_url_list_data:List[str],
 58 |     parse_number:int=3,
 59 |     write_json_type:bool=True,
 60 |     write_csv_type:bool=True,
 61 |     write_image_type:bool=True
 62 | ) -> List[dict]:
 63 | 
 64 |     book_info_list_data = []
 65 | 
 66 |     for tag_url in tag_url_list_data:
 67 | 
 68 |         # 开始翻页，每20算一页
 69 |         for parse in range(0,parse_number*20+1,20):
 70 |             
 71 |             # 翻页URL
 72 |             parse_url = f'{tag_url}?start={parse}'
 73 |             
 74 |             html = requests.get(url=parse_url,headers=HEADERS)
 75 |             soup = etree.HTML(html.text)
 76 | 
 77 |             # 选择书本
 78 |             books = soup.xpath('//li[@class="subject-item"]')
 79 | 
 80 |             for book in books:
 81 | 
 82 |                 # 选择书本链接
 83 |                 book_url = book.xpath('.//h2/a/@href')[0]
 84 | 
 85 |                 # 选择书名
 86 |                 title = book.xpath('.//h2/a/text()')[0].strip().replace(' ','').replace('\n','')
 87 | 
 88 |                 # 选择作者
 89 |                 info = book.xpath('.//div[@class="pub"]/text()')[0].strip().replace(' ','').replace('\n','')
 90 | 
 91 |                 # 选择评分
 92 |                 star = book.xpath('.//span[@class="rating_nums"]/text()')[0].strip().replace(' ','').replace('\n','')
 93 | 
 94 |                 # 选择评价
 95 |                 pl = book.xpath('.//span[@class="pl"]/text()')[0].strip().replace(' ','').replace('\n','')
 96 | 
 97 |                 # 选择书本简介
 98 |                 introduce = book.xpath('.//p/text()')[0].strip().replace(' ','').replace('\n','')
 99 | 
100 |                 # 获取图片URL
101 |                 image_url = book.xpath('.//img/@src')[0]
102 | 
103 |                 book_info_result = dict(
104 |                     书本链接=book_url,
105 |                     书名=title,
106 |                     作者=info,
107 |                     评分=star,
108 |                     评价=pl,
109 |                     书本简介=introduce,
110 |                     图片链接=image_url
111 |                 )
112 | 
113 |                 '''生成结果hash值'''
114 |                 result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False))
115 | 
116 |                 if result_hash_data not in RESULT_SET_DATA:
117 | 
118 |                     '''加入去重集合'''
119 |                     RESULT_SET_DATA.add(result_hash_data)
120 | 
121 |                     if write_image_type:
122 |                         write_image_book_info(
123 |                             image_url=image_url,
124 |                             image_name=title,
125 |                             headers=HEADERS
126 |                         )
127 | 
128 |                     # 检查是否写入json文件
129 |                     if write_json_type:
130 |                         write_json_book_info(book_info_result)
131 | 
132 |                     # 检查是否写入csv文件
133 |                     if write_csv_type:
134 |                         write_csv_book_info(
135 |                             headers=[key for key,value in book_info_result.items()],
136 |                             book_info=[value for key,value in book_info_result.items()]
137 |                         )
138 | 
139 |                     print(book_info_result)
140 | 
141 |                     book_info_list_data.append(book_info_result)
142 | 
143 |     return book_info_list_data
144 | 
145 | 
146 | '''
147 |     保存图片，生成图片映射JSON文件
148 |     params:
149 |         image_url：str --> 图片链接
150 |         image_name：str --> 图片名字
151 |         headers: dict --> 协议头
152 | '''
153 | def write_image_book_info(image_url:str,image_name:str,headers:dict):
154 | 
155 |     '''确保图片文件名不重复'''
156 |     uuid_id = uuid4()
157 | 
158 |     filename = './保存图片/图片'
159 | 
160 |     image_file_name = f'{filename}/{uuid_id}.jpg'
161 | 
162 |     image_map_file_name = f'./保存图片/image_map_data.json'
163 | 
164 |     '''如果不存在文件夹则创建'''
165 |     if not os.path.exists(filename):
166 |         os.makedirs(filename)
167 | 
168 |     html = requests.get(url=image_url,headers=headers)
169 | 
170 |     '''写入图片'''
171 |     with open(image_file_name,'wb') as f:
172 | 
173 |         f.write(html.content)
174 |     
175 |     '''保存图片映射JSON文件'''
176 |     with open(image_map_file_name,'a+',encoding='utf-8') as f:
177 | 
178 |         f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n')
179 | 
180 | 
181 | 
182 | '''
183 |     将book的内容，写入json文件
184 |     params:
185 |         book_info: dict --> 爬取成功book的内容
186 | '''
187 | def write_json_book_info(book_info:dict):
188 | 
189 |     with open('book_info.json','a+',encoding='utf-8') as f:
190 | 
191 |         '''
192 |             json.dumps() 将dict对象转成str对象，json就是str对象
193 |             ensure_ascii=False 让json显示中文编码
194 |         '''
195 |         f.write(json.dumps(book_info,ensure_ascii=False)+'\n')
196 | 
197 | 
198 | 
199 | '''
200 |     将book的内容，写入csv文件（带表头）
201 |     params:
202 |         headers：list --> CSV表头
203 |         book_info: list --> 爬取成功book的内容
204 | '''
205 | def write_csv_book_info(headers:list,book_info:list):
206 | 
207 |     '''
208 |         跨平台问题：
209 |             写入csv 因为Windows有点BUG
210 |             writerows()写入会出现空行
211 |             所以加入newline=''
212 |             没有出现这种情况则不需要
213 |     '''
214 | 
215 |     '''
216 |         检查是否创建了CSV文件
217 |         没有则生成带有表头的CSV文件
218 |     '''
219 |     if not os.path.exists('book_info.csv'): 
220 | 
221 |         with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
222 | 
223 |              f_csv = csv.writer(f)
224 |              f_csv.writerow(headers)
225 | 
226 | 
227 | 
228 |     '''
229 |         逐行开始写入CSV
230 |     '''
231 |     with open('book_info.csv','a+',encoding='utf-8',newline='') as f:
232 | 
233 |         f_csv = csv.writer(f)
234 |         f_csv.writerow(book_info) #逐行插入
235 | 
236 | if __name__ == '__main__':
237 | 
238 |     book_tag_url = get_book_tag_url(1)
239 | 
240 |     book_url_info = parse_book_url_info(book_tag_url)


--------------------------------------------------------------------------------