├── .gitattributes ├── .gitignore ├── B站模拟扫码登录 └── demo.py ├── README.md ├── requirements.txt ├── 下载小鹅通视频 ├── 2021年12月 │ ├── 1.前置知识 │ │ ├── 1.AES-CBC解密 │ │ │ ├── CBC解密.py │ │ │ ├── after.ts │ │ │ └── before.ts │ │ ├── 2.m3u8解析 │ │ │ ├── demo.m3u8 │ │ │ └── parse.py │ │ ├── 3.HTML注入JS │ │ │ ├── after.html │ │ │ ├── before.html │ │ │ └── demo.py │ │ └── 4.ffmpeg合成ts视频 │ │ │ ├── demo.py │ │ │ ├── ffmpeg.exe │ │ │ ├── file.txt │ │ │ ├── out.mp4 │ │ │ └── 素材 │ │ │ ├── 1.ts │ │ │ ├── 10.ts │ │ │ ├── 11.ts │ │ │ ├── 12.ts │ │ │ ├── 13.ts │ │ │ ├── 14.ts │ │ │ ├── 15.ts │ │ │ ├── 2.ts │ │ │ ├── 3.ts │ │ │ ├── 4.ts │ │ │ ├── 5.ts │ │ │ ├── 6.ts │ │ │ ├── 7.ts │ │ │ ├── 8.ts │ │ │ └── 9.ts │ ├── 2.自动合并版本 │ │ ├── ffmpeg.exe │ │ ├── request_demo.py │ │ ├── requirements.txt │ │ ├── selenium启动 │ │ │ ├── chromedriver.exe │ │ │ ├── selenium_start.py │ │ │ └── 谷歌驱动下载地址.txt │ │ ├── 启动程序指令.txt │ │ └── 安装环境指令.txt │ └── 3.手动合并版本 │ │ ├── ffmpeg.exe │ │ ├── request_demo.py │ │ ├── requirements.txt │ │ ├── selenium启动 │ │ ├── chromedriver.exe │ │ ├── selenium_start.py │ │ └── 谷歌驱动下载地址.txt │ │ ├── 启动程序指令.txt │ │ └── 安装环境指令.txt └── 2022年12月 │ └── 1.自动合并版本 │ ├── N_m3u8DL-CLI_v3.0.2.exe │ ├── ffmpeg.exe │ ├── request_demo.py │ ├── requirements.txt │ ├── selenium启动 │ ├── chromedriver.exe │ ├── selenium_start.py │ └── 谷歌驱动下载地址.txt │ ├── 启动程序指令.txt │ └── 安装环境指令.txt ├── 下载荔枝微课 ├── ffmpeg.exe ├── request_demo.py ├── requirements.txt ├── selenium启动 │ ├── chromedriver.exe │ ├── selenium_start.py │ └── 谷歌驱动下载地址.txt ├── 启动程序指令.txt └── 安装环境指令.txt ├── 京东商品信息 └── crawl.py ├── 房天下 ├── crawl.py └── db.py ├── 新版QQ音乐 ├── README.md ├── crawl.py ├── db.py ├── demo.py └── get_sign.js ├── 旧版QQ音乐(仍可用) ├── README.md ├── crawl.py ├── db.py └── demo.py ├── 有道翻译 └── crawl.py ├── 构建代理池 ├── crawl.py └── ip_pool.json ├── 百度图片 └── crawl.py ├── 破解有道翻译 └── crawl.py ├── 破解网易登录 ├── crawl.py ├── pw.js └── rtid.js └── 豆瓣读书 ├── 入库版 ├── book.py └── boook_db.py └── 分类实现版 ├── requirements.txt ├── 【bs4实现】豆瓣读书爬虫.py ├── 【re实现】豆瓣读书爬虫.py └── 【xpath实现】豆瓣读书爬虫.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /B站模拟扫码登录/demo.py: -------------------------------------------------------------------------------- 1 | # Python3.7 2 | # encoding=utf-8 3 | 4 | import requests,time,json,os 5 | import qrcode # 生成二维码 6 | import cv2 as cv # 读取二维码图片 7 | from concurrent.futures import ThreadPoolExecutor 8 | 9 | ''' 10 | 需要安装第三方库: 11 | pip install qrcode==7.3 12 | pip install opencv-python==4.5.3.56 13 | ''' 14 | 15 | headers = { 16 | 'referer':'https://passport.bilibili.com/login', 17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36x-requested-with: XMLHttpRequest' 18 | } 19 | 20 | class Login(): 21 | 22 | def __init__(self): 23 | self.oauthKey = '' 24 | self.qrcodeURL = '' 25 | self.session = requests.Session() 26 | self.session.headers = headers 27 | 28 | # 获取二维码图片地址 29 | def getQRcode(self): 30 | 31 | html = self.session.get('https://passport.bilibili.com/qrcode/getLoginUrl') 32 | if html.json()['status'] == True: 33 | self.oauthKey = html.json()['data']['oauthKey'] 34 | self.qrcodeURL = html.json()['data']['url'] 35 | return True 36 | return False 37 | 38 | # 利用 opencv 读取图片 39 | @staticmethod 40 | def showQRCode(url): 41 | qrCode = qrcode.QRCode() 42 | qrCode.add_data(url) 43 | qrCode = qrCode.make_image() 44 | qrCode.save('qrCode.png') 45 | img = cv.imread('qrCode.png',1) 46 | cv.imshow('Login',img) 47 | cv.waitKey() 48 | 49 | # 开始登录 50 | def login(self): 51 | 52 | # 创建另一个线程,展示二维码图片 53 | thread_pool = ThreadPoolExecutor(max_workers=2) 54 | if self.getQRcode(): 55 | thread_pool.submit(self.showQRCode,self.qrcodeURL) 56 | 57 | # 不断检查二维码是否确认登录 58 | while True: 59 | time.sleep(1) 60 | data = { 61 | 'oauthKey':self.oauthKey, 62 | 'gourl':'https://www.bilibili.com/' 63 | } 64 | 65 | html = self.session.post('https://passport.bilibili.com/qrcode/getLoginInfo',headers=headers,data=data) 66 | 67 | if html.json()['data'] == -4: # 还没扫码 68 | pass 69 | elif html.json()['data'] == -2: # 二维码过期,需要重新生成 70 | self.getQRcode() 71 | thread_pool.submit(self.showQRCode,self.qrcodeURL) 72 | elif html.json()['data'] == -5: # 已经扫码,等待确认 73 | pass 74 | else: 75 | break 76 | 77 | # 解析 cookie 78 | cookieRaw = html.json()['data']['url'].split('?')[1].split('&') 79 | cookies = {} 80 | for cookie in cookieRaw: 81 | key,value = cookie.split('=') 82 | if key != 'gourl' and key != 'Expires': 83 | cookies[key] = value 84 | print(json.dumps(cookies)) 85 | os._exit(0) 86 | 87 | if __name__ == '__main__': 88 | login = Login() 89 | login.login() 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **Python3Webcrawler** 2 | ## **[哔哩哔哩作者:-相依-](https://space.bilibili.com/343154012)**  **UPDATE 2023-02-15** 3 | 4 | --- 5 | 6 | > **精心挑选了几个爬虫,给大家在学Scrapy框架之前打基础。** 7 | >> **该项目仅限学习交流,请勿用于商业用途,如有侵权,请联系删除。** 8 | 9 | --- 10 | 11 | |**运行环境**|**项目使用版本**| 12 | |:----:|:--------:| 13 | |**python**|**3.7.9**| 14 | |**NodeJS**|**14.6.0**| 15 | 16 | --- 17 | 18 | |**程序依赖**|**安装指令**|**项目使用版本**| 19 | |:----:|:--------:|:--------:| 20 | |**lxml**|**pip install lxml**|**4.6.2**| 21 | |**aiohttp**|**pip install aiohttp**|**3.7.4**| 22 | |**requests**|**pip install requests**|**2.25.1**| 23 | |**PyExecJS**|**pip install PyExecJS**|**1.5.1**| 24 | |**sqlalchemy**|**pip install sqlalchemy**|**1.3.23**| 25 | |**beautifulsoup4**|**pip install beautifulsoup4**|**4.9.3**| 26 | |**mysqlconnector**|**pip install mysql-connector-python**|**8.0.23**| 27 | |**qrcode**|**pip install qrcode**|**7.3**| 28 | |**opencv-python**|**pip install opencv-python**|**4.5.3.56**| 29 | |**m3u8**|**pip install m3u8**|**0.9.0**| 30 | |**mitmproxy**|**pip install mitmproxy**|**5.3.0**| 31 | |**selenium**|**pip install selenium**|**3.141.0**| 32 | |**pycryptodome**|**pip install pycryptodome**|**3.10.1**| 33 | --- 34 | 35 | * ### **京东   [官网地址](https://item.jd.com)** 36 | * ### **网易   [官网地址](https://www.163.com/)** 37 | * ### **房天下  [官网地址](https://www.fang.com)** 38 | * ### **快代理  [官网地址](https://www.kuaidaili.com)** 39 | * ### **QQ音乐   [官网地址](https://y.qq.com)** 40 | * ### **百度图片 [官网地址](https://image.baidu.com)** 41 | * ### **豆瓣读书 [官网地址](https://book.douban.com)** 42 | * ### **有道翻译 [官网地址](http://fanyi.youdao.com)** 43 | * ### **哔哩哔哩 [官网地址](https://bilibili.com)** 44 | * ### **小鹅通  [官网地址](https://www.xiaoe-tech.com)** 45 | * ### **荔枝微课 [官网地址](https://m.lizhiweike.com)** -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4 2 | async-timeout==3.0.1 3 | attrs==20.3.0 4 | beautifulsoup4==4.9.3 5 | certifi==2020.12.5 6 | chardet==3.0.4 7 | idna==2.10 8 | lxml==4.6.2 9 | multidict==5.1.0 10 | mysql-connector-python==8.0.23 11 | protobuf==3.13.0 12 | PyExecJS==1.5.1 13 | requests==2.25.1 14 | six==1.15.0 15 | soupsieve==2.2 16 | SQLAlchemy==1.3.23 17 | typing-extensions==3.7.4.3 18 | urllib3==1.26.3 19 | yarl==1.6.3 20 | qrcode==7.3 21 | opencv-python==4.5.3.56 22 | selenium==3.141.0 23 | m3u8==0.9.0 24 | mitmproxy==5.3.0 25 | pycryptodome==3.10.1 -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/CBC解密.py: -------------------------------------------------------------------------------- 1 | from Crypto.Cipher import AES 2 | 3 | # 设置模式 4 | mode = AES.MODE_CBC 5 | 6 | key = b'V\x9dH\x1e:\xe6g\x10\x11l\xd7\xab\xd5\xd3\xc1\xbc' 7 | 8 | ''' 9 | 生成解密对象 10 | key:密钥 11 | mode:解密模式 12 | iv:偏移量 13 | ''' 14 | cryptos = AES.new(key=key,mode=mode,iv=b'0000000000000000') 15 | 16 | with open('before.ts','rb') as f: # 解密前 17 | with open('after.ts','wb') as f2: # 解密后 18 | f2.write(cryptos.decrypt(f.read())) 19 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/after.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/1.AES-CBC解密/before.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/demo.m3u8: -------------------------------------------------------------------------------- 1 | #EXTM3U 2 | #EXT-X-VERSION:3 3 | #EXT-X-TARGETDURATION:11 4 | #EXT-X-MEDIA-SEQUENCE:0 5 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 6 | #EXTINF:2.000000, 7 | v.f230.ts?start=0&end=68063&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 8 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 9 | #EXTINF:2.000000, 10 | v.f230.ts?start=68064&end=130671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 11 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 12 | #EXTINF:1.708333, 13 | v.f230.ts?start=130672&end=190847&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 14 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 15 | #EXTINF:2.000000, 16 | v.f230.ts?start=190848&end=281471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 17 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 18 | #EXTINF:2.000000, 19 | v.f230.ts?start=281472&end=369471&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 20 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 21 | #EXTINF:2.000000, 22 | v.f230.ts?start=369472&end=457647&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 23 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 24 | #EXTINF:2.000000, 25 | v.f230.ts?start=457648&end=742095&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 26 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 27 | #EXTINF:4.291667, 28 | v.f230.ts?start=742096&end=1186719&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 29 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 30 | #EXTINF:4.000000, 31 | v.f230.ts?start=1186720&end=1413087&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 32 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 33 | #EXTINF:4.000000, 34 | v.f230.ts?start=1413088&end=1776687&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 35 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 36 | #EXTINF:5.791667, 37 | v.f230.ts?start=1776688&end=2031631&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 38 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 39 | #EXTINF:6.000000, 40 | v.f230.ts?start=2031632&end=2294271&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 41 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 42 | #EXTINF:6.000000, 43 | v.f230.ts?start=2294272&end=2535679&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 44 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 45 | #EXTINF:10.208333, 46 | v.f230.ts?start=2535680&end=3179583&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 47 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 48 | #EXTINF:9.666667, 49 | v.f230.ts?start=3179584&end=3695279&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 50 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 51 | #EXTINF:10.000000, 52 | v.f230.ts?start=3695280&end=3994207&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 53 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 54 | #EXTINF:10.750000, 55 | v.f230.ts?start=3994208&end=4735695&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 56 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 57 | #EXTINF:9.333333, 58 | v.f230.ts?start=4735696&end=5240671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 59 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 60 | #EXTINF:9.583333, 61 | v.f230.ts?start=5240672&end=5551439&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 62 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 63 | #EXTINF:10.416667, 64 | v.f230.ts?start=5551440&end=5820671&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 65 | #EXT-X-KEY:METHOD=AES-128,URI="https://app.xiaoe-tech.com/get_video_key.php?edk=CiCHkZwwhEr4uPPu%2FDpFfxJWLmVpVMapttTR7qFVt3CbcxCO08TAChiaoOvUBCokYjRhNjFiNTgtMmVhNy00OWYxLTgwZGMtZTE0NTIyODc5YWIy&fileId=5285890798112366481&keySource=VodBuildInKMS",IV=0x00000000000000000000000000000000 66 | #EXTINF:6.416667, 67 | v.f230.ts?start=5820672&end=5890239&type=mpegts&exper=0&sign=4ed80cd04c3f8f778d5dee9a52592408&t=60fbf938&us=RF69wdt1My3L&whref=xueyuan.xiaoe-tech.com 68 | #EXT-X-ENDLIST 69 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/2.m3u8解析/parse.py: -------------------------------------------------------------------------------- 1 | import m3u8 2 | 3 | ''' 4 | m3u8 官方文档:https://github.com/globocom/m3u8 5 | ''' 6 | with open(r'demo.m3u8','r',encoding='utf-8') as f: 7 | 8 | # 解析 m3u8 9 | dict_data = m3u8.parse(f.read()) 10 | print(dict_data) 11 | 12 | # 获取键值 13 | # print(dict_data.keys()) 14 | 15 | # 获取 m3u8 分片地址 16 | # for data in dict_data['segments']: 17 | # print(data['uri']) 18 | # start = data['uri'].split('?')[1].split('&')[0] 19 | # end = data['uri'].split('?')[1].split('&')[1] 20 | # print(start + end) 21 | 22 | 23 | # 获取 m3u8 加密地址 24 | # for data in dict_data['keys']: 25 | # print(data['uri']) 26 | 27 | 28 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/after.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 模拟多个script节点 14 | 15 | 17 | 19 | 20 | 21 | 23 | 24 | 26 | 28 | 31 | 32 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/before.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 模拟多个script节点 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/3.HTML注入JS/demo.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | ''' 4 | BeautifulSoup修改文档树-官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id45 5 | ''' 6 | 7 | with open('before.html','r',encoding='utf-8') as f: 8 | 9 | soup = BeautifulSoup(f.read(), 'lxml') 10 | 11 | 12 | # 创建 HTML 的 script 节点 13 | script_tag = soup.new_tag('script', type='text/javascript') 14 | script_tag.string = "alert('靓仔')" 15 | # print(script_tag) 16 | 17 | # 获取最后一个 script 节点,向后插入 18 | print('[插入前] 最后一个节点:{}'.format(soup.select('script')[-1])) 19 | soup.select('script')[-1].insert_after(script_tag) 20 | print('[插入后] 最后一个节点:{}'.format(soup.select('script')[-1])) 21 | 22 | with open('after.html','w',encoding='utf-8') as f: 23 | f.write(soup.prettify()) # 格式化写入 24 | 25 | # print(soup) -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ''' 4 | 路径常识: 5 | \ 不能在Linux 6 | / 能够跨平台 7 | 推荐使用 / 路径 8 | ''' 9 | 10 | print('\\') 11 | print('/') 12 | 13 | # 遍历文件夹 14 | for dirpath,dirnames,files in os.walk('./素材'): 15 | 16 | # 获取有多少个文件 17 | print(files) 18 | 19 | # 将文件名排序好 20 | # list_data = [ int(data.replace('.ts','')) for data in files] 21 | # list_data.sort() 22 | # print(list_data) 23 | 24 | # 开始写入文件 25 | # for index in list_data: 26 | 27 | # # 写入 28 | # with open('file.txt','a+',encoding='utf-8') as f1: 29 | 30 | # # 读取 31 | # with open('file.txt','r',encoding='utf-8') as f2: 32 | 33 | # # 获取当前绝对路径 34 | # current_filename = os.getcwd().replace('\\','/') 35 | 36 | # # 文件名 37 | # filename = current_filename + '/素材/{}.ts'.format(index) 38 | 39 | # # 如果该文件名不在里面,就写入 40 | # if filename not in f2.read(): 41 | # f1.write("file '{}'\n".format(filename)) 42 | 43 | 44 | # 设置UTF-8编码,让命令行支持中文编码 45 | # cmd = 'ffmpeg.exe -f concat -safe 0 -i file.txt -c copy out.mp4"' 46 | # os.system('CHCP 65001') 47 | # os.system(cmd.replace('/', '\\')) 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/ffmpeg.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/file.txt: -------------------------------------------------------------------------------- 1 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/1.ts' 2 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/2.ts' 3 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/3.ts' 4 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/4.ts' 5 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/5.ts' 6 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/6.ts' 7 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/7.ts' 8 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/8.ts' 9 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/9.ts' 10 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/10.ts' 11 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/11.ts' 12 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/12.ts' 13 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/13.ts' 14 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/14.ts' 15 | file 'C:/Users/A/Desktop/mitmproxy爬取小鹅通/1.前置知识/4.ffmege合成ts视频/素材/15.ts' 16 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/out.mp4 -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/1.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/10.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/11.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/12.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/13.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/14.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/15.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/2.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/3.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/4.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/5.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/6.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/7.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/8.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/1.前置知识/4.ffmpeg合成ts视频/素材/9.ts -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/ffmpeg.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/request_demo.py: -------------------------------------------------------------------------------- 1 | # python 3.7 2 | import mitmproxy.http,json,os,m3u8,requests,shutil 3 | from bs4 import BeautifulSoup 4 | from mitmproxy import ctx 5 | from pathlib import Path 6 | from Crypto.Cipher import AES 7 | 8 | ''' 9 | 本次爬取的课程地址:https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6 10 | ''' 11 | 12 | # 生成python修复文件 13 | repair_file_py = r''' 14 | import os 15 | 16 | from Crypto.Cipher import AES 17 | 18 | # 获取当前路径 19 | current_filename = os.getcwd().replace('\\','/') 20 | 21 | # 修复文件连接 22 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt' 23 | 24 | # 开始修复文件 25 | def decrypt_file(): 26 | 27 | global new_repair_file_txt 28 | 29 | before_content = None 30 | 31 | key = {} 32 | 33 | mode = AES.MODE_CBC 34 | 35 | # 获取 AES 解密对象 36 | cryptos = AES.new(key, mode) 37 | 38 | # 创建修复文件 39 | repair_filename = current_filename + '/修复文件' 40 | if not os.path.exists(repair_filename): 41 | os.makedirs(repair_filename) 42 | 43 | with open('not_finish_file.txt','r',encoding='utf-8') as f1: 44 | 45 | # 读取第一行 46 | line = f1.readline() 47 | 48 | # 逐行读取 49 | while line: 50 | # 获取 还没被解密的 ts 视频的路径 51 | not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/') 52 | print(not_finish_file_line) 53 | 54 | with open(not_finish_file_line,'rb') as f: # 解密之前 55 | before_content = f.read() 56 | 57 | # 写入 修复文件 58 | new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1] 59 | print(new_repair_filename) 60 | with open(new_repair_filename,'wb') as f: # 解密之后 61 | f.write(cryptos.decrypt(before_content)) 62 | 63 | new_repair_file_txt = repair_filename + '/' + 'repair_file.txt' 64 | 65 | # 确保不重复 66 | with open(new_repair_file_txt,'a+',encoding='utf-8') as f3: # 解密之后 67 | with open(new_repair_file_txt,'r',encoding='utf-8') as f4: 68 | if str(new_repair_filename) not in f4.read(): 69 | f3.write("file '%s'\n" % str(new_repair_filename)) 70 | 71 | line = f1.readline() 72 | 73 | # 使用 not_finish_file.txt 合成视频 74 | def compose_file(): 75 | 76 | cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt + " -c copy 1.修复视频.mp4" 77 | print(cmd) 78 | # 设置UTF-8编码 79 | os.system('CHCP 65001') 80 | os.system(cmd.replace('/','\\')) 81 | 82 | decrypt_file() 83 | compose_file() 84 | ''' 85 | 86 | cryptos = None # AES解密 87 | m3u8_data = None # 保存m3u8有多少个uri 88 | filename = None # 下载视频路径 89 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径 90 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径 91 | title = None # 标题 92 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频 93 | 94 | class Counter: 95 | 96 | def __init__(self): 97 | self.Referer = 'https://xueyuan.xiaoe-tech.com/' 98 | self.Cookie = '请填写你的cookie' 99 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36' 100 | self.headers = { 101 | 'Referer':self.Referer, 102 | 'Cookie':self.Cookie, 103 | 'UserAgent':self.UserAgent 104 | } 105 | 106 | def request(self, flow: mitmproxy.http.HTTPFlow): 107 | 108 | # 所有请求插入协议头 109 | flow.request.headers['Referer'] = self.Referer 110 | flow.request.headers['Cookie'] = self.Cookie 111 | 112 | def response(self, flow: mitmproxy.http.HTTPFlow): 113 | 114 | # 导入全局变量 115 | global cryptos,filename,m3u8_data,result_filename,repair_file_py,title,finish_file_flag 116 | 117 | # 注入 JavaScript 118 | # 启动就能点击播放器 119 | if 'detail' in flow.request.url: 120 | 121 | # 确保匹配 HTML 122 | if 'text/html' in flow.response.headers['Content-Type']: 123 | 124 | javascript_text = ''' 125 | // 视频播放速度 126 | const playbackRate = 16; 127 | 128 | function start_video(){ 129 | 130 | // 确保修改了视频播放速度 131 | while(document.querySelector('video').playbackRate != playbackRate ){ 132 | 133 | // 点击播放器 134 | document.querySelector('div.iconfont.playButton.icon-icon_play').click(); 135 | 136 | // 设置视频重头播放 137 | document.querySelector('video').currentTime = 0; 138 | 139 | // 设置视频自动播放 140 | document.querySelector('video').autoplay = true; 141 | 142 | // 设置视频播放速度 143 | document.querySelector('video').playbackRate = playbackRate; 144 | 145 | // 设置视频静音 146 | document.querySelector('video').muted = true 147 | 148 | // 开始播放 149 | document.querySelector('video').play(); 150 | } 151 | }; 152 | 153 | // 使用递归,异步等待,确保video标签会出现 154 | function waitForElementToDisplay(selector, time) { 155 | 156 | // video标签出现后,异步等待 1 秒 157 | if(document.querySelector(selector)!=null) { 158 | 159 | console.log('获取成功video'); 160 | setTimeout( 161 | ()=>{ 162 | start_video(); 163 | },1000 164 | ); 165 | 166 | return; 167 | } 168 | else { 169 | setTimeout( ()=> { 170 | waitForElementToDisplay(selector, time); 171 | }, time); 172 | } 173 | } 174 | 175 | // 每过 1 秒检查video标签 是否出现 176 | waitForElementToDisplay('video',1000) 177 | ''' 178 | 179 | # 获取 BeautifulSoup 对象 180 | soup = BeautifulSoup(flow.response.text, 'lxml') 181 | 182 | # 生成一个script节点 183 | script_tag = soup.new_tag('script', type='text/javascript') 184 | 185 | # 往script节点写入内容 186 | script_tag.string = javascript_text 187 | 188 | # 在当前 HTML 最后一个script节点 向后插入一个节点 189 | soup.select('script')[-1].insert_after(script_tag) 190 | 191 | # 修改当前 HTML 全部内容 192 | flow.response.text = str(soup) 193 | 194 | # 设置 AES解密模式 195 | mode = AES.MODE_CBC 196 | 197 | # 获取课程标题 198 | if 'xe.goods.detail.get' in flow.request.url: 199 | 200 | # 加载 JSON 对象 201 | json_data = json.loads(flow.response.text) 202 | 203 | # 获取当前视频标题 204 | title = json_data['data']['title'].replace(' ','') 205 | 206 | # 如果没有文件夹,就创建文件夹 207 | filename = current_filename + '/下载成功的视频/{}'.format(title) 208 | if not os.path.exists(filename): 209 | os.makedirs(filename) 210 | 211 | if not os.path.exists(result_filename): 212 | os.makedirs(result_filename) 213 | 214 | # 匹配 m3u8 215 | if '.m3u8' in flow.request.url: 216 | 217 | # 加载 m3u8 对象 218 | dict_data = m3u8.parse(flow.response.text) 219 | 220 | # 获取 m3u8 全部分片链接 221 | m3u8_data = [ data['uri'] for data in dict_data['segments']] 222 | print(m3u8_data) 223 | 224 | # 获取解密参数 225 | m3u8_content = requests.get(url=dict_data['keys'][0]['uri'],headers=self.headers).content 226 | cryptos = AES.new(m3u8_content,mode) 227 | 228 | # 将密钥 写入 修复文件 229 | repair_file_py = repair_file_py.format(str(m3u8_content)) 230 | print('\n' + '-'*50) 231 | print('\n当前密钥:{}'.format(str(m3u8_content))) 232 | 233 | 234 | # 匹配密钥 235 | if 'get_video_key.php' in flow.request.url: 236 | 237 | print('\n当前密钥:{}'.format(str(flow.response.content))) 238 | 239 | # 将密钥 写入 修复文件 240 | repair_file_py = repair_file_py.format(str(flow.response.content)) 241 | cryptos = AES.new(flow.response.content, mode) 242 | 243 | # 解密 ts 文件 244 | if '.ts' in flow.request.url: 245 | 246 | print('-'*50) 247 | print('\n[当前解密对象]:{}\n'.format(cryptos)) 248 | 249 | # 拼接当前视频保存路径 250 | m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end')) 251 | print('[当前视频]:{} [保存路径]:{}\n'.format(title,m3u8_ts_filename)) 252 | 253 | # 用于合成 254 | m3u8_finish_file_filename = filename + '/finish_file.txt' 255 | 256 | # 确定最后一个分片 257 | start_data = m3u8_data[-1].split('?')[1].split('&')[0] 258 | end_data = m3u8_data[-1].split('?')[1].split('&')[1] 259 | result_data = start_data + end_data 260 | 261 | # 获取成功密钥,再解密 262 | if cryptos != None: 263 | 264 | # 保存 解密好的 ts 265 | with open(m3u8_ts_filename,'wb') as f: 266 | f.write(cryptos.decrypt(flow.response.content)) 267 | 268 | 269 | # 写入 解密成功 标记文件 270 | with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1: 271 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2: 272 | 273 | # 如果文件为空,同时又存在最后一片,将不写入 274 | if result_data in m3u8_ts_filename and f2.read()=='': 275 | pass 276 | 277 | # 防止重复,确保路径没问题 278 | elif m3u8_ts_filename not in f2.read(): 279 | f1.write("file '{}'\n".format(m3u8_ts_filename)) 280 | 281 | # 如果是最后一个分片,开始合成视频 282 | if result_data in m3u8_ts_filename: 283 | 284 | # 拷贝 ffmpeg.exe 写入指定目录 285 | ffmpeg_filename = filename + '/ffmpeg.exe' 286 | shutil.copyfile('ffmpeg.exe', ffmpeg_filename) 287 | 288 | # 如果 存在 还没被解密的 ts 视频 289 | if finish_file_flag: 290 | 291 | # 生成修复python 292 | repair_file = filename + '/repair.py' 293 | with open(repair_file,'w',encoding='utf-8') as f: 294 | f.write(repair_file_py) 295 | 296 | # 合成视频 297 | cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"' 298 | 299 | 300 | # 读取 解密成功 标记文件 301 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f: 302 | 303 | # 确保文件不为空 304 | if f.read()!='': 305 | 306 | mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4' 307 | 308 | # 如果合成的视频已经存在,先删除,再执行 309 | if os.path.exists(mp4_filename): 310 | os.remove(mp4_filename) 311 | 312 | # 设置UTF-8编码 313 | os.system('CHCP 65001') 314 | os.system(cmd.replace('/','\\')) 315 | print('[警告]:文件路径 {}'.format(mp4_filename)) 316 | print('[警告]:文件被覆盖了,由于该文件之前已存在过') 317 | else: 318 | os.system('CHCP 65001') 319 | os.system(cmd.replace('/','\\')) 320 | print('[成功]:文件路径 {}'.format(mp4_filename)) 321 | print('[成功]:合并完毕') 322 | else: 323 | print(os.path.exists(result_filename + '/' + filename.split('/')[-1] + '.mp4"')) 324 | print(result_filename + '/' + filename.split('/')[-1] + '.mp4') 325 | print('[异常]:当前视频只下载最后一片,将不会合成视频') 326 | else: 327 | 328 | # 标记是否存在 还没被解密的 ts 视频 329 | finish_file_flag = True 330 | 331 | # 保存 还没被解密的 ts 视频 332 | with open(m3u8_ts_filename,'wb') as f: 333 | f.write(flow.response.content) 334 | 335 | # 用于合成 336 | m3u8_not_finish_file__filename = filename + '/not_finish_file.txt' 337 | with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f: 338 | f.write("file '{}'\n".format(m3u8_ts_filename)) 339 | 340 | 341 | addons = [ 342 | Counter() 343 | ] -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/requirements.txt: -------------------------------------------------------------------------------- 1 | asgiref==3.3.4 2 | beautifulsoup4==4.9.3 3 | blinker==1.4 4 | Brotli==1.0.9 5 | bs4==0.0.1 6 | certifi==2021.5.30 7 | cffi==1.14.6 8 | charset-normalizer==2.0.3 9 | click==7.1.2 10 | cryptography==3.2.1 11 | Flask==1.1.4 12 | h11==0.12.0 13 | h2==4.0.0 14 | hpack==4.0.0 15 | hyperframe==6.0.1 16 | idna==3.2 17 | iso8601==0.1.14 18 | itsdangerous==1.1.0 19 | Jinja2==2.11.3 20 | kaitaistruct==0.9 21 | ldap3==2.8.1 22 | m3u8==0.9.0 23 | MarkupSafe==2.0.1 24 | mitmproxy==5.3.0 25 | msgpack==1.0.2 26 | passlib==1.7.4 27 | protobuf==3.13.0 28 | publicsuffix2==2.20191221 29 | pyasn1==0.4.8 30 | pycparser==2.20 31 | pycryptodome==3.10.1 32 | pydivert==2.1.0 33 | pyOpenSSL==19.1.0 34 | pyparsing==2.4.7 35 | pyperclip==1.8.2 36 | requests==2.26.0 37 | ruamel.yaml==0.16.13 38 | ruamel.yaml.clib==0.2.6 39 | selenium==3.141.0 40 | six==1.16.0 41 | sortedcontainers==2.2.2 42 | soupsieve==2.2.1 43 | tornado==6.1 44 | typing-extensions==3.10.0.0 45 | urllib3==1.26.6 46 | urwid==2.1.2 47 | Werkzeug==1.0.1 48 | wsproto==0.15.0 49 | zstandard==0.14.1 50 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/chromedriver.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/selenium_start.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | PROXY='http://127.0.0.1:8080' 4 | 5 | chrome_options = webdriver.ChromeOptions() 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080") 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式 8 | 9 | 10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义 11 | 12 | ''' 13 | 本次爬取的课程地址:https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6 14 | ''' 15 | 16 | url = 'https://xueyuan.xiaoe-tech.com/detail/p_5e269260ab5c5_O25BMaat/6' 17 | browser.get(url)#访问网站 18 | 19 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/selenium启动/谷歌驱动下载地址.txt: -------------------------------------------------------------------------------- 1 | 地址一: 2 | http://chromedriver.storage.googleapis.com/index.html 3 | 4 | 地址二: 5 | https://registry.npmmirror.com/binary.html?path=chromedriver/ -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/启动程序指令.txt: -------------------------------------------------------------------------------- 1 | 先打开一个cmd,输入mitmweb -s request_demo.py 2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/2.自动合并版本/安装环境指令.txt: -------------------------------------------------------------------------------- 1 | 一次性安装 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple 3 | 4 | 分开安装 5 | pip install mitmproxy 6 | pip install selenium 7 | pip install m3u8 8 | pip install requests 9 | pip install bs4 10 | pip install pycryptodome -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/ffmpeg.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/request_demo.py: -------------------------------------------------------------------------------- 1 | # python 3.7 2 | import mitmproxy.http,json,os,shutil 3 | from bs4 import BeautifulSoup 4 | from mitmproxy import ctx 5 | from pathlib import Path 6 | from Crypto.Cipher import AES 7 | 8 | ''' 9 | 本次爬取的课程地址:https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e 10 | ''' 11 | 12 | # 生成python修复文件 13 | repair_file_py = r''' 14 | 15 | "此文件用于保存密钥,请不要执行代码" 16 | 17 | import os 18 | 19 | from Crypto.Cipher import AES 20 | 21 | # 获取当前路径 22 | current_filename = os.getcwd().replace('\\','/') 23 | 24 | # 修复文件连接 25 | new_repair_file_txt = current_filename + '/修复文件/' + 'repair_file.txt' 26 | 27 | # 开始修复文件 28 | def decrypt_file(): 29 | 30 | global new_repair_file_txt 31 | 32 | before_content = None 33 | 34 | key = {} 35 | 36 | mode = AES.MODE_CBC 37 | 38 | # 获取 AES 解密对象 39 | cryptos = AES.new(key, mode) 40 | 41 | # 创建修复文件 42 | repair_filename = current_filename + '/修复文件' 43 | if not os.path.exists(repair_filename): 44 | os.makedirs(repair_filename) 45 | 46 | with open('not_finish_file.txt','r',encoding='utf-8') as f1: 47 | 48 | # 读取第一行 49 | line = f1.readline() 50 | 51 | # 逐行读取 52 | while line: 53 | # 获取 还没被解密的 ts 视频的路径 54 | not_finish_file_line = line.split(' ')[1].replace('\n','').replace("'",'').replace('\\','/') 55 | print(not_finish_file_line) 56 | 57 | with open(not_finish_file_line,'rb') as f: # 解密之前 58 | before_content = f.read() 59 | 60 | # 写入 修复文件 61 | new_repair_filename = repair_filename + '/' + not_finish_file_line.split('/')[-1] 62 | print(new_repair_filename) 63 | with open(new_repair_filename,'wb') as f: # 解密之后 64 | f.write(cryptos.decrypt(before_content)) 65 | 66 | new_repair_file_txt = repair_filename + '/' + 'repair_file.txt' 67 | 68 | # 确保不重复 69 | with open(new_repair_file_txt,'a+',encoding='utf-8') as f3: # 解密之后 70 | with open(new_repair_file_txt,'r',encoding='utf-8') as f4: 71 | if str(new_repair_filename) not in f4.read(): 72 | f3.write("file '%s'\n" % str(new_repair_filename)) 73 | 74 | line = f1.readline() 75 | 76 | # 使用 not_finish_file.txt 合成视频 77 | def compose_file(): 78 | 79 | cmd = "ffmpeg.exe -f concat -safe 0 -i " + new_repair_file_txt + " -c copy 1.修复视频.mp4" 80 | print(cmd) 81 | # 设置UTF-8编码 82 | os.system('CHCP 65001') 83 | os.system(cmd.replace('/','\\')) 84 | 85 | decrypt_file() 86 | compose_file() 87 | ''' 88 | 89 | # 生成python合成文件 90 | merge_file_py = r''' 91 | 92 | "此文件用于合成视频" 93 | 94 | import os 95 | 96 | mp4_filename = '%s' 97 | 98 | cmd = '%s' 99 | 100 | # 如果合成的视频已经存在,先删除,再执行 101 | if os.path.exists(mp4_filename): 102 | os.remove(mp4_filename) 103 | 104 | # 设置UTF-8编码 105 | os.system('CHCP 65001') 106 | os.system(cmd.replace('/','\\')) 107 | print('[警告]:文件路径 {}'.format(mp4_filename)) 108 | print('[警告]:文件被覆盖了,由于该文件之前已存在过') 109 | else: 110 | os.system('CHCP 65001') 111 | os.system(cmd.replace('/','\\')) 112 | print('[成功]:文件路径 {}'.format(mp4_filename)) 113 | print('[成功]:合并完毕') 114 | ''' 115 | 116 | cryptos = None # AES解密 117 | filename = None # 下载视频路径 118 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径 119 | result_filename = current_filename + '/合成的视频' # 获取 ffmepg合成视频后的路径 120 | title = None # 标题 121 | finish_file_flag = False # 标记是否存在 还没被解密的 ts 视频 122 | 123 | class Counter: 124 | 125 | def __init__(self): 126 | self.Referer = 'https://appdgjqmn6j1714.h5.xiaoeknow.com' 127 | self.Cookie = '请填写你的Cooie' 128 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36' 129 | self.headers = { 130 | 'Referer':self.Referer, 131 | 'Cookie':self.Cookie, 132 | 'UserAgent':self.UserAgent 133 | } 134 | 135 | def request(self, flow: mitmproxy.http.HTTPFlow): 136 | 137 | # 所有请求插入协议头 138 | flow.request.headers['Referer'] = self.Referer 139 | flow.request.headers['Cookie'] = self.Cookie 140 | 141 | def response(self, flow: mitmproxy.http.HTTPFlow): 142 | 143 | # 导入全局变量 144 | global cryptos,filename,result_filename,repair_file_py,title,finish_file_flag,merge_file_py 145 | 146 | # 注入 JavaScript 147 | # 启动就能点击播放器 148 | if 'v_61ceb0f8e4b05006f9c4214e' in flow.request.url: 149 | 150 | # 确保匹配 HTML 151 | if 'text/html' in flow.response.headers.get('content-type'): 152 | 153 | try: 154 | print('尝试执行JS控制播放器代码') 155 | javascript_text = ''' 156 | // 视频播放速度 157 | const playbackRate = 16; 158 | 159 | function start_video(){ 160 | 161 | // 确保修改了视频播放速度 162 | while(document.querySelector('video').playbackRate != playbackRate ){ 163 | 164 | // 点击播放器 165 | document.querySelector('div.iconfont.playButton.icon-icon_play').click(); 166 | 167 | // 设置视频重头播放 168 | document.querySelector('video').currentTime = 0; 169 | 170 | // 设置视频自动播放 171 | document.querySelector('video').autoplay = true; 172 | 173 | // 设置视频播放速度 174 | document.querySelector('video').playbackRate = playbackRate; 175 | 176 | // 设置视频静音 177 | document.querySelector('video').muted = true 178 | 179 | // 开始播放 180 | document.querySelector('video').play(); 181 | } 182 | }; 183 | 184 | // 使用递归,异步等待,确保video标签会出现 185 | function waitForElementToDisplay(selector, time) { 186 | 187 | // video标签出现后,异步等待 1 秒 188 | if(document.querySelector(selector)!=null) { 189 | 190 | console.log('获取成功video'); 191 | setTimeout( 192 | ()=>{ 193 | start_video(); 194 | },1000 195 | ); 196 | 197 | return; 198 | } 199 | else { 200 | setTimeout( ()=> { 201 | waitForElementToDisplay(selector, time); 202 | }, time); 203 | } 204 | } 205 | 206 | // 每过 1 秒检查video标签 是否出现 207 | waitForElementToDisplay('video',1000) 208 | ''' 209 | 210 | # 获取 BeautifulSoup 对象 211 | soup = BeautifulSoup(flow.response.text, 'lxml') 212 | 213 | # 生成一个script节点 214 | script_tag = soup.new_tag('script', type='text/javascript') 215 | 216 | # 往script节点写入内容 217 | script_tag.string = javascript_text 218 | 219 | # 在当前 HTML 最后一个script节点 向后插入一个节点 220 | soup.select('script')[-1].insert_after(script_tag) 221 | 222 | # 修改当前 HTML 全部内容 223 | flow.response.text = str(soup) 224 | except: 225 | pass 226 | 227 | # 设置 AES解密模式 228 | mode = AES.MODE_CBC 229 | 230 | # 获取课程标题 231 | if 'get_goods_info_business' in flow.request.url: 232 | 233 | # 加载 JSON 对象 234 | json_data = json.loads(flow.response.text) 235 | 236 | # 获取当前视频标题 237 | title = json_data['data']['goods_name'].replace(' ','') 238 | 239 | # 如果没有文件夹,就创建文件夹 240 | filename = current_filename + '/下载成功的视频/{}'.format(title) 241 | if not os.path.exists(filename): 242 | os.makedirs(filename) 243 | 244 | if not os.path.exists(result_filename): 245 | os.makedirs(result_filename) 246 | 247 | # 匹配密钥 248 | if 'get_video_key.php' in flow.request.url: 249 | 250 | print('\n当前密钥:{}'.format(str(flow.response.content))) 251 | 252 | # 将密钥 写入 修复文件 253 | repair_file_py = repair_file_py.format(str(flow.response.content)) 254 | cryptos = AES.new(flow.response.content, mode) 255 | 256 | # 解密 ts 文件 257 | if '.ts' in flow.request.url: 258 | 259 | print('-'*50) 260 | print('\n[当前解密对象]:{}\n'.format(cryptos)) 261 | 262 | # 拼接当前视频保存路径 263 | m3u8_ts_filename = filename + '/start={}end={}.ts'.format(flow.request.query.get('start'),flow.request.query.get('end')) 264 | print('[当前视频]:{} [保存路径]:{}\n'.format(title,m3u8_ts_filename)) 265 | 266 | # 用于合成 267 | m3u8_finish_file_filename = filename + '/finish_file.txt' 268 | 269 | # 确定最后一个分片 270 | start_data = flow.request.query.get('start') 271 | end_data = flow.request.query.get('end') 272 | result_data = start_data + end_data 273 | 274 | # 获取成功密钥,再解密 275 | if cryptos != None: 276 | 277 | 278 | # 保存 解密好的 ts 279 | with open(m3u8_ts_filename,'wb') as f: 280 | f.write(cryptos.decrypt(flow.response.content)) 281 | 282 | 283 | # 写入 解密成功 标记文件 284 | with open(m3u8_finish_file_filename,'a+',encoding='utf-8') as f1: 285 | with open(m3u8_finish_file_filename,'r',encoding='utf-8') as f2: 286 | 287 | # 如果文件为空,同时又存在最后一片,将不写入 288 | if result_data in m3u8_ts_filename and f2.read()=='': 289 | pass 290 | 291 | # 防止重复,确保路径没问题 292 | elif m3u8_ts_filename not in f2.read(): 293 | f1.write("file '{}'\n".format(m3u8_ts_filename)) 294 | 295 | ffmpeg_filename = filename + '/ffmpeg.exe' 296 | shutil.copyfile('ffmpeg.exe', ffmpeg_filename) 297 | 298 | # 优化版 生成python合成文件 299 | mp4_filename = result_filename + '/' + filename.split('/')[-1] + '.mp4' 300 | cmd = 'ffmpeg.exe -f concat -safe 0 -i "' + m3u8_finish_file_filename + '" -c copy "' + result_filename + '/' + filename.split('/')[-1] + '.mp4"' 301 | 302 | if mp4_filename and cmd: 303 | 304 | try: 305 | merge_file_py = merge_file_py % (str(mp4_filename),str(cmd)) 306 | except: 307 | pass 308 | 309 | # 开始生成python合成文件 310 | merge_file = filename + '/merge.py' 311 | with open(merge_file,'w',encoding='utf-8') as f: 312 | f.write(merge_file_py) 313 | 314 | # 生成修复python文件 315 | repair_file = filename + '/repair.py' 316 | with open(repair_file,'w',encoding='utf-8') as f: 317 | f.write(repair_file_py) 318 | else: 319 | 320 | # 标记是否存在 还没被解密的 ts 视频 321 | finish_file_flag = True 322 | 323 | # 保存 还没被解密的 ts 视频 324 | with open(m3u8_ts_filename,'wb') as f: 325 | f.write(flow.response.content) 326 | 327 | # 用于合成 328 | m3u8_not_finish_file__filename = filename + '/not_finish_file.txt' 329 | with open(m3u8_not_finish_file__filename,'a+',encoding='utf-8') as f: 330 | f.write("file '{}'\n".format(m3u8_ts_filename)) 331 | 332 | 333 | addons = [ 334 | Counter() 335 | ] -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/requirements.txt: -------------------------------------------------------------------------------- 1 | asgiref==3.3.4 2 | beautifulsoup4==4.9.3 3 | blinker==1.4 4 | Brotli==1.0.9 5 | bs4==0.0.1 6 | certifi==2021.5.30 7 | cffi==1.14.6 8 | charset-normalizer==2.0.3 9 | click==7.1.2 10 | cryptography==3.2.1 11 | Flask==1.1.4 12 | h11==0.12.0 13 | h2==4.0.0 14 | hpack==4.0.0 15 | hyperframe==6.0.1 16 | idna==3.2 17 | iso8601==0.1.14 18 | itsdangerous==1.1.0 19 | Jinja2==2.11.3 20 | kaitaistruct==0.9 21 | ldap3==2.8.1 22 | m3u8==0.9.0 23 | MarkupSafe==2.0.1 24 | mitmproxy==5.3.0 25 | msgpack==1.0.2 26 | passlib==1.7.4 27 | protobuf==3.13.0 28 | publicsuffix2==2.20191221 29 | pyasn1==0.4.8 30 | pycparser==2.20 31 | pycryptodome==3.10.1 32 | pydivert==2.1.0 33 | pyOpenSSL==19.1.0 34 | pyparsing==2.4.7 35 | pyperclip==1.8.2 36 | requests==2.26.0 37 | ruamel.yaml==0.16.13 38 | ruamel.yaml.clib==0.2.6 39 | selenium==3.141.0 40 | six==1.16.0 41 | sortedcontainers==2.2.2 42 | soupsieve==2.2.1 43 | tornado==6.1 44 | typing-extensions==3.10.0.0 45 | urllib3==1.26.6 46 | urwid==2.1.2 47 | Werkzeug==1.0.1 48 | wsproto==0.15.0 49 | zstandard==0.14.1 50 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/chromedriver.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/selenium_start.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | PROXY='http://127.0.0.1:8080' 4 | 5 | chrome_options = webdriver.ChromeOptions() 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080") 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式 8 | 9 | 10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义 11 | 12 | ''' 13 | 本次爬取的课程地址:https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e 14 | ''' 15 | 16 | url = 'https://appdgjqmn6j1714.h5.xiaoeknow.com/v1/course/video/v_61ceb0f8e4b05006f9c4214e' 17 | browser.get(url)#访问网站 18 | 19 | -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/selenium启动/谷歌驱动下载地址.txt: -------------------------------------------------------------------------------- 1 | 地址一: 2 | http://chromedriver.storage.googleapis.com/index.html 3 | 4 | 地址二: 5 | https://registry.npmmirror.com/binary.html?path=chromedriver/ -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/启动程序指令.txt: -------------------------------------------------------------------------------- 1 | 先打开一个cmd,输入mitmweb -s request_demo.py 2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py -------------------------------------------------------------------------------- /下载小鹅通视频/2021年12月/3.手动合并版本/安装环境指令.txt: -------------------------------------------------------------------------------- 1 | 一次性安装 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple 3 | 4 | 分开安装 5 | pip install mitmproxy 6 | pip install selenium 7 | pip install m3u8 8 | pip install requests 9 | pip install bs4 10 | pip install pycryptodome -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/N_m3u8DL-CLI_v3.0.2.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/ffmpeg.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/request_demo.py: -------------------------------------------------------------------------------- 1 | # python 3.7.9 2 | import mitmproxy.http,json,os,m3u8,requests,base64 3 | from mitmproxy import ctx 4 | from pathlib import Path 5 | 6 | ''' 7 | 解密参考文章: https://www.52pojie.cn/thread-1689801-1-1.html 8 | m3u8下载器GitHub地址: https://github.com/nilaoda/N_m3u8DL-CLI 9 | 10 | 旧版课程: https://appjkyl58fl2930.h5.xiaoeknow.com/p/course/column/p_5c483e6305292_C1LfcA9T?type=3 11 | 新版课程: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3 12 | 13 | 本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3 14 | ''' 15 | 16 | userid = None # 用户uid 17 | filename = None # 下载视频路径 18 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径 19 | ts_url = None # ts文件下载地址 20 | title = None # 标题 21 | m3u8_obj = None # m3u8对象 22 | m3u8_content = None # m3u8密钥 23 | 24 | class Counter: 25 | 26 | def __init__(self): 27 | self.Referer = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3' 28 | self.Cookie = '请填写你的Cooie' 29 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36' 30 | self.headers = { 31 | 'Referer':self.Referer, 32 | 'Cookie':self.Cookie, 33 | 'UserAgent':self.UserAgent 34 | } 35 | 36 | def request(self, flow: mitmproxy.http.HTTPFlow): 37 | 38 | # 所有请求插入协议头 39 | flow.request.headers['Referer'] = self.Referer 40 | flow.request.headers['Cookie'] = self.Cookie 41 | 42 | def response(self, flow: mitmproxy.http.HTTPFlow): 43 | 44 | # 导入全局变量 45 | global filename,title,userid,ts_url,m3u8_obj,m3u8_content 46 | 47 | 48 | # 获取课程标题 49 | if 'xe.course.business.core.info.get' in flow.request.url: 50 | 51 | # 加载 JSON 对象 52 | json_data = json.loads(flow.response.text) 53 | 54 | # 获取当前视频标题 55 | title = json_data['data']['resource_name'].replace(' ','') 56 | 57 | print(f'[当前标题] {title}') 58 | 59 | # 如果没有文件夹,就创建文件夹 60 | filename = current_filename + '/下载成功的视频/{}'.format(title) 61 | if not os.path.exists(filename): 62 | os.makedirs(filename) 63 | 64 | if not os.path.exists(current_filename+'/m3u8'): 65 | os.makedirs(current_filename+'/m3u8') 66 | 67 | if 'xe.course.business.composite_info.get' in flow.request.url: 68 | 69 | # 加载 JSON 对象 70 | json_data = json.loads(flow.response.text) 71 | 72 | # 获取userid 73 | userid = json_data['data']['user_info']['user_id'].replace(' ','') 74 | 75 | print(f'[用户ID] {userid}') 76 | 77 | # 匹配 m3u8 78 | if '.m3u8' in flow.request.url: 79 | 80 | if userid != None and filename != None: 81 | 82 | # 加载 m3u8 对象 83 | m3u8_obj = m3u8.loads(flow.response.text) 84 | 85 | # 添加用户userid 86 | m3u8_obj.keys[0].uri = m3u8_obj.keys[0].uri + f'&uid={userid}' 87 | 88 | # 获取m3u8密钥 URL 89 | m3u8_key_url = m3u8_obj.keys[0].uri 90 | 91 | # 获取解密参数(第一次解密) 92 | # print(m3u8_key_url) 93 | try: 94 | m3u8_content = requests.get( 95 | url=m3u8_key_url, 96 | headers=self.headers, 97 | proxies={ "http": None, "https": None} # 不走系统代理,防止clash、v2ray等代理软件拦截 98 | ).content 99 | except: 100 | print(f'-'*25 + 'm3u8_content连接失败' + '-'*25) 101 | print(f'[m3u8链接] = {m3u8_key_url}') 102 | print(f'[协议头] = {self.headers}') 103 | print(f'-'*50) 104 | raise (f'[ERROR]m3u8_content连接失败') 105 | 106 | # 基于用户userid解密(第二次解密) 107 | rsp_data = m3u8_content 108 | userid_bytes = bytes(userid.encode(encoding='utf-8')) 109 | result_list = [] 110 | for index in range(0, len(rsp_data)): 111 | result_list.append( 112 | rsp_data[index] ^ userid_bytes[index]) 113 | m3u8_content = bytes(result_list) 114 | 115 | # 最终密钥 116 | m3u8_content = base64.b64encode(bytes(result_list)).decode() 117 | print(f'[m3u8密钥] {m3u8_content}') 118 | 119 | else: 120 | print(f'[当前标题] {title}') 121 | print(f'[用户ID] {userid}') 122 | print('[错误] 没有用户id || 没有标题') 123 | 124 | 125 | if '.ts' in flow.request.url: 126 | 127 | video_url = flow.request.url 128 | 129 | print('[开始下载视频]------------------') 130 | # print(f'video_url: {video_url}') 131 | 132 | # 获取ts文件下载域名(前缀) 133 | start_url = video_url.split('/')[:-1] 134 | 135 | # 获取ts文件下载域名(后缀) 136 | end_url = video_url.split('/')[-1].split('?') 137 | end_url[0] = '{ts_url}' 138 | 139 | # 后缀塞入前缀 140 | start_url.append('&'.join(end_url)) 141 | 142 | # 生成 ts文件下载地址 143 | ts_url = '/'.join(start_url) 144 | 145 | # 添加 ts 链接地址 146 | for tmp_data in m3u8_obj.segments: 147 | 148 | # 插入 149 | if ts_url != None: 150 | tmp_data.uri = ts_url.format(ts_url=tmp_data.uri) 151 | else: 152 | print(f'[错误] ts_url is None') 153 | 154 | m3u8_filename = f'./m3u8/{title}.m3u8' 155 | m3u8_obj.dump(m3u8_filename) 156 | 157 | # 确保m3u8文件存在 158 | if os.path.exists(m3u8_filename): 159 | 160 | if os.path.exists(f'{filename}/{title}.mp4'): 161 | print(f'[停止下载警告] 已经存在 {filename}/{title}.mp4') 162 | 163 | elif m3u8_content == None: 164 | print(f'[m3u8密钥] {m3u8_content}') 165 | print('[错误] 没有m3u8密钥') 166 | 167 | else: 168 | cmd = f'N_m3u8DL-CLI_v3.0.2.exe "{m3u8_filename}" --workDir "{filename}" --saveName "{title}" --useKeyBase64 "{m3u8_content}"' 169 | print(cmd) 170 | os.system('CHCP 65001') 171 | os.system(cmd) 172 | 173 | else: 174 | print('[错误]m3u8文件生成失败') 175 | 176 | 177 | addons = [ 178 | Counter() 179 | ] -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/requirements.txt: -------------------------------------------------------------------------------- 1 | asgiref==3.3.4 2 | beautifulsoup4==4.9.3 3 | blinker==1.4 4 | Brotli==1.0.9 5 | bs4==0.0.1 6 | certifi==2021.5.30 7 | cffi==1.14.6 8 | charset-normalizer==2.0.3 9 | click==7.1.2 10 | cryptography==3.2.1 11 | Flask==1.1.4 12 | h11==0.12.0 13 | h2==4.0.0 14 | hpack==4.0.0 15 | hyperframe==6.0.1 16 | idna==3.2 17 | iso8601==0.1.14 18 | itsdangerous==1.1.0 19 | Jinja2==2.11.3 20 | kaitaistruct==0.9 21 | ldap3==2.8.1 22 | m3u8==0.9.0 23 | MarkupSafe==2.0.1 24 | mitmproxy==5.3.0 25 | msgpack==1.0.2 26 | passlib==1.7.4 27 | protobuf==3.13.0 28 | publicsuffix2==2.20191221 29 | pyasn1==0.4.8 30 | pycparser==2.20 31 | pycryptodome==3.10.1 32 | pydivert==2.1.0 33 | pyOpenSSL==19.1.0 34 | pyparsing==2.4.7 35 | pyperclip==1.8.2 36 | requests==2.26.0 37 | ruamel.yaml==0.16.13 38 | ruamel.yaml.clib==0.2.6 39 | selenium==3.141.0 40 | six==1.16.0 41 | sortedcontainers==2.2.2 42 | soupsieve==2.2.1 43 | tornado==6.1 44 | typing-extensions==3.10.0.0 45 | urllib3==1.26.6 46 | urwid==2.1.2 47 | Werkzeug==1.0.1 48 | wsproto==0.15.0 49 | zstandard==0.14.1 50 | -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/chromedriver.exe -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/selenium_start.py: -------------------------------------------------------------------------------- 1 | # python 3.7.9 2 | from selenium import webdriver 3 | 4 | PROXY='http://127.0.0.1:8080' 5 | 6 | chrome_options = webdriver.ChromeOptions() 7 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080") 8 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式 9 | 10 | 11 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义 12 | 13 | ''' 14 | 本次爬取的课程地址: https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3 15 | ''' 16 | 17 | url = 'https://app0mupqpv04212.h5.xiaoeknow.com/p/course/column/p_6226f0d0e4b02b82585244ba?type=3' 18 | browser.get(url)#访问网站 19 | 20 | -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/selenium启动/谷歌驱动下载地址.txt: -------------------------------------------------------------------------------- 1 | 地址一: 2 | http://chromedriver.storage.googleapis.com/index.html 3 | 4 | 地址二: 5 | https://registry.npmmirror.com/binary.html?path=chromedriver/ -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/启动程序指令.txt: -------------------------------------------------------------------------------- 1 | 先打开一个cmd,输入mitmweb -s request_demo.py 2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py -------------------------------------------------------------------------------- /下载小鹅通视频/2022年12月/1.自动合并版本/安装环境指令.txt: -------------------------------------------------------------------------------- 1 | 一次性安装 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple 3 | 4 | 分开安装 5 | pip install mitmproxy 6 | pip install selenium 7 | pip install m3u8 8 | pip install requests 9 | pip install bs4 10 | pip install pycryptodome -------------------------------------------------------------------------------- /下载荔枝微课/ffmpeg.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/ffmpeg.exe -------------------------------------------------------------------------------- /下载荔枝微课/request_demo.py: -------------------------------------------------------------------------------- 1 | # python 3.7 2 | import mitmproxy.http,json,os,requests 3 | from mitmproxy import ctx 4 | from pathlib import Path 5 | 6 | ''' 7 | 本次爬取的课程地址:https://m.lizhiweike.com/channel2/1192275 8 | ''' 9 | 10 | cookie = '请填写你的Cooie' 11 | filename = None # 下载视频路径 12 | current_filename = os.getcwd().replace('\\','/') # 获取当前路径 13 | title = None # 标题 14 | 15 | class Counter: 16 | 17 | def __init__(self): 18 | self.Referer = 'https://m.lizhiweike.com/channel2/1192275' 19 | self.UserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36' 20 | self.headers = { 21 | 'Referer':self.Referer, 22 | 'Host':'m.lizhiweike.com', 23 | 'User-Agent':self.UserAgent 24 | } 25 | 26 | def request(self, flow: mitmproxy.http.HTTPFlow): 27 | 28 | # 所有请求插入协议头 29 | flow.request.headers['Referer'] = self.Referer 30 | 31 | def response(self, flow: mitmproxy.http.HTTPFlow): 32 | 33 | # 导入全局变量 34 | global filename,title,current_filename,cookie 35 | 36 | if 'lecture' in flow.request.url and 'info' in flow.request.url: 37 | 38 | # 加载 JSON 对象 39 | json_data = json.loads(flow.response.text) 40 | 41 | try: 42 | # 获取当前视频标题 43 | title = json_data['data']['share_info']['share_title'].replace(' ','') 44 | except: 45 | pass 46 | 47 | # 获取课程标题 48 | if 'qcvideo' in flow.request.url: 49 | 50 | # 加载 JSON 对象 51 | json_data = json.loads(flow.response.text) 52 | 53 | # 获取视频URL 54 | video_url = json_data['data']['play_list'][0]['url'] 55 | 56 | print(f'【信息】当前视频标题:{title},视频mp4链接:{video_url}') 57 | 58 | # 如果没有文件夹,就创建文件夹 59 | filename = current_filename + '/下载成功的视频/' 60 | if not os.path.exists(filename): 61 | os.makedirs(filename) 62 | 63 | # 生产mp4存放路径 64 | mp4_filename_path = f'{filename}{title}.mp4' 65 | 66 | headers = { 67 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', 68 | 'referer':'https://m.lizhiweike.com/channel2/1192275', 69 | 'Cookie':cookie 70 | } 71 | 72 | # 下载视频 73 | html = requests.get(url=video_url,headers=headers) 74 | with open(mp4_filename_path,'wb') as f: 75 | f.write(html.content) 76 | 77 | addons = [ 78 | Counter() 79 | ] -------------------------------------------------------------------------------- /下载荔枝微课/requirements.txt: -------------------------------------------------------------------------------- 1 | asgiref==3.3.4 2 | beautifulsoup4==4.9.3 3 | blinker==1.4 4 | Brotli==1.0.9 5 | bs4==0.0.1 6 | certifi==2021.5.30 7 | cffi==1.14.6 8 | charset-normalizer==2.0.3 9 | click==7.1.2 10 | cryptography==3.2.1 11 | Flask==1.1.4 12 | h11==0.12.0 13 | h2==4.0.0 14 | hpack==4.0.0 15 | hyperframe==6.0.1 16 | idna==3.2 17 | iso8601==0.1.14 18 | itsdangerous==1.1.0 19 | Jinja2==2.11.3 20 | kaitaistruct==0.9 21 | ldap3==2.8.1 22 | m3u8==0.9.0 23 | MarkupSafe==2.0.1 24 | mitmproxy==5.3.0 25 | msgpack==1.0.2 26 | passlib==1.7.4 27 | protobuf==3.13.0 28 | publicsuffix2==2.20191221 29 | pyasn1==0.4.8 30 | pycparser==2.20 31 | pycryptodome==3.10.1 32 | pydivert==2.1.0 33 | pyOpenSSL==19.1.0 34 | pyparsing==2.4.7 35 | pyperclip==1.8.2 36 | requests==2.26.0 37 | ruamel.yaml==0.16.13 38 | ruamel.yaml.clib==0.2.6 39 | selenium==3.141.0 40 | six==1.16.0 41 | sortedcontainers==2.2.2 42 | soupsieve==2.2.1 43 | tornado==6.1 44 | typing-extensions==3.10.0.0 45 | urllib3==1.26.6 46 | urwid==2.1.2 47 | Werkzeug==1.0.1 48 | wsproto==0.15.0 49 | zstandard==0.14.1 50 | -------------------------------------------------------------------------------- /下载荔枝微课/selenium启动/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mochazi/Python3Webcrawler/007f4aefe723d931812602e7d1fce8a2e831bac4/下载荔枝微课/selenium启动/chromedriver.exe -------------------------------------------------------------------------------- /下载荔枝微课/selenium启动/selenium_start.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | PROXY='http://127.0.0.1:8080' 4 | 5 | chrome_options = webdriver.ChromeOptions() 6 | chrome_options.add_argument("--proxy-server=127.0.0.1:8080") 7 | chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 专业模式 8 | 9 | 10 | browser = webdriver.Chrome(executable_path=r'chromedriver.exe',options=chrome_options)#r 代表的是强制禁止转义 11 | 12 | ''' 13 | 本次爬取的课程地址:https://m.lizhiweike.com/channel2/1192275 14 | ''' 15 | 16 | url = 'https://m.lizhiweike.com/channel2/1192275' 17 | browser.get(url)#访问网站 18 | 19 | -------------------------------------------------------------------------------- /下载荔枝微课/selenium启动/谷歌驱动下载地址.txt: -------------------------------------------------------------------------------- 1 | 地址一: 2 | http://chromedriver.storage.googleapis.com/index.html 3 | 4 | 地址二: 5 | https://registry.npmmirror.com/binary.html?path=chromedriver/ -------------------------------------------------------------------------------- /下载荔枝微课/启动程序指令.txt: -------------------------------------------------------------------------------- 1 | 先打开一个cmd,输入mitmweb -s request_demo.py 2 | 然后再cd进入selenium启动文件夹,输入python selenium_start.py -------------------------------------------------------------------------------- /下载荔枝微课/安装环境指令.txt: -------------------------------------------------------------------------------- 1 | 一次性安装 2 | pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple 3 | 4 | 分开安装 5 | pip install mitmproxy 6 | pip install selenium 7 | pip install m3u8 8 | pip install requests 9 | pip install bs4 10 | pip install pycryptodome -------------------------------------------------------------------------------- /京东商品信息/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests,re,json 5 | from bs4 import BeautifulSoup 6 | from urllib import parse 7 | 8 | KEYWORD = parse.quote('python') 9 | 10 | base = 'https://item.jd.com' 11 | headers = { 12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 13 | 'Connection':'keep-alive', 14 | #参考链接:https://search.jd.com/Search?keyword=python&enc=utf-8&wq=python 15 | } 16 | 17 | 18 | def get_index(url): 19 | #一开始的请求页面 20 | 21 | session = requests.Session() 22 | session.headers = headers 23 | html = session.get(url) 24 | html.encoding = 'GBK' 25 | soup = BeautifulSoup(html.text,'lxml') 26 | items = soup.select('li.gl-item') 27 | 28 | 29 | for item in items: 30 | inner_url = item.select('li.gl-item .gl-i-wrap .p-img a')[0].get('href') 31 | print(inner_url) 32 | inner_url = parse.urljoin(base,inner_url)#转成URL格式 33 | 34 | item_id = get_id(inner_url) 35 | 36 | #评论数 37 | comm_num = get_comm_num(inner_url) 38 | inner_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6501&productId=11993134&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' 39 | 40 | #获取评论 41 | if comm_num>0: 42 | get_comm(inner_url,comm_num,item_id) 43 | 44 | 45 | 46 | 47 | def get_comm(url,comm_num,item_id ): 48 | 49 | headers = { 50 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' 51 | } 52 | good_comments = '' #存放结果 53 | #获取评论 54 | 55 | pages = comm_num//10 56 | if pages>99: 57 | pages = 99 58 | 59 | for page in range(0,pages): 60 | comment_url = 'https://sclub.jd.com/comment/productPageComments.action?'\ 61 | 'callback=fetchJSON_comment98vv4&productId={}&score=0'\ 62 | '&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1'.format(item_id,page) 63 | 64 | json_decoder = requests.get(comment_url,headers=headers).text 65 | try: 66 | if json_decoder: 67 | start = json_decoder.find('{"productAttr":null,') 68 | 69 | end = json_decoder.find(',"afterDays":0}]}')+len(',"afterDays":0}]}') 70 | 71 | content = json.loads(json_decoder[start:end]) 72 | 73 | comments = content['comments'] 74 | 75 | for c in comments: 76 | comm = c['content'] 77 | good_comments+="{}|".format(comm) 78 | 79 | print(good_comments) 80 | except Exception as e: 81 | print(e) 82 | 83 | print(item_id,good_comments) 84 | 85 | def get_shop_info(url):#获取商品信息 86 | shop_data = {} 87 | html = requests.get(url,headers = headers) 88 | soup = BeautifulSoup(html.text,'lxml') 89 | try: 90 | shop_name = soup.select('div.mt h3 a') 91 | except Exception as e: 92 | raise e 93 | 94 | def get_index_lists(html):#获取索引列表 95 | html.encoding = 'utf8' 96 | soup = BeautifulSoup(html.text,'lxml') 97 | lis = soup.find_all('li',attrs = {"class":"gl-item"}) 98 | for li in lis: 99 | number = li.find('div',attrs = {"class":"p-commit"}).strong 100 | print(number) 101 | 102 | def get_comm_num(url):#获取评论数量 103 | 104 | item_id = get_id(url) 105 | comm_url = 'https://club.jd.com/comment/productCommentSummaries.action?'\ 106 | 'referenceIds={}&callback=jQuery3096445'.format(item_id) 107 | comment = requests.get(comm_url,headers = headers).text 108 | start = comment.find('{"CommentsCount":')#起始 109 | end = comment.find('"PoorRateStyle":0}]}')+len('"PoorRateStyle":0}]}')#结尾 110 | try: 111 | content = json.loads(comment[start:end])['CommentsCount']#取出json 112 | except: 113 | return 0 114 | comm_num = content[0]['CommentCount'] 115 | return comm_num 116 | 117 | 118 | def get_id(url):#匹配id 119 | id = re.compile('\d+') 120 | res = id.findall(url) 121 | return res[0] 122 | 123 | 124 | if __name__ == '__main__': 125 | 126 | for i in range(1,30,2): 127 | url = 'https://search.jd.com/Search?'\ 128 | 'keyword={}&page={}'.format(KEYWORD,i) 129 | get_index(url) 130 | 131 | 132 | -------------------------------------------------------------------------------- /房天下/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests, re 5 | from lxml import etree 6 | from urllib import parse 7 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor 8 | from db import sess, House 9 | 10 | headers = { 11 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 12 | 'referer':'https://fs.zu.fang.com/house-a0617/i32/', 13 | #参考链接 https://zu.fang.com/house-a01 14 | #请填写你的Cookie 15 | } 16 | 17 | session = requests.session() #保持会话状态,不必重复请求 18 | session.headers = headers 19 | 20 | 21 | #获取str中的数字 22 | def get_number(text): 23 | number = re.compile('\d+') 24 | return number.findall(text)[0] 25 | 26 | 27 | #获取页面的page数目 28 | def get_pages(html): 29 | soup = etree.HTML(html.text) 30 | pages = soup.xpath("//div[@class='fanye']/span/text()") 31 | number = get_number(pages[0]) 32 | if number: 33 | return int(number) 34 | return None 35 | 36 | 37 | def get_house_data(url, *args): 38 | headers = { 39 | 'Connection': 'keep-alive', #常链接 40 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 41 | 'Referer': 'https://fs.zu.fang.com/house-a0617/i33/', 42 | #参考链接 https://zu.fang.com/house-a01 43 | #请填写你的Cookie 44 | } 45 | 46 | loca_url = re.compile("(.*?)") #获取跳转链接 47 | xiangqing_url = re.compile('location.href="(.*?)"') 48 | 49 | session = requests.session() #长连接 保持会话 50 | session.headers = headers 51 | 52 | url = 'http://search.fang.com/captcha-854085290c4833ba19/redirect?h=' + url 53 | 54 | html = session.get(url) 55 | 56 | one_url = xiangqing_url.findall(html.text)[-1] #第一次跳转 57 | html = session.get(one_url) 58 | 59 | two_url = xiangqing_url.findall(html.text)[-1] #第二次跳转 60 | html = session.get(two_url) 61 | 62 | soup = etree.HTML(html.text) 63 | xiangqing = soup.xpath('//div[@class="fyms_con floatl gray3"]/text()') 64 | xiangqing = '|'.join(xiangqing) 65 | print('block:{}\t标题:{}\t租金:{}详情:{}'.format(args[0], args[2], args[1],xiangqing)) 66 | 67 | s = sess() 68 | try: 69 | house = House(block=args[0], 70 | title=args[2], 71 | rent=args[1], 72 | data=xiangqing) 73 | 74 | s.add(house) 75 | s.commit() 76 | print('commit') 77 | except Exception as e: 78 | print('rollback', e) 79 | s.rollback() 80 | 81 | 82 | #获取页面信息 83 | def get_data_next(url): 84 | html = session.get(url) 85 | soup = etree.HTML(html.text) 86 | dls = soup.xpath("//div[@class='houseList']/dl") 87 | block = soup.xpath("//span[@class='selestfinds']/a/text()") 88 | rfss = soup.xpath("//input[@id='baidid']/@value")[0] 89 | for dl in dls: 90 | try: 91 | title = dl.xpath('dd/p/a/text()')[0] 92 | rent = dl.xpath("dd/div/p/span[@class='price']/text()")[0] 93 | href = parse.urljoin('https://zu.fang.com', 94 | dl.xpath('dd/p/a/@href')[0]) #拼接链 95 | get_house_data(href, block, rent, title) 96 | except IndexError as e: 97 | print('dl error', e) 98 | 99 | 100 | #获取页面 101 | def get_data(html): 102 | pages = get_pages(html) 103 | if not pages: 104 | pages = 1 105 | urls = [ 106 | 'https://zu.fang.com/house-a01/i3%d/' % i for i in range(1, pages + 1) 107 | ] 108 | 109 | with ProcessPoolExecutor(max_workers=2) as t: 110 | 111 | for url in urls: 112 | t.submit(get_data_next, url) 113 | 114 | 115 | #进入首页 116 | def get_index(url): 117 | html = session.get(url, headers=headers) 118 | if html.status_code == 200: 119 | get_data(html) 120 | else: 121 | print('请求页面{}出错'.format(url)) 122 | 123 | 124 | def main(): 125 | urls = ['https://zu.fang.com/house-a0{}/'.format(i) for i in range(1, 17)] 126 | with ProcessPoolExecutor(max_workers=2) as p: 127 | for url in urls: 128 | p.submit(get_index, url) 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | session.close() 134 | -------------------------------------------------------------------------------- /房天下/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy import Column,Integer,String,Text 3 | from sqlalchemy.orm import sessionmaker,scoped_session 4 | from sqlalchemy.ext.declarative import declarative_base 5 | 6 | BASE = declarative_base()#创建基类 7 | 8 | #此处没有使用pymysql的驱动 9 | #请安装pip install mysql-connector-python 10 | #engine中的 mysqlconnector 为 mysql官网驱动 11 | engine = create_engine( 12 | "mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8",#确定编码格式 13 | max_overflow = 500,#超过连接池大小外最多可以创建的链接 14 | pool_size = 100,#连接池大小 15 | echo = False,#调试信息展示 16 | ) 17 | 18 | class House(BASE):#继承基类 19 | __tablename__ = 'house' #表名字 20 | id = Column(Integer,primary_key = True,autoincrement = True) 21 | block = Column(String(125)) 22 | title = Column(String(125)) 23 | rent = Column(String(125)) 24 | data = Column(Text()) 25 | 26 | BASE.metadata.create_all(engine)#通过基类创建表 27 | Session = sessionmaker(engine) 28 | sess = scoped_session(Session) 29 | 30 | -------------------------------------------------------------------------------- /新版QQ音乐/README.md: -------------------------------------------------------------------------------- 1 | **注意事项** 2 | - [QQ音乐爬虫原理视频](https://www.bilibili.com/video/BV1pk4y1m7TG) 3 | - `execjs`依赖于`NodeJS`,请务必提前安装(本项目的开发环境为`NodeJS v14.6.0`)。 4 | - `cd`到当前文件夹,执行`python demo.py`即可 5 | - `demo.py`为没有入库版,只爬取一个分类,没开多进程,方便大家理解 6 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题 7 | - `get_singer_mid(index)` 方法决定分类爬取 -------------------------------------------------------------------------------- /新版QQ音乐/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import execjs,requests,math,os,threading 5 | from urllib import parse 6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor 7 | from db import SQLsession,Song 8 | 9 | lock = threading.Lock() 10 | 11 | headers = { 12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 13 | 'Referer':'https://y.qq.com/portal/singer_list.html', 14 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1& 15 | } 16 | 17 | session = SQLsession() 18 | 19 | def get_sign(data): 20 | 21 | with open('./新版QQ音乐/get_sign.js','r',encoding='utf-8') as f: 22 | text = f.read() 23 | 24 | js_data = execjs.compile(text) 25 | sign = js_data.call('get_sign',data) 26 | return sign 27 | 28 | 29 | def myProcess(): 30 | #把歌手按照首字母分为27类 31 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程 32 | for i in range(1,28):#28 33 | p.submit(get_singer_mid,i) 34 | 35 | 36 | def get_singer_mid(index): 37 | #index = 1-----27 38 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手 39 | #找出mid, 用于歌手详情页 40 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\ 41 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\ 42 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index)) 43 | sign = get_sign(data) 44 | 45 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\ 46 | '&sign={}'\ 47 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 48 | '¬ice=0&platform=yqq.json&needNewCode=0'\ 49 | '&data={}'.format(sign,parse.quote(data)) 50 | 51 | html = requests.get(url,headers = headers).json() 52 | 53 | total = html['singerList']['data']['total']#多少个歌手 54 | 55 | pages = int(math.floor(int(total)/80))#向下取整 56 | 57 | thread_number = pages 58 | Thread = ThreadPoolExecutor(max_workers = thread_number) 59 | 60 | sin = 0 61 | #分页迭代每一个字母下的所有页面歌手 62 | for page in range(1,pages+2): 63 | 64 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\ 65 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\ 66 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page)) 67 | sign = get_sign(data) 68 | 69 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\ 70 | '&sign={}'\ 71 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 72 | '¬ice=0&platform=yqq.json&needNewCode=0'\ 73 | '&data={}'.format(sign,parse.quote(data)) 74 | 75 | html = requests.get(url,headers = headers).json() 76 | 77 | sings = html['singerList']['data']['singerlist'] 78 | 79 | for sing in sings: 80 | 81 | singer_name = sing['singer_name'] #获取歌手名字 82 | mid = sing['singer_mid'] #获取歌手mid 83 | 84 | Thread.submit(get_singer_data,mid = mid, 85 | singer_name = singer_name,) 86 | sin+=80 87 | 88 | #获取歌手信息 89 | def get_singer_data(mid,singer_name): 90 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面 91 | #找出歌手的歌曲信息页 92 | 93 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\ 94 | '{"order":1,"singerMid":"%s","begin":0,"num":10}'\ 95 | ',"module":"musichall.song_list_server"}}'%(str(mid)) 96 | 97 | sign = get_sign(data) 98 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\ 99 | '&g_tk=5381&sign={}&loginUin=0'\ 100 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 101 | '&data={}'.format(sign,parse.quote(data)) 102 | 103 | html = requests.get(url,headers = headers).json() 104 | 105 | songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数 106 | 107 | 108 | for number in range(0,songs_num,100): 109 | 110 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\ 111 | '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\ 112 | ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num)) 113 | 114 | sign = get_sign(data) 115 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\ 116 | '&g_tk=5381&sign={}&loginUin=0'\ 117 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 118 | '&data={}'.format(sign,parse.quote(data)) 119 | 120 | html = requests.get(url,headers = headers).json() 121 | 122 | datas = html['singerSongList']['data']['songList'] 123 | 124 | for d in datas: 125 | sing_name = d['songInfo']['title'] 126 | song_mid = d['songInfo']['mid'] 127 | try: 128 | lock.acquire()#锁上 129 | 130 | session.add(Song(song_name = sing_name, 131 | song_singer = singer_name, 132 | song_mid = song_mid)) 133 | session.commit() 134 | 135 | lock.release()#解锁 136 | print('commit') 137 | except: 138 | session.rollback() 139 | print('rollbeak') 140 | 141 | 142 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,song_mid)) 143 | download(song_mid,sing_name,singer_name) 144 | 145 | 146 | def download(song_mid,sing_name,singer_name): 147 | 148 | qq_number = '1641202711'#请修改你的QQ号 149 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\ 150 | ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\ 151 | '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\ 152 | '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\ 153 | '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number)) 154 | 155 | sign = get_sign(data) 156 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\ 157 | '&g_tk=1291538537&sign={}&loginUin={}'\ 158 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0'\ 159 | '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data)) 160 | 161 | html = requests.get(url,headers = headers).json() 162 | 163 | try: 164 | purl = html['req_0']['data']['midurlinfo'][0]['purl'] 165 | 166 | 167 | url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl) 168 | 169 | html = requests.get(url,headers = headers,verify=False) 170 | 171 | html.encoding = 'utf-8' 172 | 173 | sing_file_name = '{} -- {}'.format(sing_name,singer_name) 174 | 175 | filename = './新版QQ音乐/歌曲' 176 | 177 | if html.status_code != 403: 178 | if not os.path.exists(filename): 179 | os.makedirs(filename) 180 | 181 | with open('./新版QQ音乐/歌曲/{}.m4a'.format(sing_file_name),'wb') as f: 182 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name)) 183 | f.write(html.content) 184 | 185 | except: 186 | print('查询权限失败,或没有查到对应的歌曲') 187 | 188 | 189 | 190 | 191 | 192 | if __name__ == "__main__": 193 | myProcess() 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /新版QQ音乐/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column,Integer,String,create_engine 2 | from sqlalchemy.orm import sessionmaker,scoped_session 3 | from sqlalchemy.ext.declarative import declarative_base 4 | 5 | #此处没有使用pymysql的驱动 6 | #请安装pip install mysql-connector-python 7 | #engine中的 mysqlconnector 为 mysql官网驱动 8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8', 9 | max_overflow = 500,#超过连接池大小外最多可以创建的链接 10 | pool_size = 100,#连接池大小 11 | echo = False,#调试信息展示 12 | ) 13 | Base = declarative_base() 14 | 15 | class Song(Base): 16 | __tablename__ = 'song' 17 | song_id = Column(Integer,primary_key = True,autoincrement = True) 18 | song_name = Column(String(64)) 19 | song_ablum = Column(String(64)) 20 | song_mid = Column(String(50)) 21 | song_singer = Column(String(50)) 22 | Base.metadata.create_all(engine) 23 | 24 | DBsession = sessionmaker(bind = engine) 25 | 26 | SQLsession = scoped_session(DBsession) 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /新版QQ音乐/demo.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import execjs,requests,math,os,threading 5 | from urllib import parse 6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor 7 | from db import SQLsession,Song 8 | 9 | # lock = threading.Lock() 10 | 11 | headers = { 12 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 13 | 'Referer':'https://y.qq.com/portal/singer_list.html', 14 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1& 15 | } 16 | 17 | # session = SQLsession() 18 | 19 | def get_sign(data): 20 | 21 | with open('./get_sign.js','r',encoding='utf-8') as f: 22 | text = f.read() 23 | 24 | js_data = execjs.compile(text) 25 | sign = js_data.call('get_sign',data) 26 | return sign 27 | 28 | 29 | def myProcess(): 30 | #把歌手按照首字母分为27类 31 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程 32 | for i in range(1,28):#28 33 | p.submit(get_singer_mid,i) 34 | 35 | 36 | def get_singer_mid(index): 37 | #index = 1-----27 38 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手 39 | #找出mid, 用于歌手详情页 40 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\ 41 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\ 42 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":0,"cur_page":1}}}'%(str(index)) 43 | sign = get_sign(data) 44 | 45 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\ 46 | '&sign={}'\ 47 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 48 | '¬ice=0&platform=yqq.json&needNewCode=0'\ 49 | '&data={}'.format(sign,parse.quote(data)) 50 | 51 | html = requests.get(url,headers = headers).json() 52 | 53 | total = html['singerList']['data']['total']#多少个歌手 54 | 55 | pages = int(math.floor(int(total)/80))#向下取整 56 | 57 | thread_number = pages 58 | Thread = ThreadPoolExecutor(max_workers = thread_number) 59 | 60 | sin = 0 61 | #分页迭代每一个字母下的所有页面歌手 62 | for page in range(1,pages+2): 63 | 64 | data = '{"comm":{"ct":24,"cv":0},"singerList":'\ 65 | '{"module":"Music.SingerListServer","method":"get_singer_list","param":'\ 66 | '{"area":-100,"sex":-100,"genre":-100,"index":%s,"sin":%s,"cur_page":%s}}}'%(str(index),str(sin),str(page)) 67 | sign = get_sign(data) 68 | 69 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI6720748185279282&g_tk=5381'\ 70 | '&sign={}'\ 71 | '&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 72 | '¬ice=0&platform=yqq.json&needNewCode=0'\ 73 | '&data={}'.format(sign,parse.quote(data)) 74 | 75 | html = requests.get(url,headers = headers).json() 76 | 77 | sings = html['singerList']['data']['singerlist'] 78 | 79 | for sing in sings: 80 | 81 | singer_name = sing['singer_name'] #获取歌手名字 82 | mid = sing['singer_mid'] #获取歌手mid 83 | 84 | Thread.submit(get_singer_data,mid = mid, 85 | singer_name = singer_name,) 86 | sin+=80 87 | 88 | #获取歌手信息 89 | def get_singer_data(mid,singer_name): 90 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面 91 | #找出歌手的歌曲信息页 92 | 93 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\ 94 | '{"order":1,"singerMid":"%s","begin":0,"num":10}'\ 95 | ',"module":"musichall.song_list_server"}}'%(str(mid)) 96 | 97 | sign = get_sign(data) 98 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\ 99 | '&g_tk=5381&sign={}&loginUin=0'\ 100 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 101 | '&data={}'.format(sign,parse.quote(data)) 102 | 103 | html = requests.get(url,headers = headers).json() 104 | 105 | songs_num = html['singerSongList']['data']['totalNum']#获取歌曲总数 106 | 107 | 108 | for number in range(0,songs_num,100): 109 | 110 | data = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList","param":'\ 111 | '{"order":1,"singerMid":"%s","begin":%s,"num":%s}'\ 112 | ',"module":"musichall.song_list_server"}}'%(str(mid),str(number),str(songs_num)) 113 | 114 | sign = get_sign(data) 115 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getSingerSong4707786209273719'\ 116 | '&g_tk=5381&sign={}&loginUin=0'\ 117 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 118 | '&data={}'.format(sign,parse.quote(data)) 119 | 120 | html = requests.get(url,headers = headers).json() 121 | 122 | datas = html['singerSongList']['data']['songList'] 123 | 124 | for d in datas: 125 | sing_name = d['songInfo']['title'] 126 | song_mid = d['songInfo']['mid'] 127 | # try: 128 | # lock.acquire()#锁上 129 | # 130 | # session.add(Song(song_name = sing_name, 131 | # song_singer = singer_name, 132 | # song_mid = song_mid)) 133 | # session.commit() 134 | # 135 | # lock.release()#解锁 136 | # print('commit') 137 | # except: 138 | # session.rollback() 139 | # print('rollbeak') 140 | 141 | 142 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,song_mid)) 143 | download(song_mid,sing_name,singer_name) 144 | 145 | 146 | def download(song_mid,sing_name,singer_name): 147 | 148 | qq_number = '1641202711'#请修改你的QQ号 149 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch"'\ 150 | ',"param":{"guid":"4803422090","calltype":0,"userip":""}},'\ 151 | '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey",'\ 152 | '"param":{"guid":"4803422090","songmid":["%s"],"songtype":[0],'\ 153 | '"uin":"%s","loginflag":1,"platform":"20"}},"comm":{"uin":%s,"format":"json","ct":24,"cv":0}}'%(str(song_mid),str(qq_number),str(qq_number)) 154 | 155 | sign = get_sign(data) 156 | url = 'https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey27494207511290925'\ 157 | '&g_tk=1291538537&sign={}&loginUin={}'\ 158 | '&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0'\ 159 | '&platform=yqq.json&needNewCode=0&data={}'.format(sign,qq_number,parse.quote(data)) 160 | 161 | html = requests.get(url,headers = headers).json() 162 | 163 | try: 164 | purl = html['req_0']['data']['midurlinfo'][0]['purl'] 165 | 166 | 167 | url = 'https://dl.stream.qqmusic.qq.com/{}'.format(purl) 168 | 169 | html = requests.get(url,headers = headers,verify=False) 170 | 171 | html.encoding = 'utf-8' 172 | 173 | sing_file_name = '{} -- {}'.format(sing_name,singer_name) 174 | 175 | filename = './歌曲' 176 | 177 | if html.status_code != 403: 178 | if not os.path.exists(filename): 179 | os.makedirs(filename) 180 | 181 | with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f: 182 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name)) 183 | f.write(html.content) 184 | 185 | except: 186 | print('查询权限失败,或没有查到对应的歌曲') 187 | 188 | 189 | 190 | 191 | 192 | if __name__ == "__main__": 193 | # myProcess() 194 | get_singer_mid(1) 195 | 196 | 197 | -------------------------------------------------------------------------------- /新版QQ音乐/get_sign.js: -------------------------------------------------------------------------------- 1 | 2 | this.window = this; 3 | var sign = null; 4 | 5 | !function(n, t) { 6 | "object" == typeof exports && "undefined" != typeof module ? module.exports = t() : "function" == typeof define && define.amd ? define(t) : (n = n || self).getSecuritySign = t() 7 | } (this, 8 | function() { 9 | "use strict"; 10 | var n = function() { 11 | if ("undefined" != typeof self) return self; 12 | if ("undefined" != typeof window) return window; 13 | if ("undefined" != typeof global) return global; 14 | throw new Error("unable to locate global object") 15 | } (); 16 | n.__sign_hash_20200305 = function(n) { 17 | function l(n, t) { 18 | var o = (65535 & n) + (65535 & t); 19 | return (n >> 16) + (t >> 16) + (o >> 16) << 16 | 65535 & o 20 | } 21 | function r(n, t, o, e, u, p) { 22 | return l((i = l(l(t, n), l(e, p))) << (r = u) | i >>> 32 - r, o); 23 | var i, r 24 | } 25 | function g(n, t, o, e, u, p, i) { 26 | return r(t & o | ~t & e, n, t, u, p, i) 27 | } 28 | function a(n, t, o, e, u, p, i) { 29 | return r(t & e | o & ~e, n, t, u, p, i) 30 | } 31 | function s(n, t, o, e, u, p, i) { 32 | return r(t ^ o ^ e, n, t, u, p, i) 33 | } 34 | function v(n, t, o, e, u, p, i) { 35 | return r(o ^ (t | ~e), n, t, u, p, i) 36 | } 37 | function t(n) { 38 | return function(n) { 39 | var t, o = ""; 40 | for (t = 0; t < 32 * n.length; t += 8) o += String.fromCharCode(n[t >> 5] >>> t % 32 & 255); 41 | return o 42 | } (function(n, t) { 43 | n[t >> 5] |= 128 << t % 32, 44 | n[14 + (t + 64 >>> 9 << 4)] = t; 45 | var o, e, u, p, i, r = 1732584193, 46 | f = -271733879, 47 | h = -1732584194, 48 | c = 271733878; 49 | for (o = 0; o < n.length; o += 16) r = g(e = r, u = f, p = h, i = c, n[o], 7, -680876936), 50 | c = g(c, r, f, h, n[o + 1], 12, -389564586), 51 | h = g(h, c, r, f, n[o + 2], 17, 606105819), 52 | f = g(f, h, c, r, n[o + 3], 22, -1044525330), 53 | r = g(r, f, h, c, n[o + 4], 7, -176418897), 54 | c = g(c, r, f, h, n[o + 5], 12, 1200080426), 55 | h = g(h, c, r, f, n[o + 6], 17, -1473231341), 56 | f = g(f, h, c, r, n[o + 7], 22, -45705983), 57 | r = g(r, f, h, c, n[o + 8], 7, 1770035416), 58 | c = g(c, r, f, h, n[o + 9], 12, -1958414417), 59 | h = g(h, c, r, f, n[o + 10], 17, -42063), 60 | f = g(f, h, c, r, n[o + 11], 22, -1990404162), 61 | r = g(r, f, h, c, n[o + 12], 7, 1804603682), 62 | c = g(c, r, f, h, n[o + 13], 12, -40341101), 63 | h = g(h, c, r, f, n[o + 14], 17, -1502002290), 64 | r = a(r, f = g(f, h, c, r, n[o + 15], 22, 1236535329), h, c, n[o + 1], 5, -165796510), 65 | c = a(c, r, f, h, n[o + 6], 9, -1069501632), 66 | h = a(h, c, r, f, n[o + 11], 14, 643717713), 67 | f = a(f, h, c, r, n[o], 20, -373897302), 68 | r = a(r, f, h, c, n[o + 5], 5, -701558691), 69 | c = a(c, r, f, h, n[o + 10], 9, 38016083), 70 | h = a(h, c, r, f, n[o + 15], 14, -660478335), 71 | f = a(f, h, c, r, n[o + 4], 20, -405537848), 72 | r = a(r, f, h, c, n[o + 9], 5, 568446438), 73 | c = a(c, r, f, h, n[o + 14], 9, -1019803690), 74 | h = a(h, c, r, f, n[o + 3], 14, -187363961), 75 | f = a(f, h, c, r, n[o + 8], 20, 1163531501), 76 | r = a(r, f, h, c, n[o + 13], 5, -1444681467), 77 | c = a(c, r, f, h, n[o + 2], 9, -51403784), 78 | h = a(h, c, r, f, n[o + 7], 14, 1735328473), 79 | r = s(r, f = a(f, h, c, r, n[o + 12], 20, -1926607734), h, c, n[o + 5], 4, -378558), 80 | c = s(c, r, f, h, n[o + 8], 11, -2022574463), 81 | h = s(h, c, r, f, n[o + 11], 16, 1839030562), 82 | f = s(f, h, c, r, n[o + 14], 23, -35309556), 83 | r = s(r, f, h, c, n[o + 1], 4, -1530992060), 84 | c = s(c, r, f, h, n[o + 4], 11, 1272893353), 85 | h = s(h, c, r, f, n[o + 7], 16, -155497632), 86 | f = s(f, h, c, r, n[o + 10], 23, -1094730640), 87 | r = s(r, f, h, c, n[o + 13], 4, 681279174), 88 | c = s(c, r, f, h, n[o], 11, -358537222), 89 | h = s(h, c, r, f, n[o + 3], 16, -722521979), 90 | f = s(f, h, c, r, n[o + 6], 23, 76029189), 91 | r = s(r, f, h, c, n[o + 9], 4, -640364487), 92 | c = s(c, r, f, h, n[o + 12], 11, -421815835), 93 | h = s(h, c, r, f, n[o + 15], 16, 530742520), 94 | r = v(r, f = s(f, h, c, r, n[o + 2], 23, -995338651), h, c, n[o], 6, -198630844), 95 | c = v(c, r, f, h, n[o + 7], 10, 1126891415), 96 | h = v(h, c, r, f, n[o + 14], 15, -1416354905), 97 | f = v(f, h, c, r, n[o + 5], 21, -57434055), 98 | r = v(r, f, h, c, n[o + 12], 6, 1700485571), 99 | c = v(c, r, f, h, n[o + 3], 10, -1894986606), 100 | h = v(h, c, r, f, n[o + 10], 15, -1051523), 101 | f = v(f, h, c, r, n[o + 1], 21, -2054922799), 102 | r = v(r, f, h, c, n[o + 8], 6, 1873313359), 103 | c = v(c, r, f, h, n[o + 15], 10, -30611744), 104 | h = v(h, c, r, f, n[o + 6], 15, -1560198380), 105 | f = v(f, h, c, r, n[o + 13], 21, 1309151649), 106 | r = v(r, f, h, c, n[o + 4], 6, -145523070), 107 | c = v(c, r, f, h, n[o + 11], 10, -1120210379), 108 | h = v(h, c, r, f, n[o + 2], 15, 718787259), 109 | f = v(f, h, c, r, n[o + 9], 21, -343485551), 110 | r = l(r, e), 111 | f = l(f, u), 112 | h = l(h, p), 113 | c = l(c, i); 114 | return [r, f, h, c] 115 | } (function(n) { 116 | var t, o = []; 117 | for (o[(n.length >> 2) - 1] = void 0, t = 0; t < o.length; t += 1) o[t] = 0; 118 | for (t = 0; t < 8 * n.length; t += 8) o[t >> 5] |= (255 & n.charCodeAt(t / 8)) << t % 32; 119 | return o 120 | } (n), 8 * n.length)) 121 | } 122 | function o(n) { 123 | return t(unescape(encodeURIComponent(n))) 124 | } 125 | return function(n) { 126 | var t, o, e = "0123456789abcdef", 127 | u = ""; 128 | for (o = 0; o < n.length; o += 1) t = n.charCodeAt(o), 129 | u += e.charAt(t >>> 4 & 15) + e.charAt(15 & t); 130 | return u 131 | } (o(n)) 132 | }, 133 | function r(f, h, c, l, g) { 134 | g = g || [[this], [{}]]; 135 | for (var t = [], o = null, n = [function() { 136 | return ! 0 137 | }, 138 | function() {}, 139 | function() { 140 | g.length = c[h++] 141 | }, 142 | function() { 143 | g.push(c[h++]) 144 | }, 145 | function() { 146 | g.pop() 147 | }, 148 | function() { 149 | var n = c[h++], 150 | t = g[g.length - 2 - n]; 151 | g[g.length - 2 - n] = g.pop(), 152 | g.push(t) 153 | }, 154 | function() { 155 | g.push(g[g.length - 1]) 156 | }, 157 | function() { 158 | g.push([g.pop(), g.pop()].reverse()) 159 | }, 160 | function() { 161 | g.push([l, g.pop()]) 162 | }, 163 | function() { 164 | g.push([g.pop()]) 165 | }, 166 | function() { 167 | var n = g.pop(); 168 | g.push(n[0][n[1]]) 169 | }, 170 | function() { 171 | g.push(g[g.pop()[0]][0]) 172 | }, 173 | function() { 174 | var n = g[g.length - 2]; 175 | n[0][n[1]] = g[g.length - 1] 176 | }, 177 | function() { 178 | g[g[g.length - 2][0]][0] = g[g.length - 1] 179 | }, 180 | function() { 181 | var n = g.pop(), 182 | t = g.pop(); 183 | g.push([t[0][t[1]], n]) 184 | }, 185 | function() { 186 | var n = g.pop(); 187 | g.push([g[g.pop()][0], n]) 188 | }, 189 | function() { 190 | var n = g.pop(); 191 | g.push(delete n[0][n[1]]) 192 | }, 193 | function() { 194 | var n = []; 195 | for (var t in g.pop()) n.push(t); 196 | g.push(n) 197 | }, 198 | function() { 199 | g[g.length - 1].length ? g.push(g[g.length - 1].shift(), !0) : g.push(void 0, !1) 200 | }, 201 | function() { 202 | var n = g[g.length - 2], 203 | t = Object.getOwnPropertyDescriptor(n[0], n[1]) || { 204 | configurable: !0, 205 | enumerable: !0 206 | }; 207 | t.get = g[g.length - 1], 208 | Object.defineProperty(n[0], n[1], t) 209 | }, 210 | function() { 211 | var n = g[g.length - 2], 212 | t = Object.getOwnPropertyDescriptor(n[0], n[1]) || { 213 | configurable: !0, 214 | enumerable: !0 215 | }; 216 | t.set = g[g.length - 1], 217 | Object.defineProperty(n[0], n[1], t) 218 | }, 219 | function() { 220 | h = c[h++] 221 | }, 222 | function() { 223 | var n = c[h++]; 224 | g[g.length - 1] && (h = n) 225 | }, 226 | function() { 227 | throw g[g.length - 1] 228 | }, 229 | function() { 230 | var n = c[h++], 231 | t = n ? g.slice( - n) : []; 232 | g.length -= n, 233 | g.push(g.pop().apply(l, t)) 234 | }, 235 | function() { 236 | var n = c[h++], 237 | t = n ? g.slice( - n) : []; 238 | g.length -= n; 239 | var o = g.pop(); 240 | g.push(o[0][o[1]].apply(o[0], t)) 241 | }, 242 | function() { 243 | var n = c[h++], 244 | t = n ? g.slice( - n) : []; 245 | g.length -= n, 246 | t.unshift(null), 247 | g.push(new(Function.prototype.bind.apply(g.pop(), t))) 248 | }, 249 | function() { 250 | var n = c[h++], 251 | t = n ? g.slice( - n) : []; 252 | g.length -= n, 253 | t.unshift(null); 254 | var o = g.pop(); 255 | g.push(new(Function.prototype.bind.apply(o[0][o[1]], t))) 256 | }, 257 | function() { 258 | g.push(!g.pop()) 259 | }, 260 | function() { 261 | g.push(~g.pop()) 262 | }, 263 | function() { 264 | g.push(typeof g.pop()) 265 | }, 266 | function() { 267 | g[g.length - 2] = g[g.length - 2] == g.pop() 268 | }, 269 | function() { 270 | g[g.length - 2] = g[g.length - 2] === g.pop() 271 | }, 272 | function() { 273 | g[g.length - 2] = g[g.length - 2] > g.pop() 274 | }, 275 | function() { 276 | g[g.length - 2] = g[g.length - 2] >= g.pop() 277 | }, 278 | function() { 279 | g[g.length - 2] = g[g.length - 2] << g.pop() 280 | }, 281 | function() { 282 | g[g.length - 2] = g[g.length - 2] >> g.pop() 283 | }, 284 | function() { 285 | g[g.length - 2] = g[g.length - 2] >>> g.pop() 286 | }, 287 | function() { 288 | g[g.length - 2] = g[g.length - 2] + g.pop() 289 | }, 290 | function() { 291 | g[g.length - 2] = g[g.length - 2] - g.pop() 292 | }, 293 | function() { 294 | g[g.length - 2] = g[g.length - 2] * g.pop() 295 | }, 296 | function() { 297 | g[g.length - 2] = g[g.length - 2] / g.pop() 298 | }, 299 | function() { 300 | g[g.length - 2] = g[g.length - 2] % g.pop() 301 | }, 302 | function() { 303 | g[g.length - 2] = g[g.length - 2] | g.pop() 304 | }, 305 | function() { 306 | g[g.length - 2] = g[g.length - 2] & g.pop() 307 | }, 308 | function() { 309 | g[g.length - 2] = g[g.length - 2] ^ g.pop() 310 | }, 311 | function() { 312 | g[g.length - 2] = g[g.length - 2] in g.pop() 313 | }, 314 | function() { 315 | g[g.length - 2] = g[g.length - 2] instanceof g.pop() 316 | }, 317 | function() { 318 | g[g[g.length - 1][0]] = void 0 === g[g[g.length - 1][0]] ? [] : g[g[g.length - 1][0]] 319 | }, 320 | function() { 321 | for (var e = c[h++], u = [], n = c[h++], t = c[h++], p = [], o = 0; o < n; o++) u[c[h++]] = g[c[h++]]; 322 | for (var i = 0; i < t; i++) p[i] = c[h++]; 323 | g.push(function n() { 324 | var t = u.slice(0); 325 | t[0] = [this], 326 | t[1] = [arguments], 327 | t[2] = [n]; 328 | for (var o = 0; o < p.length && o < arguments.length; o++) 0 < p[o] && (t[p[o]] = [arguments[o]]); 329 | return r(f, e, c, l, t) 330 | }) 331 | }, 332 | function() { 333 | t.push([c[h++], g.length, c[h++]]) 334 | }, 335 | function() { 336 | t.pop() 337 | }, 338 | function() { 339 | return !! o 340 | }, 341 | function() { 342 | o = null 343 | }, 344 | function() { 345 | g[g.length - 1] += String.fromCharCode(c[h++]) 346 | }, 347 | function() { 348 | g.push("") 349 | }, 350 | function() { 351 | g.push(void 0) 352 | }, 353 | function() { 354 | g.push(null) 355 | }, 356 | function() { 357 | g.push(!0) 358 | }, 359 | function() { 360 | g.push(!1) 361 | }, 362 | function() { 363 | g.length -= c[h++] 364 | }, 365 | function() { 366 | g[g.length - 1] = c[h++] 367 | }, 368 | function() { 369 | var n = g.pop(), 370 | t = g[g.length - 1]; 371 | t[0][t[1]] = g[n[0]][0] 372 | }, 373 | function() { 374 | var n = g.pop(), 375 | t = g[g.length - 1]; 376 | t[0][t[1]] = n[0][n[1]] 377 | }, 378 | function() { 379 | var n = g.pop(), 380 | t = g[g.length - 1]; 381 | g[t[0]][0] = g[n[0]][0] 382 | }, 383 | function() { 384 | var n = g.pop(), 385 | t = g[g.length - 1]; 386 | g[t[0]][0] = n[0][n[1]] 387 | }, 388 | function() { 389 | g[g.length - 2] = g[g.length - 2] < g.pop() 390 | }, 391 | function() { 392 | g[g.length - 2] = g[g.length - 2] <= g.pop() 393 | }];;) try { 394 | for (; ! n[c[h++]]();); 395 | if (o) throw o; 396 | return g.pop() 397 | } catch(n) { 398 | var e = t.pop(); 399 | if (void 0 === e) throw n; 400 | o = n, 401 | h = e[0], 402 | g.length = e[1], 403 | e[2] && (g[e[2]][0] = o) 404 | } 405 | } (120731, 0, [21, 34, 50, 100, 57, 50, 102, 50, 98, 99, 101, 52, 54, 97, 52, 99, 55, 56, 52, 49, 57, 54, 57, 49, 56, 98, 102, 100, 100, 48, 48, 55, 55, 102, 2, 10, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 4, 21, 427, 54, 2, 15, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 12, 9, 48, 61, 13, 9, 48, 61, 14, 9, 48, 61, 10, 9, 55, 54, 97, 54, 98, 54, 99, 54, 100, 54, 101, 54, 102, 54, 103, 54, 104, 54, 105, 54, 106, 54, 107, 54, 108, 54, 109, 54, 110, 54, 111, 54, 112, 54, 113, 54, 114, 54, 115, 54, 116, 54, 117, 54, 118, 54, 119, 54, 120, 54, 121, 54, 122, 54, 48, 54, 49, 54, 50, 54, 51, 54, 52, 54, 53, 54, 54, 54, 55, 54, 56, 54, 57, 13, 4, 61, 11, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 4, 9, 11, 3, 3, 9, 11, 39, 3, 1, 38, 40, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 12, 9, 55, 13, 4, 61, 13, 9, 3, 0, 13, 4, 4, 3, 13, 9, 11, 3, 11, 9, 11, 66, 22, 306, 4, 21, 422, 24, 4, 3, 14, 9, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 102, 54, 108, 54, 111, 54, 111, 54, 114, 14, 55, 54, 77, 54, 97, 54, 116, 54, 104, 8, 55, 54, 114, 54, 97, 54, 110, 54, 100, 54, 111, 54, 109, 14, 25, 0, 3, 10, 9, 55, 54, 108, 54, 101, 54, 110, 54, 103, 54, 116, 54, 104, 15, 10, 40, 25, 1, 13, 4, 61, 12, 9, 6, 11, 3, 10, 9, 3, 14, 9, 11, 15, 10, 38, 13, 4, 61, 13, 9, 6, 11, 6, 5, 1, 5, 0, 3, 1, 38, 13, 4, 61, 0, 5, 0, 43, 4, 21, 291, 61, 3, 12, 9, 11, 0, 3, 9, 9, 49, 72, 0, 2, 3, 4, 13, 4, 61, 8, 9, 21, 721, 3, 2, 8, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 510, 4, 21, 523, 22, 4, 55, 54, 115, 54, 101, 54, 108, 54, 102, 8, 10, 0, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 566, 4, 21, 583, 3, 4, 55, 54, 119, 54, 105, 54, 110, 54, 100, 54, 111, 54, 119, 8, 10, 0, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 626, 4, 21, 643, 25, 4, 55, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 8, 10, 0, 55, 54, 69, 54, 114, 54, 114, 54, 111, 54, 114, 8, 55, 54, 117, 54, 110, 54, 97, 54, 98, 54, 108, 54, 101, 54, 32, 54, 116, 54, 111, 54, 32, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 101, 54, 32, 54, 103, 54, 108, 54, 111, 54, 98, 54, 97, 54, 108, 54, 32, 54, 111, 54, 98, 54, 106, 54, 101, 54, 99, 54, 116, 27, 1, 23, 56, 0, 49, 444, 0, 0, 24, 0, 13, 4, 61, 8, 9, 55, 54, 95, 54, 95, 54, 103, 54, 101, 54, 116, 54, 83, 54, 101, 54, 99, 54, 117, 54, 114, 54, 105, 54, 116, 54, 121, 54, 83, 54, 105, 54, 103, 54, 110, 15, 21, 1126, 49, 2, 14, 3, 2, 9, 48, 61, 3, 9, 48, 61, 4, 9, 48, 61, 5, 9, 48, 61, 6, 9, 48, 61, 7, 9, 48, 61, 8, 9, 48, 61, 9, 9, 48, 61, 10, 9, 48, 61, 11, 9, 48, 61, 9, 9, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 10, 30, 55, 54, 117, 54, 110, 54, 100, 54, 101, 54, 102, 54, 105, 54, 110, 54, 101, 54, 100, 32, 28, 22, 862, 21, 932, 21, 4, 55, 54, 108, 54, 111, 54, 99, 54, 97, 54, 116, 54, 105, 54, 111, 54, 110, 8, 55, 54, 104, 54, 111, 54, 115, 54, 116, 14, 55, 54, 105, 54, 110, 54, 100, 54, 101, 54, 120, 54, 79, 54, 102, 14, 55, 54, 121, 54, 46, 54, 113, 54, 113, 54, 46, 54, 99, 54, 111, 54, 109, 25, 1, 3, 0, 3, 1, 39, 32, 22, 963, 4, 55, 54, 67, 54, 74, 54, 66, 54, 80, 54, 65, 54, 67, 54, 114, 54, 82, 54, 117, 54, 78, 54, 121, 54, 55, 21, 974, 50, 4, 3, 12, 9, 11, 3, 8, 3, 10, 24, 2, 13, 4, 61, 10, 9, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 10, 22, 1030, 21, 1087, 22, 4, 3, 13, 9, 55, 54, 95, 54, 95, 54, 115, 54, 105, 54, 103, 54, 110, 54, 95, 54, 104, 54, 97, 54, 115, 54, 104, 54, 95, 54, 50, 54, 48, 54, 50, 54, 48, 54, 48, 54, 51, 54, 48, 54, 53, 15, 3, 9, 9, 11, 3, 3, 9, 11, 38, 25, 1, 13, 4, 61, 11, 9, 3, 12, 9, 11, 3, 10, 3, 53, 3, 37, 39, 24, 2, 13, 4, 4, 55, 54, 122, 54, 122, 54, 97, 3, 11, 9, 11, 38, 3, 10, 9, 11, 38, 0, 49, 771, 2, 1, 12, 9, 13, 8, 3, 12, 4, 4, 56, 0], n); 406 | var t = n.__getSecuritySign; 407 | sign = t; 408 | return t; 409 | }); 410 | 411 | function get_sign(data){ 412 | return sign(data) 413 | }; 414 | -------------------------------------------------------------------------------- /旧版QQ音乐(仍可用)/README.md: -------------------------------------------------------------------------------- 1 | **注意事项** 2 | - `cd`到当前文件夹,执行`python demo.py`即可 3 | - `demo.py`为没有入库版,只爬取一个分类,没开多进程,方便大家理解 4 | - 请务必`demo.py` 的`filename`和`with open`项目路径问题 5 | - `get_singer_mid(index)` 方法决定分类爬取 -------------------------------------------------------------------------------- /旧版QQ音乐(仍可用)/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests,os,json,math,threading 5 | from urllib import parse 6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor 7 | from db import SQLsession,Song 8 | 9 | headers = { 10 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 11 | 'referer':'https://y.qq.com/portal/singer_list.html', 12 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1& 13 | } 14 | 15 | lock = threading.Lock() 16 | session = SQLsession() 17 | 18 | def myProcess(): 19 | #把歌手按照首字母分为27类 20 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程 21 | for i in range(1,28):#28 22 | p.submit(get_singer_mid,i) 23 | 24 | def get_singer_mid(index): 25 | #index = 1-----27 26 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手 27 | #找出mid, 用于歌手详情页 28 | 29 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\ 30 | ',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\ 31 | '"index":%s,"sin":0,"cur_page":1}}}'%(str(index)) 32 | 33 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\ 34 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\ 35 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 36 | '&data={}'.format(parse.quote(data)) 37 | 38 | html = requests.get(url).json() 39 | total = html['singerList']['data']['total']#多少个歌手 40 | pages = int(math.floor(int(total)/80)) 41 | thread_number = pages 42 | 43 | Thread = ThreadPoolExecutor(max_workers = thread_number) 44 | 45 | sin = 0 46 | #分页迭代每一个字母下的所有页面歌手 47 | for page in range(1,pages+2): 48 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\ 49 | '"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\ 50 | 'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page)) 51 | 52 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\ 53 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\ 54 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 55 | '&data={}'.format(parse.quote(data)) 56 | 57 | html = requests.get(url,headers = headers).json() 58 | 59 | sings = html['singerList']['data']['singerlist'] 60 | 61 | for sing in sings: 62 | 63 | singer_name = sing['singer_name'] 64 | mid = sing['singer_mid'] 65 | 66 | Thread.submit(get_singer_data,mid = mid, 67 | singer_name = singer_name,) 68 | sin+=80 69 | 70 | 71 | 72 | #获取歌手信息 73 | def get_singer_data(mid,singer_name): 74 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面 75 | #找出歌手的歌曲信息页 76 | 77 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\ 78 | '"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\ 79 | '"module":"musichall.song_list_server"}}'%str(mid) 80 | 81 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\ 82 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 83 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params)) 84 | 85 | html = requests.session() 86 | content = html.get(url,headers = headers).json() 87 | 88 | songs_num = content['singerSongList']['data']['totalNum'] 89 | 90 | 91 | for a in range(0,songs_num,100): 92 | 93 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \ 94 | '"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \ 95 | '"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num)) 96 | 97 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \ 98 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \ 99 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params)) 100 | 101 | html = requests.session() 102 | content = html.get(url, headers=headers).json() 103 | 104 | datas = content['singerSongList']['data']['songList'] 105 | 106 | for d in datas: 107 | sing_name = d['songInfo']['title'] 108 | songmid = d['songInfo']['mid'] 109 | try: 110 | lock.acquire()#锁上 111 | session.add(Song(song_name = sing_name, 112 | song_singer = singer_name, 113 | song_mid = songmid)) 114 | session.commit() 115 | lock.release()#解锁 116 | print('commit') 117 | except: 118 | session.rollback() 119 | print('rollbeak') 120 | 121 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,mid)) 122 | download(songmid,sing_name,singer_name) 123 | 124 | def download(songmid,sing_name,singer_name): 125 | headers = { 126 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 127 | 'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html', 128 | } 129 | 130 | 131 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\ 132 | '"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\ 133 | '"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\ 134 | '"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid) 135 | 136 | 137 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\ 138 | '&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 139 | '¬ice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data)) 140 | 141 | html = requests.get(url,headers = headers) 142 | 143 | try: 144 | purl = html.json()['req_0']['data']['midurlinfo'][0]['purl'] 145 | 146 | url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl) 147 | 148 | html = requests.get(url,headers = headers) 149 | html.encoding = 'utf-8' 150 | 151 | sing_file_name = '{} -- {}'.format(sing_name,singer_name) 152 | 153 | filename = './旧版QQ音乐(仍可用)/歌曲' 154 | 155 | if not os.path.exists(filename): 156 | os.makedirs(filename) 157 | 158 | with open('./旧版QQ音乐(仍可用)/歌曲/{}.m4a'.format(sing_file_name),'wb') as f: 159 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name)) 160 | f.write(html.content) 161 | 162 | except: 163 | print('查询权限失败,或没有查到对应的歌曲') 164 | 165 | 166 | 167 | if __name__ == '__main__': 168 | myProcess() -------------------------------------------------------------------------------- /旧版QQ音乐(仍可用)/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column,Integer,String,create_engine 2 | from sqlalchemy.orm import sessionmaker,scoped_session 3 | from sqlalchemy.ext.declarative import declarative_base 4 | 5 | #此处没有使用pymysql的驱动 6 | #请安装pip install mysql-connector-python 7 | #engine中的 mysqlconnector 为 mysql官网驱动 8 | engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/test?charset=utf8', 9 | max_overflow = 500,#超过连接池大小外最多可以创建的链接 10 | pool_size = 100,#连接池大小 11 | echo = False,#调试信息展示 12 | ) 13 | Base = declarative_base() 14 | 15 | class Song(Base): 16 | __tablename__ = 'song' 17 | song_id = Column(Integer,primary_key = True,autoincrement = True) 18 | song_name = Column(String(64)) 19 | song_ablum = Column(String(64)) 20 | song_mid = Column(String(50)) 21 | song_singer = Column(String(50)) 22 | Base.metadata.create_all(engine) 23 | 24 | DBsession = sessionmaker(bind = engine) 25 | 26 | SQLsession = scoped_session(DBsession) 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /旧版QQ音乐(仍可用)/demo.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests,os,json,math,threading 5 | from urllib import parse 6 | from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor 7 | from db import SQLsession,Song 8 | 9 | headers = { 10 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 11 | 'referer':'https://y.qq.com/portal/singer_list.html', 12 | #参考链接 https://y.qq.com/portal/singer_list.html#page=1&index=1& 13 | } 14 | 15 | lock = threading.Lock() 16 | session = SQLsession() 17 | 18 | def myProcess(): 19 | #把歌手按照首字母分为27类 20 | with ProcessPoolExecutor(max_workers = 2) as p:#创建27个进程 21 | for i in range(1,28):#28 22 | p.submit(get_singer_mid,i) 23 | 24 | def get_singer_mid(index): 25 | #index = 1-----27 26 | #打开歌手列表页面,找出singerList,找出所有歌手的数目,除于80,构造后续页面获取page歌手 27 | #找出mid, 用于歌手详情页 28 | 29 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer"'\ 30 | ',"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,'\ 31 | '"index":%s,"sin":0,"cur_page":1}}}'%(str(index)) 32 | 33 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\ 34 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\ 35 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 36 | '&data={}'.format(parse.quote(data)) 37 | 38 | html = requests.get(url).json() 39 | total = html['singerList']['data']['total']#多少个歌手 40 | pages = int(math.floor(int(total)/80)) 41 | thread_number = pages 42 | 43 | Thread = ThreadPoolExecutor(max_workers = thread_number) 44 | 45 | sin = 0 46 | #分页迭代每一个字母下的所有页面歌手 47 | for page in range(1,pages+2): 48 | data = '{"comm":{"ct":24,"cv":0},"singerList":{"module":"Music.SingerListServer",'\ 49 | '"method":"get_singer_list","param":{"area":-100,"sex":-100,"genre":-100,"'\ 50 | 'index":%s,"sin":%d,"cur_page":%s}}}'%(str(index),sin,str(page)) 51 | 52 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI0432880619182503'\ 53 | '&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&out'\ 54 | 'Charset=utf-8¬ice=0&platform=yqq.json&needNewCode=0'\ 55 | '&data={}'.format(parse.quote(data)) 56 | 57 | html = requests.get(url,headers = headers).json() 58 | 59 | sings = html['singerList']['data']['singerlist'] 60 | 61 | for sing in sings: 62 | 63 | singer_name = sing['singer_name'] 64 | mid = sing['singer_mid'] 65 | 66 | Thread.submit(get_singer_data,mid = mid, 67 | singer_name = singer_name,) 68 | sin+=80 69 | 70 | 71 | 72 | #获取歌手信息 73 | def get_singer_data(mid,singer_name): 74 | #获取歌手mid,进入歌手详情页,也就是每一个歌手歌曲所在页面 75 | #找出歌手的歌曲信息页 76 | 77 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",'\ 78 | '"param":{"order":1,"singerMid":"%s","begin":0,"num":10},'\ 79 | '"module":"musichall.song_list_server"}}'%str(mid) 80 | 81 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&'\ 82 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 83 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params)) 84 | 85 | html = requests.session() 86 | content = html.get(url,headers = headers).json() 87 | 88 | songs_num = content['singerSongList']['data']['totalNum'] 89 | 90 | 91 | for a in range(0,songs_num,100): 92 | 93 | params = '{"comm":{"ct":24,"cv":0},"singerSongList":{"method":"GetSingerSongList",' \ 94 | '"param":{"order":1,"singerMid":"%s","begin":%s,"num":%s},' \ 95 | '"module":"musichall.song_list_server"}}' % (str(mid), int(a),int(songs_num)) 96 | 97 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong9513357793133783&' \ 98 | 'g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8' \ 99 | '¬ice=0&platform=yqq.json&needNewCode=0*&data={}'.format(parse.quote(params)) 100 | 101 | html = requests.session() 102 | content = html.get(url, headers=headers).json() 103 | 104 | datas = content['singerSongList']['data']['songList'] 105 | 106 | for d in datas: 107 | sing_name = d['songInfo']['title'] 108 | songmid = d['songInfo']['mid'] 109 | try: 110 | lock.acquire()#锁上 111 | session.add(Song(song_name = sing_name, 112 | song_singer = singer_name, 113 | song_mid = songmid)) 114 | session.commit() 115 | lock.release()#解锁 116 | print('commit') 117 | except: 118 | session.rollback() 119 | print('rollbeak') 120 | 121 | print('歌手名字:{}\t歌曲名字:{}\t歌曲ID:{}'.format(singer_name,sing_name,mid)) 122 | download(songmid,sing_name,singer_name) 123 | 124 | def download(songmid,sing_name,singer_name): 125 | headers = { 126 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 127 | 'Referer':'https://y.qq.com/n/yqq/singer/000aHmbL2aPXWH.html', 128 | } 129 | 130 | 131 | data = '{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch",'\ 132 | '"param":{"guid":"5746584900","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer",'\ 133 | '"method":"CgiGetVkey","param":{"guid":"5746584900","songmid":["%s"],"songtype":[0],'\ 134 | '"uin":"3262637034","loginflag":1,"platform":"20"}},"comm":{"uin":3262637034,"format":"json","ct":24,"cv":0}}'%str(songmid) 135 | 136 | 137 | url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey17693804549459324'\ 138 | '&g_tk=5381&loginUin=3262637034&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8'\ 139 | '¬ice=0&platform=yqq.json&needNewCode=0&data={}'.format(parse.quote(data)) 140 | 141 | html = requests.get(url,headers = headers) 142 | 143 | try: 144 | purl = html.json()['req_0']['data']['midurlinfo'][0]['purl'] 145 | 146 | url = 'http://dl.stream.qqmusic.qq.com/{}'.format(purl) 147 | 148 | html = requests.get(url,headers = headers) 149 | html.encoding = 'utf-8' 150 | 151 | sing_file_name = '{} -- {}'.format(sing_name,singer_name) 152 | 153 | filename = './歌曲' 154 | 155 | if not os.path.exists(filename): 156 | os.makedirs(filename) 157 | 158 | with open('./歌曲/{}.m4a'.format(sing_file_name),'wb') as f: 159 | print('\n正在下载{}歌曲.....\n'.format(sing_file_name)) 160 | f.write(html.content) 161 | 162 | except: 163 | print('查询权限失败,或没有查到对应的歌曲') 164 | 165 | 166 | 167 | if __name__ == '__main__': 168 | # myProcess() 169 | get_singer_mid(1) -------------------------------------------------------------------------------- /有道翻译/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import time, math,random,hashlib 5 | import requests 6 | 7 | def get_html(name): 8 | 9 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 10 | 11 | 12 | ts = math.floor(time.time() * 1000) 13 | salt = ts + int(random.random() * 10) 14 | 15 | sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest() 16 | bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest() 17 | 18 | data = { 19 | 'i': name, 20 | 'from': 'AUTO', 21 | 'to': 'AUTO', 22 | 'smartresult': 'dict', 23 | 'client': 'fanyideskweb', 24 | 'salt': salt, 25 | 'sign': sign, 26 | 'ts': ts, 27 | 'bv': bv, 28 | 'doctype': 'json', 29 | 'version': '2.1', 30 | 'keyfrom': 'fanyi.web', 31 | 'action': 'FY_BY_CLICKBUTTION', 32 | } 33 | 34 | headers = { 35 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 36 | 'Referer': 'http://fanyi.youdao.com/', 37 | #参考链接:http://fanyi.youdao.com/ 38 | #请在此处填写你的 Cookie 39 | } 40 | 41 | 42 | html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法 43 | # print(html.json()) 44 | print('正在执行有道翻译程序:') 45 | print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src'])) 46 | print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt'])) 47 | 48 | if __name__ == "__main__": 49 | 50 | name = '靓仔' 51 | 52 | get_html(name) -------------------------------------------------------------------------------- /构建代理池/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests,time,json 5 | from bs4 import BeautifulSoup 6 | 7 | headers ={ 8 | 'Referer':'https://www.kuaidaili.com/free/inha/1/', 9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 10 | #参考链接:https://www.kuaidaili.com/free/inha/1/ 11 | } 12 | 13 | 14 | def get_ip(url):#访问网站 15 | html = requests.get(url,headers = headers) 16 | if html.status_code==200: 17 | time.sleep(2) 18 | print('[INFO]正在爬取...') 19 | parse_html(html.text) 20 | else: 21 | print("[ERROR]错误",url) 22 | 23 | def parse_html(html):#获取ip信息 24 | soup = BeautifulSoup(html,'lxml') 25 | ips = soup.select('.table tbody tr') 26 | for line in ips: 27 | ip = line.select_one('td').text 28 | port = line.select('td')[1].text 29 | print('[INFO]获取IP:{} Port:{}'.format(ip,port)) 30 | 31 | address = 'http://{}:{}'.format(ip,port)#构造ip访问 32 | proxies = { 33 | 'http':address, 34 | 'https':address, 35 | } 36 | verify_ip(proxies) 37 | 38 | def verify_ip(proxies):#验证ip能否被用 39 | 40 | try: 41 | html = requests.get('http://www.baidu.com',proxies = proxies,timeout = 3)#连接测试 42 | print('[SUCC]可用代理:{}'.format(proxies)) 43 | write_json(proxies) 44 | except: 45 | print("[ERROR]代理超时不可用:{}".format(proxies)) 46 | 47 | 48 | def write_json(row):#写入文本 49 | 50 | with open('ip_pool.json','a+',encoding='utf-8') as f: 51 | json.dump(row,f) 52 | f.write('\n') 53 | 54 | 55 | def read_json():#读取文件 56 | 57 | with open('ip_pool.json','r',encoding='utf-8') as f: 58 | 59 | for i in f.readlines(): 60 | content = json.loads(i.strip()) 61 | print(content) 62 | 63 | 64 | if __name__ == '__main__': 65 | 66 | for i in range(15,25): 67 | url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i) 68 | get_ip(url) 69 | 70 | print('目前验证成功的IP') 71 | read_json() -------------------------------------------------------------------------------- /构建代理池/ip_pool.json: -------------------------------------------------------------------------------- 1 | {"http": "http://183.164.239.153:9999", "https": "http://183.164.239.153:9999"} 2 | {"http": "http://49.235.69.138:8118", "https": "http://49.235.69.138:8118"} 3 | {"http": "http://111.38.91.99:8060", "https": "http://111.38.91.99:8060"} 4 | {"http": "http://47.107.160.99:8118", "https": "http://47.107.160.99:8118"} 5 | -------------------------------------------------------------------------------- /百度图片/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import requests,json,re,os,traceback,datetime,aiohttp,asyncio 5 | from uuid import uuid4 6 | from urllib import parse 7 | from concurrent.futures import ThreadPoolExecutor 8 | 9 | headers = { 10 | 'Accept':'text/plain, */*; q=0.01', 11 | 'Accept-Encoding':'gzip, deflate, br', 12 | 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8', 13 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 14 | 'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E5%88%9D%E9%9F%B3%E6%9C%AA%E6%9D%A5', 15 | #参考链接:https://image.baidu.com/ 16 | #请在此处填写你的 Cookie 17 | } 18 | 19 | tasks = [] 20 | 21 | def get_html(url): 22 | 23 | try: 24 | html = requests.get(url,headers=headers) 25 | json_data = html.text.replace('\\','')#去除JSON数据多余的\ 26 | json_data = json.loads(json_data) 27 | parse_json(json_data) 28 | 29 | except json.decoder.JSONDecodeError: 30 | 31 | #去除"fromPageTitle"键值的双引号异常 32 | fromPageTitle = r'"fromPageTitle":"(.*?)",' 33 | json_data = replace_data(fromPageTitle,json_data) 34 | 35 | #去除"fromPageTitle"键值的双引号异常 36 | fromPageTitle = r'"fromPageTitleEnc":"(.*?)",' 37 | json_data = replace_data(fromPageTitle,json_data) 38 | 39 | json_data = json.loads(json_data) 40 | write_error(url,flag='已经成功处理') 41 | parse_json(json_data) 42 | 43 | except Exception: 44 | write_error(url,flag='未能成功处理') 45 | 46 | #解析JSON获取图片URL 47 | def parse_json(json_data): 48 | list_data = json_data['data'] 49 | for data in list_data[:-1]: 50 | image_name = data["fromPageTitleEnc"] 51 | for image_data in data["replaceUrl"]: 52 | image_url = image_data['ObjURL'] 53 | tasks.append(download(image_url,image_name)) 54 | 55 | #下载图片 56 | async def download(image_url,image_name): 57 | 58 | black_image = b'GIF89a\x04\x00\x08\x00\x91\x02\x00\xff\xff\xff\x00\x00\x00\xff\xff\xff\x00\x00\x00!\xf9\x04\x01\x00\x00\x02\x00,\x00\x00\x00\x00\x04\x00\x08\x00\x00\x02\x05\x94\x8f\xa9\x8b\x05\x00;' 59 | 60 | filename = './百度图片/下载好的图片' 61 | if not os.path.exists(filename): 62 | os.makedirs(filename) 63 | 64 | print("[INFO]{} 正在下载图片:{}".format(datetime.datetime.now(),image_name)) 65 | 66 | async with aiohttp.ClientSession(headers = headers) as session: 67 | async with session.get(image_url) as html: 68 | 69 | uuid_id = uuid4() 70 | image_file_name = '{}/{}.jpg'.format(filename,uuid_id) 71 | 72 | #筛选掉异常的黑色图片、查询不到的图片 73 | if black_image not in await html.read() and b'' not in await html.read(): 74 | 75 | with open(image_file_name,'wb') as f: 76 | f.write(await html.read()) 77 | 78 | with open('./百度图片/图片映射表.json','a+',encoding='utf-8') as f: 79 | json_data = json.dumps(dict(image_name = image_name,id=str(uuid_id)),ensure_ascii=False) 80 | f.write(json_data + '\n') 81 | 82 | #用正则删除双引号异常 83 | def replace_data(re_compile,json_data): 84 | re_data = re.compile(re_compile) 85 | for i in re_data.findall(json_data): 86 | data = i.replace('"','').replace("\\'",'') 87 | json_data = json_data.replace(i,data) 88 | return json_data 89 | 90 | #写入异常 91 | def write_error(url,flag=None): 92 | 93 | with open('./百度图片/错误日志.txt','a+',encoding='utf-8') as f: 94 | f.write('JSON异常是否处理成功:{}\n'.format(flag)) 95 | f.write('异常时间:{}\n'.format(datetime.datetime.now())) 96 | f.write('异常URL:{}\n'.format(url)) 97 | f.write(traceback.format_exc() + '\n') 98 | 99 | if __name__ == "__main__": 100 | 101 | loop = asyncio.get_event_loop()#创建异步编程 102 | name = parse.quote('初音未来') 103 | 104 | with ThreadPoolExecutor(max_workers = 2) as t: 105 | #翻页30 106 | for i in range(30,120,30): 107 | url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592'\ 108 | '&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest='\ 109 | '©right=&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1'\ 110 | '&fr=&expermode=&force=&pn={}&rn=30'.format(name,name,i) 111 | t.submit(get_html,url) 112 | 113 | loop.run_until_complete(asyncio.wait(tasks)) 114 | loop.close()#程序关闭 -------------------------------------------------------------------------------- /破解有道翻译/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import time, math,random,hashlib 5 | import requests 6 | 7 | def get_html(name): 8 | 9 | url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 10 | 11 | 12 | ts = math.floor(time.time() * 1000) 13 | salt = ts + int(random.random() * 10) 14 | 15 | sign = hashlib.md5(("fanyideskweb" + name + str(salt) +"Nw(nmmbP%A-r6U3EUn]Aj").encode('utf-8')).hexdigest() 16 | bv = hashlib.md5(("5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36").encode('utf-8')).hexdigest() 17 | 18 | data = { 19 | 'i': name, 20 | 'from': 'AUTO', 21 | 'to': 'AUTO', 22 | 'smartresult': 'dict', 23 | 'client': 'fanyideskweb', 24 | 'salt': salt, 25 | 'sign': sign, 26 | 'ts': ts, 27 | 'bv': bv, 28 | 'doctype': 'json', 29 | 'version': '2.1', 30 | 'keyfrom': 'fanyi.web', 31 | 'action': 'FY_BY_CLICKBUTTION', 32 | } 33 | 34 | headers = { 35 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 36 | 'Referer': 'http://fanyi.youdao.com/', 37 | #参考链接:http://fanyi.youdao.com/ 38 | #请在此处填写你的 Cookie 39 | } 40 | 41 | 42 | html = requests.post(url, headers=headers, data=data)#有需要的可以改成session写法 43 | 44 | print('正在执行有道翻译程序:') 45 | print('翻译的词:{}'.format(html.json()['translateResult'][0][0]['src'])) 46 | print('翻译结果:{}'.format(html.json()['translateResult'][0][0]['tgt'])) 47 | 48 | if __name__ == "__main__": 49 | 50 | name = '靓仔' 51 | 52 | get_html(name) -------------------------------------------------------------------------------- /破解网易登录/crawl.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | import execjs,requests,time 5 | 6 | class User():#获取用户密码加密 7 | 8 | def __init__(self,user_id,user_password): 9 | 10 | self.user_id = user_id 11 | self.user_password = user_password 12 | self.session = requests.session() 13 | self.session.headers = { 14 | 'Referer':'https://dl.reg.163.com/webzj/v1.0.1/pub/index_dl2_new.html?cd=https%3A%2F%2Ftemp.163.com%2Fspecial%2F00804C4H%2F&cf=urs_style_2019.css%3Ft%3D20190527&MGID=1590637061742.5342&wdaId=&pkid=MODXOXd&product=163', 15 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 16 | #请在此处输入你的Cookie 17 | #参考链接 https://www.163.com/ 18 | } 19 | 20 | def get_pw(self): 21 | 22 | with open('pw.js','r',encoding='utf-8') as f: 23 | content = f.read() 24 | 25 | js_data = execjs.compile(content)#编译js 26 | pw = js_data.call('get_pw',self.user_password)#调用get_pw函数 27 | return pw 28 | 29 | def get_rtid(self): 30 | 31 | with open('rtid.js','r',encoding='utf-8') as f: 32 | content = f.read() 33 | 34 | js_data = execjs.compile(content)#编译js 35 | rtid = js_data.call('get_rtid')#调用get_rtid函数 36 | return rtid 37 | 38 | def get_tk(self,rtid): 39 | 40 | url = 'https://dl.reg.163.com/dl/gt' 41 | 42 | params = { 43 | 'un':self.user_id, 44 | 'pkid':'MODXOXd', 45 | 'pd':'163', 46 | 'channel':'0', 47 | 'topURL':'https://www.163.com/', 48 | 'rtid':rtid, 49 | 'nocache':int(time.time()*1000), 50 | } 51 | 52 | html = self.session.get(url,params = params).json() 53 | return html['tk'] 54 | 55 | def get_login(self,pw,rtid,tk): 56 | 57 | url = 'https://dl.reg.163.com/dl/l' 58 | 59 | 60 | data = { 61 | 'channel':'0', 62 | 'd':'10', 63 | 'domains':"163.com", 64 | 'l':'0', 65 | 'pd':"163", 66 | 'pkid':"MODXOXd", 67 | 'pw':pw, 68 | 'pwdKeyUp':'1', 69 | 'rtid':rtid, 70 | 't':int(time.time()*1000), 71 | 'tk':tk, 72 | 'topURL':"https://www.163.com/", 73 | 'un':self.user_id, 74 | } 75 | 76 | html = self.session.post(url,json = data).json()#传递JSON 77 | return html 78 | 79 | 80 | if __name__ == "__main__": 81 | 82 | user = User('请输入你的账号','请输入你的密码') 83 | pw = user.get_pw()#获取pw 84 | rtid = user.get_rtid()#获取rtid 85 | 86 | tk = user.get_tk(rtid)#获取tk 87 | 88 | login = user.get_login(pw,rtid,tk) 89 | print(login) 90 | 91 | 92 | -------------------------------------------------------------------------------- /破解网易登录/rtid.js: -------------------------------------------------------------------------------- 1 | function t() { 2 | var e = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 3 | , t = 32 4 | , i = []; 5 | for (; t-- > 0; ) 6 | i[t] = e.charAt(Math.random() * e.length); 7 | return i.join("") 8 | }; 9 | 10 | function get_rtid(){ 11 | return t() 12 | } -------------------------------------------------------------------------------- /豆瓣读书/入库版/book.py: -------------------------------------------------------------------------------- 1 | #Python3.7 2 | #encoding = utf-8 3 | 4 | from urllib import parse 5 | import asyncio,aiohttp,os,time,requests 6 | from bs4 import BeautifulSoup#爬虫解析库 7 | from boook_db import Book,sess 8 | from concurrent.futures import ThreadPoolExecutor 9 | 10 | tasks = [] 11 | 12 | headers = { 13 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', 14 | 'Referer':'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T', 15 | #参考链接 https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T 16 | } 17 | 18 | 19 | def get_html(url): 20 | 21 | 22 | html = requests.get(url,headers = headers) 23 | 24 | if html.status_code==200: 25 | 26 | parse_html(html.text) 27 | else: 28 | print('错误') 29 | 30 | def parse_html(html): 31 | 32 | soup =BeautifulSoup(html,'lxml')#选择解析器 33 | books = soup.select('li.subject-item')#选择文章 34 | 35 | for book in books: 36 | 37 | try:#防错机制 38 | 39 | title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','')#选择书名并去除空格 40 | info = book.select_one('.subject-item .info div.pub').text.strip().replace(' ','').replace('\n','')#选择作者 41 | star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','')#选择评分 42 | pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','')#选择评价 43 | introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','')#选择书本简介 44 | img = book.select_one('.nbg img')['src']#获取图片url 45 | 46 | tasks.append(dowmload(title,img))#异步编程 47 | print(title,info,star,pl,img) 48 | print(introduce) 49 | print('-'*50) 50 | 51 | #插入数据库 52 | book_data = Book( 53 | title = title, 54 | info = info, 55 | star = star, 56 | pl = pl, 57 | introduce = introduce, 58 | ) 59 | sess.add(book_data) 60 | sess.commit() 61 | except Exception as e:#发生任何错误返回 62 | print(e) 63 | sess.rollback()#事务回滚 64 | 65 | 66 | async def dowmload(title,url):#保存封面图片 67 | 68 | if not os.path.exists('./豆瓣读书/doubanImg'):#检查有没有文件夹并创建 69 | os.makedirs('./豆瓣读书/doubanImg') 70 | 71 | async with aiohttp.ClientSession(headers = headers) as session: 72 | async with session.get(url) as html: 73 | with open('./豆瓣读书/doubanImg/{}.jpg'.format(title),'wb')as f: 74 | f.write(await html.content.read()) 75 | 76 | if __name__ == '__main__': 77 | 78 | loop = asyncio.get_event_loop() 79 | with ThreadPoolExecutor(max_workers = 2) as t: 80 | for i in range(0,100,20):#翻页参数为20 81 | url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T'.format(i) 82 | t.submit(get_html,url) 83 | loop.run_until_complete(asyncio.wait(tasks)) 84 | loop.close()#程序关闭 85 | 86 | 87 | -------------------------------------------------------------------------------- /豆瓣读书/入库版/boook_db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy import Column,String,Integer,Text 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy.ext.declarative import declarative_base 5 | 6 | #基础类 7 | Base = declarative_base() 8 | 9 | #此处没有使用pymysql的驱动 10 | #请安装pip install mysql-connector-python 11 | #engine中的 mysqlconnector 为 mysql官网驱动 12 | engine = create_engine( 13 | 'mysql+mysqlconnector://root:root@127.0.0.1:3306/test?charset=utf8',#连接本地 14 | echo = True 15 | ) 16 | 17 | class Book(Base): 18 | __tablename__ = 'book' 19 | id = Column('id',Integer(),primary_key = True,autoincrement = True) 20 | title = Column('title',String(20)) 21 | info = Column('info',String(30)) 22 | star = Column('star',String(10)) 23 | pl = Column('pl',String(10)) 24 | introduce = Column('introduce',Text()) 25 | 26 | Base.metadata.create_all(engine) 27 | 28 | session = sessionmaker(engine) 29 | sess=session() -------------------------------------------------------------------------------- /豆瓣读书/分类实现版/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | certifi==2021.10.8 3 | chardet==4.0.0 4 | charset-normalizer==2.0.7 5 | idna==2.10 6 | lxml==4.6.2 7 | requests==2.25.1 8 | soupsieve==2.2.1 9 | urllib3==1.26.7 10 | -------------------------------------------------------------------------------- /豆瓣读书/分类实现版/【bs4实现】豆瓣读书爬虫.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests,json,csv,os 3 | from uuid import uuid4 4 | from bs4 import BeautifulSoup 5 | from urllib import parse 6 | 7 | '''主域名''' 8 | DOMAIN_URL = 'https://book.douban.com' 9 | 10 | ''' 11 | 协议头 12 | user-agent(必填) 13 | Referer(有就填,没有不填) 14 | Cookie(有账号登录就填,没有不填) 15 | ''' 16 | HEADERS = { 17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', 18 | 'Referer':'https://book.douban.com/', 19 | 'Cookie':'填写你的Cookie' 20 | } 21 | 22 | '''结果去重集合''' 23 | RESULT_SET_DATA = set() 24 | 25 | ''' 26 | 获取book的tag链接 27 | params: 28 | parse_number: int --> 爬取几个tag链接,默认全部 29 | 30 | return: List[str] --> 确定爬取几个tag链接 31 | ''' 32 | def get_book_tag_url(split_number:int=None) -> List[str]: 33 | 34 | html = requests.get(url=DOMAIN_URL,headers=HEADERS) 35 | soup = BeautifulSoup(html.text,'lxml') 36 | 37 | tag_url_list_data = [ 38 | DOMAIN_URL+ parse.quote(tag_url['href']) 39 | for tag_url in soup.select('ul.hot-tags-col5.s ul a') 40 | ] 41 | 42 | if split_number: 43 | tag_url_list_data = tag_url_list_data[:split_number] 44 | 45 | return tag_url_list_data 46 | 47 | 48 | ''' 49 | 解析tag_url,进行翻页后,获取book的内容 50 | params: 51 | tag_url_list_data: List[str] --> book的tag链接 52 | parse_number: int --> 翻页参数,默认爬取3页 53 | write_type: bool --> 是否写入json文件 54 | return:List[dict] --> 爬取成功book的内容 55 | ''' 56 | def parse_book_url_info( 57 | tag_url_list_data:List[str], 58 | parse_number:int=3, 59 | write_json_type:bool=True, 60 | write_csv_type:bool=True, 61 | write_image_type:bool=True 62 | ) -> List[dict]: 63 | 64 | book_info_list_data = [] 65 | 66 | for tag_url in tag_url_list_data: 67 | 68 | # 开始翻页,每20算一页 69 | for parse in range(0,parse_number*20+1,20): 70 | 71 | # 翻页URL 72 | parse_url = f'{tag_url}?start={parse}' 73 | 74 | html = requests.get(url=parse_url,headers=HEADERS) 75 | soup = BeautifulSoup(html.text,'lxml') 76 | 77 | # 选择书本 78 | books = soup.select('li.subject-item') 79 | 80 | for book in books: 81 | 82 | # 选择书本链接 83 | book_url = book.select_one('.info h2 a')['href'] 84 | 85 | # 选择书名 86 | title = book.select_one('.info h2 a').text.strip().replace(' ','').replace('\n','') 87 | 88 | # 选择作者 89 | info = book.select_one('.info div.pub').text.strip().replace(' ','').replace('\n','') 90 | 91 | # 选择评分 92 | star = book.select_one('.rating_nums').text.strip().replace(' ','').replace('\n','') 93 | 94 | # 选择评价 95 | pl = book.select_one('.pl').text.strip().replace(' ','').replace('\n','') 96 | 97 | # 选择书本简介 98 | introduce = book.select_one('.info p').text.strip().replace(' ','').replace('\n','') 99 | 100 | # 获取图片URL 101 | image_url = book.select_one('.nbg img')['src'] 102 | 103 | book_info_result = dict( 104 | 书本链接=book_url, 105 | 书名=title, 106 | 作者=info, 107 | 评分=star, 108 | 评价=pl, 109 | 书本简介=introduce, 110 | 图片链接=image_url 111 | ) 112 | 113 | '''生成结果hash值''' 114 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False)) 115 | 116 | if result_hash_data not in RESULT_SET_DATA: 117 | 118 | '''加入去重集合''' 119 | RESULT_SET_DATA.add(result_hash_data) 120 | 121 | if write_image_type: 122 | write_image_book_info( 123 | image_url=image_url, 124 | image_name=title, 125 | headers=HEADERS 126 | ) 127 | 128 | # 检查是否写入json文件 129 | if write_json_type: 130 | write_json_book_info(book_info_result) 131 | 132 | # 检查是否写入csv文件 133 | if write_csv_type: 134 | write_csv_book_info( 135 | headers=[key for key,value in book_info_result.items()], 136 | book_info=[value for key,value in book_info_result.items()] 137 | ) 138 | 139 | print(book_info_result) 140 | 141 | book_info_list_data.append(book_info_result) 142 | 143 | return book_info_list_data 144 | 145 | 146 | 147 | ''' 148 | 保存图片,生成图片映射JSON文件 149 | params: 150 | image_url:str --> 图片链接 151 | image_name:str --> 图片名字 152 | headers: dict --> 协议头 153 | ''' 154 | def write_image_book_info(image_url:str,image_name:str,headers:dict): 155 | 156 | '''确保图片文件名不重复''' 157 | uuid_id = uuid4() 158 | 159 | filename = './保存图片/图片' 160 | 161 | image_file_name = f'{filename}/{uuid_id}.jpg' 162 | 163 | image_map_file_name = f'./保存图片/image_map_data.json' 164 | 165 | '''如果不存在文件夹则创建''' 166 | if not os.path.exists(filename): 167 | os.makedirs(filename) 168 | 169 | html = requests.get(url=image_url,headers=headers) 170 | 171 | '''写入图片''' 172 | with open(image_file_name,'wb') as f: 173 | 174 | f.write(html.content) 175 | 176 | '''保存图片映射JSON文件''' 177 | with open(image_map_file_name,'a+',encoding='utf-8') as f: 178 | 179 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n') 180 | 181 | 182 | 183 | ''' 184 | 将book的内容,写入json文件 185 | params: 186 | book_info: dict --> 爬取成功book的内容 187 | ''' 188 | def write_json_book_info(book_info:dict): 189 | 190 | with open('book_info.json','a+',encoding='utf-8') as f: 191 | 192 | ''' 193 | json.dumps() 将dict对象转成str对象,json就是str对象 194 | ensure_ascii=False 让json显示中文编码 195 | ''' 196 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n') 197 | 198 | 199 | 200 | ''' 201 | 将book的内容,写入csv文件(带表头) 202 | params: 203 | headers:list --> CSV表头 204 | book_info: list --> 爬取成功book的内容 205 | ''' 206 | def write_csv_book_info(headers:list,book_info:list): 207 | 208 | ''' 209 | 跨平台问题: 210 | 写入csv 因为Windows有点BUG 211 | writerows()写入会出现空行 212 | 所以加入newline='' 213 | 没有出现这种情况则不需要 214 | ''' 215 | 216 | ''' 217 | 检查是否创建了CSV文件 218 | 没有则生成带有表头的CSV文件 219 | ''' 220 | if not os.path.exists('book_info.csv'): 221 | 222 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 223 | 224 | f_csv = csv.writer(f) 225 | f_csv.writerow(headers) 226 | 227 | 228 | 229 | ''' 230 | 逐行开始写入CSV 231 | ''' 232 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 233 | 234 | f_csv = csv.writer(f) 235 | f_csv.writerow(book_info) #逐行插入 236 | 237 | if __name__ == '__main__': 238 | 239 | book_tag_url = get_book_tag_url(1) 240 | 241 | book_url_info = parse_book_url_info(book_tag_url) -------------------------------------------------------------------------------- /豆瓣读书/分类实现版/【re实现】豆瓣读书爬虫.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests,json,csv,os,re 3 | from uuid import uuid4 4 | from urllib import parse 5 | 6 | '''主域名''' 7 | DOMAIN_URL = 'https://book.douban.com' 8 | 9 | ''' 10 | 协议头 11 | user-agent(必填) 12 | Referer(有就填,没有不填) 13 | Cookie(有账号登录就填,没有不填) 14 | ''' 15 | HEADERS = { 16 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', 17 | 'Referer':'https://book.douban.com/', 18 | 'Cookie':'填写你的Cookie' 19 | } 20 | 21 | '''结果去重集合''' 22 | RESULT_SET_DATA = set() 23 | 24 | 25 | class ReFind(): 26 | 27 | def __init__(self,text): 28 | 29 | '''去除所有空格、换行''' 30 | self.text = re.sub('\s+','',text) 31 | 32 | 33 | 34 | ''' 35 | 【链式调用】传入指定正则表达式,获取第一个结果文本 36 | params: 37 | compile: str --> 指定正则表达式 38 | re_type:RegexFlag --> 匹配模式 39 | return: ReFind --> 实例化对象本身,方便进行链式调用 40 | ''' 41 | def add_search(self,compile:str,re_type=re.I|re.S): 42 | 43 | self.text = re.compile(compile,re_type).search(self.text).group() 44 | 45 | return self 46 | 47 | ''' 48 | 传入指定正则表达式,返回所有查询结果 49 | params: 50 | compile: str --> 指定正则表达式 51 | re_type:RegexFlag --> 匹配模式 52 | return: List[str] --> 正则匹配成功的结果 53 | ''' 54 | def find_all(self,compile:str,re_type=re.I|re.S) -> List[str]: 55 | 56 | return re.compile(compile,re_type).findall(self.text) 57 | 58 | 59 | 60 | ''' 61 | 打印当前文本 62 | return: str --> 当前对象的文本 63 | ''' 64 | def print(self) -> str: 65 | print(self.text) 66 | 67 | 68 | 69 | ''' 70 | 获取book的tag链接 71 | params: 72 | parse_number: int --> 爬取几个tag链接,默认全部 73 | 74 | return: List[str] --> 确定爬取几个tag链接 75 | ''' 76 | def get_book_tag_url(split_number:int=None) -> List[str]: 77 | 78 | html = requests.get(url=DOMAIN_URL,headers=HEADERS) 79 | 80 | tag_url_list_data = [ 81 | DOMAIN_URL+ parse.quote(tag_url) 82 | for tag_url in ( 83 | ReFind(html.text) 84 | .add_search(r'') 85 | .find_all(r'
  • .*?
  • ') 86 | ) 87 | ] 88 | 89 | if split_number: 90 | tag_url_list_data = tag_url_list_data[:split_number] 91 | 92 | return tag_url_list_data 93 | 94 | 95 | ''' 96 | 解析tag_url,进行翻页后,获取book的内容 97 | params: 98 | tag_url_list_data: List[str] --> book的tag链接 99 | parse_number: int --> 翻页参数,默认爬取3页 100 | write_type: bool --> 是否写入json文件 101 | return:List[dict] --> 爬取成功book的内容 102 | ''' 103 | def parse_book_url_info( 104 | tag_url_list_data:List[str], 105 | parse_number:int=3, 106 | write_json_type:bool=True, 107 | write_csv_type:bool=True, 108 | write_image_type:bool=True 109 | ) -> List[dict]: 110 | 111 | book_info_list_data = [] 112 | 113 | for tag_url in tag_url_list_data: 114 | 115 | # 开始翻页,每20算一页 116 | for parse in range(0,parse_number*20+1,20): 117 | 118 | # 翻页URL 119 | parse_url = f'{tag_url}?start={parse}' 120 | 121 | html = requests.get(url=parse_url,headers=HEADERS) 122 | 123 | # 选择书本 124 | books = ( 125 | ReFind(html.text) 126 | .find_all(r'') 127 | ) 128 | 129 | for book in books: 130 | 131 | # 选择书本链接 132 | book_url = ( 133 | ReFind(book) 134 | .find_all(r'') 135 | )[0] 136 | 137 | # 选择书名 138 | title = ( 139 | ReFind(book) 140 | .find_all(r'(.*?)') 141 | )[0].strip().replace(' ','').replace('\n','') 142 | 143 | # 选择作者 144 | info = ( 145 | ReFind(book) 146 | .find_all(r'(.*?)') 147 | )[0].strip().replace(' ','').replace('\n','') 148 | 149 | # 选择评分 150 | star = ( 151 | ReFind(book) 152 | .find_all(r'(.*?)') 153 | )[0].strip().replace(' ','').replace('\n','') 154 | 155 | # 选择评价 156 | pl = ( 157 | ReFind(book) 158 | .find_all(r'(.*?)') 159 | )[0].strip().replace(' ','').replace('\n','') 160 | 161 | # 选择书本简介 162 | introduce = ( 163 | ReFind(book) 164 | .find_all(r'

    (.*?)

    ') 165 | )[0].strip().replace(' ','').replace('\n','') 166 | 167 | 168 | # 获取图片URL 169 | image_url =( 170 | ReFind(book) 171 | .find_all(r'') 172 | )[0] 173 | 174 | book_info_result = dict( 175 | 书本链接=book_url, 176 | 书名=title, 177 | 作者=info, 178 | 评分=star, 179 | 评价=pl, 180 | 书本简介=introduce, 181 | 图片链接=image_url 182 | ) 183 | 184 | '''生成结果hash值''' 185 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False)) 186 | 187 | if result_hash_data not in RESULT_SET_DATA: 188 | 189 | '''加入去重集合''' 190 | RESULT_SET_DATA.add(result_hash_data) 191 | 192 | if write_image_type: 193 | write_image_book_info( 194 | image_url=image_url, 195 | image_name=title, 196 | headers=HEADERS 197 | ) 198 | 199 | # 检查是否写入json文件 200 | if write_json_type: 201 | write_json_book_info(book_info_result) 202 | 203 | # 检查是否写入csv文件 204 | if write_csv_type: 205 | write_csv_book_info( 206 | headers=[key for key,value in book_info_result.items()], 207 | book_info=[value for key,value in book_info_result.items()] 208 | ) 209 | 210 | print(book_info_result) 211 | 212 | book_info_list_data.append(book_info_result) 213 | 214 | return book_info_list_data 215 | 216 | 217 | ''' 218 | 保存图片,生成图片映射JSON文件 219 | params: 220 | image_url:str --> 图片链接 221 | image_name:str --> 图片名字 222 | headers: dict --> 协议头 223 | ''' 224 | def write_image_book_info(image_url:str,image_name:str,headers:dict): 225 | 226 | '''确保图片文件名不重复''' 227 | uuid_id = uuid4() 228 | 229 | filename = './保存图片/图片' 230 | 231 | image_file_name = f'{filename}/{uuid_id}.jpg' 232 | 233 | image_map_file_name = f'./保存图片/image_map_data.json' 234 | 235 | '''如果不存在文件夹则创建''' 236 | if not os.path.exists(filename): 237 | os.makedirs(filename) 238 | 239 | html = requests.get(url=image_url,headers=headers) 240 | 241 | '''写入图片''' 242 | with open(image_file_name,'wb') as f: 243 | 244 | f.write(html.content) 245 | 246 | '''保存图片映射JSON文件''' 247 | with open(image_map_file_name,'a+',encoding='utf-8') as f: 248 | 249 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n') 250 | 251 | 252 | 253 | ''' 254 | 将book的内容,写入json文件 255 | params: 256 | book_info: dict --> 爬取成功book的内容 257 | ''' 258 | def write_json_book_info(book_info:dict): 259 | 260 | with open('book_info.json','a+',encoding='utf-8') as f: 261 | 262 | ''' 263 | json.dumps() 将dict对象转成str对象,json就是str对象 264 | ensure_ascii=False 让json显示中文编码 265 | ''' 266 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n') 267 | 268 | 269 | 270 | ''' 271 | 将book的内容,写入csv文件(带表头) 272 | params: 273 | headers:list --> CSV表头 274 | book_info: list --> 爬取成功book的内容 275 | ''' 276 | def write_csv_book_info(headers:list,book_info:list): 277 | 278 | ''' 279 | 跨平台问题: 280 | 写入csv 因为Windows有点BUG 281 | writerows()写入会出现空行 282 | 所以加入newline='' 283 | 没有出现这种情况则不需要 284 | ''' 285 | 286 | ''' 287 | 检查是否创建了CSV文件 288 | 没有则生成带有表头的CSV文件 289 | ''' 290 | if not os.path.exists('book_info.csv'): 291 | 292 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 293 | 294 | f_csv = csv.writer(f) 295 | f_csv.writerow(headers) 296 | 297 | 298 | 299 | ''' 300 | 逐行开始写入CSV 301 | ''' 302 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 303 | 304 | f_csv = csv.writer(f) 305 | f_csv.writerow(book_info) #逐行插入 306 | 307 | if __name__ == '__main__': 308 | 309 | book_tag_url = get_book_tag_url(1) 310 | 311 | book_url_info = parse_book_url_info(book_tag_url) -------------------------------------------------------------------------------- /豆瓣读书/分类实现版/【xpath实现】豆瓣读书爬虫.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests,json,csv,os 3 | from uuid import uuid4 4 | from lxml import etree 5 | from urllib import parse 6 | 7 | '''主域名''' 8 | DOMAIN_URL = 'https://book.douban.com' 9 | 10 | ''' 11 | 协议头 12 | user-agent(必填) 13 | Referer(有就填,没有不填) 14 | Cookie(有账号登录就填,没有不填) 15 | ''' 16 | HEADERS = { 17 | 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', 18 | 'Referer':'https://book.douban.com/', 19 | 'Cookie':'填写你的Cookie' 20 | } 21 | 22 | '''结果去重集合''' 23 | RESULT_SET_DATA = set() 24 | 25 | ''' 26 | 获取book的tag链接 27 | params: 28 | parse_number: int --> 爬取几个tag链接,默认全部 29 | 30 | return: List[str] --> 确定爬取几个tag链接 31 | ''' 32 | def get_book_tag_url(split_number:int=None) -> List[str]: 33 | 34 | html = requests.get(url=DOMAIN_URL,headers=HEADERS) 35 | soup = etree.HTML(html.text) 36 | 37 | tag_url_list_data = [ 38 | DOMAIN_URL+ parse.quote(tag_url) 39 | for tag_url in soup.xpath('//ul[@class="hot-tags-col5 s"]//ul//a/@href') 40 | ] 41 | 42 | if split_number: 43 | tag_url_list_data = tag_url_list_data[:split_number] 44 | 45 | return tag_url_list_data 46 | 47 | 48 | ''' 49 | 解析tag_url,进行翻页后,获取book的内容 50 | params: 51 | tag_url_list_data: List[str] --> book的tag链接 52 | parse_number: int --> 翻页参数,默认爬取3页 53 | write_type: bool --> 是否写入json文件 54 | return:List[dict] --> 爬取成功book的内容 55 | ''' 56 | def parse_book_url_info( 57 | tag_url_list_data:List[str], 58 | parse_number:int=3, 59 | write_json_type:bool=True, 60 | write_csv_type:bool=True, 61 | write_image_type:bool=True 62 | ) -> List[dict]: 63 | 64 | book_info_list_data = [] 65 | 66 | for tag_url in tag_url_list_data: 67 | 68 | # 开始翻页,每20算一页 69 | for parse in range(0,parse_number*20+1,20): 70 | 71 | # 翻页URL 72 | parse_url = f'{tag_url}?start={parse}' 73 | 74 | html = requests.get(url=parse_url,headers=HEADERS) 75 | soup = etree.HTML(html.text) 76 | 77 | # 选择书本 78 | books = soup.xpath('//li[@class="subject-item"]') 79 | 80 | for book in books: 81 | 82 | # 选择书本链接 83 | book_url = book.xpath('.//h2/a/@href')[0] 84 | 85 | # 选择书名 86 | title = book.xpath('.//h2/a/text()')[0].strip().replace(' ','').replace('\n','') 87 | 88 | # 选择作者 89 | info = book.xpath('.//div[@class="pub"]/text()')[0].strip().replace(' ','').replace('\n','') 90 | 91 | # 选择评分 92 | star = book.xpath('.//span[@class="rating_nums"]/text()')[0].strip().replace(' ','').replace('\n','') 93 | 94 | # 选择评价 95 | pl = book.xpath('.//span[@class="pl"]/text()')[0].strip().replace(' ','').replace('\n','') 96 | 97 | # 选择书本简介 98 | introduce = book.xpath('.//p/text()')[0].strip().replace(' ','').replace('\n','') 99 | 100 | # 获取图片URL 101 | image_url = book.xpath('.//img/@src')[0] 102 | 103 | book_info_result = dict( 104 | 书本链接=book_url, 105 | 书名=title, 106 | 作者=info, 107 | 评分=star, 108 | 评价=pl, 109 | 书本简介=introduce, 110 | 图片链接=image_url 111 | ) 112 | 113 | '''生成结果hash值''' 114 | result_hash_data = hash(json.dumps(book_info_result,ensure_ascii=False)) 115 | 116 | if result_hash_data not in RESULT_SET_DATA: 117 | 118 | '''加入去重集合''' 119 | RESULT_SET_DATA.add(result_hash_data) 120 | 121 | if write_image_type: 122 | write_image_book_info( 123 | image_url=image_url, 124 | image_name=title, 125 | headers=HEADERS 126 | ) 127 | 128 | # 检查是否写入json文件 129 | if write_json_type: 130 | write_json_book_info(book_info_result) 131 | 132 | # 检查是否写入csv文件 133 | if write_csv_type: 134 | write_csv_book_info( 135 | headers=[key for key,value in book_info_result.items()], 136 | book_info=[value for key,value in book_info_result.items()] 137 | ) 138 | 139 | print(book_info_result) 140 | 141 | book_info_list_data.append(book_info_result) 142 | 143 | return book_info_list_data 144 | 145 | 146 | ''' 147 | 保存图片,生成图片映射JSON文件 148 | params: 149 | image_url:str --> 图片链接 150 | image_name:str --> 图片名字 151 | headers: dict --> 协议头 152 | ''' 153 | def write_image_book_info(image_url:str,image_name:str,headers:dict): 154 | 155 | '''确保图片文件名不重复''' 156 | uuid_id = uuid4() 157 | 158 | filename = './保存图片/图片' 159 | 160 | image_file_name = f'{filename}/{uuid_id}.jpg' 161 | 162 | image_map_file_name = f'./保存图片/image_map_data.json' 163 | 164 | '''如果不存在文件夹则创建''' 165 | if not os.path.exists(filename): 166 | os.makedirs(filename) 167 | 168 | html = requests.get(url=image_url,headers=headers) 169 | 170 | '''写入图片''' 171 | with open(image_file_name,'wb') as f: 172 | 173 | f.write(html.content) 174 | 175 | '''保存图片映射JSON文件''' 176 | with open(image_map_file_name,'a+',encoding='utf-8') as f: 177 | 178 | f.write(json.dumps(dict(image_name=image_name,uuid=str(uuid_id),image_url=image_url),ensure_ascii=False)+'\n') 179 | 180 | 181 | 182 | ''' 183 | 将book的内容,写入json文件 184 | params: 185 | book_info: dict --> 爬取成功book的内容 186 | ''' 187 | def write_json_book_info(book_info:dict): 188 | 189 | with open('book_info.json','a+',encoding='utf-8') as f: 190 | 191 | ''' 192 | json.dumps() 将dict对象转成str对象,json就是str对象 193 | ensure_ascii=False 让json显示中文编码 194 | ''' 195 | f.write(json.dumps(book_info,ensure_ascii=False)+'\n') 196 | 197 | 198 | 199 | ''' 200 | 将book的内容,写入csv文件(带表头) 201 | params: 202 | headers:list --> CSV表头 203 | book_info: list --> 爬取成功book的内容 204 | ''' 205 | def write_csv_book_info(headers:list,book_info:list): 206 | 207 | ''' 208 | 跨平台问题: 209 | 写入csv 因为Windows有点BUG 210 | writerows()写入会出现空行 211 | 所以加入newline='' 212 | 没有出现这种情况则不需要 213 | ''' 214 | 215 | ''' 216 | 检查是否创建了CSV文件 217 | 没有则生成带有表头的CSV文件 218 | ''' 219 | if not os.path.exists('book_info.csv'): 220 | 221 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 222 | 223 | f_csv = csv.writer(f) 224 | f_csv.writerow(headers) 225 | 226 | 227 | 228 | ''' 229 | 逐行开始写入CSV 230 | ''' 231 | with open('book_info.csv','a+',encoding='utf-8',newline='') as f: 232 | 233 | f_csv = csv.writer(f) 234 | f_csv.writerow(book_info) #逐行插入 235 | 236 | if __name__ == '__main__': 237 | 238 | book_tag_url = get_book_tag_url(1) 239 | 240 | book_url_info = parse_book_url_info(book_tag_url) --------------------------------------------------------------------------------