├── 12306.py ├── Netease ├── Netease.py └── music_list.txt ├── README.md ├── baiduwenku.py ├── baiduwenku_pro_1.py ├── baiwan ├── app.js ├── baiwan.py ├── file.txt ├── index.html └── question.txt ├── bilibili ├── README.md ├── bilibili.py └── xml2ass.py ├── biqukan.py ├── cartoon ├── cartoon │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-34.pyc │ │ ├── items.cpython-34.pyc │ │ ├── pipelines.cpython-34.pyc │ │ └── settings.cpython-34.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-34.pyc │ │ └── comic_spider.cpython-34.pyc │ │ └── comic_spider.py └── scrapy.cfg ├── daili.py ├── dingdong ├── README.md └── jd.py ├── douyin.py ├── douyin_pro.py ├── douyin_pro_2.py ├── downloader.py ├── financical.py ├── geetest.py ├── hero.py ├── one_hour_spider ├── biqukan.py ├── unsplash.py └── vidoe_downloader.py ├── shuaia.py └── video_downloader ├── Images ├── bg.png └── qrcode.png ├── MyQR ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-34.pyc │ ├── __init__.cpython-35.pyc │ ├── myqr.cpython-34.pyc │ ├── myqr.cpython-35.pyc │ └── terminal.cpython-35.pyc ├── mylibs │ ├── ECC.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── ECC.cpython-34.pyc │ │ ├── ECC.cpython-35.pyc │ │ ├── __init__.cpython-34.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── constant.cpython-34.pyc │ │ ├── constant.cpython-35.pyc │ │ ├── data.cpython-34.pyc │ │ ├── data.cpython-35.pyc │ │ ├── draw.cpython-34.pyc │ │ ├── draw.cpython-35.pyc │ │ ├── matrix.cpython-34.pyc │ │ ├── matrix.cpython-35.pyc │ │ ├── structure.cpython-34.pyc │ │ ├── structure.cpython-35.pyc │ │ ├── theqrmodule.cpython-34.pyc │ │ └── theqrmodule.cpython-35.pyc │ ├── constant.py │ ├── data.py │ ├── draw.py │ ├── matrix.py │ ├── structure.py │ └── theqrmodule.py ├── myqr.py └── terminal.py ├── requirements.txt └── video_downloader.py /12306.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: liuyw 4 | """ 5 | from splinter.browser import Browser 6 | from time import sleep 7 | import traceback 8 | import time, sys 9 | 10 | class huoche(object): 11 | driver_name = '' 12 | executable_path = '' 13 | #用户名,密码 14 | username = u"xxx" 15 | passwd = u"xxx" 16 | # cookies值得自己去找, 下面两个分别是沈阳, 哈尔滨 17 | starts = u"%u6C88%u9633%2CSYT" 18 | ends = u"%u54C8%u5C14%u6EE8%2CHBB" 19 | 20 | # 时间格式2018-01-19 21 | dtime = u"2018-01-19" 22 | # 车次,选择第几趟,0则从上之下依次点击 23 | order = 0 24 | ###乘客名 25 | users = [u"xxx",u"xxx"] 26 | ##席位 27 | xb = u"二等座" 28 | pz = u"成人票" 29 | 30 | """网址""" 31 | ticket_url = "https://kyfw.12306.cn/otn/leftTicket/init" 32 | login_url = "https://kyfw.12306.cn/otn/login/init" 33 | initmy_url = "https://kyfw.12306.cn/otn/index/initMy12306" 34 | buy = "https://kyfw.12306.cn/otn/confirmPassenger/initDc" 35 | 36 | def __init__(self): 37 | self.driver_name = 'chrome' 38 | self.executable_path = 'D:/chromedriver' 39 | 40 | def login(self): 41 | self.driver.visit(self.login_url) 42 | self.driver.fill("loginUserDTO.user_name", self.username) 43 | # sleep(1) 44 | self.driver.fill("userDTO.password", self.passwd) 45 | print(u"等待验证码,自行输入...") 46 | while True: 47 | if self.driver.url != self.initmy_url: 48 | sleep(1) 49 | else: 50 | break 51 | 52 | def start(self): 53 | self.driver = Browser(driver_name=self.driver_name,executable_path=self.executable_path) 54 | self.driver.driver.set_window_size(1400, 1000) 55 | self.login() 56 | # sleep(1) 57 | self.driver.visit(self.ticket_url) 58 | try: 59 | print(u"购票页面开始...") 60 | # sleep(1) 61 | # 加载查询信息 62 | self.driver.cookies.add({"_jc_save_fromStation": self.starts}) 63 | self.driver.cookies.add({"_jc_save_toStation": self.ends}) 64 | self.driver.cookies.add({"_jc_save_fromDate": self.dtime}) 65 | 66 | self.driver.reload() 67 | 68 | count = 0 69 | if self.order != 0: 70 | while self.driver.url == self.ticket_url: 71 | self.driver.find_by_text(u"查询").click() 72 | count += 1 73 | print(u"循环点击查询... 第 %s 次" % count) 74 | # sleep(1) 75 | try: 76 | self.driver.find_by_text(u"预订")[self.order - 1].click() 77 | except Exception as e: 78 | print(e) 79 | print(u"还没开始预订") 80 | continue 81 | else: 82 | while self.driver.url == self.ticket_url: 83 | self.driver.find_by_text(u"查询").click() 84 | count += 1 85 | print(u"循环点击查询... 第 %s 次" % count) 86 | # sleep(0.8) 87 | try: 88 | for i in self.driver.find_by_text(u"预订"): 89 | i.click() 90 | sleep(1) 91 | except Exception as e: 92 | print(e) 93 | print(u"还没开始预订 %s" % count) 94 | continue 95 | print(u"开始预订...") 96 | # sleep(3) 97 | # self.driver.reload() 98 | sleep(1) 99 | print(u'开始选择用户...') 100 | for user in self.users: 101 | self.driver.find_by_text(user).last.click() 102 | 103 | print(u"提交订单...") 104 | sleep(1) 105 | self.driver.find_by_text(self.pz).click() 106 | self.driver.find_by_id('').select(self.pz) 107 | # sleep(1) 108 | self.driver.find_by_text(self.xb).click() 109 | sleep(1) 110 | self.driver.find_by_id('submitOrder_id').click() 111 | print(u"开始选座...") 112 | self.driver.find_by_id('1D').last.click() 113 | self.driver.find_by_id('1F').last.click() 114 | 115 | sleep(1.5) 116 | print(u"确认选座...") 117 | self.driver.find_by_id('qr_submit_id').click() 118 | 119 | except Exception as e: 120 | print(e) 121 | 122 | if __name__ == '__main__': 123 | huoche = huoche() 124 | huoche.start() -------------------------------------------------------------------------------- /Netease/Netease.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests, hashlib, sys, click, re, base64, binascii, json, os 3 | from Crypto.Cipher import AES 4 | from http import cookiejar 5 | 6 | """ 7 | Website:http://cuijiahua.com 8 | Author:Jack Cui 9 | Refer:https://github.com/darknessomi/musicbox 10 | """ 11 | 12 | class Encrypyed(): 13 | """ 14 | 解密算法 15 | """ 16 | def __init__(self): 17 | self.modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' 18 | self.nonce = '0CoJUm6Qyw8W8jud' 19 | self.pub_key = '010001' 20 | 21 | # 登录加密算法, 基于https://github.com/stkevintan/nw_musicbox脚本实现 22 | def encrypted_request(self, text): 23 | text = json.dumps(text) 24 | sec_key = self.create_secret_key(16) 25 | enc_text = self.aes_encrypt(self.aes_encrypt(text, self.nonce), sec_key.decode('utf-8')) 26 | enc_sec_key = self.rsa_encrpt(sec_key, self.pub_key, self.modulus) 27 | data = {'params': enc_text, 'encSecKey': enc_sec_key} 28 | return data 29 | 30 | def aes_encrypt(self, text, secKey): 31 | pad = 16 - len(text) % 16 32 | text = text + chr(pad) * pad 33 | encryptor = AES.new(secKey.encode('utf-8'), AES.MODE_CBC, b'0102030405060708') 34 | ciphertext = encryptor.encrypt(text.encode('utf-8')) 35 | ciphertext = base64.b64encode(ciphertext).decode('utf-8') 36 | return ciphertext 37 | 38 | def rsa_encrpt(self, text, pubKey, modulus): 39 | text = text[::-1] 40 | rs = pow(int(binascii.hexlify(text), 16), int(pubKey, 16), int(modulus, 16)) 41 | return format(rs, 'x').zfill(256) 42 | 43 | def create_secret_key(self, size): 44 | return binascii.hexlify(os.urandom(size))[:16] 45 | 46 | 47 | class Song(): 48 | """ 49 | 歌曲对象,用于存储歌曲的信息 50 | """ 51 | def __init__(self, song_id, song_name, song_num, song_url=None): 52 | self.song_id = song_id 53 | self.song_name = song_name 54 | self.song_num = song_num 55 | self.song_url = '' if song_url is None else song_url 56 | 57 | class Crawler(): 58 | """ 59 | 网易云爬取API 60 | """ 61 | def __init__(self, timeout=60, cookie_path='.'): 62 | self.headers = { 63 | 'Accept': '*/*', 64 | 'Accept-Encoding': 'gzip,deflate,sdch', 65 | 'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4', 66 | 'Connection': 'keep-alive', 67 | 'Content-Type': 'application/x-www-form-urlencoded', 68 | 'Host': 'music.163.com', 69 | 'Referer': 'http://music.163.com/search/', 70 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 71 | } 72 | self.session = requests.Session() 73 | self.session.headers.update(self.headers) 74 | self.session.cookies = cookiejar.LWPCookieJar(cookie_path) 75 | self.download_session = requests.Session() 76 | self.timeout = timeout 77 | self.ep = Encrypyed() 78 | 79 | def post_request(self, url, params): 80 | """ 81 | Post请求 82 | :return: 字典 83 | """ 84 | 85 | data = self.ep.encrypted_request(params) 86 | resp = self.session.post(url, data=data, timeout=self.timeout) 87 | result = resp.json() 88 | if result['code'] != 200: 89 | click.echo('post_request error') 90 | else: 91 | return result 92 | 93 | def search(self, search_content, search_type, limit=9): 94 | """ 95 | 搜索API 96 | :params search_content: 搜索内容 97 | :params search_type: 搜索类型 98 | :params limit: 返回结果数量 99 | :return: 字典. 100 | """ 101 | 102 | url = 'http://music.163.com/weapi/cloudsearch/get/web?csrf_token=' 103 | params = {'s': search_content, 'type': search_type, 'offset': 0, 'sub': 'false', 'limit': limit} 104 | result = self.post_request(url, params) 105 | return result 106 | 107 | def search_song(self, song_name, song_num, quiet=True, limit=9): 108 | """ 109 | 根据音乐名搜索 110 | :params song_name: 音乐名 111 | :params song_num: 下载的歌曲数 112 | :params quiet: 自动选择匹配最优结果 113 | :params limit: 返回结果数量 114 | :return: Song独享 115 | """ 116 | 117 | result = self.search(song_name, search_type=1, limit=limit) 118 | 119 | if result['result']['songCount'] <= 0: 120 | click.echo('Song {} not existed.'.format(song_name)) 121 | else: 122 | songs = result['result']['songs'] 123 | if quiet: 124 | song_id, song_name = songs[0]['id'], songs[0]['name'] 125 | song = Song(song_id=song_id, song_name=song_name, song_num=song_num) 126 | return song 127 | 128 | def get_song_url(self, song_id, bit_rate=320000): 129 | """ 130 | 获得歌曲的下载地址 131 | :params song_id: 音乐ID. 132 | :params bit_rate: {'MD 128k': 128000, 'HD 320k': 320000} 133 | :return: 歌曲下载地址 134 | """ 135 | 136 | url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token=' 137 | csrf = '' 138 | params = {'ids': [song_id], 'br': bit_rate, 'csrf_token': csrf} 139 | result = self.post_request(url, params) 140 | # 歌曲下载地址 141 | song_url = result['data'][0]['url'] 142 | 143 | # 歌曲不存在 144 | if song_url is None: 145 | click.echo('Song {} is not available due to copyright issue.'.format(song_id)) 146 | else: 147 | return song_url 148 | 149 | def get_song_by_url(self, song_url, song_name, song_num, folder): 150 | """ 151 | 下载歌曲到本地 152 | :params song_url: 歌曲下载地址 153 | :params song_name: 歌曲名字 154 | :params song_num: 下载的歌曲数 155 | :params folder: 保存路径 156 | """ 157 | if not os.path.exists(folder): 158 | os.makedirs(folder) 159 | fpath = os.path.join(folder, str(song_num) + '_' + song_name + '.mp3') 160 | if sys.platform == 'win32' or sys.platform == 'cygwin': 161 | valid_name = re.sub(r'[<>:"/\\|?*]', '', song_name) 162 | if valid_name != song_name: 163 | click.echo('{} will be saved as: {}.mp3'.format(song_name, valid_name)) 164 | fpath = os.path.join(folder, str(song_num) + '_' + valid_name + '.mp3') 165 | 166 | if not os.path.exists(fpath): 167 | resp = self.download_session.get(song_url, timeout=self.timeout, stream=True) 168 | length = int(resp.headers.get('content-length')) 169 | label = 'Downloading {} {}kb'.format(song_name, int(length/1024)) 170 | 171 | with click.progressbar(length=length, label=label) as progressbar: 172 | with open(fpath, 'wb') as song_file: 173 | for chunk in resp.iter_content(chunk_size=1024): 174 | if chunk: 175 | song_file.write(chunk) 176 | progressbar.update(1024) 177 | 178 | 179 | class Netease(): 180 | """ 181 | 网易云音乐下载 182 | """ 183 | def __init__(self, timeout, folder, quiet, cookie_path): 184 | self.crawler = Crawler(timeout, cookie_path) 185 | self.folder = '.' if folder is None else folder 186 | self.quiet = quiet 187 | 188 | def download_song_by_search(self, song_name, song_num): 189 | """ 190 | 根据歌曲名进行搜索 191 | :params song_name: 歌曲名字 192 | :params song_num: 下载的歌曲数 193 | """ 194 | 195 | try: 196 | song = self.crawler.search_song(song_name, song_num, self.quiet) 197 | except: 198 | click.echo('download_song_by_serach error') 199 | # 如果找到了音乐, 则下载 200 | if song != None: 201 | self.download_song_by_id(song.song_id, song.song_name, song.song_num, self.folder) 202 | 203 | def download_song_by_id(self, song_id, song_name, song_num, folder='.'): 204 | """ 205 | 通过歌曲的ID下载 206 | :params song_id: 歌曲ID 207 | :params song_name: 歌曲名 208 | :params song_num: 下载的歌曲数 209 | :params folder: 保存地址 210 | """ 211 | try: 212 | url = self.crawler.get_song_url(song_id) 213 | # 去掉非法字符 214 | song_name = song_name.replace('/', '') 215 | song_name = song_name.replace('.', '') 216 | self.crawler.get_song_by_url(url, song_name, song_num, folder) 217 | 218 | except: 219 | click.echo('download_song_by_id error') 220 | 221 | 222 | if __name__ == '__main__': 223 | timeout = 60 224 | output = 'Musics' 225 | quiet = True 226 | cookie_path = 'Cookie' 227 | netease = Netease(timeout, output, quiet, cookie_path) 228 | music_list_name = 'music_list.txt' 229 | # 如果music列表存在, 那么开始下载 230 | if os.path.exists(music_list_name): 231 | with open(music_list_name, 'r') as f: 232 | music_list = list(map(lambda x: x.strip(), f.readlines())) 233 | for song_num, song_name in enumerate(music_list): 234 | netease.download_song_by_search(song_name,song_num + 1) 235 | else: 236 | click.echo('music_list.txt not exist.') -------------------------------------------------------------------------------- /Netease/music_list.txt: -------------------------------------------------------------------------------- 1 | 風見鶏 2 | 外婆的话【不才】 3 | We Don't Talk Anymore 4 | 【电吉他】《青鸟》 5 | 小棋童 6 | 千本桜(古筝版) 7 | 妄为 8 | 借我 9 | 你到底有没有爱过我 10 | 七月上 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Spider 2 | 3 | * 贵有恒,何必三更起五更睡;最无益,只怕一日暴十寒。 4 | * Python3爬虫实战:实战源码+博客讲解 5 | * [个人网站](http://cuijiahua.com "悬停显示") 6 | * [CSDN博客](http://blog.csdn.net/c406495762 "悬停显示") 7 | * [CSDN爬虫专栏](http://blog.csdn.net/column/details/15321.html "悬停显示")
8 | * 学习交流群【328127489】Coder
9 | 10 | ## 声明 11 | 12 | * 代码、教程均为Jack Cui本人原创,且仅限于学习交流,请勿用于任何商业用途! 13 | 14 | ### 文章首发声明 15 | 16 | * 文章在自己的个人网站首发,其他平台文章均属转发,如想获得最新更新进展,欢迎关注我的个人网站:http://cuijiahua.com/ 17 | 18 | ## 目录 19 | 20 | * [爬虫小工具](#爬虫小工具) 21 | * [文件下载小助手](https://github.com/Jack-Cherish/python-spider/blob/master/downloader.py "悬停显示") 22 | * [爬虫实战](#爬虫实战) 23 | * [笔趣看小说下载](https://github.com/Jack-Cherish/python-spider/blob/master/biqukan.py "悬停显示") 24 | * [VIP视频下载](https://github.com/Jack-Cherish/python-spider/tree/master/video_downloader "悬停显示") 25 | * [百度文库文章下载_rev1](https://github.com/Jack-Cherish/python-spider/blob/master/baiduwenku.py "悬停显示") 26 | * [百度文库文章下载_rev2](https://github.com/Jack-Cherish/python-spider/blob/master/baiduwenku_pro_1.py "悬停显示") 27 | * [《帅啊》网帅哥图片下载](https://github.com/Jack-Cherish/python-spider/blob/master/shuaia.py "悬停显示") 28 | * [构建代理IP池](https://github.com/Jack-Cherish/python-spider/blob/master/daili.py "悬停显示") 29 | * [《火影忍者》漫画下载](https://github.com/Jack-Cherish/python-spider/tree/master/cartoon "悬停显示") 30 | * [财务报表下载小助手](https://github.com/Jack-Cherish/python-spider/blob/master/financical.py "悬停显示") 31 | * [一小时入门网络爬虫](https://github.com/Jack-Cherish/python-spider/tree/master/one_hour_spider "悬停显示") 32 | * [抖音App视频下载_rev1](https://github.com/Jack-Cherish/python-spider/blob/master/douyin.py "悬停显示") 33 | * [抖音App视频下载_rev2](https://github.com/Jack-Cherish/python-spider/blob/master/douyin_pro.py "悬停显示") 34 | * [抖音App视频下载_rev3](https://github.com/Jack-Cherish/python-spider/blob/master/douyin_pro_2.py "悬停显示") 35 | * [GEETEST验证码破解](https://github.com/Jack-Cherish/python-spider/blob/master/geetest.py "悬停显示") 36 | * [12306抢票小助手](https://github.com/Jack-Cherish/python-spider/blob/master/12306.py "悬停显示") 37 | * [百万英雄答题辅助系统](https://github.com/Jack-Cherish/python-spider/tree/master/baiwan "悬停显示") 38 | * [网易云音乐批量下载](https://github.com/Jack-Cherish/python-spider/tree/master/Netease "悬停显示") 39 | * [B站视频和弹幕批量下载](https://github.com/Jack-Cherish/python-spider/tree/master/bilibili "悬停显示") 40 | * [京东商品晒单图下载](https://github.com/Jack-Cherish/python-spider/tree/master/dingdong "悬停显示") 41 | * [其它](#其它) 42 | 43 | ## 爬虫小工具 44 | 45 | * downloader.py:文件下载小助手 46 | 47 | 一个可以用于下载图片、视频、文件的小工具,有下载进度显示功能。稍加修改即可添加到自己的爬虫中。 48 | 49 | 动态示意图: 50 | 51 | ![image](https://raw.githubusercontent.com/Jack-Cherish/Pictures/master/9.gif) 52 | 53 | ## 爬虫实战 54 | 55 | * biqukan.py:《笔趣看》盗版小说网站,爬取小说工具 56 | 57 | 第三方依赖库安装: 58 | 59 | pip3 install beautifulsoup4 60 | 61 | 使用方法: 62 | 63 | python biqukan.py 64 | 65 | * video_downloader:爱奇艺等主流视频网站的VIP视频破解助手(暂只支持PC和手机在线观看VIP视频!) 66 | 67 | 感谢Python3二维码生成器作者:https://github.com/sylnsfar/qrcode 68 | 69 | 编译好的软件下载连接:https://pan.baidu.com/s/1bqSTNJL 密码:p8bs 70 | 71 | 解压密码:`cuijiahua.com` 72 | 73 | 无需Python3环境,在Windows下,解压即用![软件使用方法](http://blog.csdn.net/c406495762/article/details/71334633 "悬停显示") 74 | 75 | 源码可查看`video_downloader`,运行源码需要搭建Python3环境,并安装相应第三方依赖库: 76 | 77 | 在`video_downloader`文件夹下,安装第三方依赖库: 78 | 79 | pip3 install -r requirements.txt 80 | 81 | 使用方法: 82 | 83 | python movie_downloader.py 84 | 85 | 运行环境: 86 | 87 | Windows, Python3 88 | 89 | Linux, Python3 90 | 91 | Mac, Python3 92 | 93 | * baiduwenku.py: 百度文库word文章爬取 94 | 95 | 原理说明:http://blog.csdn.net/c406495762/article/details/72331737 96 | 97 | 代码不完善,没有进行打包,不具通用性,纯属娱乐,以后有时间会完善。 98 | 99 | * shuaia.py: 爬取《帅啊》网,帅哥图片 100 | 101 | 《帅啊》网URL:http://www.shuaia.net/index.html 102 | 103 | 原理说明:http://blog.csdn.net/c406495762/article/details/72597755 104 | 105 | 第三方依赖库安装: 106 | 107 | pip3 install requests beautifulsoup4 108 | 109 | * daili.py: 构建代理IP池 110 | 111 | 原理说明:http://blog.csdn.net/c406495762/article/details/72793480 112 | 113 | 114 | * carton: 使用Scrapy爬取《火影忍者》漫画 115 | 116 | 代码可以爬取整个《火影忍者》漫画所有章节的内容,保存到本地。更改地址,可以爬取其他漫画。保存地址可以在settings.py中修改。 117 | 118 | 动漫网站:http://comic.kukudm.com/ 119 | 120 | 原理说明:http://blog.csdn.net/c406495762/article/details/72858983 121 | 122 | * hero.py: 《王者荣耀》推荐出装查询小助手 123 | 124 | 网页爬取已经会了,想过爬取手机APP里的内容吗? 125 | 126 | 原理说明:http://blog.csdn.net/c406495762/article/details/76850843 127 | 128 | * financical.py: 财务报表下载小助手 129 | 130 | 爬取的数据存入数据库会吗?《跟股神巴菲特学习炒股之财务报表入库(MySQL)》也许能给你一些思路。 131 | 132 | 原理说明:http://blog.csdn.net/c406495762/article/details/77801899 133 | 134 | 动态示意图: 135 | 136 | ![image](https://raw.githubusercontent.com/Jack-Cherish/Pictures/master/10.gif) 137 | 138 | * one_hour_spider:一小时入门Python3网络爬虫。 139 | 140 | 原理说明: 141 | 142 | * 知乎:https://zhuanlan.zhihu.com/p/29809609 143 | * CSDN:http://blog.csdn.net/c406495762/article/details/78123502 144 | 145 | 本次实战内容有: 146 | 147 | * 网络小说下载(静态网站)-biqukan 148 | * 优美壁纸下载(动态网站)-unsplash 149 | * 爱奇艺VIP视频下载 150 | 151 | * douyin.py:抖音App视频下载 152 | 153 | 抖音App的视频下载,就是普通的App爬取。 154 | 155 | 原理说明: 156 | 157 | * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html 158 | 159 | * douyin_pro:抖音App视频下载(升级版) 160 | 161 | 抖音App的视频下载,添加视频解析网站,支持无水印视频下载,使用第三方平台解析。 162 | 163 | 原理说明: 164 | 165 | * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html 166 | 167 | * douyin_pro_2:抖音App视频下载(升级版2) 168 | 169 | 抖音App的视频下载,添加视频解析网站,支持无水印视频下载,通过url解析,无需第三方平台。 170 | 171 | 原理说明: 172 | 173 | * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html 174 | 175 | 动态示意图: 176 | 177 | ![image](https://github.com/Jack-Cherish/Pictures/blob/master/14.gif) 178 | 179 | * geetest.py:GEETEST验证码破解 180 | 181 | 爬虫最大的敌人之一是什么?没错,验证码!Geetest作为提供验证码服务的行家,市场占有率还是蛮高的。遇到Geetest提供的滑动验证码怎么破?授人予鱼不如授人予渔,接下来就为大家呈现本教程的精彩内容。 182 | 183 | 原理说明: 184 | 185 | * 个人网站:http://www.cuijiahua.com/blog/2017/11/spider_2_geetest.html 186 | 187 | 动态示意图: 188 | 189 | ![image](https://github.com/Jack-Cherish/Pictures/blob/master/spider_2_1.gif) 190 | 191 | * 12306.py:用Python抢火车票简单代码 192 | 193 | 可以自己慢慢丰富,蛮简单,有爬虫基础很好操作,没有原理说明。 194 | 195 | * baiwan:百万英雄辅助答题 196 | 197 | 效果图: 198 | 199 | ![image](https://github.com/Jack-Cherish/Pictures/blob/master/11.gif) 200 | 201 | 原理说明: 202 | 203 | * 个人网站:http://cuijiahua.com/blog/2018/01/spider_3.html 204 | 205 | 功能介绍: 206 | 207 | 服务器端,使用Python(baiwan.py)通过抓包获得的接口获取答题数据,解析之后通过百度知道搜索接口匹配答案,将最终匹配的结果写入文件(file.txt)。 208 | 209 | 手机抓包不会的朋友,可以看下我的早期[手机APP抓包教程](http://blog.csdn.net/c406495762/article/details/76850843 "悬停显示")。 210 | 211 | Node.js(app.js)每隔1s读取一次file.txt文件,并将读取结果通过socket.io推送给客户端(index.html)。 212 | 213 | 亲测答题延时在3s左右。 214 | 215 | 声明:没做过后端和前端,花了一天时间,现学现卖弄好的,javascript也是现看现用,百度的程序,调试调试而已。可能有很多用法比较low的地方,用法不对,请勿见怪,有大牛感兴趣,可以自行完善。 216 | 217 | * Netease:根据歌单下载网易云音乐 218 | 219 | 效果图: 220 | 221 | ![image](https://github.com/Jack-Cherish/Pictures/blob/master/13.gif) 222 | 223 | 原理说明: 224 | 225 | 暂无 226 | 227 | 功能介绍: 228 | 229 | 根据music_list.txt文件里的歌单的信息下载网易云音乐,将自己喜欢的音乐进行批量下载。 230 | 231 | * bilibili:B站视频和弹幕批量下载 232 | 233 | 原理说明: 234 | 235 | 暂无 236 | 237 | 使用说明: 238 | 239 | python bilibili.py -d 猫 -k 猫 -p 10 240 | 241 | 三个参数: 242 | -d 保存视频的文件夹名 243 | -k B站搜索的关键字 244 | -p 下载搜索结果前多少页 245 | 246 | * jingdong:京东商品晒单图下载 247 | 248 | 效果图: 249 | 250 | ![image](https://github.com/Jack-Cherish/Pictures/blob/master/jd.gif) 251 | 252 | 原理说明: 253 | 254 | 暂无 255 | 256 | 使用说明: 257 | 258 | python jd.py -k 芒果 259 | 260 | 三个参数: 261 | -d 保存图片的路径,默认为fd.py文件所在文件夹 262 | -k 搜索关键词 263 | -n 下载商品的晒单图个数,即n个商店的晒单图 264 | 265 | ## 其它 266 | 267 | * 欢迎 Pull requests,感谢贡献。 268 | -------------------------------------------------------------------------------- /baiduwenku.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | from selenium import webdriver 3 | from bs4 import BeautifulSoup 4 | import re 5 | import time 6 | 7 | if __name__ == '__main__': 8 | 9 | options = webdriver.ChromeOptions() 10 | options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"') 11 | driver = webdriver.Chrome('J:\迅雷下载\chromedriver.exe', chrome_options=options) 12 | driver.get('https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html') 13 | 14 | html = driver.page_source 15 | bf1 = BeautifulSoup(html, 'lxml') 16 | result = bf1.find_all(class_='rtcspage') 17 | bf2 = BeautifulSoup(str(result[0]), 'lxml') 18 | title = bf2.div.div.h1.string 19 | pagenum = bf2.find_all(class_='size') 20 | pagenum = BeautifulSoup(str(pagenum), 'lxml').span.string 21 | pagepattern = re.compile('页数:(\d+)页') 22 | num = int(pagepattern.findall(pagenum)[0]) 23 | print('文章标题:%s' % title) 24 | print('文章页数:%d' % num) 25 | 26 | 27 | while True: 28 | num = num / 5.0 29 | html = driver.page_source 30 | bf1 = BeautifulSoup(html, 'lxml') 31 | result = bf1.find_all(class_='rtcspage') 32 | for each_result in result: 33 | bf2 = BeautifulSoup(str(each_result), 'lxml') 34 | texts = bf2.find_all('p') 35 | for each_text in texts: 36 | main_body = BeautifulSoup(str(each_text), 'lxml') 37 | for each in main_body.find_all(True): 38 | if each.name == 'span': 39 | print(each.string.replace('\xa0',''),end='') 40 | elif each.name == 'br': 41 | print('') 42 | print('\n') 43 | if num > 1: 44 | page = driver.find_elements_by_xpath("//div[@class='page']") 45 | driver.execute_script('arguments[0].scrollIntoView();', page[-1]) #拖动到可见的元素去 46 | nextpage = driver.find_element_by_xpath("//a[@data-fun='next']") 47 | nextpage.click() 48 | time.sleep(3) 49 | else: 50 | break -------------------------------------------------------------------------------- /baiduwenku_pro_1.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import json 4 | import os 5 | 6 | session = requests.session() 7 | 8 | 9 | def fetch_url(url): 10 | return session.get(url).content.decode('gbk') 11 | 12 | 13 | def get_doc_id(url): 14 | return re.findall('view/(.*).html', url)[0] 15 | 16 | 17 | def parse_type(content): 18 | return re.findall(r"docType.*?\:.*?\'(.*?)\'\,", content)[0] 19 | 20 | 21 | def parse_title(content): 22 | return re.findall(r"title.*?\:.*?\'(.*?)\'\,", content)[0] 23 | 24 | 25 | def parse_doc(content): 26 | result = '' 27 | url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content) 28 | url_list = [addr.replace("\\\\\\/", "/") for addr in url_list] 29 | for url in url_list[:-5]: 30 | content = fetch_url(url) 31 | y = 0 32 | txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),', content) 33 | for item in txtlists: 34 | if not y == item[1]: 35 | y = item[1] 36 | n = '\n' 37 | else: 38 | n = '' 39 | result += n 40 | result += item[0].encode('utf-8').decode('unicode_escape', 'ignore') 41 | return result 42 | 43 | 44 | def parse_txt(doc_id): 45 | content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_id 46 | content = fetch_url(content_url) 47 | md5 = re.findall('"md5sum":"(.*?)"', content)[0] 48 | pn = re.findall('"totalPageNum":"(.*?)"', content)[0] 49 | rsign = re.findall('"rsign":"(.*?)"', content)[0] 50 | content_url = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign 51 | content = json.loads(fetch_url(content_url)) 52 | result = '' 53 | for item in content: 54 | for i in item['parags']: 55 | result += i['c'].replace('\\r', '\r').replace('\\n', '\n') 56 | return result 57 | 58 | 59 | def parse_other(doc_id): 60 | content_url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt" 61 | content = fetch_url(content_url) 62 | url_list = re.findall('{"zoom":"(.*?)","page"', content) 63 | url_list = [item.replace("\\", '') for item in url_list] 64 | if not os.path.exists(doc_id): 65 | os.mkdir(doc_id) 66 | for index, url in enumerate(url_list): 67 | content = session.get(url).content 68 | path = os.path.join(doc_id, str(index) + '.jpg') 69 | with open(path, 'wb') as f: 70 | f.write(content) 71 | print("图片保存在" + doc_id + "文件夹") 72 | 73 | 74 | def save_file(filename, content): 75 | with open(filename, 'w', encoding='utf8') as f: 76 | f.write(content) 77 | print('已保存为:' + filename) 78 | 79 | 80 | # test_txt_url = 'https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search' 81 | # test_ppt_url = 'https://wenku.baidu.com/view/2b7046e3f78a6529657d5376.html?from=search' 82 | # test_pdf_url = 'https://wenku.baidu.com/view/dd6e15c1227916888586d795.html?from=search' 83 | # test_xls_url = 'https://wenku.baidu.com/view/eb4a5bb7312b3169a551a481.html?from=search' 84 | def main(): 85 | url = input('请输入要下载的文库URL地址') 86 | content = fetch_url(url) 87 | doc_id = get_doc_id(url) 88 | type = parse_type(content) 89 | title = parse_title(content) 90 | if type == 'doc': 91 | result = parse_doc(content) 92 | save_file(title + '.txt', result) 93 | elif type == 'txt': 94 | result = parse_txt(doc_id) 95 | save_file(title + '.txt', result) 96 | else: 97 | parse_other(doc_id) 98 | 99 | 100 | if __name__ == "__main__": 101 | main() 102 | -------------------------------------------------------------------------------- /baiwan/app.js: -------------------------------------------------------------------------------- 1 | var http = require('http'); 2 | var fs = require('fs'); 3 | var schedule = require("node-schedule"); 4 | var message = {}; 5 | var count = 0; 6 | var server = http.createServer(function (req,res){ 7 | fs.readFile('./index.html',function(error,data){ 8 | res.writeHead(200,{'Content-Type':'text/html'}); 9 | res.end(data,'utf-8'); 10 | }); 11 | }).listen(80); 12 | console.log('Server running!'); 13 | var lineReader = require('line-reader'); 14 | function messageGet(){ 15 | lineReader.eachLine('file.txt', function(line, last) { 16 | count++; 17 | var name = 'line' + count; 18 | console.log(name); 19 | console.log(line); 20 | message[name] = line; 21 | }); 22 | if(count == 25){ 23 | count = 0; 24 | } 25 | else{ 26 | for(var i = count+1; i <= 25; i++){ 27 | var name = 'line' + i; 28 | message[name] = 'f'; 29 | } 30 | count = 0; 31 | } 32 | } 33 | var io = require('socket.io').listen(server); 34 | var rule = new schedule.RecurrenceRule(); 35 | var times = []; 36 | for(var i=1; i<1800; i++){ 37 | times.push(i); 38 | } 39 | rule.second = times; 40 | schedule.scheduleJob(rule, function(){ 41 | messageGet(); 42 | }); 43 | io.sockets.on('connection',function(socket){ 44 | // console.log('User connected' + count + 'user(s) present'); 45 | socket.emit('users',message); 46 | socket.broadcast.emit('users',message); 47 | 48 | socket.on('disconnect',function(){ 49 | console.log('User disconnected'); 50 | //socket.broadcast.emit('users',message); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /baiwan/baiwan.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import requests 3 | from lxml import etree 4 | from bs4 import BeautifulSoup 5 | import urllib 6 | import time, re, types, os 7 | 8 | 9 | """ 10 | 代码写的匆忙,本来想再重构下,完善好注释再发,但是比较忙,想想算了,所以自行完善吧!写法很不规范,勿见怪。 11 | 12 | 作者: Jack Cui 13 | Website:http://cuijiahua.com 14 | 注: 本软件仅用于学习交流,请勿用于任何商业用途! 15 | """ 16 | 17 | class BaiWan(): 18 | def __init__(self): 19 | # 百度知道搜索接口 20 | self.baidu = 'http://zhidao.baidu.com/search?' 21 | # 百万英雄及接口,每个人的接口都不一样,里面包含的手机信息,因此不公布,请自行抓包,有疑问欢迎留言:http://cuijiahua.com/liuyan.html 22 | self.api = 'https://api-spe-ttl.ixigua.com/xxxxxxx={}'.format(int(time.time()*1000)) 23 | 24 | # 获取答案并解析问题 25 | def get_question(self): 26 | to = True 27 | while to: 28 | list_dir = os.listdir('./') 29 | if 'question.txt' not in list_dir: 30 | fw = open('question.txt', 'w') 31 | fw.write('百万英雄尚未出题请稍后!') 32 | fw.close() 33 | go = True 34 | while go: 35 | req = requests.get(self.api, verify=False) 36 | req.encoding = 'utf-8' 37 | html = req.text 38 | 39 | print(html) 40 | if '*' in html: 41 | question_start = html.index('*') 42 | try: 43 | 44 | question_end = html.index('?') 45 | except: 46 | question_end = html.index('?') 47 | question = html[question_start:question_end][2:] 48 | if question != None: 49 | fr = open('question.txt', 'r') 50 | text = fr.readline() 51 | fr.close() 52 | if text != question: 53 | print(question) 54 | go = False 55 | with open('question.txt', 'w') as f: 56 | f.write(question) 57 | else: 58 | time.sleep(1) 59 | else: 60 | to = False 61 | else: 62 | to = False 63 | 64 | temp = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9\+\-\*/]', html[question_end+1:]) 65 | b_index = [] 66 | print(temp) 67 | 68 | for index, each in enumerate(temp): 69 | if each == 'B': 70 | b_index.append(index) 71 | elif each == 'P' and (len(temp) - index) <= 3 : 72 | b_index.append(index) 73 | break 74 | 75 | if len(b_index) == 4: 76 | a = ''.join(temp[b_index[0] + 1:b_index[1]]) 77 | b = ''.join(temp[b_index[1] + 1:b_index[2]]) 78 | c = ''.join(temp[b_index[2] + 1:b_index[3]]) 79 | alternative_answers = [a,b,c] 80 | 81 | if '下列' in question: 82 | question = a + ' ' + b + ' ' + c + ' ' + question.replace('下列', '') 83 | elif '以下' in question: 84 | question = a + ' ' + b + ' ' + c + ' ' + question.replace('以下', '') 85 | else: 86 | alternative_answers = [] 87 | # 根据问题和备选答案搜索答案 88 | self.search(question, alternative_answers) 89 | time.sleep(1) 90 | 91 | def search(self, question, alternative_answers): 92 | print(question) 93 | print(alternative_answers) 94 | infos = {"word":question} 95 | # 调用百度接口 96 | url = self.baidu + 'lm=0&rn=10&pn=0&fr=search&ie=gbk&' + urllib.parse.urlencode(infos, encoding='GB2312') 97 | print(url) 98 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36', 99 | } 100 | sess = requests.Session() 101 | req = sess.get(url = url, headers=headers, verify=False) 102 | req.encoding = 'gbk' 103 | # print(req.text) 104 | bf = BeautifulSoup(req.text, 'lxml') 105 | answers = bf.find_all('dd',class_='dd answer') 106 | for answer in answers: 107 | print(answer.text) 108 | 109 | # 推荐答案 110 | recommend = '' 111 | if alternative_answers != []: 112 | best = [] 113 | print('\n') 114 | for answer in answers: 115 | # print(answer.text) 116 | for each_answer in alternative_answers: 117 | if each_answer in answer.text: 118 | best.append(each_answer) 119 | print(each_answer,end=' ') 120 | # print(answer.text) 121 | print('\n') 122 | break 123 | statistics = {} 124 | for each in best: 125 | if each not in statistics.keys(): 126 | statistics[each] = 1 127 | else: 128 | statistics[each] += 1 129 | errors = ['没有', '不是', '不对', '不正确','错误','不包括','不包含','不在','错'] 130 | error_list = list(map(lambda x: x in question, errors)) 131 | print(error_list) 132 | if sum(error_list) >= 1: 133 | for each_answer in alternative_answers: 134 | if each_answer not in statistics.items(): 135 | recommend = each_answer 136 | print('推荐答案:', recommend) 137 | break 138 | elif statistics != {}: 139 | recommend = sorted(statistics.items(), key=lambda e:e[1], reverse=True)[0][0] 140 | print('推荐答案:', recommend) 141 | 142 | # 写入文件 143 | with open('file.txt', 'w') as f: 144 | f.write('问题:' + question) 145 | f.write('\n') 146 | f.write('*' * 50) 147 | f.write('\n') 148 | if alternative_answers != []: 149 | f.write('选项:') 150 | for i in range(len(alternative_answers)): 151 | f.write(alternative_answers[i]) 152 | f.write(' ') 153 | f.write('\n') 154 | f.write('*' * 50) 155 | f.write('\n') 156 | f.write('参考答案:\n') 157 | for answer in answers: 158 | f.write(answer.text) 159 | f.write('\n') 160 | f.write('*' * 50) 161 | f.write('\n') 162 | if recommend != '': 163 | f.write('最终答案请自行斟酌!\t') 164 | f.write('推荐答案:' + sorted(statistics.items(), key=lambda e:e[1], reverse=True)[0][0]) 165 | 166 | 167 | if __name__ == '__main__': 168 | bw = BaiWan() 169 | bw.get_question() -------------------------------------------------------------------------------- /baiwan/file.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sukersuker/python--spider/0e743c1503212cff9d2a800b6c5df344d81362b7/baiwan/file.txt -------------------------------------------------------------------------------- /baiwan/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Jack Cui答题辅助系统 7 | 8 | 9 |

百万英雄答题辅助系统

10 |

11 |

12 |

13 |

14 |

15 |

16 |

17 |

18 |

19 |

20 |

21 |

22 |

23 |

24 |

25 |

26 |

27 |

28 |

29 |

30 |

31 |

32 |

33 |

34 |

35 | 36 | 37 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /baiwan/question.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sukersuker/python--spider/0e743c1503212cff9d2a800b6c5df344d81362b7/baiwan/question.txt -------------------------------------------------------------------------------- /bilibili/README.md: -------------------------------------------------------------------------------- 1 | ## 功能 2 | 3 | 下载B站视频和弹幕,将xml原生弹幕转换为ass弹幕文件,支持plotplayer等播放器的弹幕播放。 4 | 5 | ## 作者 6 | 7 | * Website: [http://cuijiahua.com](http://cuijiahua.com "悬停显示") 8 | * Author: Jack Cui 9 | * Date: 2018.6.12 10 | 11 | ## 使用说明 12 | 13 | python bilibili.py -d 猫 -k 猫 -p 10 14 | 15 | 三个参数: 16 | -d 保存视频的文件夹名 17 | -k B站搜索的关键字 18 | -p 下载搜索结果前多少页 19 | -------------------------------------------------------------------------------- /bilibili/bilibili.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | # Website: http://cuijiahua.com 3 | # Author: Jack Cui 4 | # Date: 2018.6.9 5 | 6 | import requests, json, re, sys, os, urllib, argparse, time 7 | from urllib.request import urlretrieve 8 | from contextlib import closing 9 | from urllib import parse 10 | import xml2ass 11 | 12 | class BiliBili: 13 | def __init__(self, dirname, keyword): 14 | self.dn_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 15 | 'Accept': '*/*', 16 | 'Accept-Encoding': 'gzip, deflate, br', 17 | 'Accept-Language': 'zh-CN,zh;q=0.9', 18 | 'Referer': 'https://search.bilibili.com/all?keyword=%s' % parse.quote(keyword)} 19 | 20 | self.search_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 21 | 'Accept-Language': 'zh-CN,zh;q=0.9', 22 | 'Accept-Encoding': 'gzip, deflate, br', 23 | 'Accept': 'application/json, text/plain, */*'} 24 | 25 | self.video_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 26 | 'Accept-Language': 'zh-CN,zh;q=0.9', 27 | 'Accept-Encoding': 'gzip, deflate, br', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'} 29 | 30 | self.danmu_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 31 | 'Accept': '*/*', 32 | 'Accept-Encoding': 'gzip, deflate, br', 33 | 'Accept-Language': 'zh-CN,zh;q=0.9'} 34 | 35 | self.sess = requests.Session() 36 | 37 | self.dir = dirname 38 | 39 | def video_downloader(self, video_url, video_name): 40 | """ 41 | 视频下载 42 | Parameters: 43 | video_url: 带水印的视频地址 44 | video_name: 视频名 45 | Returns: 46 | 无 47 | """ 48 | size = 0 49 | with closing(self.sess.get(video_url, headers=self.dn_headers, stream=True, verify=False)) as response: 50 | chunk_size = 1024 51 | content_size = int(response.headers['content-length']) 52 | if response.status_code == 200: 53 | sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024)) 54 | video_name = os.path.join(self.dir, video_name) 55 | with open(video_name, 'wb') as file: 56 | for data in response.iter_content(chunk_size = chunk_size): 57 | file.write(data) 58 | size += len(data) 59 | file.flush() 60 | 61 | sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r') 62 | # sys.stdout.flush() 63 | if size / content_size == 1: 64 | print('\n') 65 | else: 66 | print('链接异常') 67 | 68 | def search_video(self, search_url): 69 | """ 70 | 搜索接口 71 | Parameters: 72 | search_url: 带水印的视频地址 73 | Returns: 74 | titles:视频名列表 75 | arcurls: 视频播放地址列表 76 | """ 77 | req = self.sess.get(url=search_url, headers=self.search_headers, verify=False) 78 | html = json.loads(req.text) 79 | videos = html["data"]['result'] 80 | titles = [] 81 | arcurls = [] 82 | for video in videos: 83 | titles.append(video['title'].replace('','').replace('','')) 84 | arcurls.append(video['arcurl']) 85 | return titles, arcurls 86 | 87 | def get_download_url(self, arcurl): 88 | """ 89 | 获取视频下载地址 90 | Parameters: 91 | arcurl: 视频播放地址 92 | oid:弹幕地址参数 93 | Returns: 94 | download_url:视频下载地址 95 | """ 96 | req = self.sess.get(url=arcurl, headers=self.video_headers, verify=False) 97 | pattern = '.__playinfo__=(.*)