├── README.md ├── ximalaya_search.py └── ximalaya_vip.py /README.md: -------------------------------------------------------------------------------- 1 | # XimalayaFM 2 | 3 | python爬取喜马拉雅音频 4 | 5 | ## TODO 6 | 7 | * 写一个UI界面 8 | * 提供多种爬取选项 9 | 10 | ## 2019-10-12 19:10 11 | 12 | [CSDN](https://blog.csdn.net/weixin_42050513/article/details/101224552)上有人评论说`xm-sign`规则改了,于是去看了看,发现实际只改了一个字母,整体流程任可以看下方正文 13 | 14 | 把`hashlib.md5("ximalaya-{}".format(servertime).encode()`中的`ximalaya`替换成`himalaya`即可 15 | 16 | 改动点如下图 17 | 18 |  19 | 20 |  21 | 22 | 改动不大,可能是发现了有人在大量爬取,先小地方改动,随后可能会有较大的规则改动,教程写出来的目的是学习、测试,切勿过度爬取! 23 | 24 | ## 2020-02-02 25 | 26 | 完成VIP音频下载功能 27 | 28 | 使用方法: 29 | 30 | * 首先你已经开通了喜马会员 31 | * 该音频属于会员或者付费可听 32 | * 运行程序,选择`VIP`选项,然后输入音频集的albumID,以及你的token,点击运行即可 33 | 34 | token在下图这里复制 35 | 36 |  37 | 38 | 就是`1&_token=xxx`,这一部分,不用加`;` 39 | 40 | 代码主要是通过`Scapy`实现的功能,对,`不是Scrapy`,Scapy具有模拟发送数据包、监听解析数据包、互联网协议解析、数据挖掘等多种用处 41 | 42 | 然后发现scapy-http这个模块,二者配合使用后,可以解析抓到的包的url等参数 43 | 44 | ### 安装工具 45 | ``` bash 46 | pip3 install scapy 47 | 48 | pip3 install scapy-http 49 | ``` 50 | 51 | 还要安装winpcap软件,为监控网卡提供接口,[下载地址](https://www.winpcap.org/install/default.htm) 52 | 53 | 注意替换代码中的iface:[iface 参数为你要监听的网卡的名称,参考这里](https://blog.csdn.net/luanpeng825485697/article/details/78379154) 54 | 55 | ## Stargazers over time 56 | 57 | [](https://starchart.cc/joelYing/XimalayaFM) 58 | -------------------------------------------------------------------------------- /ximalaya_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf-8 -*- 3 | # author:joel 19-9-22 4 | 5 | import hashlib 6 | import json 7 | import os 8 | import re 9 | import time 10 | import random 11 | import requests 12 | 13 | """ 14 | 注意运行前请修改 make_dir() 中的下载路径!不要过度爬取,仅供测试学习! 15 | """ 16 | 17 | 18 | class XiMa(object): 19 | def __init__(self): 20 | self.base_url = 'https://www.ximalaya.com' 21 | self.base_api = 'https://www.ximalaya.com/revision/play/album?albumId={}&pageNum={}&sort=0&pageSize=30' 22 | self.time_api = 'https://www.ximalaya.com/revision/time' 23 | self.header = { 24 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0' 25 | } 26 | self.s = requests.session() 27 | 28 | def get_time(self): 29 | """ 30 | 获取服务器时间戳 31 | :return: 32 | """ 33 | r = self.s.get(self.time_api, headers=self.header) 34 | return r.text 35 | 36 | def get_sign(self): 37 | """ 38 | 获取sign: md5(ximalaya-服务器时间戳)(100以内随机数)服务器时间戳(100以内随机数)现在时间戳 39 | :return: xm_sign 40 | """ 41 | nowtime = str(round(time.time() * 1000)) 42 | servertime = self.get_time() 43 | sign = str(hashlib.md5("himalaya-{}".format(servertime).encode()).hexdigest()) + "({})".format( 44 | str(round(random.random() * 100))) + servertime + "({})".format(str(round(random.random() * 100))) + nowtime 45 | self.header["xm-sign"] = sign 46 | # print(sign) 47 | # return sign 48 | 49 | def index_choose(self): 50 | c_num = input(u'请输入对应操作的选项:\n' 51 | u'1、下载整部有声书\n' 52 | u'2、下载单个音源\n' 53 | u'3、返回\n') 54 | if c_num == '1': 55 | xm_id = input(u'请输入要获取喜马拉雅节目的ID:') 56 | xima.get_fm(xm_id) 57 | self.index_choose() 58 | elif c_num == '2': 59 | xm_id = input(u'请输入要获取的音源:') 60 | print(xm_id) 61 | self.index_choose() 62 | elif c_num == '3': 63 | print('结束') 64 | else: 65 | pass 66 | 67 | @staticmethod 68 | def make_dir(xm_fm_id): 69 | # 保存路径,请自行修改,这里是以有声书ID作为文件夹的路径 70 | fm_path = 'F:\\{}\\'.format(xm_fm_id) 71 | f = os.path.exists(fm_path) 72 | if not f: 73 | os.makedirs(fm_path) 74 | print('make file success...') 75 | else: 76 | print('file already exists...') 77 | return fm_path 78 | 79 | def get_fm(self, xm_fm_id): 80 | # 根据有声书ID构造url 81 | fm_url = self.base_url + '/youshengshu/{}'.format(xm_fm_id) 82 | print(fm_url) 83 | r_fm_url = self.s.get(fm_url, headers=self.header) 84 | fm_title = re.findall('
XiMaFM下载器
")) 121 | self.choose_label.setText(_translate("XiMaDownloader", "请选择需要下载的音频类型:
")) 122 | self.free_fm.setText(_translate("XiMaDownloader", "免费有声书")) 123 | self.vip_fm.setText(_translate("XiMaDownloader", "VIP/付费有声书")) 124 | self.single_fm.setText(_translate("XiMaDownloader", "单个音频")) 125 | self.id_label.setText(_translate("XiMaDownloader", "请输入需要下载的音频ID:
")) 126 | self.token_label.setText(_translate("XiMaDownloader", "请输入你的会员token:
")) 127 | self.d_type_box.setTitle(_translate("XiMaDownloader", "下载类型")) 128 | self.path_label.setText(_translate("XiMaDownloader", "请输入保存文件的路径:
")) 129 | self.d_config_box.setTitle(_translate("XiMaDownloader", "下载配置")) 130 | self.choose_file_button.setText(_translate("XiMaDownloader", "选择文件夹")) 131 | self.run_button.setText(_translate("XiMaDownloader", "运行")) 132 | self.cancel_button.setText(_translate("XiMaDownloader", "取消")) 133 | 134 | 135 | class XiMaControl(QMainWindow, Ui_XiMaDownloader): 136 | def __init__(self): 137 | super(XiMaControl, self).__init__() 138 | self.setupUi(self) 139 | self.choose_file_button.clicked.connect(self.open_folder) 140 | self.free_fm.clicked.connect(self.free_check_box) 141 | self.vip_fm.clicked.connect(self.vip_check_box) 142 | self.single_fm.clicked.connect(self.single_check_box) 143 | self.run_button.clicked.connect(self.run) 144 | self.info = 0 145 | self.ximamain = XiMaMain() 146 | 147 | def open_folder(self): 148 | # 选取文件 149 | foldername = QFileDialog.getExistingDirectory(self, "选择文件夹", "F:/") 150 | foldername = str(foldername).replace('/', '\\') 151 | # print(foldername) 152 | self.path_input_line.setText(foldername) 153 | 154 | def free_check_box(self): 155 | if self.free_fm.isChecked() and self.vip_fm.isChecked() is False and self.single_fm.isChecked() is False: 156 | self.info = 1 157 | 158 | def vip_check_box(self): 159 | if self.vip_fm.isChecked() and self.free_fm.isChecked() is False and self.single_fm.isChecked() is False: 160 | self.info = 2 161 | 162 | def single_check_box(self): 163 | if self.single_fm.isChecked() and self.vip_fm.isChecked() is False and self.free_fm.isChecked() is False: 164 | self.info = 3 165 | 166 | def run(self): 167 | try: 168 | xm_id = self.id_input_line.text() 169 | folder_path = self.path_input_line.text() 170 | token = self.token_input_line.text() 171 | # message = str(self.info) + xm_id + folder_path + token 172 | # print_text(message) 173 | if self.info == 1: 174 | self.ximamain.get_free_fm(xm_id, folder_path) 175 | elif self.info == 2: 176 | self.ximamain.get_pay_fm(xm_id, folder_path, token) 177 | elif self.info == 3: 178 | print_text(xm_id) 179 | else: 180 | pass 181 | except Exception as e: 182 | print(e) 183 | 184 | 185 | class XiMa: 186 | def __init__(self): 187 | self.base_url = 'https://www.ximalaya.com' 188 | # 有声书 189 | self.yss_api = 'https://www.ximalaya.com/youshengshu/{}/{}' 190 | # 需要带上sign访问的api,适用于免费的音频的播放源 191 | self.free_sign_api = 'https://www.ximalaya.com/revision/play/album?albumId={}&pageNum={}&sort=0&pageSize=30' 192 | # 获取单个免费音频api (trackId) 193 | self.free_track_api = 'http://mobile.ximalaya.com/mobile/redirect/free/play/{}/2' 194 | # 时间戳api 195 | self.time_api = 'https://www.ximalaya.com/revision/time' 196 | # 获取节目总音源个数与节目名 197 | self.album_api = 'https://www.ximalaya.com/revision/album?albumId={}' 198 | # 获取指定albumID的每一页音频的ID等track信息 199 | self.album_tracks_api = 'https://www.ximalaya.com/revision/album/v1/getTracksList?albumId={}&pageNum={}' 200 | # APP抓包得到,可用于获取付费节目总音源个数与节目名,获取音集所有音频ID,通过改变pageSize的大小,(albumId, pageSize) 201 | # 2020-02-29 最新测试pageSize最大为1000,所以针对章节大的有声书修改规则 202 | self.pay_size_api = 'http://180.153.255.6/mobile-album/album/page/ts-1569206246849?ac=WIFI&albumId={}' \ 203 | '&device=android&isAsc=true&isQueryInvitationBrand=true&isVideoAsc=true&pageId=1' \ 204 | '&pageSize={}' 205 | self.header = { 206 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0' 207 | } 208 | self.s = requests.session() 209 | 210 | def get_time(self): 211 | """ 212 | 获取服务器时间戳 213 | """ 214 | r = self.s.get(self.time_api, headers=self.header) 215 | r_time = r.text 216 | return r_time 217 | 218 | def get_sign(self): 219 | """ 220 | 获取sign: md5(ximalaya-服务器时间戳)(100以内随机数)服务器时间戳(100以内随机数)现在时间戳 221 | """ 222 | nowtime = str(round(time.time() * 1000)) 223 | servertime = self.get_time() 224 | sign = str(hashlib.md5("himalaya-{}".format(servertime).encode()).hexdigest()) + "({})".format( 225 | str(round(random.random() * 100))) + servertime + "({})".format(str(round(random.random() * 100))) + nowtime 226 | self.header["xm-sign"] = sign 227 | 228 | def make_dir(self, xm_fm_id, path): 229 | """ 230 | 保存路径,请自行修改,这里是以有声书ID作为文件夹的路径 231 | """ 232 | fm_path = path + '\\' + xm_fm_id 233 | if str(path).endswith('\\'): 234 | fm_path = path + xm_fm_id 235 | f = os.path.exists(fm_path) 236 | if not f: 237 | os.makedirs(fm_path) 238 | print_text('make file success...') 239 | else: 240 | print_text('file already exists...') 241 | return fm_path 242 | 243 | def get_fm(self, xm_fm_id, path): 244 | """ 245 | 根据albumID解析 免费 fm信息 246 | """ 247 | # 根据有声书ID构造url 248 | r_fm_url = self.s.get(self.album_api.format(xm_fm_id), headers=self.header) 249 | r_fm_json = json.loads(r_fm_url.text) 250 | fm_title = r_fm_json['data']['mainInfo']['albumTitle'] 251 | fm_count = r_fm_json['data']['tracksInfo']['trackTotalCount'] 252 | fm_page_size = r_fm_json['data']['tracksInfo']['pageSize'] 253 | print_text('书名:' + fm_title) 254 | # 新建有声书ID的文件夹 255 | fm_path = self.make_dir(xm_fm_id, path) 256 | # 取最大页数,向上取整 257 | max_page = math.ceil(fm_count/fm_page_size) 258 | return fm_count, fm_path, max_page 259 | 260 | def get_free_sign(self, xm_fm_id, page): 261 | """ 262 | 下载免费的音频的播放源信息 263 | :param xm_fm_id: 264 | :param page: 265 | :return: response 266 | """ 267 | self.get_sign() 268 | response = self.s.get(self.free_sign_api.format(xm_fm_id, page), headers=self.header) 269 | return response 270 | 271 | def get_pay_album(self, xm_fm_id, page_num): 272 | """ 273 | 获取付费的音频的播放源信息 274 | :param xm_fm_id: 275 | :param max_page: 276 | :return: response 277 | """ 278 | response = self.s.get(self.album_tracks_api.format(xm_fm_id, page_num), headers=self.header) 279 | return response 280 | 281 | def save_fm2local(self, title, src, path): 282 | """ 283 | 保存音频到本地 284 | :param title: 285 | :param src: 286 | :param path: 287 | """ 288 | r_audio_src = requests.get(src, headers=self.header) 289 | m4a_path = path + '\\' + title + '.m4a' 290 | if not os.path.exists(m4a_path): 291 | with open(m4a_path, 'wb') as f: 292 | f.write(r_audio_src.content) 293 | print_text(title + '保存完毕...') 294 | else: 295 | print_text(title + '.m4a 已存在') 296 | 297 | 298 | class XiMaMain: 299 | def __init__(self): 300 | self.xmd = XiMa() 301 | 302 | def get_free_fm(self, xm_fm_id, path): 303 | fm_count, fm_path, max_page = self.xmd.get_fm(xm_fm_id, path) 304 | if max_page: 305 | for page in range(1, int(max_page) + 1): 306 | print_text(str('第' + str(page) + '页')) 307 | r = self.xmd.get_free_sign(xm_fm_id, page) 308 | r_json = json.loads(r.text) 309 | for audio in r_json['data']['tracksAudioPlay']: 310 | audio_title = str(audio['trackName']).replace(' ', '') 311 | audio_src = audio['src'] 312 | self.xmd.save_fm2local(audio_title, audio_src, fm_path) 313 | # 每爬取1页,30个音频,休眠3秒 314 | time.sleep(3) 315 | else: 316 | print_text('no max_page') 317 | 318 | def get_pay_fm(self, xm_fm_id, path, token): 319 | fm_count, fm_path, max_page = self.xmd.get_fm(xm_fm_id, path) 320 | if max_page: 321 | # 这里应该是 fm_count 322 | for p in range(1, int(max_page) + 1): 323 | r = self.xmd.get_pay_album(xm_fm_id, p) 324 | r_json = json.loads(r.text) 325 | tracks = r_json['data']['tracks'] 326 | for i, track in enumerate(tracks): 327 | audio_id = track['trackId'] 328 | audio_title = str(track['title']).replace(' ', '') 329 | audio_url = self.xmd.base_url + track['url'] 330 | print_text(str(audio_title + '' + audio_url)) 331 | real_url = self.auto_click(audio_url, token) 332 | self.xmd.save_fm2local(audio_title, real_url, fm_path) 333 | # 每爬取1页,30个音频,休眠1~3秒 334 | time.sleep(random.randint(1, 3)) 335 | else: 336 | print_text('no max_page') 337 | 338 | def auto_click(self, url, token): 339 | """ 340 | 参数url为对应的VIP音频的播放页面,selenium访问页面后,带上cookie(1&_token)模拟登陆再次访问,前提你已经是会员 341 | 等待页面加载完成,通过selenium+Chromedriver的无头浏览器模拟点击音频播放按钮 342 | scapy开始抓点击后音频真实地址的数据包,退出browser,解析包 343 | 注意click与抓包的顺序,先点击再抓包 344 | """ 345 | chrome_options = Options() 346 | chrome_options.add_argument('--headless') 347 | chrome_options.add_argument('--disable-gpu') 348 | browser = webdriver.Chrome(chrome_options=chrome_options) 349 | browser.get(url) 350 | browser.add_cookie({ 351 | # 此处xxx.com前,需要带点,注意domain也是cookie必须的 352 | 'domain': '.ximalaya.com', 353 | 'name': '1&_token', 354 | 'value': token, 355 | }) 356 | browser.get(url) 357 | time.sleep(4) 358 | print_text('开始抓包') 359 | # selenium 点击播放按钮 360 | browser.find_element_by_css_selector(".play-btn.fR_").click() 361 | # 下面的iface是电脑网卡的名称 count是捕获报文的数目 362 | pkts = sniff(filter="tcp and port 80", iface="Qualcomm Atheros AR956x Wireless Network Adapter", count=5) 363 | browser.quit() 364 | for pkt in pkts: 365 | if TCP in pkt and pkt.haslayer(http.HTTPRequest): 366 | http_header = pkt[http.HTTPRequest].fields 367 | req_url = 'http://' + bytes.decode(http_header['Host']) + bytes.decode(http_header['Path']) 368 | return req_url 369 | 370 | 371 | def print_text(msg): 372 | control.output_text.append(msg) 373 | print(msg) 374 | QApplication.processEvents() 375 | 376 | 377 | if __name__ == "__main__": 378 | app = QApplication(sys.argv) 379 | control = XiMaControl() 380 | control.show() 381 | sys.exit(app.exec_()) 382 | 383 | --------------------------------------------------------------------------------