├── .gitignore ├── .idea ├── .gitignore ├── Python_Spider_All.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── 书旗小说 ├── shuqi_novel_search.py └── 书旗小说详情加密 ├── 优酷eid_uid_videoid作者ID转换 └── 转换.py ├── 哔哩哔哩下载 ├── bilibili_download.py ├── bilibili_download_base.py ├── bilibili_ocr.py ├── test_option.py ├── 人工核验验证码图片的场景.html ├── 使用说明 ├── 视频下架的场景.html ├── 记录哔哩哔哩问题.txt └── 需要人机识别的场景.html ├── 哔哩哔哩主页采集 └── bilibili_user_getall.py ├── 喜马拉雅FM ├── .idea │ ├── .gitignore │ ├── inspectionProfiles │ │ ├── Project_Default.xml │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── vcs.xml │ └── 喜马拉雅FM.iml ├── 使用教程 └── 喜马拉雅spider.py ├── 微信公众号短链接转长链接 └── short_to_long.py ├── 百度翻译spider ├── .idea │ ├── .gitignore │ ├── inspectionProfiles │ │ ├── Project_Default.xml │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── workspace.xml │ └── 百度翻译spider.iml ├── requirement.txt ├── sign加密参数破解.py ├── 使用教程 └── 百度翻译spider.py ├── 秒拍视频 ├── 使用教程 └── 秒拍spider.py ├── 网易云爬虫 ├── requirment.txt ├── 使用说明 ├── 加密解密代码.py └── 网易云spider.py ├── 虾米音乐spider ├── .idea │ ├── .gitignore │ ├── inspectionProfiles │ │ ├── Project_Default.xml │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── vcs.xml │ └── 虾米音乐spider.iml ├── requirment.txt ├── xiami_audio_spider.py ├── xiami_test_secret_parms.py └── 使用说明 └── 起点中文网详情字体加密破解 ├── my_font_content.py ├── qidian_novel_info_spider.py ├── 字体文件解析.py └── 字体解密记录 /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | /喜马拉雅FM/.idea/ 3 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/Python_Spider_All.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python_Spider_All 2 | 每完成一个项目存储一个 欢迎添加 3 | ######################################################################## 4 | 注意!!! 5 | config 本地配置文件 使用时请省略,主要包含 代理和翻页参数 6 | ######################################################################## 7 | 8 | # 书旗小说 9 | 10 | # 优酷eid_uid_videoid作者ID转换 11 | 12 | # 哔哩哔哩视频下载 13 | 14 | # 哔哩哔哩主页采集 15 | 个人主页公开视频信息 16 | # 虾米音乐 17 | 18 | # 网易云 19 | 20 | # 喜马拉雅FM 21 | 2023 06 修复 22 | 23 | # 微信公众号短链接转长链接 24 | 简单的解析页面进行拼接 25 | 26 | # 百度翻译 27 | 28 | # 秒拍 29 | 老版本已经失效,可借鉴 30 | 31 | # 书旗小说 详情 -------------------------------------------------------------------------------- /书旗小说/shuqi_novel_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Chance favors the prepared mind. 3 | # author : pyl owo, 4 | # time : 2020/9/21 5 | import datetime 6 | import json 7 | import random 8 | import re 9 | import time 10 | from hashlib import md5 11 | 12 | from fake_useragent import UserAgent 13 | import requests 14 | # 获取代理 15 | def get_proxy(): 16 | pass 17 | 18 | # 统一请求函数 19 | def unify_requests(method="GET",url="",headers={},proxies={},data={},verify=False,cookies={}): 20 | if method=="GET": 21 | response = requests.get(url, headers=headers,proxies=proxies,data=data,cookies=cookies,timeout=5) 22 | return response 23 | else: 24 | response = requests.post(url, headers=headers,proxies=proxies,data=data,verify=verify,cookies=cookies,timeout=5) 25 | return response 26 | 27 | # 书旗小说 28 | class SFQingNovel: 29 | def __init__(self, use_proxy=True): 30 | self.proxy = get_proxy() if use_proxy else None 31 | """:cvar 32 | 有三点反爬, 33 | 1,禁用右键 34 | 2,sign 35 | 3,headers里的 authorization 属性 36 | """ 37 | 38 | ############################################################ 39 | # Md5 加密函数 32 返回32位的加密结果 40 | def md5_use(self, text: str) -> str: 41 | result = md5(bytes(text, encoding="utf-8")).hexdigest() 42 | # print(result) 43 | return result 44 | 45 | # 获取加密 sign timestamp 46 | def shuqi_jiami(self, book_id: str, time_stamp: str = str(int(time.time())), use_pwd='37e81a9d8f02596e1b895d07c171d5c9', 47 | user_id="8000000"): 48 | """ function i(t, n, e) { 49 | var o = Object.keys(t).filter(function(t) { 50 | return !Array.isArray(n) || -1 !== n.indexOf(t) 51 | }).sort().map(function(n) { 52 | return t[n] 53 | }).join("") + (e || n); 54 | return a()(o) 55 | }""" 56 | """""" 57 | # 改写规则就是简单的拼接 艹 58 | info = self.md5_use(book_id + time_stamp + user_id + use_pwd) 59 | # 打印 sign 时间戳 以及 书籍ID 60 | # print(info,book_id,time_stamp) 61 | return info 62 | 63 | # 获得 authorization 的值 (在请求里面 需要re) 64 | def shuqi_get_header_token(self, book_id: str): 65 | # response = requests.get("https://t.shuqi.com/cover/{}".format(book_id)) 66 | response = unify_requests(url="https://t.shuqi.com/cover/{}".format(book_id), proxies=self.proxy) 67 | # print(response.text) 68 | token = re.findall(r'"token":"(.*?)"', response.text) 69 | token = token[0] if token else "" 70 | if token: 71 | # print(token) 72 | return token 73 | else: 74 | return "" 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | #####################################3 以上👆 加密 83 | 84 | # 获得响应 85 | def get_response(self, novel_url, time_stamp: str = str(int(time.time())), user_id: str = "8000000", **kwargs): 86 | if kwargs.get('qin_quan_id_int'): 87 | bookId = str(kwargs.get('qin_quan_id_int')) 88 | elif novel_url: 89 | bookId = str(novel_url).split('?')[0].split('/')[-1] 90 | else: 91 | return {} 92 | # print(bookId) 93 | token = self.shuqi_get_header_token(bookId) 94 | if token: 95 | pass 96 | else: 97 | print("获取token authorization 失败") 98 | return False 99 | headers = { 100 | 'User-Agent': UserAgent().random, 101 | "Proxy-Tunnel": str(random.randint(1, 10000)), 102 | 'authority': 'ocean.shuqireader.com', 103 | 'accept': 'application/json, text/plain, */*', 104 | # 'authorization': 'Bearer eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiI4MDAwMDAwIiwidXRkaWQiOiIiLCJpbWVpIjoiIiwic24iOiIiLCJleHAiOjE2MDA4NDgyNTYsInVzZXJJZCI6IjgwMDAwMDAiLCJpYXQiOjE2MDA4MzAyNTYsIm9haWQiOiIiLCJwbGF0Zm9ybSI6IjAifQ.tjgtZMMoMWCoA7Z-z1M55d7MUEFy4GjruQoeoyAOnSWYy1glqk-YkEbOHfX6oSH_3T-bhF0NKz6-4If4gSKz1A', 105 | 'authorization': "Bearer " + self.shuqi_get_header_token(bookId), 106 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Mobile Safari/537.36', 107 | 'content-type': 'application/x-www-form-urlencoded', 108 | 'origin': 'https://t.shuqi.com', 109 | 'sec-fetch-site': 'cross-site', 110 | 'sec-fetch-mode': 'cors', 111 | 'sec-fetch-dest': 'empty', 112 | 'referer': 'https://t.shuqi.com/cover/7027302', 113 | 'accept-language': 'zh-CN,zh;q=0.9', 114 | } 115 | 116 | data = { 117 | 'user_id': '%s' % (user_id), 118 | 'bookId': '%s' % (bookId), 119 | 'timestamp': '%s' % (time_stamp), 120 | 'sign': '%s' % (self.shuqi_jiami(bookId, time_stamp, user_id=user_id)), 121 | 'platform': '0' 122 | } 123 | # print(headers,data) 124 | # response = requests.post('https://ocean.shuqireader.com/webapi/bcspub/openapi/book/info', headers=headers, 125 | # data=data) 126 | response = unify_requests(url="https://ocean.shuqireader.com/webapi/bcspub/openapi/book/info", method="POST", headers=headers, data=data, proxies=self.proxy) 127 | # print(json.loads(response.text)) 128 | return response 129 | 130 | # 获取小说所有详细信息 131 | def get_novel_info(self, novel_url, **kwargs): 132 | search_result = self.parse_novel_info(self.get_response(novel_url, **kwargs), novel_url, **kwargs) 133 | return search_result 134 | 135 | 136 | # 获取评论数 137 | def get_comment(self, novel_url, **kwargs): 138 | if kwargs.get('qin_quan_id_int'): 139 | bookId = str(kwargs.get('qin_quan_id_int')) 140 | elif novel_url: 141 | bookId = str(novel_url).split('?')[0].split('/')[-1] 142 | else: 143 | return {} 144 | token = self.shuqi_get_header_token(bookId) 145 | if token: 146 | pass 147 | else: 148 | print("获取token authorization 失败") 149 | return False 150 | headers = { 151 | 'User-Agent': UserAgent().random, 152 | "Proxy-Tunnel": str(random.randint(1, 10000)), 153 | 'authority': 'ocean.shuqireader.com', 154 | 'accept': 'application/json, text/plain, */*', 155 | 'authorization': "Bearer " + self.shuqi_get_header_token(bookId), 156 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', 157 | 'origin': 'https://t.shuqi.com', 158 | 'sec-fetch-site': 'cross-site', 159 | 'sec-fetch-mode': 'cors', 160 | 'sec-fetch-dest': 'empty', 161 | 'referer': 'https://t.shuqi.com/', 162 | 'accept-language': 'zh-CN,zh;q=0.9', 163 | # 'Cookie': 'XSRF-TOKEN=1f1a10da-49bc-44eb-a39a-8fc19e44f8a0' 164 | } 165 | 166 | info_base_url = "https://ocean.shuqireader.com/webapi/comment/novel/i.php?do=sp_get&bookId={}&fetch=merge&sqUid=8000000&source=store&size=3&page=1&score=yes&authorId=8000000" # 129676 数字id 167 | 168 | if kwargs.get('qin_quan_id_int'): 169 | respose_info = unify_requests(url=info_base_url.format('kwargs.get("qin_quan_id_int")'), 170 | headers=headers, proxies=self.proxy) 171 | elif novel_url: 172 | # print(info_base_url.format((str(novel_url).split('?')[0].split('/')[-1]))) 173 | respose_info = unify_requests(url=info_base_url.format((str(novel_url).split('?')[0].split('/')[-1])), 174 | headers=headers, proxies=self.proxy) 175 | else: 176 | return {} 177 | return respose_info 178 | 179 | # 搜索视频响应解析 180 | def parse_novel_info(self, respose_info, novel_url='', **kwargs) -> dict: 181 | try: 182 | print(novel_url) 183 | response_dict = json.loads(respose_info.text).get('data', {}) 184 | comment_dict = json.loads(self.get_comment(novel_url, **kwargs).text) 185 | except Exception as e: 186 | print(e) 187 | return {} 188 | else: 189 | # info_book_dict = info_dict.get('book', {}) 190 | novel_dict = dict() 191 | # ''.join(response_data.xpath('')) 192 | # response_dict.get('', '') 193 | novel_dict['all_recommend_str'] = None # 总推荐数 str 194 | novel_dict['month_recommend_str'] = None # 月推荐数 str 195 | novel_dict['week_recommend_str'] = None # 周推荐数 str 196 | novel_dict['all_read_int'] = None # 总阅读数 int 197 | novel_dict['month_read_int'] = None # 月阅读数 int 198 | novel_dict['week_read_int'] = None # 周阅读数 int 199 | novel_dict['all_words_number_int'] = int(float(response_dict.get('wordCount', '')) * 10000) if response_dict.get('wordCount', '') else None # 总字数 200 | book_status = response_dict.get('state', '') 201 | if book_status == "1": 202 | book_status_str = "连载" 203 | elif book_status == "2": 204 | book_status_str = "完结" 205 | else: 206 | book_status_str = "暂无" 207 | novel_dict['book_status_str'] = book_status_str # 书籍状态 (连载,完结,暂无)bookCP 208 | novel_dict['book_property_str'] = None # 书籍属性 (免费,会员,限免) 209 | novel_dict['author_type_str'] = None # 作者类型 (金牌,签约,独立 默认无) 210 | novel_dict['book_lable_str'] = "|".join([i.get('tagName') for i in response_dict.get('tag', [])]) # 书籍标签 (用|分割的字符串 ''科幻|现实|励志'') 211 | novel_dict['book_type_str'] = None # 书籍分类 (玄幻 ,科幻,言情...)按搜索结果来多个按|分割 212 | novel_dict['book_update_time'] = datetime.datetime.strftime(datetime.datetime.fromtimestamp(response_dict.get('lastChapter', {}).get('updateTime')), "%Y-%m-%d") # 书籍更新日期 年-月-日 213 | novel_dict['book_zong_zhang_jie_int'] = None # 书籍总的章节 完结的,未完结就填目前的总章节 214 | novel_dict['book_zui_xin_zhang_jie_name_str'] = response_dict.get('lastChapter', {}).get('updateTime') # 最新章节名称 215 | novel_dict['book_introduce_text'] = response_dict.get('desc', '') # 书籍简介 text 216 | novel_dict['book_cover_image_str'] = response_dict.get('imgUrl', '') # 书籍封面 URL imgUrl 217 | novel_dict['book_detail_url_str'] = novel_url # 书籍详情URL 218 | novel_dict['book_detail_id_int'] = response_dict.get('bookId', '') # 书籍简介 text # 书籍详情ID 数字形式 bookId 219 | novel_dict['book_detail_id_str'] = str(response_dict.get('bookId', '')) # 书籍详情ID 字符形式 220 | novel_dict['book_zhan_dian_str'] = None # 书籍站点 (男生,女生,暂无) 221 | novel_dict['book_publish_str'] = '书旗小说' # 出版社 默认侵权平台' 222 | novel_dict['book_commeds_int'] = comment_dict.get('info', {}).get('total') # 书籍评论数 223 | novel_dict['author_grade_float'] = None # 作者评分 224 | novel_dict['author_id_str'] = str(response_dict.get('authorId', '')) # 作者ID 字符形式 ## 新增 authorId 225 | novel_dict['author_page_url_str'] = None # 作者主页链接 userId 226 | novel_dict['author_book_number_int'] = None # 作者书籍总数 227 | novel_dict['author_likes_int'] = None # 作者获赞总数 228 | novel_dict['author_all_words_number_str'] = None # 作者累计创作字数 229 | novel_dict['author_produce_days_str'] = None # 作者累计创作天数 230 | novel_dict['author_fens_number_int'] = None # 作者粉丝数 231 | novel_dict['author_head_image_url_str'] = response_dict.get('authorIcon', '') # 作者头像URL authorIcon 232 | return novel_dict 233 | 234 | 235 | # 统一的调用 search_novels 236 | search_novel_info = SFQingNovel(use_proxy=False).get_novel_info 237 | if __name__ == "__main__": 238 | result = search_novel_info('https://t.shuqi.com/cover/7329628') 239 | print(result) -------------------------------------------------------------------------------- /书旗小说/书旗小说详情加密: -------------------------------------------------------------------------------- 1 | 直接执行 没加代理, 2 | 加代理 3 | use_proxy = True (代理函数 自己添加) 4 | 5 | 具体能获得的值 已经列好列,主要还是# 上面的加密的地方,参数加密 执行 函数 shuqi_jiami 6 | data = { 7 | 'user_id': '%s' % (user_id), 8 | 'bookId': '%s' % (bookId), 9 | 'timestamp': '%s' % (time_stamp), 10 | 'sign': '%s' % (self.shuqi_jiami(bookId, time_stamp, user_id=user_id)), 11 | 'platform': '0' 12 | } 13 | 14 | 后记,真的烦 唯独没有测试md5 我以为不会那么简单的,淦!!! -------------------------------------------------------------------------------- /优酷eid_uid_videoid作者ID转换/转换.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2022/6/9 4 | #!/usr/bin/env python 5 | # -*- coding: utf-8 -*- 6 | 7 | import base64 8 | 9 | # 作者主页字符ID转数字ID 10 | def uid2eid(uid): 11 | """ 用户数字ID 加密为 字符串ID""" 12 | return encrypt(uid, type_str=1) 13 | 14 | # 视频字符ID转数字ID 15 | def uid2vid(uid): 16 | """ 视频数字ID 加密为 字符串ID """ 17 | return encrypt(uid, type_str=2) 18 | 19 | # 字符串转int 20 | def eid2uid(eid,type_str:int): 21 | """ 字符串ID 解密为 数字ID """ 22 | uid = "" 23 | if type_str==1: 24 | uid = int(str_to_int(eid[1:]))/4 25 | elif type_str==2: 26 | uid = int(str_to_int(eid[4:]))/4 27 | 28 | return str(int(uid)) 29 | 30 | # 字符串形式转ID的方法 bs64 31 | def str_to_int(eid:str): 32 | uid = base64.b64decode(eid).decode() 33 | return uid 34 | 35 | # 数字转换字符串形式的 36 | def encrypt(int_id:int, type_str:int): 37 | num = str(int(int_id)*4) 38 | eid = base64.b64encode(num.encode()).decode() 39 | if type_str==1: 40 | return "U"+eid 41 | elif type_str==2: 42 | return "id_X"+eid 43 | 44 | if __name__ == '__main__': 45 | # 1/作者主页 2/视频 46 | print(eid2uid("UODExNjMwNTc1Ng==",1)) 47 | # print(uid2eid("1596252942")) 48 | -------------------------------------------------------------------------------- /哔哩哔哩下载/bilibili_download.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/9/02 4 | import random 5 | from you_get.common import * 6 | import requests 7 | from Task_Compar_Config import Config_Of_Compar as config 8 | from Task_Compar_Config import proxies 9 | from fake_useragent import UserAgent 10 | from task_tool_unit import match1 11 | from tort_download_unit.bilibili_tort_download.bilibili_ocr import _get_toke_and_img 12 | from tort_download_unit.bilibili_tort_download.bilibili_download_base import download_urls # 给出下载链接 下载 13 | 14 | stream_types = [ 15 | {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, 16 | 'container': 'FLV', 'video_resolution': '1080p', 'desc': '高清 1080P60'}, 17 | {'id': 'hdflv2', 'quality': 112, 'audio_quality': 30280, 18 | 'container': 'FLV', 'video_resolution': '1080p', 'desc': '高清 1080P+'}, 19 | {'id': 'flv', 'quality': 80, 'audio_quality': 30280, 20 | 'container': 'FLV', 'video_resolution': '1080p', 'desc': '高清 1080P'}, 21 | {'id': 'flv720_p60', 'quality': 74, 'audio_quality': 30280, 22 | 'container': 'FLV', 'video_resolution': '720p', 'desc': '高清 720P60'}, 23 | {'id': 'flv720', 'quality': 64, 'audio_quality': 30280, 24 | 'container': 'FLV', 'video_resolution': '720p', 'desc': '高清 720P'}, 25 | {'id': 'hdmp4', 'quality': 48, 'audio_quality': 30280, 26 | 'container': 'MP4', 'video_resolution': '720p', 'desc': '高清 720P (MP4)'}, 27 | {'id': 'flv480', 'quality': 32, 'audio_quality': 30280, 28 | 'container': 'FLV', 'video_resolution': '480p', 'desc': '清晰 480P'}, 29 | {'id': 'flv360', 'quality': 16, 'audio_quality': 30216, 30 | 'container': 'FLV', 'video_resolution': '360p', 'desc': '流畅 360P'}, 31 | # 'quality': 15? 32 | {'id': 'mp4', 'quality': 0}, 33 | 34 | {'id': 'jpg', 'quality': 0}, 35 | ] 36 | dry_run = False 37 | json_output = False 38 | force = False 39 | skip_existing_file_size_check = False 40 | player = None 41 | extractor_proxy = None 42 | cookies = None 43 | output_filename = None 44 | auto_rename = False 45 | insecure = False 46 | import ssl 47 | import socket 48 | import logging 49 | from urllib import request, error 50 | 51 | def urlopen_with_retry(*args, **kwargs): 52 | retry_time = 3 53 | for i in range(retry_time): 54 | try: 55 | if insecure: 56 | # ignore ssl errors 57 | ctx = ssl.create_default_context() 58 | ctx.check_hostname = False 59 | ctx.verify_mode = ssl.CERT_NONE 60 | return request.urlopen(*args, context=ctx, **kwargs) 61 | else: 62 | return request.urlopen(*args, **kwargs) 63 | except socket.timeout as e: 64 | logging.debug('request attempt %s timeout' % str(i + 1)) 65 | if i + 1 == retry_time: 66 | raise e 67 | # try to tackle youku CDN fails 68 | except error.HTTPError as http_error: 69 | logging.debug('HTTP Error with code{}'.format(http_error.code)) 70 | if i + 1 == retry_time: 71 | raise http_error 72 | # 通过url下载哔哩哔哩文件 73 | def bilibili_download_urls(bili_url, title, ext='mp4',proxies=proxies,output_dir=config["system_path"]+"/"+config["tort_path"])->"bool": 74 | 75 | try: 76 | urls, size = bilibili_down_load(bili_url, proxy=proxies) 77 | if isinstance(urls,list): # 正常情况 78 | if urls: # ocr识别成功 79 | headers = bilibili_headers(referer=bili_url) 80 | download_urls(urls, title, ext, size, headers=headers, 81 | output_dir=output_dir, 82 | merge=True, 83 | av=True 84 | ) 85 | return True 86 | else: 87 | return False 88 | if isinstance(urls,int): # 错误的情况 89 | return urls # 错误情况返回错误代码 90 | 91 | except Exception as e: 92 | print(e) 93 | return False 94 | # 哔哩哔哩的头 headers 95 | def bilibili_headers(referer=None, cookie=None): 96 | # a reasonable UA 97 | # ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' 98 | ua = '{}'.format(UserAgent().random) 99 | # print(ua) 100 | headers = {'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'User-Agent': ua} 101 | if referer is not None: 102 | headers.update({'Referer': referer}) 103 | if cookie is not None: 104 | headers.update({'Cookie': cookie}) 105 | return headers 106 | 107 | # 哔哩哔哩下载地址 108 | def bilibili_down_load(bili_url,proxy=proxies): 109 | # 仿照 you-get 110 | stream_qualities = {s['quality']: s for s in stream_types} 111 | headers = { 112 | "Proxy-Tunnel": str(random.randint(1, 10000)), 113 | 'authority': 'www.bilibili.com', 114 | 'cache-control': 'max-age=0', 115 | 'upgrade-insecure-requests': '1', 116 | # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36', 117 | "user-agent": UserAgent().random, 118 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 119 | 'sec-fetch-site': 'same-site', 120 | 'sec-fetch-mode': 'navigate', 121 | 'sec-fetch-user': '?1', 122 | 'sec-fetch-dest': 'document', 123 | 'referer': 'https://search.bilibili.com/all?keyword=beatbox&from_source=nav_search&spm_id_from=333.851.b_696e7465726e6174696f6e616c486561646572.9&order=totalrank&duration=0&tids_1=3&tids_2=193', 124 | 'accept-language': 'zh-CN,zh;q=0.9', 125 | # '$cookie': 'CURRENT_FNVAL=16; _uuid=39019883-BF03-8583-5980-65F1AB32A8B437048infoc; buvid3=A1AF6CF2-8DE1-41D4-82FA-331AAF700F4953938infoc; rpdid=|(u)~lJ|l|lJ0J\'ul))Y)m)uu; LIVE_BUVID=AUTO4115905671332477; sid=lubz9xqt; DedeUserID=101681207; DedeUserID__ckMd5=dfc9ce597d1ee703; SESSDATA=4e33cf62%2C1609722422%2Ca87cf*71; bili_jct=96dcdd930c28d4d499acbf1c31b4ebb7; Hm_lvt_8a6e55dbd2870f0f5bc9194cddf32a02=1594256989; PVID=1; bsource=search_baidu; finger=351232418; blackside_state=1', 126 | } 127 | 128 | if proxy: 129 | response = requests.get(bili_url, headers=headers) 130 | 131 | else: 132 | response = requests.get(bili_url, headers=headers,proxies=proxy) 133 | 134 | # print(response.text) 135 | playinfo_text_ = match1(response.text, r'__playinfo__=(.*?) 视频去哪了呢?_哔哩哔哩 (゜-゜)つロ 干杯~-bilibili
啊叻?视频不见了?
返回首页
40 | 41 | 42 | -------------------------------------------------------------------------------- /哔哩哔哩下载/记录哔哩哔哩问题.txt: -------------------------------------------------------------------------------- 1 | 1,问题1,哔哩哔哩 人机校验接口 2 | https://sec.biliapi.net/th/captcha/get 3 | 其中包含图片base64值 和token值 ,带着这两个值去请求就能通过 4 | 图片识别调用 百度文字识别接口 5 | 2,不一定是 人机校验接口 还有下架的情况 6 | 人机校验哪里 通过判断返回数字 7 | -888 下架 (目前只是通过简单的是否包含一些特殊的字符去判断) 8 | 人机识别 (字符编码有很大的问题,用英文判断是否包含 bilibili security。。。。) 9 | 下架识别 (视频无法播放。。。。) 10 | 可能以后会有其他的情况 无法播放的情况 11 | 3,验证码识别哪里返回判断 12 | 成功执行 就返回 0就行 13 | 下载返回True 执行保持下载好状态 14 | 执行一些步骤成功执行 返回int 0 不会有任何操作,(例如人机识别通过之后) -------------------------------------------------------------------------------- /哔哩哔哩下载/需要人机识别的场景.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 出错啦! - bilibili.com 11 | 115 | 116 | 117 | 122 | 123 | 124 | 125 | 126 | 127 |
128 |
129 |
130 | 131 |
132 | 135 |
136 |
错误号:412
137 |
由于触发哔哩哔哩安全风控策略,该次访问请求被拒绝。
138 |
The request was rejected because of the bilibili security control policy.
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 | 156 | 279 | 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /哔哩哔哩主页采集/bilibili_user_getall.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2023/5/22 4 | 5 | import datetime 6 | import time 7 | import requests 8 | 9 | from hashlib import md5 10 | from urllib.parse import quote 11 | from retrying import retry 12 | 13 | # 代理 14 | def get_proxy(): 15 | return {} 16 | 17 | # Md5 加密函数 32 返回32位的加密结果 18 | def md5_use(text: str) -> str: 19 | result = md5(bytes(text, encoding="utf-8")).hexdigest() 20 | return result 21 | 22 | # 通过时间字符形式 返回时长格式 23 | def unify_duration_format(duar_str_or_s: str): 24 | """ 25 | 01:11 -> 71,'00:01:11' 26 | 00:01:11 -> 71,'00:01:11' 27 | :param duar_str: '01:11' or '00:01:11' 28 | :return: 71, '00:01:11' 29 | """ 30 | error = 0, '' 31 | 32 | def hms(m: int, s: int, h=0): 33 | if s >= 60: 34 | m += int(s / 60) 35 | s = s % 60 # 36 | if m >= 60: 37 | h += int(m / 60) 38 | m = m % 60 39 | return h * 60 * 60 + m * 60 + s, str(h).zfill(2) + ':' + str(m).zfill(2) + ':' + str(s).zfill(2) 40 | 41 | try: 42 | s = int(duar_str_or_s) 43 | except: 44 | pass 45 | else: 46 | return hms(m=s % 3600 // 60, s=s % 60, h=s // 3600) 47 | try: 48 | if duar_str_or_s: 49 | duar_list = duar_str_or_s.split(':') 50 | if len(duar_list) == 2: 51 | return hms(m=int(duar_list[0]), s=int(duar_list[1])) 52 | elif len(duar_list) == 3: 53 | return hms(m=int(duar_list[1]), s=int(duar_list[2]), h=int(duar_list[0])) 54 | else: 55 | return error 56 | else: 57 | return error 58 | except Exception as e: 59 | return error 60 | 61 | # 哔哩哔哩加密 62 | def bilibili_jiami(keyword,mid,pn): 63 | wts = int(time.time()) 64 | key = "keyword={keyword}&mid={mid}&order=pubdate&order_avoided=true&platform=web&pn={pn}&ps=30&tid=0&web_location=1550101&wts={wts}".format(keyword=keyword, 65 | mid=mid, pn=pn, 66 | wts=wts ) 67 | # salt = "72136226c6a73669787ee4fd02a74c27" # 老版本的盐 68 | salt = "5a73a9f6609390773b53586cce514c2e" # 2023 0609 新 69 | w_rid = md5_use(key+salt) 70 | return w_rid,wts 71 | 72 | # 解析ifno 73 | def analysis_parms(info_json): 74 | lis = info_json.get("data",{}).get("list",{}).get("vlist",[]) 75 | now_count = int(info_json.get("data",{}).get("page",{}).get("pn"))*int(info_json.get("data",{}).get("page",{}).get("ps")) 76 | all_count = int(info_json.get("data",{}).get("page",{}).get("count")) 77 | has_more = True if now_count<=all_count else False 78 | lis_dic_ifno = [] 79 | for each in lis: 80 | dic_info = dict() 81 | dic_info["play_num"] = each.get("play","") 82 | dic_info["like_num"] = each.get("photo","") 83 | dic_info["vid"] = each.get("aid","") 84 | dic_info["comment_num"] = each.get("comment","") 85 | dic_info["url"] = "https://www.bilibili.com/video/{}".format(each.get("bvid","")) 86 | dic_info["title"] = each.get("title","") 87 | duration, duration_str = unify_duration_format(each.get("length","")) 88 | dic_info["duration"] = duration_str 89 | dic_info["cover"] = each.get("pic","") 90 | dic_info["uid"] = each.get("mid","") 91 | dic_info["author_name"] = each.get("author","") 92 | dic_info["author_url"] = "https://space.bilibili.com/{}".format(each.get("mid","")) 93 | dic_info["pubtime"] = each.get("created","") 94 | if dic_info["pubtime"]: 95 | dic_info["pubtime"] = datetime.datetime.fromtimestamp(int(str(dic_info["pubtime"])[:10])).strftime("%Y-%m-%d %H:%M:%S") 96 | dic_info["photoUrl"] = each.get("pic","") # 这个是默认的播放的地址 完整版的 97 | lis_dic_ifno.append(dic_info) 98 | return lis_dic_ifno,has_more 99 | # 通过链接获取对应的信息 100 | @retry(stop_max_attempt_number=9, wait_fixed=20) 101 | def get_parms(userId="",pcursor=1,keyword=""): 102 | 103 | headers = { 104 | 'authority': 'api.bilibili.com', 105 | 'cache-control': 'max-age=0', 106 | 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 107 | 'sec-ch-ua-mobile': '?0', 108 | 'upgrade-insecure-requests': '1', 109 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 110 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 111 | 'sec-fetch-site': 'none', 112 | 'sec-fetch-mode': 'navigate', 113 | 'sec-fetch-user': '?1', 114 | 'sec-fetch-dest': 'document', 115 | 'accept-language': 'zh-CN,zh;q=0.9', 116 | } 117 | 118 | w_rid,wts = bilibili_jiami(quote(keyword),str(userId),str(pcursor)) 119 | params = { 120 | 'mid': str(userId), 121 | 'ps': '30', 122 | 'tid': '0', 123 | 'pn': str(pcursor), 124 | 'keyword': keyword, 125 | 'order': 'pubdate', 126 | 'platform': 'web', 127 | 'web_location': '1550101', 128 | 'order_avoided': 'true', 129 | 'w_rid': w_rid, 130 | 'wts': wts, 131 | } 132 | cookies = { 133 | 'bsource': 'search_baidu', 134 | 'innersign': '1', 135 | 136 | 137 | } 138 | 139 | response = requests.get('https://api.bilibili.com/x/space/wbi/arc/search', headers=headers,cookies=cookies, params=params,proxies=get_proxy(),timeout=10) 140 | 141 | 142 | return response.json() 143 | 144 | 145 | # 主要的执行的函数 146 | def run(userId="",pcursor=1,max_list_page=1,last_list=None,keyword=""): 147 | """ 148 | userId 用户ID 149 | pcursor 起始页 150 | max_list_page 截止页 151 | keyword 搜索关键词 默认空 152 | """ 153 | # last_list = [] 154 | if last_list is None: 155 | last_list = [] 156 | try: 157 | ever_page_info = get_parms(userId=userId,pcursor=pcursor,keyword=keyword) 158 | lis_dic_ifno,has_more = analysis_parms(ever_page_info) 159 | last_list.extend(lis_dic_ifno) 160 | # print(pcursor,has_more) 161 | if pcursor 2 | 3 | 15 | -------------------------------------------------------------------------------- /喜马拉雅FM/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /喜马拉雅FM/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /喜马拉雅FM/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /喜马拉雅FM/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /喜马拉雅FM/.idea/喜马拉雅FM.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /喜马拉雅FM/使用教程: -------------------------------------------------------------------------------- 1 | 调用 喜马拉雅spider 2 | search_songs 传值 id -------------------------------------------------------------------------------- /喜马拉雅FM/喜马拉雅spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2022/7/10 4 | 5 | import datetime 6 | import time 7 | import hashlib 8 | import random 9 | import json 10 | import requests, pprint 11 | 12 | from urllib.parse import quote 13 | from hashlib import md5 14 | 15 | 16 | config = { 17 | "xi_ma_la_ya": {"start": 1, "end": 2, "pagesize": 30, "start_page": 1}, 18 | } 19 | 20 | # 关于 User_Agent_Pc 21 | class UserAgent_Base(): 22 | random = "" 23 | 24 | # Md5 加密函数 32 返回32位的加密结果 25 | def md5_use(text: str) -> str: 26 | result = md5(bytes(text, encoding="utf-8")).hexdigest() 27 | return result 28 | 29 | # 通过时间获得一个固定格式的 时长格式 30 | def get_duration_str(seconds: float, like: str = "%02d:%02d:%02d"): 31 | """ 32 | 71 -> 01:11 33 | """ 34 | m, s = divmod(float(seconds), 60) 35 | h, m = divmod(m, 60) 36 | # print(like % (h, m, s)) 37 | return like % (h, m, s) 38 | 39 | 40 | # 爬取喜马拉雅的音乐的类 41 | class XiMaLaYa(object): 42 | 43 | def __init__(self): 44 | self.headers = { 45 | 'authority': 'www.ximalaya.com', 46 | "user-agent": UserAgent_Base().random, 47 | "Proxy-Tunnel": str(random.randint(1, 10000)), 48 | 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8', 49 | 'accept': '*/*', 50 | 'sec-fetch-site': 'same-origin', 51 | 'sec-fetch-mode': 'cors', 52 | 'sec-fetch-dest': 'empty', 53 | 'referer': 'https://www.ximalaya.com/search/sound/%E6%88%91%E4%BB%AC%E4%B8%8D%E4%B8%80%E6%A0%B7/p1', 54 | 'accept-language': 'zh-CN,zh;q=0.9', 55 | 'cookie': '' 56 | } 57 | 58 | def getServerTime(self): 59 | """ 60 | 获取喜马拉雅服务器的时间戳 61 | :return: 62 | """ 63 | # 这个地址就是返回服务器时间戳的接口 64 | serverTimeUrl = "https://www.ximalaya.com/revision/time" 65 | response = requests.get(serverTimeUrl,headers = self.headers) 66 | return response.text 67 | 68 | def getSign(self,serverTime): 69 | """ 70 | 生成 xm-sign 71 | 规则是 md5(ximalaya-服务器时间戳)(100以内随机数)服务器时间戳(100以内随机数)现在时间戳 72 | :param serverTime: 73 | :return: 74 | """ 75 | nowTime = str(round(time.time()*1000)) 76 | 77 | sign = str(hashlib.md5("himalaya-{}".format(serverTime).encode()).hexdigest()) + "({})".format(str(round(random.random()*100))) + serverTime + "({})".format(str(round(random.random()*100))) + nowTime 78 | # 将xm-sign添加到请求头中 79 | self.headers["xm-sign"] = sign 80 | # return sign 81 | 82 | # 统一请求响应函数 83 | def unify_requests(self,method="GET",url="",headers={},proxies={},data={}): 84 | if method=="GET": 85 | response = requests.get(url, headers=headers,proxies=proxies,data=data,timeout=5) 86 | return response 87 | 88 | 89 | # 解析搜索的结果的函数 90 | def parms_search_songs(self,result): 91 | result = result.text 92 | info_dic = json.loads(result) 93 | result_list = [] 94 | # 95 | if "data" in info_dic and "track" in info_dic["data"] and "docs" in info_dic["data"]["track"] and info_dic["data"]["track"]["docs"]: 96 | for each in info_dic["data"]["track"]["docs"]: 97 | if int(each["duration"])<350: 98 | dic_ = {} 99 | dic_["audio2_albumName"] = each["albumTitle"] 100 | dic_["audio2_artistName"] = each["nickname"] 101 | dic_["audio2_songName"] = each["title"] 102 | dic_["audio2_songId"] = each["id"] 103 | dic_["audio2_songtime"] = datetime.datetime.fromtimestamp(int(each["createdAt"]/1000)).strftime("%Y-%m-%d %H:%M:%S") # 时间 104 | dic_["audio2_platform"] = "喜马拉雅" 105 | dic_["audio2_duration_intsec"] = int(each["duration"]) # 音乐时长 2021 02 25 新加功能 秒 106 | dic_["audio2_duration_strsec"] = get_duration_str(seconds=each["duration"]) # 音乐时长 2021 02 25 新加功能 格式化 107 | dic_["audio2_albumid"] = each["albumId"] 108 | dic_["audio2_url"] = "https://www.ximalaya.com{trackUrl}".format(trackUrl=each["trackUrl"]) 109 | dic_["audio2_url_hash"] = md5_use(text=dic_["audio2_url"]) 110 | result_list.append(dic_) 111 | return result_list 112 | 113 | # 查找歌曲 114 | def search_songs(self, song_name='在希望的田野上', proxy={}, num=0,**kwargs): 115 | self.headers["referer"]="https://www.ximalaya.com/search/sound/{}/p1".format(quote(song_name)) 116 | result_list = [] 117 | _start = config["xi_ma_la_ya"]["start"] 118 | _end = config["xi_ma_la_ya"]["end"] 119 | 120 | if kwargs.get("page_num"): 121 | if config["xi_ma_la_ya"]["start_page"]==0: 122 | _start = kwargs.get("page_num")-1 123 | _end = kwargs.get("page_num") 124 | elif config["xi_ma_la_ya"]["start_page"]==1: 125 | _start = kwargs.get("page_num") 126 | _end = kwargs.get("page_num") + 1 127 | 128 | for page in range(_start,_end): 129 | url = "https://www.ximalaya.com/revision/search/main?kw={song_name}&page={page}&spellchecker=true&condition=relation&rows=20&device=iPhone&core=track&fq=&paidFilter=false".format(song_name=quote(song_name),page=page) 130 | 131 | if proxy: 132 | self.getSign(self.getServerTime()) 133 | 134 | result = self.unify_requests(url=url, headers=self.headers,proxies=proxy) 135 | else: 136 | self.getSign(self.getServerTime()) 137 | 138 | result = self.unify_requests(url=url,headers=self.headers) 139 | for each in self.parms_search_songs(result): 140 | result_list.append(each) 141 | return result_list 142 | def get_single(self): 143 | pass 144 | 145 | search_songs = XiMaLaYa().search_songs 146 | if __name__ == '__main__': 147 | 148 | proxies = { 149 | 150 | } 151 | each = { 152 | "page_num": 1, 153 | "search_key_words": "道德经", 154 | 155 | } 156 | 157 | # print(wy.search_songs(song_name="丑八怪",proxy=proxies)) 158 | info = search_songs(song_name=each["search_key_words"],proxy=proxies, **each) 159 | print(len(info)) 160 | print(info) 161 | 162 | -------------------------------------------------------------------------------- /微信公众号短链接转长链接/short_to_long.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xujian,time:2023/3/21 4 | 5 | import requests 6 | import re 7 | 8 | from retrying import retry 9 | 10 | # 获取代理 11 | def get_proxy(): 12 | return {} 13 | 14 | @retry(stop_max_attempt_number=6, wait_fixed=1000) 15 | def short_to_long_get_res(url): 16 | 17 | cookies = { 18 | } 19 | 20 | headers = { 21 | 'authority': 'mp.weixin.qq.com', 22 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 23 | 'accept-language': 'zh-CN,zh;q=0.9', 24 | 'cache-control': 'no-cache', 25 | 'pragma': 'no-cache', 26 | 'sec-ch-ua': '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', 27 | 'sec-ch-ua-mobile': '?0', 28 | 'sec-ch-ua-platform': '"macOS"', 29 | 'sec-fetch-dest': 'document', 30 | 'sec-fetch-mode': 'navigate', 31 | 'sec-fetch-site': 'none', 32 | 'sec-fetch-user': '?1', 33 | 'upgrade-insecure-requests': '1', 34 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 35 | } 36 | 37 | response = requests.get(url, cookies=cookies, headers=headers, 38 | proxies=get_proxy(), timeout=5) 39 | return response 40 | 41 | 42 | # 获得响应之后进行解析 43 | def parse_html_res(res): 44 | long_url_index = "".join(re.findall(r'_g\.msg_link =(.*);',res.text)) 45 | biz = "".join(re.findall(r'__biz=(.*?)&',long_url_index)) 46 | mid = "".join(re.findall(r'mid=(.*?)&',long_url_index)) 47 | idx = "".join(re.findall(r'idx=(.*?)&',long_url_index)) 48 | sn = "".join(re.findall(r'sn=(.*?)&',long_url_index)) 49 | long_url = "https://mp.weixin.qq.com/s?__biz={biz}&mid={mid}&idx={idx}&sn={sn}".format(biz=biz,mid=mid,idx=idx,sn=sn) 50 | return long_url 51 | 52 | # 主调度函数 53 | def run(url): 54 | res = short_to_long_get_res(url) 55 | long_url = parse_html_res(res) 56 | dic_ = {} 57 | dic_["short_url"] = url 58 | dic_["long_url"] = long_url 59 | dic_["platform_name"] = "微信" 60 | if long_url: 61 | return dic_ 62 | else: 63 | return {} 64 | 65 | 66 | get_long_url = run 67 | if __name__ == '__main__': 68 | url = "https://mp.weixin.qq.com/s/i2pcIC3zXrgsTXH79OB_kg" 69 | 70 | info = run(url) 71 | print(info) 72 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xuenew/Python_Spider_All/221e20bf84dfe067763adc964c51b40e47f38eed/百度翻译spider/.idea/.gitignore -------------------------------------------------------------------------------- /百度翻译spider/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 16 | 17 | 18 | 19 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 1593160631146 63 | 68 | 69 | 70 | 71 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /百度翻译spider/.idea/百度翻译spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /百度翻译spider/requirement.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.6.20 2 | chardet==3.0.4 3 | idna==2.9 4 | PyExecJS==1.5.1 5 | requests==2.24.0 6 | six==1.15.0 7 | urllib3==1.25.9 8 | -------------------------------------------------------------------------------- /百度翻译spider/sign加密参数破解.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/5/29 4 | 5 | 6 | import execjs 7 | 8 | # 先编译、后调用 9 | # # 将js文件中的内容读取出来编译即可调用里面的方法了 10 | js_compile = execjs.compile( 11 | r""" 12 | function a(r) { 13 | if (Array.isArray(r)) { 14 | for (var o = 0, t = Array(r.length); o < r.length; o++) 15 | t[o] = r[o]; 16 | return t 17 | } 18 | return Array.from(r) 19 | } 20 | function n(r, o) { 21 | for (var t = 0; t < o.length - 2; t += 3) { 22 | var a = o.charAt(t + 2); 23 | a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), 24 | a = "+" === o.charAt(t + 1) ? r >>> a : r << a, 25 | r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a 26 | } 27 | return r 28 | } 29 | var xx = function e(r) { 30 | var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); 31 | if (null === o) { 32 | var t = r.length; 33 | t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) 34 | } else { 35 | for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) 36 | "" !== e[C] && f.push.apply(f, a(e[C].split(""))), 37 | C !== h - 1 && f.push(o[C]); 38 | var g = f.length; 39 | g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) 40 | } 41 | var u = void 0 42 | , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); 43 | u = null !== i ? i : (i = "320305.131321201" || "") || ""; 44 | for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { 45 | var A = r.charCodeAt(v); 46 | 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), 47 | S[c++] = A >> 18 | 240, 48 | S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, 49 | S[c++] = A >> 6 & 63 | 128), 50 | S[c++] = 63 & A | 128) 51 | } 52 | for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) 53 | p += S[b], 54 | p = n(p, F); 55 | return p = n(p, D), 56 | p ^= s, 57 | 0 > p && (p = (2147483647 & p) + 2147483648), 58 | p %= 1e6, 59 | p.toString() + "." + (p ^ m) 60 | } 61 | var i = null; 62 | """ 63 | ) 64 | 65 | print(js_compile.call('xx', "hi")) 66 | -------------------------------------------------------------------------------- /百度翻译spider/使用教程: -------------------------------------------------------------------------------- 1 | 解决问题 2 | sign 加密 3 | 文件安排 4 | **sign加密参数破解 直接给参数(被翻译单词),返回sign加密结果 5 | 百度翻译spider 简单的请求接口, 6 | 注意事项 7 | 爬取过程 注意IP的封锁,可能对应的token什么的按照本地环境生成,自己curl 修改参数sign 8 | 9 | 10 | 翻译参照表 ,修改对应的 from 和 to 11 | langList: { 12 | 'zh': '中文','jp': '日语','jpka': '日语假名','th': '泰语','fra': '法语','en': '英语','spa': '西班牙语','kor': '韩语', 13 | 'tr': '土耳其语','vie': '越南语','ms': '马来语','de': '德语','ru': '俄语','ir': '伊朗语','ara': '阿拉伯语','est': '爱沙尼亚语','be': '白俄罗斯语','bul': '保加利亚语','hi': '印地语','is': '冰岛语', 14 | 'pl': '波兰语','fa': '波斯语','dan': '丹麦语','tl': '菲律宾语','fin': '芬兰语','nl': '荷兰语','ca': '加泰罗尼亚语','cs': '捷克语','hr': '克罗地亚语','lv': '拉脱维亚语','lt': '立陶宛语','rom': '罗马尼亚语','af': '南非语','no': '挪威语','pt_BR': '巴西语','pt': '葡萄牙语','swe': '瑞典语','sr': '塞尔维亚语', 15 | 'eo': '世界语','sk': '斯洛伐克语','slo': '斯洛文尼亚语','sw': '斯瓦希里语','uk': '乌克兰语','iw': '希伯来语','el': '希腊语','hu': '匈牙利语','hy': '亚美尼亚语','it': '意大利语','id': '印尼语','sq': '阿尔巴尼亚语','am': '阿姆哈拉语','as': '阿萨姆语','az': '阿塞拜疆语','eu': '巴斯克语','bn': '孟加拉语','bs': '波斯尼亚语', 16 | 'gl': '加利西亚语','ka': '格鲁吉亚语','gu': '古吉拉特语','ha': '豪萨语','ig': '伊博语','iu': '因纽特语','ga': '爱尔兰语','zu': '祖鲁语','kn': '卡纳达语','kk': '哈萨克语','ky': '吉尔吉斯语', 17 | 'lb': '卢森堡语','mk': '马其顿语','mt': '马耳他语','mi': '毛利语','mr': '马拉提语','ne': '尼泊尔语','or': '奥利亚语','pa': '旁遮普语','qu': '凯楚亚语','tn': '塞茨瓦纳语','si': '僧加罗语', 18 | 'ta': '泰米尔语','tt': '塔塔尔语','te': '泰卢固语','ur': '乌尔都语','uz': '乌兹别克语','cy': '威尔士语', 19 | 'yo': '约鲁巴语','yue': '粤语','wyw': '文言文','cht': '中文繁体' 20 | } -------------------------------------------------------------------------------- /百度翻译spider/百度翻译spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/5/29 4 | import json 5 | 6 | import requests 7 | import execjs 8 | 9 | def js_dd(r:str): 10 | js_compile = execjs.compile( 11 | r""" 12 | function a(r) { 13 | if (Array.isArray(r)) { 14 | for (var o = 0, t = Array(r.length); o < r.length; o++) 15 | t[o] = r[o]; 16 | return t 17 | } 18 | return Array.from(r) 19 | } 20 | function n(r, o) { 21 | for (var t = 0; t < o.length - 2; t += 3) { 22 | var a = o.charAt(t + 2); 23 | a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), 24 | a = "+" === o.charAt(t + 1) ? r >>> a : r << a, 25 | r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a 26 | } 27 | return r 28 | } 29 | var xx = function e(r) { 30 | var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); 31 | if (null === o) { 32 | var t = r.length; 33 | t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) 34 | } else { 35 | for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) 36 | "" !== e[C] && f.push.apply(f, a(e[C].split(""))), 37 | C !== h - 1 && f.push(o[C]); 38 | var g = f.length; 39 | g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) 40 | } 41 | var u = void 0 42 | , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); 43 | u = null !== i ? i : (i = "320305.131321201" || "") || ""; 44 | for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { 45 | var A = r.charCodeAt(v); 46 | 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), 47 | S[c++] = A >> 18 | 240, 48 | S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, 49 | S[c++] = A >> 6 & 63 | 128), 50 | S[c++] = 63 & A | 128) 51 | } 52 | for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) 53 | p += S[b], 54 | p = n(p, F); 55 | return p = n(p, D), 56 | p ^= s, 57 | 0 > p && (p = (2147483647 & p) + 2147483648), 58 | p %= 1e6, 59 | p.toString() + "." + (p ^ m) 60 | } 61 | var i = null; 62 | """ 63 | ) 64 | info = js_compile.call('xx', r) 65 | return info 66 | 67 | def run(key): 68 | sign = js_dd(key) 69 | url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh" 70 | 71 | payload = "from=en&to=zh&query={}&transtype=realtime&simple_means_flag=3&sign={}&token=886aa9b1d94bd35f736c15b865355987&domain=common".format(key,sign) 72 | headers = { 73 | 'authority': 'fanyi.baidu.com', 74 | 'accept': '*/*', 75 | 'x-requested-with': 'XMLHttpRequest', 76 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36', 77 | 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 78 | 'origin': 'https://fanyi.baidu.com', 79 | 'sec-fetch-site': 'same-origin', 80 | 'sec-fetch-mode': 'cors', 81 | 'sec-fetch-dest': 'empty', 82 | 'referer': 'https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh', 83 | 'accept-language': 'zh-CN,zh;q=0.9', 84 | 'cookie': 'BIDUPSID=4440053665E634D40225BDB0D03E06FB; PSTM=1587886677; BAIDUID=4440053665E634D478DDC8C16E9C7B12:FG=1; BDUSS=y1qektIM1ZwM2JGS1ZOcWR2TDd5cXNOUUw5UzBXZUpBemZxaFJzcnU0N2l6TXhlRVFBQUFBJCQAAAAAAAAAAAEAAABJ60F6x63RptLk0fQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOI~pV7iP6VeY; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BDSFRCVID=tjPOJeCAa7berx3uzpK9uyZdfmKK0gOTH6qcdz7sDYu7og_VfHNgEG0Pox8g0KubKALAogKK0mOTHUuF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJutoIPMtKv0jbTg-tP_-4_tbh_X5-RLfb5PLp7F5lONHt3uXj5NMRQL5UrQBbbv-2o-ahkM5h7xOKQSM-5pMJFUXp7ULpOAQeQghf5N3KJmDPP9bT3v5Dun3J3r2-biWbRL2MbdbDnP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe6KMD5cbDNKfqbT32CnKW5rtKRTffjrnhPF35-IrXP6-3MoKJKr-QtPbb4c2OITG2q6fb4uUyN3MWh37Jj620PF5-hbZfD3vbpn4bxkNbPoxJpOyMnbMopvaKqvN8hjvbURvD-ug3-7P-x5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqCLabKPw3f; H_PS_PSSID=; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; delPer=0; PSINO=2; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1590665640,1590739020,1590740573; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1590740573; __yjsv5_shitong=1.0_7_311eefede6fd974d4f71d2b54434efb086cc_300_1590740573647_221.223.193.45_cdfd764c; yjs_js_security_passport=c5fb1118c64b458b25325a4dbb9225810a243137_1590740578_js; BAIDUID=1EFD4CDE99B11CB1B31310352198C496:FG=1' 85 | } 86 | 87 | response = requests.request("POST", url, headers=headers, data = payload) 88 | 89 | print(response.text) 90 | 91 | if __name__ == '__main__': 92 | run(key="hi") 93 | -------------------------------------------------------------------------------- /秒拍视频/使用教程: -------------------------------------------------------------------------------- 1 | 秒拍 异或响应加密 2 | -------------------------------------------------------------------------------- /秒拍视频/秒拍spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2021/1/18 4 | 5 | import datetime 6 | import json 7 | import random 8 | import time 9 | import uuid 10 | import requests 11 | 12 | from hashlib import md5 13 | from urllib import parse 14 | from urllib.parse import quote 15 | 16 | proxies = {} 17 | # 通过时间字符形式 返回时长格式 18 | def unify_duration_format(duar_str_or_s: str): 19 | """ 20 | 01:11 -> 71,'00:01:11' 21 | 00:01:11 -> 71,'00:01:11' 22 | :param duar_str: '01:11' or '00:01:11' 23 | :return: 71, '00:01:11' 24 | """ 25 | error = 0, '' 26 | 27 | def hms(m: int, s: int, h=0): 28 | if s > 60: 29 | m += 1 30 | if m > 60: 31 | h += 1 32 | return h * 60 * 60 + m * 60 + s, str(h).zfill(2) + ':' + str(m).zfill(2) + ':' + str(s).zfill(2) 33 | try: 34 | s = int(duar_str_or_s) 35 | except: 36 | pass 37 | else: 38 | return hms(m=s % 3600//60, s=s % 60, h=s//3600) 39 | try: 40 | if duar_str_or_s: 41 | duar_list = duar_str_or_s.split(':') 42 | if len(duar_list) == 2: 43 | return hms(m=int(duar_list[0]), s=int(duar_list[1])) 44 | elif len(duar_list) == 3: 45 | return hms(m=int(duar_list[1]), s=int(duar_list[2]), h=int(duar_list[0])) 46 | else: 47 | return error 48 | else: 49 | return error 50 | except Exception as e: 51 | return error 52 | 53 | # Md5 加密函数 32 返回32位的加密结果 54 | def md5_use(text:str)->str: 55 | result = md5(bytes(text, encoding="utf-8")).hexdigest() 56 | # print(result) 57 | return result 58 | 59 | # 获取代理 60 | # 获得代理函数 61 | def get_proxy(): 62 | return proxies 63 | 64 | config = { 65 | # 秒拍 66 | "video_search_offset": {"start": 1, "end": 2, "pagesize": 20, "start_page": 0}, 67 | 68 | } 69 | 70 | class MiaopaiVideo(): 71 | # 时间戳 72 | current_ts = str(int(time.time())) 73 | # 伪造UUID,也叫做GUID(C#) 74 | fake_uuid = str(uuid.uuid1()) 75 | # APP版本 76 | app_version = '7.2.60' 77 | 78 | # 搜索接口 key_words 搜索的关键词 默认3 79 | APISearch = "https://b-api.ins.miaopai.com/1/search/media.json?count=20&page={page}&key={key_words}" 80 | 81 | def __init__(self, use_proxy=True): 82 | self.proxy = get_proxy() if use_proxy else None 83 | 84 | ################################### 85 | def get_cpAbid(self): 86 | s1 = random.randint(1,19) 87 | s2 = random.randint(1,29) 88 | if random.randint(0,1): 89 | return '1-102,{}-100,2-1,{}-101,5-200,2-201'.format(s1,s2) 90 | else: 91 | return '1-102,{}-100,2-1,{}-101'.format(s1,s2) 92 | 93 | # md5 加密 94 | def get_md5(self, source): 95 | if isinstance(source, str): 96 | source = source.encode('utf-8') 97 | return md5(source).hexdigest() 98 | 99 | ################################### 100 | 101 | # 秒拍解密响应 102 | def _decode_resp_content(self,resp_content): 103 | """解密请求响应的数据 104 | :param resp_content: 请求响应的content""" 105 | 106 | def bytes_to_int(data, offset): 107 | result = 0 108 | for i in range(4): 109 | result |= (data[offset + i] & 0xff) << (8 * 1) 110 | return result 111 | 112 | def reverse_bytes(i): 113 | return ((i >> 24) & 0xFF) | ((i >> 8) & 0xFF00) | ((i << 8) & 0xFF0000) | (i << 24) 114 | 115 | if len(resp_content) <= 8: 116 | return '' 117 | dword0 = bytes_to_int(resp_content, 0) 118 | dword1 = bytes_to_int(resp_content, 4) 119 | x = 0 120 | if (dword0 ^ dword1) == -1936999725: 121 | x = reverse_bytes(dword1 ^ bytes_to_int(resp_content, 8)) 122 | buffer_size = len(resp_content) - 12 - x 123 | if buffer_size <= 0: 124 | return '' 125 | else: 126 | buffer = bytearray() 127 | for index in range(buffer_size): 128 | buffer.append((resp_content[8 + index] ^ resp_content[12 + index]) & 0xff) 129 | return buffer.decode('utf8') 130 | 131 | # 获取响应 132 | def get_response(self,key_words:str="梁家辉",page:int=3,**kwargs): 133 | # url = "https://b-api.ins.miaopai.com/1/search/media.json?count=20&page=3&key=%E6%A2%81%E5%AE%B6%E8%BE%89" 134 | # url = "https://b-api.ins.miaopai.com/1/search/media.json?count=20&page=3&key={}".format(quote(key_words)) 135 | url = self.APISearch.format(key_words=quote(key_words),page=page) 136 | # timestamp = int(datetime.datetime.now().timestamp()) 137 | 138 | payload = {} 139 | # headers = { 140 | # 'cp-uniqueId': '8ac4508c-ca93-30ac-b310-61d9b4ea91a2', 141 | # 'cp-os': 'android', 142 | # 'cp_kid': '0', 143 | # 'cp-ver': '7.2.78', 144 | # 'cp-uuid': '8ac4508c-ca93-30ac-b310-61d9b4ea91a2', 145 | # 'cp-abid': '1-10,2-1', 146 | # 'cp-channel': 'xiaomi_market', 147 | # 'cp-time': '1600245983', 148 | # 'cp-sver': '9', 149 | # # 'cp-sign': 'fd3a76b879d6182925add2c5182071de', 150 | # 'cp-vend': 'miaopai', 151 | # 'cp-appid': '424', 152 | # 'Host': 'b-api.ins.miaopai.com', 153 | # 'User-Agent': 'okhttp/3.3.1', 154 | # 'Cookie': 'acw_tc=7b39758516002460160502434e5c514791eb6d8c44782e71955cd0f42e2fad' 155 | # } 156 | headers = { 157 | "Accept-Encoding": "gzip", 158 | 'User-Agent': 'okhttp/3.3.1', 159 | 'Connection': 'Keep-Alive', 160 | "Host": 'b-api.ins.miaopai.com', 161 | 'cp_ver': '7.2.60', 162 | 'cp_appid': '424', 163 | 'cp_sver': '5.1.1', 164 | 'cp_channel': 'xiaomi_market', 165 | 'cp_os': 'android', 166 | 'cp_vend': 'miaopai', 167 | } 168 | 169 | # cp_uuid = uuid.uuid1().__str__() 170 | headers['cp_sign'] = self.get_cp_sign(url) 171 | # print(headers) 172 | headers['cp_time'] = str(self.current_ts) 173 | headers['cp_uuid'] = self.fake_uuid 174 | headers['cp_abid'] = self.get_cpAbid() 175 | headers['Cache-Control'] = 'no-cache' 176 | 177 | response = requests.get(url, headers=headers, data = payload,verify=False,proxies=self.proxy) 178 | 179 | return response.content 180 | 181 | # 获取cp_sign参数值 182 | def get_cp_sign(self,target_url: str): 183 | sign_raw_str = 'url=' + parse.urlparse(target_url).path + \ 184 | 'unique_id=' + self.fake_uuid + \ 185 | 'version=' + self.app_version + \ 186 | 'timestamp=' + self.current_ts + \ 187 | '4O230P1eeOixfktCk2B0K8d0PcjyPoBC' 188 | return md5((sign_raw_str.encode(encoding='utf-8'))).hexdigest() 189 | 190 | # 解析秒拍 191 | def get_parse(self,respose_text): 192 | # print(respose_text.replace("]}00","]}").replace("]}0","]}")) 193 | task_list = [] # 解析的结果集 194 | dic_info = json.loads(respose_text.replace("]}00","]}").replace("]}0","]}")) 195 | # dic_info = json.loads(respose_text.replace("]}0","]}")) 196 | # print(dic_info) 197 | if "result" in dic_info and dic_info["result"]: 198 | for each in dic_info["result"]: 199 | video_dict = {} 200 | video_dict["video2_title"] = each["description"] 201 | video_dict["video2_id"] = each["smid"] 202 | video_dict["video2_url"] = "http://n.miaopai.com/media/{}.html".format(each["smid"]) 203 | video_dict["video2_author"] = each["user"]["nick"] 204 | video_dict["video2_url_hash"] = md5_use(video_dict.get("video2_url")) 205 | video_dict["video2_platform"] = "秒拍视频" 206 | duration_str_temp = each.get('meta_data', [])[0].get('upload', {}).get('length', '') if each.get('meta_data', []) else '' 207 | duration, duration_str = unify_duration_format(duration_str_temp) 208 | video_dict["video2_duration"] = duration # 时长(秒数) 209 | video_dict["video2_duration_str"] = duration_str # 时长(字符串) 210 | task_list.append(video_dict) 211 | return task_list 212 | # 获取video的返回值 213 | def search_video(self,search_key: str,**kwargs): 214 | _start = config["video_search_offset"]["start"] 215 | _end = config["video_search_offset"]["end"] 216 | task_list = [] 217 | if kwargs.get("page_num"): 218 | if config["video_search_offset"]["start_page"] == 0: 219 | _start = int(kwargs.get("page_num")) - 1 220 | _end = int(kwargs.get("page_num")) 221 | elif config["video_search_offset"]["start_page"] == 1: 222 | _start = int(kwargs.get("page_num")) 223 | _end = int(kwargs.get("page_num")) + 1 224 | 225 | for page in range(_start, _end): 226 | respose_text = self._decode_resp_content(self.get_response(key_words=search_key,page=page)) 227 | print(respose_text) 228 | for each in self.get_parse(respose_text): 229 | task_list.append(each) 230 | 231 | return task_list 232 | 233 | 234 | search_songs = MiaopaiVideo(use_proxy=True).search_video 235 | 236 | if __name__ == '__main__': 237 | kwags = { 238 | } 239 | info = search_songs(search_key="周杰伦", **kwags) # 1109 没数据就对了 240 | print(info) 241 | -------------------------------------------------------------------------------- /网易云爬虫/requirment.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.6.20 2 | chardet==3.0.4 3 | crypto==1.4.1 4 | fake-useragent==0.1.11 5 | idna==2.10 6 | lxml==4.5.1 7 | Naked==0.1.31 8 | pycryptodome==3.9.8 9 | PyExecJS==1.5.1 10 | PyYAML==5.3.1 11 | requests==2.24.0 12 | shellescape==3.8.1 13 | six==1.15.0 14 | urllib3==1.25.9 15 | -------------------------------------------------------------------------------- /网易云爬虫/使用说明: -------------------------------------------------------------------------------- 1 | 文件说明: 2 | 网易云spider.py 提供评论接口解析 歌曲歌词接口解析 搜索接口解析 3 | 加密解密代码.py 提供 js函数里b函数的解析(并不针对网易云解密) 具体看本人博客 4 | 注意事项: 5 | 测试cookie两天未封(是否需要cookie池) 6 | IP加代理 -------------------------------------------------------------------------------- /网易云爬虫/加密解密代码.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/6/28 4 | from Crypto.Cipher import AES 5 | import base64 6 | 7 | # 加密 8 | def py_aes_first(text): 9 | BS = AES.block_size 10 | pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8') 11 | unpad = lambda s : s[0:-s[-1]] 12 | 13 | key = b'0CoJUm6Qyw8W8jud' 14 | text = text.encode("utf-8") 15 | IV = b'0102030405060708' 16 | 17 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 18 | # cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密,cipher对象只能用一次 19 | 20 | # print(text) 21 | encrypted = pad(text) 22 | # print(encrypted) 23 | encrypted = cipher.encrypt(encrypted) 24 | # print(encrypted) 25 | encrypted = base64.b64encode(encrypted).decode("utf-8") 26 | print("第一次加密结果",encrypted) 27 | return encrypted 28 | def py_aes_second(text): 29 | BS = AES.block_size 30 | pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8') 31 | unpad = lambda s : s[0:-s[-1]] 32 | 33 | key = b'TXhkKroQJSgrKrnN' 34 | text = text.encode("utf-8") 35 | IV = b'0102030405060708' 36 | 37 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 38 | # cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密,cipher对象只能用一次 39 | 40 | # print(text) 41 | encrypted = pad(text) 42 | # print(encrypted) 43 | encrypted = cipher.encrypt(encrypted) 44 | # print(encrypted) 45 | encrypted = base64.b64encode(encrypted).decode("utf-8") 46 | print("第二次加密结果",encrypted) 47 | 48 | key = b'0CoJUm6Qyw8W8jud' 49 | text = encrypted 50 | IV = b'0102030405060708' 51 | 52 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 53 | 54 | return encrypted 55 | 56 | 57 | # 解密 58 | def py_aes_third(encrypted): 59 | key = b'TXhkKroQJSgrKrnN' 60 | IV = b'0102030405060708' 61 | unpad = lambda s: s[0:-s[-1]] 62 | cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) 63 | 64 | decrypted = base64.b64decode(encrypted) 65 | # print(decrypted) 66 | decrypted = cipher2.decrypt(decrypted) 67 | # print(decrypted) # will be 'to be encrypted' 68 | decrypted = unpad(decrypted) 69 | print("第一次解密结果",str(decrypted,encoding='utf-8')) 70 | key = b'0CoJUm6Qyw8W8jud' 71 | IV = b'0102030405060708' 72 | 73 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 74 | 75 | decrypted = base64.b64decode(decrypted) 76 | # print(decrypted) 77 | decrypted = cipher.decrypt(decrypted) 78 | # print(decrypted) # will be 'to be encrypted' 79 | decrypted = unpad(decrypted) 80 | print("第二次解密结果",str(decrypted,encoding='utf-8')) 81 | 82 | 83 | def jiami_(text): 84 | info = py_aes_second(py_aes_first(text)) 85 | print(info) 86 | return info 87 | 88 | def jiemi_(text): 89 | jiemi_info = py_aes_third(text) 90 | print(jiemi_info) 91 | return jiemi_info 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | jiami_('sdfadfaf') 97 | jiemi_('zi8Y6TYtv3TQ3vPiamaUE+arzvFjlbFzNGdkSTxKRrw=zi8Y6TYtv3TQ3vPiamaUE+arzvFjlbFzNGdkSTxKRrw=') 98 | # py_aes_first('{"s":"在田野","csrf_token":""}') 99 | # py_aes_second(py_aes_first('{"s":"在田野","csrf_token":""}')) 100 | # xx = py_aes_second(py_aes_first('{"hlpretag":"","hlposttag":"","s":"在意义","type":"1","offset":"90","total":"false","limit":"30","csrf_token":”"}')) 101 | # py_aes_second(py_aes_first('{"hlpretag":"","hlposttag":"","s":"在田野","type":"1","offset":"90","total":"false","limit":"30","csrf_token":”"}')) 102 | 103 | 104 | 105 | 106 | 107 | # 解密测试 108 | # xx = 'RidsQl08PTom8lbreQjS0wrkPfv02Ib1P+7WYmAYUmHz3V3KhauA0kodLg+VIPLXEn393pGiP6j7E9soFzuH09jq/XFIcjEMKCIZb3npxxc=' 109 | # py_aes_third(xx) 110 | 111 | # 加密测试 112 | # py_aes_second(py_aes_first('{"hlpretag":"","hlposttag":"","#/discover":"","s":"在一起","type":"1","offset":"30","total":"false","limit":"30","csrf_token":"”}')) 113 | 114 | -------------------------------------------------------------------------------- /网易云爬虫/网易云spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/6/28 4 | 5 | from fake_useragent import UserAgent 6 | from Crypto.Cipher import AES 7 | 8 | import base64 9 | import requests, pprint,json 10 | 11 | 12 | class WangYiYun(): 13 | 14 | def __init__(self): 15 | self.params = "" 16 | self._i = "l6Brr86UeZ6C3Bsw" # 默认使用此字符串 17 | # 使用默认_i 配套的encSecKey 18 | self.encSecKey = "7ca9b5ba8b13044f47ed74c388df912ac84758122acbedc64111f2ac83232b01d3ce16f7195a39c7e064b4c0240b5c1d52624dc13c22ec820d76dfe32db43e496aeacced5be3ca9108c78a85bb389f1edf8d8c9fced02024ba9490401b4ce062cc50764d0a24294e07bb229271391b5a3640e924ee1ed15435dc6e288f1fa873" 19 | self.headers = { 20 | 'authority': 'music.163.com', 21 | 'user-agent': UserAgent().random, 22 | 'content-type': 'application/x-www-form-urlencoded', 23 | 'accept': '*/*', 24 | 'origin': 'https://music.163.com', 25 | 'sec-fetch-site': 'same-origin', 26 | 'sec-fetch-mode': 'cors', 27 | 'sec-fetch-dest': 'empty', 28 | 'referer': 'https://music.163.com/song?id=1426301364', 29 | 'accept-language': 'zh-CN,zh;q=0.9', 30 | 'cookie': '_iuqxldmzr_=32; _ntes_nnid=5f8ee04e745645d13d3f711c76769afe,1593048942478; _ntes_nuid=5f8ee04e745645d13d3f711c76769afe; WM_TID=XqvK2%2FtWaSBEUBRBEEN7XejGE%2FL0h6Vq; WM_NI=iN6dugAs39cIm2K2R9ox28GszTm5oRjcvJCcyIuaI1dccEVSjaHEwhc8FuERfkh3s%2FFP0zniMA5P4vqS4H3TJKdQofPqezDPP4IR5ApTjuqeNIJNZkCvHMSY6TtEkCZUS3k%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb2e57dbababf88b879a8b08fa2d84f869f9fbaaa50a3f599a5d650939b8dadd52af0fea7c3b92aab92fa85f86d83adfddae243afee85d3d133ada8fed9c679ba8ca3d6ee5aaabdbaabc269bb97bb82cc3ba8bdada6d559aabf88a6f664a1e88a96c85aa6b5a8d4f2258690009bed638f9ffbb1b77eb38dfca9b2608a95acb2ee6e94afab9bc75c94ec87b3b84bb48ca696f46f8e9786afd96181aa88aed253f68cbca6ea499a8b9dd4ea37e2a3; JSESSIONID-WYYY=tI8MIKMCRBuyCYnUJMCyUTlp%2Fufv5xIfCquvp7PJ4%2BuXod%5CXH%5CB0icDZw8TNlwHUHOW%2B2t%2BCuXyC4VZ%5C19OrzaDE%5Ck0F0dAZQh7KcVxUoHKpqUdiVzPu8NxCK9cJRG%5C%5CPTvtqxjFerd1%2BBa4%2F%5C8PESa4pvvRaQF6jljjsibX%5CrcPsH0I%3A1593347447142', 31 | } 32 | 33 | # 搜索歌曲接口 34 | API_Serch_Songs = 'https://music.163.com/weapi/cloudsearch/get/web?csrf_token=' 35 | # 歌曲评论 36 | API_Comments_Song = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token=' # 音乐ID可替换 37 | # 歌曲歌词 38 | API_Lyric_Songs = 'https://music.163.com/weapi/song/lyric?csrf_token=' 39 | 40 | # crypt_js_complex python 复写cryptjs 41 | def crypt_js_complex(self,text): 42 | BS = AES.block_size 43 | pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8') 44 | unpad = lambda s: s[0:-s[-1]] 45 | 46 | key = bytes(self._i, encoding="utf-8") 47 | text = text.encode("utf-8") 48 | IV = b'0102030405060708' 49 | 50 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 51 | # cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密,cipher对象只能用一次 52 | 53 | # print(text) 54 | encrypted = pad(text) 55 | # print(encrypted) 56 | encrypted = cipher.encrypt(encrypted) 57 | # print(encrypted) 58 | encrypted = base64.b64encode(encrypted).decode("utf-8") 59 | # print("第二次加密结果", encrypted) 60 | 61 | return encrypted 62 | 63 | # crypt_js_complex 的基础 64 | def crypt_js_complex_base(self,text): 65 | BS = AES.block_size 66 | pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8') 67 | unpad = lambda s: s[0:-s[-1]] 68 | 69 | key = b'0CoJUm6Qyw8W8jud' 70 | text = text.encode("utf-8") 71 | IV = b'0102030405060708' 72 | 73 | cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV) 74 | # cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密,cipher对象只能用一次 75 | 76 | # print(text) 77 | encrypted = pad(text) 78 | # print(encrypted) 79 | encrypted = cipher.encrypt(encrypted) 80 | # print(encrypted) 81 | encrypted = base64.b64encode(encrypted).decode("utf-8") 82 | # print("第一次加密结果", encrypted) 83 | return encrypted 84 | 85 | # 获得parms参数值 86 | def get_params(self,text): 87 | return self.crypt_js_complex( 88 | self.crypt_js_complex_base(text),) 89 | 90 | # 搜索歌曲接口 91 | def serch_songs(self,name,offset=0): 92 | """ 93 | 94 | :param name:str 95 | :param offset:int 偏移量 默认第一页 例如 0 30 60 90 96 | :return 接口数据 97 | """ 98 | text = '{"hlpretag":"","hlposttag":"","#/discover":"","s":"%s","type":"1","offset":"%s","total":"false","limit":"30","csrf_token":""}'%(name,offset*30) 99 | # payload = 'params={params}&encSecKey={encSecKey}'.format(params=self.get_params(text),encSecKey=self.encSecKey) 100 | print(text) 101 | params = ( 102 | ('csrf_token', ''), 103 | ) 104 | 105 | data = { 106 | 'params': self.get_params(text), 107 | 'encSecKey': self.encSecKey 108 | } 109 | print(data) 110 | response = requests.post(self.API_Serch_Songs, headers=self.headers, params=params, 111 | data=data) 112 | self._dispose(json.loads(response.text)) 113 | 114 | # 歌曲评论抓取 115 | def comment_song(self,songid:str,offset:int=0): 116 | """" 117 | :param songid:str 歌曲ID 118 | :param offset:int 翻页 默认第一页 0 20 40 119 | :return 接口数据 120 | """ 121 | text = '{"rid":"R_SO_4_%s","offset":"%s","total":"true","limit":"20","csrf_token":""}'%(songid,offset*20) 122 | 123 | 124 | params = ( 125 | ('csrf_token', ''), 126 | ) 127 | 128 | data = { 129 | 'params': self.get_params(text), 130 | 'encSecKey': self.encSecKey 131 | } 132 | response = requests.post(self.API_Comments_Song.format(songid), headers=self.headers, 133 | params=params, data=data) 134 | self._dispose(json.loads(response.text)) 135 | # 歌词爬取 136 | def lyric_song(self,songid:str): 137 | """ 138 | :param songid str 歌曲ID 139 | :return 接口数据 140 | """ 141 | # 歌词接口加密参数原型 142 | text = '{"id":"%s","lv":-1,"tv":-1,"csrf_token":""}'%(songid) 143 | 144 | params = ( 145 | ('csrf_token', ''), 146 | ) 147 | 148 | data = { 149 | 'params': self.get_params(text), 150 | 'encSecKey': self.encSecKey 151 | } 152 | 153 | response = requests.post(self.API_Lyric_Songs, headers=self.headers, params=params, data=data) 154 | self._dispose(json.loads(response.text)) 155 | 156 | # 处理爬虫获取到的数据,这里我就输出值 157 | def _dispose(self, data): 158 | pprint.pprint(data) 159 | return data 160 | 161 | # 主函数 测试 162 | def wangyi_main(self): 163 | # 搜索接口 164 | self.serch_songs("旧账",0) 165 | #歌曲评论接口 166 | # self.comment_song("25639331",0) 167 | # 歌词接口 168 | # self.lyric_song("1351615757") # 旧账 169 | pass 170 | if __name__ == '__main__': 171 | wangyi = WangYiYun() 172 | wangyi.wangyi_main() 173 | 174 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /虾米音乐spider/.idea/虾米音乐spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /虾米音乐spider/requirment.txt: -------------------------------------------------------------------------------- 1 | bcrypt==3.1.7 2 | certifi==2020.6.20 3 | cffi==1.14.0 4 | chardet==3.0.4 5 | cryptography==2.9.2 6 | fabric==2.5.0 7 | fake==0.8 8 | fake-useragent==0.1.11 9 | idna==2.9 10 | invoke==1.4.1 11 | paramiko==2.7.1 12 | pycparser==2.20 13 | PyNaCl==1.4.0 14 | requests==2.24.0 15 | six==1.15.0 16 | urllib3==1.25.9 17 | -------------------------------------------------------------------------------- /虾米音乐spider/xiami_audio_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/6/30 4 | import json 5 | import random 6 | from xtools import md5_use 7 | import requests, pprint 8 | from fake_useragent import UserAgent 9 | from hashlib import md5 10 | from retrying import retry 11 | class XiaMi: 12 | ua = UserAgent() 13 | DOMAIN = "https://www.xiami.com" 14 | 15 | # 各个API接口地址 16 | # 每日音乐推荐 17 | APIDailySongs = "/api/recommend/getDailySongs" 18 | # 排行榜音乐 19 | APIBillboardDetail = "/api/billboard/getBillboardDetail" 20 | # 所有排行榜 21 | APIBillboardALL = "/api/billboard/getBillboards" 22 | # 歌曲详情信息 23 | APISongDetails = "/api/song/getPlayInfo" 24 | # 搜索音乐接口 25 | APISearch = "/api/search/searchSongs" 26 | # 歌曲单独一首详情 27 | APISingleSongInfo = "/api/song/initialize" 28 | def __init__(self): 29 | self.session = requests.Session() 30 | self.headers = { 31 | "user-agent": self.ua.random, 32 | "Proxy-Tunnel": str(random.randint(1, 10000)) 33 | } 34 | self.session.get(self.DOMAIN) 35 | 36 | def _get_api_url(self, api): 37 | return self.DOMAIN + api 38 | 39 | # 获取每日推荐的30首歌曲 40 | def get_daily_songs(self): 41 | url = self._get_api_url(self.APIDailySongs) 42 | params = { 43 | "_s": self._get_params__s(self.APIDailySongs) 44 | } 45 | result = self.session.get(url=url, params=params).json() 46 | self._dispose(result) 47 | 48 | # 获取虾米音乐的音乐排行榜 49 | def get_billboard_song(self, billboard_id: int = 0): 50 | ''' 51 | :param billboard_id: 各类型的排行榜 52 | :return: 排行榜音乐数据 53 | ''' 54 | if not hasattr(self, "billboard_dict"): 55 | self._get_billboard_dict_map() 56 | 57 | assert hasattr(self, "billboard_dict"), "billboard_dict获取失败" 58 | pprint.pprint(self.billboard_dict) 59 | if billboard_id == 0: 60 | billboard_id = input("输入对应ID,获取排行榜信息") 61 | assert billboard_id in self.billboard_dict, "billboard_id错误" 62 | 63 | url = self._get_api_url(self.APIBillboardDetail) 64 | _q = '{\"billboardId\":\"%s\"}' % billboard_id 65 | params = { 66 | "_q": _q, 67 | "_s": self._get_params__s(self.APIBillboardDetail, _q) 68 | } 69 | result = self.session.get(url=url, params=params).json() 70 | self._dispose(result) 71 | 72 | # 生成一个排行榜对应的字典映射 73 | def _get_billboard_dict_map(self): 74 | billboard_dict = {} 75 | billboards_info = self.get_billboard_all() 76 | try: 77 | if billboards_info["code"] == "SUCCESS": 78 | xiamiBillboards_list = billboards_info["result"]["data"]["xiamiBillboards"] 79 | for xiamiBillboards in xiamiBillboards_list: 80 | for xiamiBillboard in xiamiBillboards: 81 | id = xiamiBillboard["billboardId"] 82 | name = xiamiBillboard["name"] 83 | billboard_dict[id] = name 84 | self.billboard_dict = billboard_dict 85 | except Exception: 86 | pass 87 | 88 | # 获取所有的排行榜信息 89 | def get_billboard_all(self): 90 | url = self._get_api_url(self.APIBillboardALL) 91 | params = { 92 | "_s": self._get_params__s(self.APIBillboardALL) 93 | } 94 | result = self.session.get(url=url, params=params).json() 95 | self._dispose(result) 96 | 97 | # 获取歌曲详情信息 98 | def get_song_details(self, *song_ids) -> dict: 99 | ''' 100 | :param song_ids: 歌曲的id,可以为多个 101 | :return: 歌曲的详情信息 102 | ''' 103 | assert len(song_ids) != 0, "参数不能为空" 104 | 105 | for song_id in song_ids: 106 | if not isinstance(song_id, int): 107 | raise Exception("每个参数必须为整型") 108 | 109 | url = self._get_api_url(self.APISongDetails) 110 | _q = "{\"songIds\":%s}" % list(song_ids) 111 | params = { 112 | "_q": _q, 113 | "_s": self._get_params__s(self.APISongDetails, _q) 114 | } 115 | result = self.session.get(url=url, params=params).json() 116 | return self._dispose(result) 117 | 118 | # 获取虾米单独一首歌曲详情 119 | def get_song_single_info(self, *song_id_str) -> dict: 120 | ''' 121 | :param song_ids: 歌曲的id,可以为多个 122 | :return: 歌曲的详情信息 123 | ''' 124 | 125 | 126 | url = self._get_api_url(self.APISingleSongInfo) 127 | _q = "{\"songId\":\"%s\"}" % (song_id_str) 128 | params = { 129 | "_q": _q, 130 | "_s": self._get_params__s(self.APISingleSongInfo, _q) 131 | } 132 | result = self.session.get(url=url, params=params).json() 133 | return self._dispose(result) 134 | 135 | # 获取歌曲的下载地址 136 | def get_song_download_url(self, *song_ids): 137 | download_url_dict = {} 138 | song_details = self.get_song_details(*song_ids) 139 | songPlayInfos = song_details["result"]["data"]["songPlayInfos"] 140 | for songPlayInfo in songPlayInfos: 141 | song_download_url = songPlayInfo["playInfos"][0]["listenFile"] or songPlayInfo["playInfos"][1]["listenFile"] 142 | song_id = songPlayInfo["songId"] 143 | download_url_dict[song_id] = song_download_url 144 | 145 | print("歌曲下载地址为:", download_url_dict) 146 | 147 | # 处理爬虫获取到的数据,这里我就输出值 148 | def _dispose(self, data): 149 | # pprint.pprint(data) 150 | return data 151 | 152 | # 获取加密字符串_s 153 | def _get_params__s(self, api: str, _q: str = "") -> str: 154 | ''' 155 | :param api: URL的地址 156 | :param _q: 需要加密的参数 157 | :return: 加密字符串 158 | ''' 159 | xm_sg_tk = self._get_xm_sg_tk() 160 | data = xm_sg_tk + "_xmMain_" + api + "_" + _q 161 | return md5(bytes(data, encoding="utf-8")).hexdigest() 162 | 163 | # 获取xm_sg_tk的值,用于对数据加密的参数 164 | def _get_xm_sg_tk(self) -> str: 165 | xm_sg_tk = self.session.cookies.get("xm_sg_tk", None) 166 | assert xm_sg_tk is not None, "xm_sg_tk获取失败" 167 | return xm_sg_tk.split("_")[0] 168 | 169 | # 获取虾米搜索结果 170 | def _get_xm_serch(self,song_name='在希望的田野上',page=2): 171 | url = self._get_api_url(self.APISearch) 172 | _q = '{"key":"%s","pagingVO":{"page":%s,"pageSize":30}}'%(song_name,page) 173 | params = { 174 | "_q": _q, 175 | "_s": self._get_params__s(self.APISearch, _q) 176 | } 177 | # 测试 178 | # print(self._get_params__s(self.APISearch, _q)) # 打印 _s 179 | result = self.session.get(url=url, params=params).json() 180 | return result 181 | 182 | # 对搜索结果进行解析的函数 183 | def parms_search_songs(self,info): 184 | result_list =[] 185 | if "code" in info and info["code"] == "SUCCESS" and "result" in info and info["result"] and info["result"]["data"] and info["result"]["data"]["songs"]: 186 | for each in info["result"]["data"]["songs"]: 187 | 188 | if "S_OFF" not in each["bizTags"]: # 189 | # print("S_OFF not in 在{}".format(each["bizTags"])) 190 | dic_ = {} 191 | dic_["audio2_albumName"] = each["albumName"] 192 | dic_["audio2_artistName"] = each["singers"] 193 | dic_["audio2_songName"] = each["songName"] 194 | dic_["audio2_songId"] = each["songId"] 195 | dic_["audio2_platform"] = "虾米音乐" 196 | dic_["audio2_songStringId"] = each['songStringId'] # 字符形式的ID 197 | dic_["audio2_url"] = "https://www.xiami.com/song/{}".format(dic_["audio2_songStringId"]) 198 | dic_["audio2_url_hash"] = md5_use(text=dic_["audio2_url"]) 199 | 200 | result_list.append(dic_) 201 | # else: 202 | # print("S_OFF 在{}".format(each["bizTags"])) 203 | return result_list 204 | 205 | # 虾米 容易尝试失败 单独的一次请求 206 | @retry(stop_max_attempt_number=5,wait_fixed=600) 207 | def get_response_single(self,url,params,proxy={},num=0): 208 | if proxy: 209 | result = self.session.get(url=url, headers=self.headers,params=params,proxies=proxy).json() 210 | elif not proxy: 211 | result = self.session.get(url=url, headers=self.headers,params=params).json() 212 | if "rgv587_flag" in result: 213 | # print("虾米音乐未获取成功 重新尝试") 214 | if num < 5: 215 | self.session = requests.Session() 216 | self.session.get(self.DOMAIN) 217 | return self.get_response_single(url,params,proxy=proxy, num=num + 1) 218 | else: 219 | print(" 单个页面请求尝试过多") 220 | return [] 221 | return result 222 | # 获取虾米搜索结果 223 | # @retry(stop_max_attempt_number=5,wait_fixed=600) 224 | def search_songs(self,song_name='在希望的田野上',proxy={},num=0): 225 | result_list = [] 226 | for page in range(config["xiami_search_offset"]["start"],config["xiami_search_offset"]["end"]): 227 | 228 | url = self._get_api_url(self.APISearch) 229 | _q = '{"key":"%s","pagingVO":{"page":%s,"pageSize":30}}'%(song_name,page) 230 | params = { 231 | "_q": _q, 232 | "_s": self._get_params__s(self.APISearch, _q) 233 | } 234 | # 测试 235 | # print(self._get_params__s(self.APISearch, _q)) # 打印 _s 236 | if proxy: 237 | result = self.get_response_single(url=url,params=params,proxy=proxy) 238 | else: 239 | result = self.get_response_single(url=url,params=params) 240 | # print(reget_song_single_infosult) 241 | # self._dispose(result) 242 | # if "rgv587_flag" in result: 243 | # print("虾米音乐未获取成功 重新尝试") 244 | # if num<5: 245 | # self.session = requests.Session() 246 | # self.session.get(self.DOMAIN) 247 | # return self.search_songs(song_name=song_name,proxy=proxy,num=num+1) 248 | # else: 249 | # print("尝试过多") 250 | # return [] 251 | for each in self.parms_search_songs(result): 252 | result_list.append(each) 253 | 254 | # print(result_list) 255 | return result_list 256 | def test(self): 257 | # self.get_daily_songs() 258 | # self._get_xm_sg_tk() 259 | # self.get_billboard_song(332) 260 | # self.get_billboard_all() 261 | # self.get_song_details(1813243760) 262 | # self.get_song_details(1806922983) # 测试 走在田野的路上 263 | # self.get_song_download_url(1813243760) 264 | self._get_xm_serch() 265 | pass 266 | def back_search_parms(self,name): 267 | url = self._get_api_url(self.APISearch) 268 | detail_info_list = [] 269 | for page in range(config["xiami_search_offset"]["start"],config["xiami_search_offset"]["end"]): 270 | detail_info_dic = {} 271 | _q = '{"key":"%s","pagingVO":{"page":%s,"pageSize":30}}' % (name, page) 272 | params = { 273 | "_q": _q, 274 | "_s": self._get_params__s(self.APISearch, _q) 275 | } 276 | detail_info_dic['params'] = params 277 | detail_info_dic['headers'] = self.headers 278 | detail_info_dic['requir_way'] = "GET" 279 | detail_info_dic['url'] = self._get_api_url(self.APISearch) 280 | detail_info_list.append(detail_info_dic) 281 | # print(detail_info_list) 282 | return detail_info_list 283 | 284 | def get_play_info_db(self, *song_id_str): 285 | info_dict = self.get_song_single_info(*song_id_str) 286 | song_info = info_dict.get('result', {}).get('data', {}).get('songDetail', {}) 287 | play_info_db = dict() 288 | play_info_db['stats_view'] = song_info.get('playCount') 289 | play_info_db['stats_share'] = song_info.get('shareCount') 290 | play_info_db['stats_like'] = song_info.get('favCount') 291 | play_info_db['stats_comment'] = info_dict.get('result', {}).get('data', {}).get('songExt', {}).get('commentCount') 292 | return play_info_db 293 | if __name__ == '__main__': 294 | xm = XiaMi() 295 | xm.back_search_parms(name='路在何方') 296 | # xm.test() 297 | proxy = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 298 | "host": config["proxyHost"], 299 | "port": config["proxyPort"], 300 | "user": config["proxyUser"], 301 | "pass": config["proxyPass"], 302 | } 303 | proxies = { 304 | "http": proxy, 305 | "https": proxy, 306 | } 307 | # xm.search_songs(song_name='七里香',proxy=proxies) 308 | # print(xm.get_song_details(379345)) 309 | # print(xm.get_song_single_info("nnkRGy619bc").get('result', {}).get('data', {}).get('songDetail', {}).get('playCount')) 310 | # # playCount shareCount(fenxaing) favCount(xihuan) 311 | # print(xm.get_song_single_info("nnkRGy619bc").get('result', {}).get('data', {}).get('songExt', {}).get('commentCount')) 312 | # # commentCount 313 | # print(json.dumps(xm.get_song_single_info("nnkRGy619bc"))) 314 | print(xm.get_play_info_db("xLDghmbd8d0")) -------------------------------------------------------------------------------- /虾米音乐spider/xiami_test_secret_parms.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/6/25 4 | 5 | from hashlib import md5 6 | 7 | # 获取加密字符串_s 8 | def _get_params__s(api,_q) -> str: 9 | ''' 10 | :param api: URL的地址 /api/search/searchSongs 11 | :param _q: 需要加密的参数 {"key":"在希望的田野上","pagingVO":{"page":2,"pageSize":30}} 12 | :param xm_sg_tk cookie xm_sg_tk去掉是时间戳得值 7f2df3233537f81aae848dc4f47bdeb8 13 | :return: 加密字符串 14 | ''' 15 | xm_sg_tk = '7f2df3233537f81aae848dc4f47bdeb8' # 16 | data = xm_sg_tk + "_xmMain_" + api + "_" + _q 17 | # data = '7f2df3233537f81aae848dc4f47bdeb8_xmMain_/api/search/searchSongs_{"key":"在希望的田野上","pagingVO":{"page":2,"pageSize":30}}' 18 | # data = 'e2853d0e0c49aab4a44dce64fd26b4ba_xmMain_/api/search/searchSongs_{"key":"在希望的田野上","pagingVO":{"page":1,"pageSize":30}}' 19 | return md5(bytes(data, encoding="utf-8")).hexdigest() 20 | 21 | 22 | print(_get_params__s()) -------------------------------------------------------------------------------- /虾米音乐spider/使用说明: -------------------------------------------------------------------------------- 1 | 功能实现: 2 | 实现部分接口的解析获取数据 3 | 4 | 主要脚本: 5 | xiami_audio_spider.py xiami这个类包含所有的函数API 6 | xiami_test_secret_parms.py 测试单独api 和 _q生成的加密参数 7 | 8 | 注意事项: 9 | 爬取过程中 自己添加代理,频繁访问会滑块验证,其余接口都是一个路子,接口api 以及接口参数 -------------------------------------------------------------------------------- /起点中文网详情字体加密破解/qidian_novel_info_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Chance favors the prepared mind. 3 | # author : pyl owo, 4 | # time : 2020/9/21 5 | import json 6 | import random 7 | import re 8 | 9 | from fake_useragent import UserAgent 10 | from fontTools.ttLib import TTFont 11 | from lxml import etree 12 | import requests 13 | from my_font_content import XYYTTFont 14 | 15 | # 获取代理 16 | def get_proxy(): 17 | pass 18 | 19 | # 统一请求函数 20 | def unify_requests(method="GET",url="",headers={},proxies={},data={},verify=False,cookies={}): 21 | if method=="GET": 22 | response = requests.get(url, headers=headers,proxies=proxies,data=data,cookies=cookies,timeout=5) 23 | return response 24 | else: 25 | response = requests.post(url, headers=headers,proxies=proxies,data=data,verify=verify,cookies=cookies,timeout=5) 26 | return response 27 | class SFQingNovel: 28 | def __init__(self, use_proxy=True): 29 | self.proxy = get_proxy() if use_proxy else None 30 | self.headers = { 31 | 'User-Agent': UserAgent().random, 32 | "Proxy-Tunnel": str(random.randint(1, 10000)), 33 | 'authority': 'book.qidian.com', 34 | 'cache-control': 'max-age=0', 35 | 'upgrade-insecure-requests': '1', 36 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', 37 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 38 | 'sec-fetch-site': 'same-site', 39 | 'sec-fetch-mode': 'navigate', 40 | 'sec-fetch-user': '?1', 41 | 'sec-fetch-dest': 'document', 42 | 'referer': 'https://www.qidian.com/', 43 | 'accept-language': 'zh-CN,zh;q=0.9', 44 | # 'Cookie': 'newstatisticUUID=1600686041_884209914; _csrfToken=nXOEpjuFF7PUkwPJoOkBd7dTo2BV5jSkPu3suGGs' 45 | } 46 | # self.novel_url_pre = "https://t.shuqi.com/cover/" 47 | 48 | # 获取小说所有详细信息 49 | def get_novel_info(self, novel_url, **kwargs): 50 | respose = unify_requests(url=novel_url, headers=self.headers, proxies=self.proxy) 51 | search_result = self.parse_novel_info(respose, novel_url, **kwargs) 52 | return search_result 53 | 54 | def get_id(self, novel_url, **kwargs): 55 | return novel_url.split('/')[-1] 56 | 57 | def get_info(self, info_response, **kwargs): 58 | number_dict = { 59 | '.notdef': "薛忆阳", 60 | 'period': '.', 61 | 'zero': '0', 62 | 'one': '1', 63 | 'two': '2', 64 | 'three': '3', 65 | 'four': '4', 66 | 'five': '5', 67 | 'six': '6', 68 | 'seven': '7', 69 | 'eight': '8', 70 | 'nine': '9', 71 | } 72 | response = info_response 73 | 74 | # 拿到下载字体的网址 75 | # @font-face.*?src: url.*?src: url(.*?) format('woff'), 76 | content = re.search(re.compile(r"@font-face.*?src: url.*?src: url(.*?)format.*?,", re.S), response.text) 77 | # ('https://qidian.gtimg.com/qd_anti_spider/yMxThZoL.woff') 78 | font_url = content.groups()[0].strip("( | )").strip("'") 79 | # print(font_url) 80 | 81 | font_content = unify_requests(url=font_url, headers={ 82 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'}, proxies=self.proxy).content 83 | 84 | # with open('qidian_lx.woff', 'wb') as f: 85 | # f.write(font_content) 86 | # 87 | # font1 = TTFont('qidian_lx.woff') 88 | # font1.saveXML('qidian_lx.xml') 89 | 90 | # 源码中提取十进制数据: 91 | 92 | data = re.findall(re.compile( 93 | r'
.*?

.*?(.*?).*?(.*?).*?(.*?).*?(.*?)', 94 | re.S), response.text)[0] 95 | 96 | four_list = [] 97 | # 遍历这四组数据 98 | for d in data: 99 | # print(d) # 拿到元组中的一个 𘜸𘜹𘜹𘜶𘜽𘜽 100 | one_list = [] 101 | d = d.split(';') # 去除分号 102 | # 遍历每组数据 103 | for x in d: 104 | res = x.replace('&#', '') 105 | if res: # 判断是否有空格有的话不转化 106 | # 将res十进制转化成16进制 107 | a = int(res) # 先转化成int类型 108 | one_list.append(a) 109 | 110 | four_list.append(one_list) 111 | map_dict = XYYTTFont(font_content).getBestCmap() 112 | # print(map_dict) 113 | result_list = [] 114 | # 遍历含有四组数据的列表 115 | for one in four_list: 116 | two_string = "" 117 | # 遍历每一组数据 118 | for a in one: 119 | # print("a",a) 120 | if a in map_dict: 121 | number = map_dict[a] # 找到对应的键 122 | number = number_dict[number] # 通过键找到对应的值 123 | # print(number) 124 | two_string += number 125 | 126 | result_list.append(two_string) 127 | return result_list 128 | def get_int_num(self, numstr): 129 | if '.' in numstr: 130 | return int(numstr.replace('.','')) * 100 131 | else: 132 | return int(numstr) 133 | # 搜索视频响应解析 134 | def parse_novel_info(self, respose_info, novel_url='', **kwargs) -> dict: 135 | try: 136 | # print(novel_url) 137 | response_data = etree.HTML(respose_info.text) 138 | info_list = self.get_info(respose_info, **kwargs) 139 | except Exception as e: 140 | print(e) 141 | return {} 142 | else: 143 | # info_book_dict = info_dict.get('book', {}) 144 | novel_dict = dict() 145 | novel_dict['all_recommend_str'] = self.get_int_num(info_list[2]) # 总推荐数 str book_interact 146 | novel_dict['month_recommend_str'] = None # 月推荐数 str 147 | novel_dict['week_recommend_str'] = self.get_int_num(info_list[3]) # 周推荐数 str 148 | novel_dict['all_read_int'] = None # 总阅读数 int 149 | novel_dict['month_read_int'] = None # 月阅读数 int 150 | novel_dict['week_read_int'] = None # 周阅读数 int 151 | novel_dict['all_words_number_int'] = self.get_int_num(info_list[0]) # 总字数 152 | novel_dict['book_status_str'] = response_data.xpath('//p[@class="tag"]/span/text()')[0] # 书籍状态 (连载,完结,暂无)bookCP 153 | novel_dict['book_property_str'] = response_data.xpath('//p[@class="tag"]/span/text()')[1] # 书籍属性 (免费,会员,限免) 154 | novel_dict['author_type_str'] = "".join(response_data.xpath('//div[@class="author-photo"]/span/text()')) # 作者类型 (金牌,签约,独立 默认无) 155 | novel_dict['book_type_str'] = '|'.join(response_data.xpath('//p[@class="tag"]/a/text()')) # 书籍分类 (玄幻 ,科幻,言情...)按搜索结果来多个按|分割 156 | novel_dict['book_update_time'] = ''.join(response_data.xpath('//li[@class="update"]/div/p[@class="cf"]/em/text()')) # 书籍更新日期 年-月-日 157 | novel_dict['book_zong_zhang_jie_int'] = '' # 书籍总的章节 完结的,未完结就填目前的总章节 158 | novel_dict['book_zui_xin_zhang_jie_name_str'] = ''.join(response_data.xpath('//li[@class="update"]/div/p[@class="cf"]/a/text()')) # 最新章节名称 159 | novel_dict['book_introduce_text'] = ''.join(response_data.xpath('//div[@class="book-intro"]/p//text()')).replace(' ', '').replace('\u3000', '').replace('\r', '').replace('\n', '').replace('\t', '') # 书籍简介 text 160 | novel_dict['book_lable_str'] = '|'.join(response_data.xpath('//p[@class="tag"]/a/text()')) # 书籍标签 (用|分割的字符串 ''科幻|现实|励志'') 161 | novel_dict['book_cover_image_str'] = "https:" + "".join(response_data.xpath('//div[@class="book-information cf"]/div[@class="book-img"]/a/img/@src')).replace('\n', '') # 书籍封面 URL 162 | novel_dict['book_detail_url_str'] = novel_url # 书籍详情URL 163 | novel_dict['book_detail_id_int'] = None # 书籍详情ID 数字形式 164 | novel_dict['book_detail_id_str'] = None # 书籍详情ID 字符形式 165 | novel_dict['book_zhan_dian_str'] = None # 书籍站点 (男生,女生,暂无) 166 | novel_dict['book_publish_str'] = '起点中文网' # 出版社 默认侵权平台' 167 | novel_dict['book_commeds_int'] = None # 书籍评论数 Pinglunfont 168 | novel_dict['author_grade_float'] = None # 作者评分 169 | novel_dict['author_id_str'] = None # 作者ID 字符形式 ## 新增 170 | novel_dict['author_page_url_str'] = "https:" + ''.join(response_data.xpath('//a[@class="writer"]/@href')) # 作者主页链接 userId 171 | author_info_data = response_data.xpath('//ul[@class="work-state cf"]/li/em/text()') 172 | novel_dict['author_book_number_int'] = author_info_data[0] # 作者书籍总数 173 | novel_dict['author_likes_int'] = None # 作者获赞总数 174 | novel_dict['author_all_words_number_str'] = author_info_data[1] # 作者累计创作字数 175 | novel_dict['author_produce_days_str'] = author_info_data[2] # 作者累计创作天数 176 | novel_dict['author_fens_number_int'] = None # 作者粉丝数 177 | novel_dict['author_head_image_url_str'] = "https:" + "".join(response_data.xpath('//div[@class="author-photo"]/a/img/@src')) # 作者头像URL 178 | # novel_dict[''] = '' # 179 | return novel_dict 180 | 181 | 182 | # 统一的调用 search_novels 183 | search_novel_info = SFQingNovel(use_proxy=True).get_novel_info 184 | if __name__ == "__main__": 185 | result = search_novel_info('https://book.qidian.com/info/1010734492') 186 | print(result) 187 | -------------------------------------------------------------------------------- /起点中文网详情字体加密破解/字体文件解析.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # 享受雷霆感受雨露 3 | # author xyy,time:2020/9/29 4 | from fontTools.ttLib import TTFont 5 | import requests 6 | # 统一请求函数 7 | def unify_requests(method="GET",url="",headers={},proxies={},data={},verify=False,cookies={}): 8 | if method=="GET": 9 | response = requests.get(url, headers=headers,proxies=proxies,data=data,cookies=cookies,timeout=5) 10 | return response 11 | else: 12 | response = requests.post(url, headers=headers,proxies=proxies,data=data,verify=verify,cookies=cookies,timeout=5) 13 | return response 14 | 15 | 16 | def zhuan_xml(): 17 | 18 | # 加载字体文件: 19 | font = TTFont('/Users/quanlifang/Desktop/my_git_file/Python_Spider_All/起点中文网详情字体加密破解/FkMwMtuL.ttf') 20 | 21 | # 转为xml文件: 22 | font.saveXML('/Users/quanlifang/Desktop/my_git_file/Python_Spider_All/起点中文网详情字体加密破解/FkMwMtuL.xml') 23 | def get_font_yingse(font_url): 24 | response = unify_requests(url=font_url) 25 | content = response.content 26 | # world = TTFont('/Users/quanlifang/Desktop/my_git_file/Python_Spider_All/起点中文网详情字体加密破解/wwOMhmLd.ttf') 27 | world = TTFont('/Users/quanlifang/Desktop/my_git_file/Python_Spider_All/起点中文网详情字体加密破解/wwOMhmLd.ttf') 28 | # 读取响应的映射关系 29 | # uni_list = world['cmap'].tables[0].ttFont.getGlyphOrder() # 'cmap' 表示汉字对应的映射 为unicode编码 30 | # print(uni_list) # 按顺序拿到各个字符的unicode编码 31 | # print(world.getGlyphOrder()) 32 | # print(world.getBestCmap()) # 获得对应的字符对应的值 33 | # {100097: 'zero', 100099: 'nine', 100100: 'eight', 100101: 'five', 100102: 'seven', 100103: 'two', 100104: 'one', 100105: 'four', 100106: 'period', 100107: 'six', 100108: 'three'} 34 | # info = world.getGlyphOrder() 35 | # dic_info = {} 36 | # for each in info: 37 | # # print(world['glyf'][each].coordinates) 38 | # # print(list(world['glyf'][each].coordinates)) # 获得一个个的元组 判断比较就好了 39 | # dic_info[each] = list(world['glyf'][each].coordinates) 40 | # print(dic_info) 41 | # {} 字典里面 对应的值对应的 list 42 | # print(dir(world)) # 按顺序拿到各个字符的unicode编码 43 | # print(dir(world["cmap"].tableTag)) # 按顺序拿到各个字符的unicode编码 44 | # print(dir(world["glyf"])) # 按顺序拿到各个字符的unicode编码 45 | # exit(0) 46 | # print(world['glyf']['one'].coordinates) 47 | # print(world['glyf']['one']) 48 | # print(dir(world['glyf']['one'])) 49 | # print(world.getGlyphOrder()) 50 | # print(world.tables) 51 | # print(dir(world)) 52 | # unicode_list= [eval(r"u'\u" + uni[3:] + "'") for uni in uni_list[2:]] 53 | # unicode_list= [uni.encode('utf-8').decode('unicode-escape') for uni in unicode_list] 54 | # print('unicode_list = ', unicode_list) 55 | # 56 | # font = TTFont('898a472b.woff') # 打开文件 57 | # font.saveXML('898a472b.xml') # 保存为xml文件 58 | # 59 | # # 解析xml文件 60 | # from xml.etree import ElementTree as ET 61 | # 62 | # tree = ET.parse('898a472b.xml') 63 | # root = tree.getroot() # 一个Element对象 64 | # childs = root.getchildren() 65 | # 66 | # for c in childs: 67 | # for cc in c.getchildren(): 68 | # datas = {} 69 | # datas[cc.tag] = cc.attrib 70 | # print(datas, '\n\n\n') 71 | return world.getBestCmap() 72 | if __name__ == '__main__': 73 | pass 74 | # tet() 75 | # zhuan_xml() 76 | 77 | """:cvar 78 | 79 | 映射 字典 80 | {'.notdef': [(256, 0), (256, 1280), (1280, 1280), (1280, 0), (288, 32), (1248, 32), (1248, 1248), (288, 1248)], 'period': [(186, 0), (186, 205), (391, 205), (391, 0)], 'zero': [(85, 723), (85, 983), (192, 1300), (403, 1472), (563, 1472), (681, 1472), (859, 1377), (975, 1198), (1041, 941), (1041, 723), (1041, 465), (935, 148), (724, -25), (563, -25), (351, -25), (230, 127), (85, 310), (270, 723), (270, 362), (439, 123), (563, 123), (687, 123), (856, 363), (856, 723), (856, 1085), (687, 1323), (561, 1323), (437, 1323), (363, 1218), (270, 1084)], 'one': [(763, 0), (583, 0), (583, 1147), (518, 1085), (307, 961), (223, 930), (223, 1104), (374, 1175), (600, 1377), (647, 1472), (763, 1472)], 'two': [(1031, 173), (1031, 0), (62, 0), (60, 65), (83, 125), (120, 224), (283, 416), (437, 542), (676, 738), (844, 967), (844, 1069), (844, 1176), (691, 1323), (568, 1323), (438, 1323), (282, 1167), (281, 1029), (96, 1048), (115, 1255), (363, 1472), (572, 1472), (783, 1472), (1029, 1238), (1029, 1065), (1029, 977), (957, 807), (790, 619), (596, 455), (434, 319), (342, 222), (312, 173)], 'three': [(86, 387), (266, 411), (297, 258), (446, 123), (553, 123), (680, 123), (855, 299), (855, 429), (855, 553), (693, 714), (568, 714), (517, 714), (441, 694), (461, 852), (479, 850), (490, 850), (605, 850), (789, 970), (789, 1095), (789, 1194), (655, 1324), (549, 1324), (444, 1324), (304, 1192), (284, 1060), (104, 1092), (137, 1273), (371, 1472), (545, 1472), (665, 1472), (867, 1369), (974, 1191), (974, 1091), (974, 996), (872, 840), (772, 794), (902, 764), (1046, 575), (1046, 433), (1046, 241), (766, -26), (552, -26), (359, -26), (104, 204)], 'four': [(662, 0), (662, 351), (26, 351), (26, 516), (695, 1466), (842, 1466), (842, 516), (1040, 516), (1040, 351), (842, 351), (842, 0), (662, 516), (662, 1177), (203, 516)], 'five': [(85, 384), (274, 400), (295, 262), (448, 123), (556, 123), (686, 123), (866, 319), (866, 481), (866, 635), (693, 813), (553, 813), (466, 813), (326, 734), (286, 671), (117, 693), (259, 1446), (988, 1446), (988, 1274), (403, 1274), (324, 880), (456, 972), (601, 972), (793, 972), (1057, 706), (1057, 497), (1057, 298), (941, 153), (800, -25), (556, -25), (356, -25), (103, 199)], 'six': [(1019, 1107), (840, 1093), (816, 1199), (772, 1247), (699, 1324), (592, 1324), (506, 1324), (441, 1276), (356, 1214), (258, 976), (256, 756), (321, 855), (509, 951), (612, 951), (792, 951), (1045, 686), (1045, 476), (1045, 338), (926, 101), (718, -25), (586, -25), (361, -25), (77, 306), (77, 686), (77, 1111), (234, 1304), (371, 1472), (603, 1472), (776, 1472), (997, 1278), (284, 475), (284, 382), (363, 212), (505, 123), (583, 123), (697, 123), (861, 307), (861, 465), (861, 617), (699, 792), (576, 792), (454, 792), (284, 617)], 'seven': [(97, 1274), (97, 1447), (1046, 1447), (1046, 1307), (906, 1158), (631, 664), (556, 403), (502, 219), (487, 0), (302, 0), (305, 173), (435, 663), (678, 1118), (815, 1274)], 'eight': [(362, 795), (250, 836), (142, 988), (142, 1094), (142, 1254), (372, 1472), (563, 1472), (755, 1472), (989, 1249), (989, 1089), (989, 987), (882, 836), (773, 795), (908, 751), (1049, 555), (1049, 419), (1049, 231), (783, -25), (566, -25), (349, -25), (83, 232), (83, 424), (83, 567), (228, 760), (326, 1100), (326, 996), (460, 864), (567, 864), (671, 864), (804, 995), (804, 1090), (804, 1189), (667, 1324), (565, 1324), (462, 1324), (326, 1192), (268, 423), (268, 346), (341, 202), (485, 123), (568, 123), (697, 123), (865, 289), (865, 417), (865, 547), (692, 717), (562, 717), (435, 717), (268, 549)], 'nine': [(112, 339), (285, 355), (307, 233), (431, 123), (528, 123), (611, 123), (736, 199), (816, 326), (870, 542), (870, 654), (870, 666), (869, 690), (815, 604), (628, 497), (519, 497), (337, 497), (85, 761), (85, 977), (85, 1200), (348, 1472), (546, 1472), (689, 1472), (926, 1318), (1049, 1033), (1049, 763), (1049, 482), (927, 149), (686, -25), (524, -25), (352, -25), (134, 166), (849, 986), (849, 1141), (684, 1323), (568, 1323), (448, 1323), (270, 1127), (270, 971), (270, 831), (439, 656), (563, 656), (688, 656), (849, 831)]} 81 | 82 | 83 | 84 | """ 85 | 86 | -------------------------------------------------------------------------------- /起点中文网详情字体加密破解/字体解密记录: -------------------------------------------------------------------------------- 1 | # 使用手册 2 | 执行 qidian_novel_info_spider.py 3 | 更换url 即可 4 | 5 | wwOMhmLd 6 | ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] 7 | GlyphCoordinates([(763, 0),(583, 0),(583, 1147),(518, 1085),(307, 961),(223, 930),(223, 1104),(374, 1175),(600, 1377),(647, 1472),(763, 1472)]) 8 | 9 | FkMwMtuL 10 | ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] 11 | GlyphCoordinates([(763, 0),(583, 0),(583, 1147),(518, 1085),(307, 961),(223, 930),(223, 1104),(374, 1175),(600, 1377),(647, 1472),(763, 1472)]) 12 | 13 | 两次结果 发现没有变化 ,但是以防万一,还是进行一个校验 增加一个值的误差值 14 | 15 | 16 | 第二次升级, 17 | 每一次 都是下载文件直接比对,这样多进程的时候是会容易出现问题的,同一个文件多次改动 18 | 19 | 升级 ttfont 20 | 改源代码 21 | 新增 XYYTTFont 函数 (改写了一些 原函数 TTFont)之前是file 地址,现在直接传入文件的 content 流 22 | 23 | 24 | 字体加密解决思路 网上一堆,就是字体文件的加密解密 25 | 两个点: 26 | world = TTFont('wwOMhmLd.ttf') 27 | # print(world.getGlyphOrder()) # 字典的value 字体文件有哪些值 对应的值 28 | ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] 29 | 30 | # print(world.getBestCmap()) # 获得对应的字符对应的值 31 | {100181: 'two', 100183: 'zero', 100184: 'three', 100185: 'eight', 100186: 'seven', 100187: 'six', 100188: 'period', 100189: 'four', 100190: 'five', 100191: 'nine', 100192: 'one'} 32 | 33 | 34 | 35 | --------------------------------------------------------------------------------