├── .gitignore ├── LICENSE ├── README.md ├── extract.py ├── extractor ├── README.md ├── __init__.py ├── acfun.py ├── baidutieba.py ├── bilibili.py ├── changba.py ├── changya.py ├── douyin.py ├── haokan.py ├── ku6.py ├── kuaishou.py ├── kugou.py ├── kuwo.py ├── lequ.py ├── lizhiFM.py ├── lofter.py ├── migu_music.py ├── momo.py ├── music163 │ ├── __init__.py │ ├── encrypt.py │ └── music163.py ├── open163.py ├── pearvideo.py ├── peiyinxiu.py ├── pic58.py ├── pipigaoxiao.py ├── pipix.py ├── qianqian.py ├── qingshipin.py ├── qmgx.py ├── qqmusic.py ├── quanminkge.py ├── quanminxsp.py ├── qutoutiao.py ├── sing5.py ├── sohuTV.py ├── ted.py ├── tuchong.py ├── tudou.py ├── wechat_article_cover.py ├── weibo.py ├── weishi.py ├── xiaokaxiu.py ├── xinpianchang.py ├── zhihu_video.py ├── zuiyou_video.py └── zuiyou_voice.py ├── misc.py ├── requirements.txt ├── screenshot ├── example.gif └── run.gif ├── utils.py └── web ├── README.md ├── __init__.py ├── _response.py ├── app.py ├── config.py ├── error.py ├── example.env ├── funcs.py ├── log.py └── views.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .vim/coc-settings.json 131 | download/ 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 wongxy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 新情况 2 | 3 | 这是很久没管的旧项目,代码质量和风格一言难尽,部分爬虫仍然可用。现计划用 fastAPI 框架搭建一个简单解析 API 服务,功能仍是简单粗糙,但用于学习或日常使用还是可以的 4 | 5 | 切换到[fastapi 分支](https://github.com/xiyaowong/spiders/tree/fastapi)即可 6 | 7 | --- 8 | 9 | - #### 都是相对简单的爬虫,熟练应该看一眼就懂了,如果是初学者,里面有些东西还是值得看一看的。 10 | 11 | - #### 爬虫文件详情在这里 [extractor](/extractor) 12 | 13 | --- 14 | 15 | ```shell 16 | pip3 install -r requirements.txt 17 | python3 extract.py 18 | ``` 19 | 20 | 可能还需要安装 nodejs 21 | 22 | - #### screenshot 23 | 24 | ![example.gif](https://cdn.jsdelivr.net/gh/xiyaowong/spiders/screenshot/run.gif) 25 | 26 | - #### release 27 | 28 | - #### 欢迎**star**:star: & **fork** 29 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from datetime import datetime 4 | from queue import Queue 5 | from threading import Thread 6 | 7 | import utils 8 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan, 9 | ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music, 10 | momo, music163, open163, pearvideo, pic58, pipigaoxiao, 11 | pipix, qianqian, qingshipin, qqmusic, quanminkge, 12 | qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo, 13 | weishi, xiaokaxiu, xinpianchang, zhihu_video, 14 | zuiyou_voice) 15 | from misc import printTips 16 | 17 | here = os.path.abspath(os.path.dirname(__file__)) 18 | 19 | crawlers = { 20 | 'acfun': acfun, 21 | 'tieba': baidutieba, 22 | 'bili': bilibili, 23 | 'changya': changya, 24 | 'douyin': douyin, 25 | 'haokan': haokan, 26 | 'ku6': ku6, 27 | 'chenzhongtech': kuaishou, 28 | 'kuaishou': kuaishou, 29 | 'kugou': kugou, 30 | 'kuwo': kuwo, 31 | 'lizhi': lizhiFM, 32 | 'lofter': lofter, 33 | 'music.163': music163, 34 | 'open.163': open163, 35 | 'pearvideo': pearvideo, 36 | 'ippzone': pipigaoxiao, 37 | 'pipix': pipix, 38 | 'music.taihe': qianqian, 39 | 'qingshipin': qingshipin, 40 | 'y.qq': qqmusic, 41 | 'kg': quanminkge, 42 | 'qutoutiao': qutoutiao, 43 | '5sing': sing5, 44 | 'weibo': weibo, 45 | 'weishi': weishi, 46 | 'xiaokaxiu': xiaokaxiu, 47 | 'xinpianchang': xinpianchang, 48 | 'zhihu': zhihu_video, 49 | 'zuiyou': zuiyou_voice, 50 | 'sohu': sohuTV, 51 | 'ted': ted, 52 | 'tudou': tudou, 53 | 'momo': momo, 54 | 'music.migu': migu_music, 55 | '58pic': pic58, 56 | 'tuchong': tuchong 57 | } 58 | 59 | 60 | class Task: 61 | def __init__(self, url, save_path='', file_name=None, file_type='unknown'): 62 | self.url = url 63 | self.save_path = save_path 64 | self.file_name = file_name or str(datetime.now()) 65 | self.file_type = file_type 66 | 67 | 68 | def data2tasks(data: dict) -> list: 69 | title = data.get("title") 70 | author = data.get("author") 71 | audioName = data.get("audioName") 72 | videoName = data.get("videoName") 73 | imgs = data.get("imgs") 74 | audios = data.get("audios") 75 | videos = data.get("videos") 76 | text = data.get("text") 77 | msg = data.get("msg") 78 | 79 | if msg: 80 | print(msg) 81 | print() 82 | if text: 83 | print(text) 84 | print() 85 | tasks = [] 86 | if imgs: 87 | img_tasks = [Task(img, 'download/images', file_type='jpg') for img in imgs] 88 | tasks.extend(img_tasks) 89 | if audios: 90 | file_name = (audioName or "") + "-" + (author or "") 91 | audio_tasks = [Task(audio, 'download/audios', file_name=file_name, file_type='mp3') for audio in audios] 92 | tasks.extend(audio_tasks) 93 | if videos: 94 | file_name = (videoName or title or "") 95 | video_tasks = [Task(video, 'download/videos', file_name=file_name, file_type='mp4') for video in videos] 96 | tasks.extend(video_tasks) 97 | return tasks 98 | 99 | 100 | @utils.retry(2) 101 | def dl(dl_queue: Queue): 102 | while not dl_queue.empty(): 103 | task = dl_queue.get() # type: Task 104 | utils.download(file_url=task.url, 105 | save_path=task.save_path, 106 | file_name=task.file_name, 107 | file_type=task.file_type) 108 | 109 | 110 | def get_data(url): 111 | for c_name, c_func in crawlers.items(): 112 | if c_name in url: 113 | data = c_func.get(url) 114 | print(data) 115 | return data 116 | print(f'链接【\033[31m{url}\033[0m】不支持') 117 | return None 118 | 119 | 120 | @utils.retry(2) 121 | def parse_urls(text: str) -> list: 122 | urls = re.findall( 123 | r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", text) 124 | return urls 125 | 126 | 127 | if __name__ == "__main__": 128 | printTips() 129 | while True: 130 | what = input("输入链接http开头(输入任意不包含链接的内容就能退出):") 131 | urls = parse_urls(what) 132 | if not urls: 133 | print("bye~") 134 | break 135 | print(f""" 136 | ╭━━━━━━━━━━━━━╮ 137 | │ 一共{len(urls)}个链接 │ 138 | ╰━━━━━━━━━━━━━╯ 139 | """) 140 | all_task = [] 141 | for idx, url in enumerate(urls): 142 | print(f"正在解析第{idx+1}个链接【{url}】") 143 | data = get_data(url) 144 | if data: 145 | all_task.extend(data2tasks(data)) 146 | 147 | queue = Queue(maxsize=100) 148 | for t in all_task: 149 | queue.put(t) 150 | 151 | print() 152 | print(f'{len(all_task)} tasks!') 153 | print() 154 | ts = [Thread(target=dl, args=(queue, )) for _ in range(min(len(all_task), 6))] 155 | for t in ts: 156 | t.start() 157 | 158 | for t in ts: 159 | t.join() 160 | -------------------------------------------------------------------------------- /extractor/README.md: -------------------------------------------------------------------------------- 1 | ### 这里是一些小爬虫集合 2 | 3 | --- 4 | 5 | 每个平台对应一个文件,每个文件里面有一个`get(url: str)`函数 统一形式如下(里面使用 f-string 需要 python3.6+): 6 | 7 | ```python 8 | """ 9 | Args: 10 | url: str 11 | Returns: 12 | data: dict 13 | |_ { 14 | title: str, 15 | author: str, 16 | audioName: str, 17 | videoName: str, 18 | imgs: List[str], 19 | audios: List[str], 20 | videos: List[str], 21 | text: str, 22 | msg: str 23 | } 24 | Tips: 25 | data里面的各个字段只有当爬取到相关内容时才会存在,除了msg(不过这个没啥大用) 26 | ☆ 爬取未成功也会返回data,而且不一定为空 27 | """ 28 | ``` 29 | 30 | # 默认输入的链接都正确:grin: 31 | 32 | --- 33 | 34 | | 平台 | 资源内容 | 完成状态 | 35 | | :--------------------: | :------------------: | :----------------: | 36 | | bilibili(哔哩哔哩) | 封面、视频 | :white_check_mark: | 37 | | changya(唱鸭) | 音频 | :white_check_mark: | 38 | | douyin(抖音) | 无水印视频 | :white_check_mark: | 39 | | kugou(酷狗) | 音频 | :white_check_mark: | 40 | | kuwo(酷我) | 音频 | :white_check_mark: | 41 | | lizhiFM(荔枝 FM) | 音频 | :white_check_mark: | 42 | | music163(网易云音乐) | 音频、视频、mv | :white_check_mark: | 43 | | qqmusic(QQ 音乐) | 音频 | :white_check_mark: | 44 | | pipigaoxiao(皮皮搞笑) | 无水印视频 | :white_check_mark: | 45 | | quanminkge(全民 K 歌) | 音频或视频 | :white_check_mark: | 46 | | weibo(微博) | 视频 | :white_check_mark: | 47 | | weishi(微视) | 无水印视频 | :white_check_mark: | 48 | | zhihu(知乎) | 视频 | :white_check_mark: | 49 | | zuiyou_voice(最右) | 音频(语音帖评论) | :white_check_mark: | 50 | | zuiyou_video(最右) | 视频 | :white_check_mark: | 51 | | qianqian(千千音乐) | 音频 | :white_check_mark: | 52 | | 5sing(5sing) | 音频 | :white_check_mark: | 53 | | pipix(皮皮虾) | 无水印视频 | :white_check_mark: | 54 | | qingshipin(轻视频) | 无水印视频 | :white_check_mark: | 55 | | qutoutiao(趣头条) | 视频 | :dash: | 56 | | ku6(酷 6 网) | 视频 | :white_check_mark: | 57 | | lofter(乐乎) | 视频 | :white_check_mark: | 58 | | open163(网易公开课) | 免费视频 | :white_check_mark: | 59 | | xinpianchang(新片场) | 视频 | :white_check_mark: | 60 | | baidutieba(百度贴吧) | 视频 | :white_check_mark: | 61 | | kuaishou(快手) | 无水印视频、长图视频 | :white_check_mark: | 62 | | acfun(AcFun 弹幕网) | 视频 | :white_check_mark: | 63 | | haokan(百度好看视频) | 视频 | :white_check_mark: | 64 | | pearvideo(梨视频) | 视频 | :white_check_mark: | 65 | | xiaokaxiu(小咖秀) | 无水印视频 | :white_check_mark: | 66 | | sohuTV(搜狐视频) | 视频 | :white_check_mark: | 67 | | ted(TED) | 视频 | :white_check_mark: | 68 | | tudou(土豆视频) | 视频 | :white_check_mark: | 69 | | quanminxsp(全民小视频) | 视频 | :white_check_mark: | 70 | | lequ(乐趣) | 背景动图、音频 | :white_check_mark: | 71 | | peiyinxiu(配音秀) | 视频 | :white_check_mark: | 72 | | tuchong(图虫) | 图片 | :white_check_mark: | 73 | | changba(唱吧) | 视频 | :white_check_mark: | 74 | | migu(咪咕音乐) | 音频 | :white_check_mark: | 75 | | momo(陌陌) | 视频 | :white_check_mark: | 76 | | 58pic(千图网) | 图片 | :white_check_mark: | 77 | | qmgx(全民搞笑) | 无水印视频 | :white_check_mark: | 78 | -------------------------------------------------------------------------------- /extractor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "wongxy github:xiyaowong" 2 | __all__ = [ 3 | "bilibili", 4 | "changya", 5 | "douyin", 6 | "kugou", 7 | "kuwo", 8 | "lizhiFM", 9 | "music163", 10 | "pipigaoxiao", 11 | "quanminkge", 12 | "weibo", 13 | "zhihu_video", 14 | "weishi", 15 | "zuiyou_voice", 16 | "zuiyou_video", 17 | "qqmusic", 18 | "qianqian", 19 | "sing5", 20 | "pipix", 21 | "qingshipin", 22 | "qutoutiao", 23 | "ku6", 24 | "lofter", 25 | "open163", 26 | "xinpianchang", 27 | "baidutieba", 28 | "kuaishou", 29 | "acfun", 30 | "haokan", 31 | "pearvideo", 32 | "xiaokaxiu", 33 | "sohuTV", 34 | "ted", 35 | "tudou", 36 | "quanminxsp", 37 | "lequ", 38 | "peiyinxiu", 39 | "tuchong", 40 | "changba", 41 | "migu_music", 42 | "momo", 43 | "pic58", 44 | "qmgx" 45 | ] 46 | -------------------------------------------------------------------------------- /extractor/acfun.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" 13 | } 14 | info_url = "https://api-new.acfunchina.com/rest/app/play/playInfo/mp4?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjAzNTI2NDMAAhAAMEP1uwS3Vi7NYAAAAJumF4MyTTFh5HGoyjW6ZpdjKymALUy9jZbsMTBVx-F10EhxyvpMtGQbBCYipvkMShM3iMNwbMd9DM6r2rnOYRVEdr6MaJS4yxxlA_Sl3JNWup57qBCQzOSC7SZnbEsHTQ%3D%3D&market=xiaomi&product=ACFUN_APP&sys_version=10&app_version=6.20.0.915&boardPlatform=sdm845&sys_name=android&socName=UNKNOWN&appMode=0" 15 | # info_url = "https://m.acfun.cn/rest/mobile-direct/play/playInfo/singleQuality?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjA5NTQ0MDACARAAMEP1uwPvjQhfQAAAAIAq7FtjRH%2Fn9rSMzs1AUNhmIS6eARtddADGgoGewjnABMg39tddqp9dTUq%2Ffd7MBisH5JpVc1bpf64a%2Bz3qrdI%3D" 16 | 17 | # get videoId, resourceIds 18 | re_title = r'(.*?)' 19 | re_videoId = r'"vid":"(\d+)",' 20 | re_resourceId = r'"ac":"(\d+)",' 21 | 22 | try: 23 | rep_html = requests.get(url, headers=headers, timeout=10) 24 | 25 | title = re.findall(re_title, rep_html.text)[0] 26 | videoId = re.findall(re_videoId, rep_html.text)[0] 27 | resourceId = re.findall(re_resourceId, rep_html.text)[0] 28 | 29 | rep_info = requests.get(info_url.format(videoId, resourceId), headers=headers, timeout=10) 30 | 31 | video = rep_info.json()["playInfo"]["streams"][0]["playUrls"][0] 32 | except (IndexError, TypeError): 33 | data["msg"] = "获取失败" 34 | else: 35 | data["title"] = title 36 | data["videos"] = [video] 37 | 38 | return data 39 | 40 | 41 | if __name__ == "__main__": 42 | url = "https://m.acfun.cn/v/?ac=14134176&part=2" 43 | print(get(url)) 44 | -------------------------------------------------------------------------------- /extractor/baidutieba.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | videos 9 | """ 10 | data = {} 11 | headers = { 12 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", 13 | } 14 | rep = requests.get(url, headers=headers, timeout=10) 15 | if rep.status_code == 200: 16 | data["videos"] = re.findall(r'data-video="(.*?)"', rep.text) 17 | else: 18 | data["msg"] = "获取失败" 19 | 20 | return data 21 | 22 | 23 | if __name__ == "__main__": 24 | # url = "https://tieba.baidu.com/p/6098286801?share=9105&fr=share&sfc=copy&client_type=2&client_version=11.3.8.2&st=1585294971&unique=190E4CEC3908756B412C7ABAE54C772F&red_tag=2618234446" 25 | url = input("url: ") 26 | print(get(url)) 27 | -------------------------------------------------------------------------------- /extractor/bilibili.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | imgs、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "user-agent": 13 | "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1", 14 | "Referer": "https://www.bilibili.com/", 15 | } 16 | 17 | av_number_pattern = r'(BV[0-9a-zA-Z]*)' 18 | cover_pattern = r"readyPoster: '(.*?)'," 19 | video_pattern = r"readyVideoUrl: '(.*?)'," 20 | title_pattern = r'title":"(.*?)",' 21 | 22 | av = re.findall(av_number_pattern, url) 23 | if av: 24 | av = av[0] 25 | else: 26 | data["msg"] = "链接可能不正确,因为我无法匹配到av号" 27 | return data 28 | url = f"https://www.bilibili.com/video/{av}" 29 | 30 | with requests.get(url, headers=headers, timeout=10) as rep: 31 | if rep.status_code == 200: 32 | cover_url = re.findall(cover_pattern, rep.text) 33 | if cover_url: 34 | cover_url = cover_url[0] 35 | if '@' in cover_url: 36 | cover_url = cover_url[:cover_url.index('@')] 37 | data["imgs"] = ['https:' + cover_url] 38 | 39 | video_url = re.findall(video_pattern, rep.text) 40 | title_text = re.findall(title_pattern, rep.text) 41 | if video_url: 42 | video_url = video_url[0] 43 | data["videos"] = [video_url] 44 | if title_text: 45 | data["videoName"] = title_text[0] 46 | else: 47 | data["msg"] = "获取失败" 48 | return data 49 | 50 | 51 | if __name__ == "__main__": 52 | print(get(input("url: "))) 53 | -------------------------------------------------------------------------------- /extractor/changba.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import execjs 4 | import requests 5 | 6 | js_code = """l=new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);function u(t){var e,o,n,a,i,r,s;for(r=t.length,i=0,s="";i>4);do{if(61==(n=255&t.charCodeAt(i++)))return s;n=l[n]}while(i>2);do{if(61==(a=255&t.charCodeAt(i++)))return s;a=l[a]}while(i dict: 11 | """ 12 | videos 13 | """ 14 | data = {} 15 | headers = { 16 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 17 | } 18 | rep = requests.get(url, headers=headers, timeout=10) 19 | if rep.status_code != 200: 20 | return {"msg": "获取失败"} 21 | 22 | enc_video_url = re.findall(r"video_url: '(.*?)',", rep.text)[0] 23 | video_url = "https:" + js.call("u", (enc_video_url,)) 24 | data["videos"] = [video_url] 25 | return data 26 | 27 | 28 | if __name__ == "__main__": 29 | print(get(input("url: "))) 30 | -------------------------------------------------------------------------------- /extractor/changya.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | author、audioName、audios 9 | """ 10 | data = {} 11 | headers = { 12 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", 13 | } 14 | audio_url_pattern = r'', html) 24 | data["author"] = author 25 | data["audioName"] = audioName 26 | data["imgs"] = imgs 27 | data["audios"] = audios 28 | except Exception: 29 | data["msg"] = {"msg": "获取失败"} 30 | 31 | return data 32 | 33 | 34 | if __name__ == "__main__": 35 | url = "https://api.bestdjb.com/promote/song-share/6477f04370cc22e7d9c2d3ac4265a92a?app_version=1.4.3" 36 | print(get(url)) 37 | -------------------------------------------------------------------------------- /extractor/lizhiFM.py: -------------------------------------------------------------------------------- 1 | # from urllib.parse import urlparse 2 | import re 3 | 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | author、audioName、audios 10 | """ 11 | data = {} 12 | headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"} 13 | info_url = "https://m.lizhi.fm/vodapi/voice/info/{id}" 14 | 15 | # path = urlparse(url).path 16 | # voiceId = path.split("/")[-1] 17 | voiceId = re.findall(r"/(\d{1,})", url) 18 | if not voiceId: 19 | data["msg"] = "链接无效,解析未成功" 20 | return data 21 | else: 22 | voiceId = voiceId[-1] 23 | 24 | with requests.get(info_url.format(id=voiceId), headers=headers, timeout=10) as rep: 25 | if rep.status_code == 200 and rep.json().get("code") == 0: 26 | info = rep.json() 27 | userName = info.get("data").get("userVoice").get("userInfo").get("name") 28 | voiceName = info.get("data").get("userVoice").get("voiceInfo").get("name") 29 | voiceUrl= info.get("data").get("userVoice").get("voicePlayProperty").get("trackUrl") 30 | data["author"] = userName 31 | data["audioName"] = voiceName 32 | data["audios"] = [voiceUrl] 33 | else: 34 | data["msg"] = "未能解析成功" 35 | 36 | return data 37 | 38 | 39 | 40 | 41 | if __name__ == "__main__": 42 | url = input("url: ") 43 | print(get(url)) -------------------------------------------------------------------------------- /extractor/lofter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | videos 9 | """ 10 | data = {} 11 | rep = requests.get(url, timeout=10) 12 | if rep.status_code == 200: 13 | data["videos"] = re.findall(r' 1 else singerName[0] 31 | 32 | # audioName 33 | audioName = json["songName"] 34 | 35 | # contentId 36 | c_item = json.get("qq") # type:dict 37 | 38 | if not c_item: 39 | return {"msg": "获取失败"} 40 | contentId = c_item["productId"] 41 | 42 | # toneFlag 43 | toneFlag = "HQ" if json["hasHQqq"] == "1" else "LQ" 44 | 45 | video_url = player_url.format(copyrightId=copyrightId, 46 | contentId=contentId, 47 | toneFlag=toneFlag, 48 | resourceType=2) 49 | 50 | data["author"] = author 51 | data["audioName"] = audioName 52 | data["videos"] = [video_url] 53 | 54 | return data 55 | 56 | 57 | if __name__ == "__main__": 58 | url = "http://music.migu.cn/v3/music/song/69910422841" 59 | print(get(url)) 60 | -------------------------------------------------------------------------------- /extractor/momo.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str): 7 | """ 8 | title、imgs、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 13 | } 14 | api = "https://m.immomo.com/inc/microvideo/share/profiles" 15 | 16 | ar = re.findall(r'/(ar.*?)\.html', url) 17 | if not ar: 18 | return {"msg": "失败"} 19 | ar = ar[0] 20 | 21 | payload = { 22 | "feedids": ar, 23 | "name": "", 24 | "avatar": "", 25 | } 26 | 27 | rep = requests.post(api, data=payload, headers=headers, timeout=6) 28 | if rep.status_code == 200 and rep.json()["ec"] == 200: 29 | info = rep.json()["data"] 30 | title = info["list"][0]["content"] 31 | img = info["list"][0]["video"]["cover"]["l"] 32 | video = info["list"][0]["video"]["video_url"] 33 | 34 | data["title"] = data["videoName"] = title 35 | data["imgs"] = [img] 36 | data["videos"] = [video] 37 | else: 38 | data["msg"] = "失败" 39 | 40 | return data 41 | 42 | 43 | if __name__ == "__main__": 44 | from pprint import pprint 45 | url = "https://m.immomo.com/s/moment/new-share-v2/ar8422649104.html" 46 | pprint(get(url)) 47 | -------------------------------------------------------------------------------- /extractor/music163/__init__.py: -------------------------------------------------------------------------------- 1 | from .music163 import Wangyiyun 2 | 3 | 4 | def get(url: str) -> dict: 5 | """ 6 | aduios或者videos 7 | """ 8 | data = {} 9 | wangyiyun = Wangyiyun() 10 | resource_url = wangyiyun.get(url) 11 | if not resource_url: 12 | return {"msg": "获取失败"} 13 | if "mv" in url or "video" in url: 14 | data["videos"] = [resource_url] 15 | elif "song" in url: 16 | data["audios"] = [resource_url] 17 | return data 18 | 19 | 20 | __all__ = ["get"] 21 | -------------------------------------------------------------------------------- /extractor/music163/encrypt.py: -------------------------------------------------------------------------------- 1 | # 原理:https://www.zhihu.com/question/36081767 代码块直接copy的:https://github.com/CharlesPikachu/Music-Downloader 2 | import base64 3 | import codecs 4 | import json 5 | import os 6 | 7 | from Crypto.Cipher import AES 8 | 9 | 10 | class Cracker(): 11 | modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' 12 | nonce = '0CoJUm6Qyw8W8jud' 13 | pubKey = '010001' 14 | 15 | @classmethod 16 | def get(cls, text): 17 | text = json.dumps(text) 18 | secKey = cls._createSecretKey(16) 19 | encText = cls._aesEncrypt(cls._aesEncrypt(text, cls.nonce), secKey) 20 | encSecKey = cls._rsaEncrypt(secKey, cls.pubKey, cls.modulus) 21 | post_data = {'params': encText, 'encSecKey': encSecKey} 22 | return post_data 23 | 24 | @classmethod 25 | def _aesEncrypt(cls, text, secKey): 26 | pad = 16 - len(text) % 16 27 | if isinstance(text, bytes): 28 | text = text.decode('utf-8') 29 | text = text + str(pad * chr(pad)) 30 | secKey = secKey.encode('utf-8') 31 | encryptor = AES.new(secKey, 2, b'0102030405060708') 32 | text = text.encode('utf-8') 33 | ciphertext = encryptor.encrypt(text) 34 | ciphertext = base64.b64encode(ciphertext) 35 | return ciphertext 36 | 37 | @classmethod 38 | def _rsaEncrypt(cls, text, pubKey, modulus): 39 | text = text[::-1] 40 | rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16)**int(pubKey, 16) % int(modulus, 16) 41 | return format(rs, 'x').zfill(256) 42 | 43 | @classmethod 44 | def _createSecretKey(cls, size): 45 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(size)))))[0:16] 46 | 47 | 48 | if __name__ == "__main__": 49 | print(Cracker.get("Hello World")) 50 | -------------------------------------------------------------------------------- /extractor/music163/music163.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import unquote 3 | 4 | import requests 5 | 6 | from .encrypt import Cracker 7 | 8 | 9 | class Wangyiyun(): 10 | def __init__(self): 11 | self.headers = { 12 | 'Referer': 'https://music.163.com/', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.32 Safari/537.36' 14 | } 15 | self.music_url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token=' 16 | self.mv_url = "https://music.163.com/weapi/song/enhance/play/mv/url?csrf_token=" 17 | 18 | def get(self, url): 19 | """ 20 | 返回资源链接 21 | """ 22 | 23 | if "video" in url: 24 | return self.get_video(url) 25 | 26 | id = self.get_id(url) 27 | if "mv" in url: 28 | params = {"id": id, "r": "1080", "csrf_token": ""} 29 | data = self.__postRequests(self.mv_url, params) 30 | if data: 31 | return data["data"]["url"] 32 | elif "song" in url: 33 | params = {'ids': [int(id)], 'br': 320000, 'csrf_token': ''} 34 | data = self.__postRequests(self.music_url, params) 35 | if data: 36 | return data["data"][0]["url"] 37 | return None 38 | 39 | def get_video(self, url): 40 | id = self.get_id(url) 41 | url = f"http://music.163.com/video/{id}/" 42 | rep = requests.get(url, headers=self.headers, timeout=6) 43 | if rep.status_code == 200: 44 | encoded_url = re.findall(r'', rep.text)[0] 45 | return unquote(encoded_url) 46 | return None 47 | 48 | # 匹配id 49 | def get_id(self, raw_url) -> str: 50 | pattern1 = re.compile(r'\?id=(\w+)') 51 | pattern2 = re.compile(r'song/(\w+)/') 52 | pattern3 = re.compile(r'mv/(\w+)/') 53 | pattern4 = re.compile(r'video/(\w+)/') 54 | if "?id" in raw_url: 55 | id = re.findall(pattern1, raw_url) 56 | elif "song" in raw_url: 57 | id = re.findall(pattern2, raw_url) 58 | elif "mv" in raw_url: 59 | id = re.findall(pattern3, raw_url) 60 | elif "video" in raw_url: 61 | id = re.findall(pattern4, raw_url) 62 | if id: 63 | return id[0] 64 | return None 65 | 66 | def __postRequests(self, url, params, timeout=6): 67 | post_data = Cracker.get(params) 68 | rep = requests.post(url, 69 | data=post_data, 70 | timeout=timeout, 71 | headers=self.headers) 72 | if rep.json()['code'] == 200: 73 | return rep.json() 74 | return None 75 | -------------------------------------------------------------------------------- /extractor/open163.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=W0123 2 | import re 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | videos 9 | """ 10 | data = {} 11 | data["videos"] = [] 12 | headers = { 13 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 14 | } 15 | re_url = r'mid:(.*?),.*?mp4SdUrlOrign:(.*?),.*?mp4HdUrlOrign:(.*?),.*?mp4ShdUrlOrign:(.*?),' 16 | rep = requests.get(url, headers=headers, timeout=10) 17 | items = re.findall(re_url, rep.text) 18 | for item in items: 19 | # 倒序取最高画质 20 | for video_url in item[::-1]: # type: str 21 | # print(url) 22 | if "http" in video_url: 23 | video_url = eval(video_url).replace("\\u002F", "/") 24 | data["videos"].append(video_url) 25 | break 26 | return data 27 | 28 | 29 | if __name__ == "__main__": 30 | url = "http://open.163.com/newview/movie/free?pid=M8LI1JCE6&mid=M8LI3BQ60" 31 | print(get(url)) 32 | -------------------------------------------------------------------------------- /extractor/pearvideo.py: -------------------------------------------------------------------------------- 1 | # hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo.com/mp4/adshort/20200328/cont-1665047-11947733-122441_adpkg-ad_hd.mp4", 2 | # data-title="奥运推迟后东京新冠确诊数翻倍,《纽约时报》发文质疑" data-summary="从3月23日起,东京地区的新冠病毒确诊数就连续4天上涨。在24日官宣东京奥运推迟之后,第二天确诊数更是直接翻倍。《纽约时报》写了一篇文章,列出了各种数据,质疑此前东京为了奥运会而牺牲检测。" 3 | 4 | import re 5 | import requests 6 | 7 | 8 | def get(url: str) -> dict: 9 | """ 10 | title、videos、text 11 | """ 12 | data = {} 13 | headers = { 14 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", 15 | } 16 | try: 17 | rep = requests.get(url, headers=headers, timeout=10) 18 | data["title"], data["text"] = re.findall(r'data-title="(.*?)" data-summary="(.*?)"', rep.text)[0] 19 | data["videos"] = re.findall(r'srcUrl="(.*?\.mp4)",', rep.text) 20 | except (ConnectionError, IndexError, TypeError): 21 | data["msg"] = "获取失败" 22 | 23 | return data 24 | 25 | 26 | if __name__ == "__main__": 27 | url = "https://www.pearvideo.com/video_1664989" 28 | print(get(url)) 29 | -------------------------------------------------------------------------------- /extractor/peiyinxiu.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" 13 | } 14 | rep = requests.get(url, headers=headers, timeout=10) 15 | if rep.status_code != 200: 16 | return {"msg": "获取失败"} 17 | html = rep.text 18 | data["title"] = re.findall(r'data-title="(.*?)"', html)[0] 19 | data["videos"] = re.findall(r"\sfilmurl: '(.*?)',", html) 20 | return data 21 | 22 | 23 | if __name__ == "__main__": 24 | url = "http://peiyinxiu.com/m/127066455" 25 | print(get(url)) 26 | -------------------------------------------------------------------------------- /extractor/pic58.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """https://www.58pic.com/newpic/*.html 8 | 9 | imgs 10 | """ 11 | rep = requests.get(url, timeout=6) 12 | if not rep.ok: 13 | return {"msg": "失败"} 14 | pre_url = re.findall(r'', rep.text) 15 | if not pre_url: 16 | return {"msg": "失败"} 17 | pre_url = pre_url[0] # type: str 18 | img_url = pre_url.replace("preview.qiantucdn.com", "https://pic.qiantucdn.com") 19 | return {"imgs": [img_url], "msg": f"下载时需要设置referer: {url}"} 20 | 21 | 22 | if __name__ == "__main__": 23 | # url = input("url: ") 24 | url = "https://www.58pic.com/newpic/34673009.html" 25 | print(get(url)) 26 | -------------------------------------------------------------------------------- /extractor/pipigaoxiao.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | videos 10 | """ 11 | data = {} 12 | headers = { 13 | "Host": "share.ippzone.com", 14 | "Connection": "keep-alive", 15 | "Content-Length": "45", 16 | "Origin": "http://share.ippzone.com", 17 | "User-Agent": 18 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", 19 | "Content-Type": "text/plain;charset=UTF-8", 20 | "Accept": "*/*", 21 | "Referer": "http://share.ippzone.com/", 22 | "Accept-Encoding": "gzip, deflate", 23 | "Accept-Language": "zh-CN,zh;q=0.9", 24 | } 25 | 26 | post_url = "http://share.ippzone.com/ppapi/share/fetch_content" 27 | 28 | pid = re.findall(r"/(\d{1,})", url) 29 | if not pid: 30 | data["msg"] = "链接无效,无法获取有效数据" 31 | return data 32 | else: 33 | pid = int(pid[0]) 34 | 35 | post_data = { 36 | "pid": pid, 37 | "type": "post", 38 | } 39 | 40 | with requests.post(post_url, headers=headers, data=json.dumps(post_data), timeout=10) as rep: 41 | if rep.status_code == 200 and rep.json().get("ret") == 1: 42 | id = rep.json().get("data").get("post").get("imgs")[0].get("id") 43 | play_url = rep.json().get('data').get('post').get('videos').get(str(id)).get('url') 44 | data["videos"] = [play_url] 45 | else: 46 | data["msg"] = "资源获取失败,请确认输入是否正确" 47 | 48 | return data 49 | 50 | 51 | if __name__ == "__main__": 52 | print(get(input("url: "))) -------------------------------------------------------------------------------- /extractor/pipix.py: -------------------------------------------------------------------------------- 1 | # author: wongxy 2 | # -------------- 3 | # https://h5.pipix.com/item/****************** 4 | import re 5 | import requests 6 | 7 | 8 | def get(url: str) -> dict: 9 | """ 10 | title、audios 11 | """ 12 | data = {} 13 | headers = { 14 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" 15 | } 16 | item_id = re.findall(r"item/(\d+)", url) 17 | if not item_id: 18 | return {"msg": "获取失败"} 19 | item_id = item_id[0] 20 | info_url = f"https://h5.pipix.com/bds/webapi/item/detail/?item_id={item_id}&source=share" 21 | with requests.get(info_url, headers=headers, timeout=10) as rep: 22 | if rep.status_code != 200 or rep.json().get("status_code") != 0: 23 | return {"msg": "获取失败"} 24 | info = rep.json()["data"]["item"] 25 | data["title"] = info["share"]["title"] 26 | data["audios"] = [info["origin_video_download"]["url_list"][0]["url"]] 27 | 28 | 29 | 30 | return data 31 | 32 | 33 | if __name__ == "__main__": 34 | print(get(input("url: "))) -------------------------------------------------------------------------------- /extractor/qianqian.py: -------------------------------------------------------------------------------- 1 | # qianqian music 2 | # music.taihe.com 3 | import re 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | url sample: http://music.taihe.com/song/******** 10 | 11 | author、audioName、imgs、audios 12 | """ 13 | data = {} 14 | headers = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" 16 | } 17 | songinfo_format_url = "http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&songid={songid}&from=web" 18 | 19 | songid = re.findall(r"song/(\d+)", url) 20 | if not songid: 21 | data["msg"] = "无法获取有效消息" 22 | return data 23 | songid = songid[0] 24 | songinfo_url = songinfo_format_url.format(songid=songid) 25 | with requests.get(songinfo_url, headers=headers, timeout=10) as rep: 26 | if rep.status_code != 200: 27 | data["msg"] = "无法获取有效消息" 28 | return data 29 | result = rep.json() 30 | data["author"] = result["songinfo"]["artist"] 31 | data["audioName"] = result["songinfo"]["title"] 32 | data["imgs"] = [result["songinfo"]["album_1000_1000"]] 33 | data["audios"] = [result["bitrate"]["show_link"] or result["bitrate"]["file_link"]] 34 | 35 | return data 36 | 37 | 38 | if __name__ == "__main__": 39 | import pprint 40 | pprint.pprint(get(input("url: "))) -------------------------------------------------------------------------------- /extractor/qingshipin.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def get(url: str): 5 | """ 6 | author、title、imgs、videos 7 | """ 8 | data = {} 9 | headers = { 10 | "User-Agent": 11 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like\ 12 | Gecko) Chrome/80.0.3987.149 Safari/537.36" 13 | } 14 | detail_url = url.replace("video/?", "bbq/app-bbq/sv/detail?sv") 15 | with requests.get(detail_url, headers=headers, timeout=10) as rep: 16 | 17 | if rep.status_code != 200: 18 | return {"msg": "error occurred!"} 19 | 20 | json = rep.json() 21 | if json["code"] != 0: 22 | return {"msg": "error occurred!"} 23 | 24 | author = json["data"]["user_info"]["uname"] 25 | title = json["data"]["title"] 26 | imgs = [json["data"]["cover_url"]] 27 | videos = [json["data"]["play"]["url"]] 28 | 29 | data["author"] = author 30 | data["title"] = title 31 | data["imgs"] = imgs 32 | data["videos"] = videos 33 | return data 34 | 35 | 36 | if __name__ == "__main__": 37 | url = input("url: ") 38 | print(get(url)) 39 | -------------------------------------------------------------------------------- /extractor/qmgx.py: -------------------------------------------------------------------------------- 1 | """ 2 | 全民搞笑 https://longxia.music.xiaomi.com/share/video/****** 3 | """ 4 | import re 5 | 6 | import requests 7 | 8 | 9 | def get(url: str) -> dict: 10 | """ 11 | title、videoName、videos 12 | """ 13 | data = {} 14 | vid = re.findall(r'video/(\d+)', url) 15 | if vid: 16 | api = 'https://longxia.music.xiaomi.com/api/share?contentType=video&contentId={}'.format(vid[0]) 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' 19 | } 20 | rep = requests.get(api, headers=headers, timeout=5) 21 | if rep.status_code == 200 and rep.json()['code'] == 200: 22 | info = rep.json()['data']['videoInfo']['videoInfo'] 23 | data['title'] = data['videoName'] = info['desc'] 24 | data['videos'] = [info['url']] 25 | return data 26 | return {'msg': 'failed'} 27 | 28 | 29 | if __name__ == "__main__": 30 | print(get('https://longxia.music.xiaomi.com/share/video/6624743459453734912?sharerUserId')) 31 | -------------------------------------------------------------------------------- /extractor/qqmusic.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | import requests 5 | 6 | 7 | def get(url: str): 8 | """ 9 | author、audioName、audios 10 | """ 11 | data = {} 12 | ios_headers = { 13 | "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", 14 | "referer": "http://y.qq.com" 15 | } 16 | 17 | # find: songid、songmid and author、audioName 18 | with requests.get(url, headers=ios_headers, timeout=10) as rep: 19 | if rep.status_code != 200: 20 | return {"msg": "链接无效"} 21 | html = rep.text 22 | songid = re.findall(r'songid":(\d+),', html) 23 | songmid = re.findall(r'"songmid":"(.*?)",', html) 24 | if not (songid or songmid): 25 | return {"msg": "提取重要信息失败"} 26 | songid = songid[0] 27 | songmid = songmid[0] 28 | data["audioName"] = re.findall(r'"songname":"(.*?)"', html)[0] 29 | data["author"] = re.findall(r'"name":"(.*?)",', html)[0] 30 | 31 | # vkey 32 | vkey_url = 'https://u.y.qq.com/cgi-bin/musicu.fcg' 33 | params = { 34 | 'data': json.dumps({"req": {"module": "CDN.SrfCdnDispatchServer", "method": "GetCdnDispatch", "param": {"guid": "3982823384", "calltype": 0, "userip": ""}}, "req_0": {"module": "vkey.GetVkeyServer", "method": "CgiGetVkey", "param": {"guid": "3982823384", "songmid": [songmid], "songtype": [0], "uin": "0", "loginflag": 1, "platform": "20"}}, "comm": {"uin": 0, "format": "json", "ct": 24, "cv": 0}}) 35 | } 36 | with requests.get(vkey_url, params=params, headers=ios_headers, timeout=10) as rep: 37 | if rep.json()["code"] != 0 and rep.json()['req_0']['code'] != 0: 38 | return {"msg": "提取重要信息失败"} 39 | data["audios"] = [ 40 | "https://isure.stream.qqmusic.qq.com/{}".format(rep.json()['req_0']['data']['midurlinfo'][0]['purl']) 41 | ] 42 | 43 | return data 44 | 45 | 46 | if __name__ == "__main__": 47 | # print(get(input("url: "))) 48 | url = 'https://y.qq.com/n/yqq/song/003tdyG9003JqW.html' 49 | print(get(url)) 50 | 51 | 52 | # "A000", "ape", 800 53 | # "F000", "flac", 800 54 | # "M800", "mp3", 320 55 | # "C400", "m4a", 128 56 | # "M500", "mp3", 128 57 | -------------------------------------------------------------------------------- /extractor/quanminkge.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url:str) -> dict: 7 | ''' 8 | author、audioName、audios、videos 9 | ''' 10 | data = {} 11 | headers = { 12 | "accept": 13 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 14 | "accept-encoding": 15 | "gzip, deflate, br", 16 | "accept-language": 17 | "zh-CN,zh;q=0.9", 18 | "cache-control": 19 | "max-age=0", 20 | "sec-fetch-mode": 21 | "navigate", 22 | "sec-fetch-site": 23 | "none", 24 | "upgrade-insecure-requests": 25 | "1", 26 | "user-agent": 27 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36", 28 | } 29 | singer_pattern = r',"nick":"(.*?)",' 30 | song_name_pattern = r'"song_name":"(.*?)",' 31 | audio_pattern = r'"playurl":"(.*?)",' 32 | video_pattern = r',"playurl_video":"(.*?)",' 33 | 34 | with requests.get(url=url, headers=headers, timeout=10) as rep: 35 | if rep.status_code == 200: 36 | html = rep.text 37 | singer = re.findall(singer_pattern, html) 38 | song_name = re.findall(song_name_pattern, html) 39 | audio_url = re.findall(audio_pattern, html) 40 | video_url = re.findall(video_pattern, html) 41 | if singer: data["author"] = singer[0] 42 | if song_name: data["audioName"] = song_name[0] 43 | if audio_url: data["audios"] = [url for url in audio_url if url != ""] 44 | if video_url: data["videos"] = [url for url in video_url if url != ""] 45 | else: 46 | data["msg"] = "获取失败" 47 | 48 | return data 49 | 50 | 51 | if __name__ == "__main__": 52 | data = get(input("url: ")) 53 | print(data) -------------------------------------------------------------------------------- /extractor/quanminxsp.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 13 | "Accept-Encoding": "gzip, deflate, br", 14 | "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", 15 | "Cache-Control": "max-age=0", 16 | "Connection": "keep-alive", 17 | "Cookie": "COMMON_LID=d8d795e732f64cd28cbbce9ee76688af; Hm_lvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585966701; Hm_lpvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585969995", 18 | "DNT": "1", 19 | "Host": "quanmin.hao222.com", 20 | "Sec-Fetch-Dest": "document", 21 | "Sec-Fetch-Mode": "navigate", 22 | "Sec-Fetch-Site": "none", 23 | "Sec-Fetch-User": "?1", 24 | "Upgrade-Insecure-Requests": "1", 25 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", 26 | } 27 | re_video = r'' 28 | re_title = r'' 29 | 30 | with requests.get(url, headers=headers, timeout=10) as rep: 31 | if rep.status_code == 200: 32 | title = re.findall(re_title, rep.text) 33 | video = re.findall(re_video, rep.text) 34 | if title: 35 | data["title"] = title[0] 36 | if video: 37 | data["videos"] = video 38 | else: 39 | data["msg"] = "失败" 40 | return data 41 | 42 | 43 | if __name__ == "__main__": 44 | url = "https://quanmin.hao222.com/sv2?source=share-h5&pd=qm_share_mvideo&vid=3877781674274744362&shareTime=1585969946&shareid=0746467921&shared_cuid=0ivn8laMv8l9uHuI_PSua_uS2u_Wav8dYu2ku_iCStloiBaR_8S08jf2QP0Hf1uea1FmA&shared_uid=gO2Ri_aIvtelA" 45 | print(get(url)) 46 | -------------------------------------------------------------------------------- /extractor/qutoutiao.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlparse, parse_qs 3 | 4 | import requests 5 | 6 | # TODO: 支持小视频 7 | 8 | def get(url: str) -> dict: 9 | """ 10 | author、title、videos 11 | """ 12 | data = {} 13 | if "new.3qtt.cn" in url: # 短连接转长连接 14 | url = requests.get(url).url 15 | 16 | data_url_format = "http://api.1sapp.com/content/getRecommendV3?key={key}&content_id={content_id}&limit=1" 17 | play_host = "http://v4.qutoutiao.net/" 18 | 19 | query = urlparse(url).query 20 | querys = parse_qs(query) 21 | content_id = querys["content_id"][0] 22 | key = querys["key"][0] 23 | data_url = data_url_format.format(content_id=content_id, key=key) 24 | 25 | rep = requests.get(data_url, timeout=10) 26 | if rep.status_code != 200 or rep.json()["code"] != 0: 27 | return {"msg": "获取失败"} 28 | 29 | # from pprint import pprint 30 | # pprint(rep.json()) 31 | json_url = rep.json()["data"]["data"][0]["urlJson"] 32 | rep = requests.get(json_url, timeout=10) 33 | if rep.status_code != 200: 34 | return {"msg": "获取失败"} 35 | # 整理 36 | video_data = json.loads(rep.text.replace("cb(", "").replace(")", "")) 37 | detail = video_data["detail"].replace("\\", "") 38 | video_data["detail"] = json.loads(detail) 39 | 40 | data["author"] = video_data["nickname"] 41 | data["title"] = video_data["title"] 42 | address = video_data["detail"]["address"] 43 | 44 | urls = [add["url"] for add in address] 45 | for q in ["hd.mp4", "hhd.mp4", "ld.mpp4", "hld.mp4"]: 46 | for i in urls: 47 | if q in i: 48 | data["videos"] = [play_host + i] 49 | break 50 | 51 | return data 52 | 53 | 54 | if __name__ == "__main__": 55 | from pprint import pprint 56 | pprint(get(input("url: "))) 57 | -------------------------------------------------------------------------------- /extractor/sing5.py: -------------------------------------------------------------------------------- 1 | # author: wongxy 2 | # -------------- 3 | # 5sing.kugou.com 4 | import re 5 | import json 6 | 7 | import requests 8 | 9 | 10 | def get(url: str) -> dict: 11 | """ 12 | author、audioName、audios 13 | """ 14 | data = {} 15 | headers = { 16 | "User-Agent": 17 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" 18 | } 19 | 20 | songinfo_format_url = "http://service.5sing.kugou.com/song/getsongurl?&songid={songid}&songtype=fc&from=web&version=6.6.72" 21 | 22 | songid = re.findall(r"/(\d+)", url.replace("5sing", "")) 23 | if not songid: 24 | return {"msg": "无法从链接获取关键信息"} 25 | songid = songid[0] 26 | 27 | songinfo_url = songinfo_format_url.format(songid=songid) 28 | with requests.get(songinfo_url, headers=headers, timeout=10) as rep: 29 | if rep.status_code != 200: 30 | return {"msg": "获取失败, 链接可能无效"} 31 | json_ = json.loads(rep.text[1: -1]) 32 | if json_["code"] != 0: 33 | return {"msg": "获取失败, 链接可能无效"} 34 | info = json_["data"] 35 | data["author"] = info["user"]["NN"] 36 | data["audioName"] = info["songName"] 37 | data["audios"] = [ 38 | info.get("squrl") or info.get("hqurl") or info.get("lqurl") 39 | ] 40 | 41 | return data 42 | 43 | 44 | if __name__ == "__main__": 45 | from pprint import pprint 46 | pprint(get("http://5sing.kugou.com/fc/15717150.html")) 47 | # print(get(input("url: "))) 48 | -------------------------------------------------------------------------------- /extractor/sohuTV.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videoName、videos 9 | """ 10 | data = {} 11 | session = requests.Session() 12 | ERROR = {"msg": "获取失败"} 13 | headers = { 14 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" 15 | } 16 | videoInfo_url = "https://my.tv.sohu.com/play/videonew.do" 17 | playInfo_url = "https://data.vod.itc.cn/ip" 18 | 19 | with session.get(url, headers=headers, timeout=10) as rep_html: 20 | if rep_html.status_code != 200: 21 | return ERROR 22 | vid = re.findall(r",vid: '(\d+)'", rep_html.text) 23 | if not vid: 24 | return ERROR 25 | vid = vid[0] 26 | 27 | videoInfo_params = { 28 | "vid": vid, 29 | "ver": 31, 30 | "ssl": 1, 31 | "referer": url 32 | } 33 | with session.get(videoInfo_url, params=videoInfo_params, timeout=10) as videoInfo_rep: 34 | if videoInfo_rep.status_code != 200: 35 | return ERROR 36 | videoInfo = videoInfo_rep.json()["data"] 37 | tvName = videoInfo["tvName"] 38 | data["title"] = data["videoName"] = tvName 39 | 40 | video_path = videoInfo["su"][0] 41 | key = videoInfo["hc"][0] if videoInfo.get("hc") else videoInfo["ck"][0] 42 | if not video_path or not key: 43 | return ERROR 44 | 45 | playInfo_params = { 46 | "new": video_path, 47 | "num": 1, 48 | "key": key, 49 | } 50 | with session.get(playInfo_url, params=playInfo_params, timeout=10) as playInfo_rep: 51 | if playInfo_rep.status_code != 200: 52 | return ERROR 53 | play_url = playInfo_rep.json()["servers"][0]["url"] 54 | data["videos"] = [play_url] 55 | 56 | return data 57 | 58 | 59 | if __name__ == "__main__": 60 | url = input("url: ") 61 | print(get(url)) 62 | -------------------------------------------------------------------------------- /extractor/ted.py: -------------------------------------------------------------------------------- 1 | # https://www.ted.com/talks/* 2 | import re 3 | 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | title、videoName、videos 10 | """ 11 | data = {} 12 | headers = { 13 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 14 | } 15 | rep = requests.get(url, headers=headers, timeout=10) 16 | if rep.status_code == 200: 17 | text = rep.text 18 | try: 19 | title = re.findall(r'', text)[0] 20 | mp4 = re.findall(r'"(https://download\.ted\.com.*?mp4\?apikey=.*?)"', text)[-1] 21 | data["title"] = data["videoName"] = title 22 | data["videos"] = [mp4] 23 | except IndexError as e: 24 | data["msg"] = "获取失败:" + e 25 | else: 26 | data["msg"] = "获取失败" 27 | 28 | return data 29 | 30 | 31 | if __name__ == "__main__": 32 | url = "https://www.ted.com/talks/bill_gates_how_we_must_respond_to_the_coronavirus_pandemic" 33 | print(get(url)) 34 | -------------------------------------------------------------------------------- /extractor/tuchong.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、imgs 9 | """ 10 | data = {} 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 13 | } 14 | rep = requests.get(url, headers=headers, timeout=6) 15 | if rep.status_code == 200: 16 | title = re.findall(r'', rep.text) 17 | if title: 18 | data["title"] = title[0] 19 | data["imgs"] = re.findall(r'photo-image" src="(.*?)"', rep.text) 20 | else: 21 | data["msg"] = "获取失败" 22 | return data 23 | 24 | 25 | if __name__ == "__main__": 26 | from pprint import pprint 27 | pprint(get(input("url: "))) 28 | -------------------------------------------------------------------------------- /extractor/tudou.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | :param url: 视频链接,免费电视剧单集 10 | 11 | :return title: 视频名 12 | :return videoName: 同title 13 | :return videos: 视频链接,多个片段。最后一个是视频流地址(m3u8) 14 | """ 15 | data = {} 16 | headers_html = { 17 | "referer": url, 18 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 19 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", 20 | } 21 | headers_info = { 22 | "referer": url, 23 | "accept": "application/json", 24 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", 25 | } 26 | host = "https://ups.youku.com/ups/get.json" 27 | 28 | rep = requests.get(url, headers=headers_html, timeout=10) 29 | if rep.status_code != 200: 30 | return {"msg": "访问链接失败"} 31 | vid = re.findall(r'"vid":"(\d+)"', rep.text) 32 | if not vid: 33 | return {"msg": "获取视频id失败"} 34 | vid = vid[0] 35 | 36 | params = { 37 | # 调试了半天,才发现ckey可以通用,但是暂时不知道过期时间 38 | "ckey": "122#wppJ/JoGEExRyDpZy4pjEJponDJE7SNEEP7ZpJRBuDPpJFQLpCGwoHZDpJEL7SwBEyGZpJLlu4Ep+FQLpoGUEELWn4yE7SNEEP7ZpERBuDPE+BQPpC76EJponDJLKMQEImb2XDnTtByWAfaPwr8S14Rqur0Nj1sih8TwWMzZF+NtTPnZULbEnh9G8WlODWp1uOjeDLVr8PG6+4EEyFfDqM3bDEpxngR4ul5EDOgPm4AiJDbEfC3mqM3WE8pangL4ul0EDLVr8CpU+4EEyFfDqMfbDEpxnSp4uOIEELXZ8oL6JwTEyF3F7S32EJpadSxwuAuRiRFmYFRiZDPACVgIudh3VaGrVnUkqUbD72siAEVR1Qr4OWZjlGSrnzPs2rh4OY+Z6EbOEBJ8OnDsYwNsTdEhishHohd6L2J+K8z7LZpSitQjj8hrDOAV/ttFwMbpN7KrcdwvCJ7TbxjR5Q0rJaMPlfUv9IYPLIY9KNNy24RBro4psistlkgxw4vO3WXa4M00NlsAH1XADAp8l3+COupmS7LbhxHS2BKVRDZkDyD+xnYIaRahNuJDv7pLt830IQHgDvnq1gJBE75mVDgemdAGyc4ruFk4++Ar9T6gZbfiuacVvtDgzBcEo0r6bi+rvYQuaMy=", 39 | "utid": "otL9FkVfwnwCASv6yQTaubZ5", # expires at: 2030-03-25T07:32:14.712Z 40 | "vid": vid, 41 | "client_ts": int(time.time()), 42 | "ccode": "050F", 43 | "client_ip": "192.168.1.1", 44 | } 45 | rep = requests.get(host, params=params, headers=headers_info, timeout=10) 46 | if rep.status_code != 200 or "error" in rep.json()["data"]: 47 | return {"msg": "获取视频信息失败"} 48 | info = rep.json()["data"] 49 | title = info["video"]["title"] 50 | stream = info["stream"] # type: list 51 | # 取最高画质 52 | best_steam = sorted(stream, key=lambda item: item["width"])[-1] 53 | videos = [url_item["cdn_url"] for url_item in best_steam["segs"]] 54 | m3u8_url = best_steam["m3u8_url"] 55 | videos.append(m3u8_url) 56 | 57 | data["title"] = data["videoName"] = title 58 | data["videos"] = videos 59 | 60 | return data 61 | 62 | 63 | if __name__ == "__main__": 64 | from pprint import pprint 65 | pprint(get(input("url: "))) 66 | -------------------------------------------------------------------------------- /extractor/wechat_article_cover.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | imgs、text 9 | """ 10 | data = {} 11 | headers = { 12 | "user-agent": 13 | "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/ 53\ 14 | 6.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" 15 | } 16 | with requests.get(url, headers=headers, timeout=10) as rep: 17 | if rep.status_code != 200: 18 | return {"msg": "错误"} 19 | img = re.findall(r'', rep.text) 20 | if img: 21 | data["imgs"] = [img[0]] 22 | text = re.findall(r'', rep.text) 23 | if text: 24 | data["text"] = text[0] 25 | return data 26 | 27 | 28 | if __name__ == "__main__": 29 | 30 | url = input("url: ") 31 | print(get(url)) 32 | -------------------------------------------------------------------------------- /extractor/weibo.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videos 9 | """ 10 | data = {} 11 | headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1'} 12 | 13 | title_re = r'"title": "(.*?)",' 14 | mp4_720p_mp4_re = r'"mp4_720p_mp4": "(.*?)",' 15 | mp4_hd_mp4_re = r'"mp4_hd_mp4": "(.*?)",' 16 | mp4_ld_mp4_re = r'"mp4_ld_mp4": "(.*?)"' 17 | 18 | with requests.get(url, headers=headers, timeout=10) as rep: 19 | if rep.status_code == 200: 20 | text = rep.text 21 | title = re.findall(title_re, text) 22 | mp4_720p_mp4 = re.findall(mp4_720p_mp4_re, text) 23 | mp4_hd_mp4 = re.findall(mp4_hd_mp4_re, text) 24 | mp4_ld_mp4 = re.findall(mp4_ld_mp4_re, text) 25 | if title: 26 | data["title"] = title[0] 27 | data["videos"] = mp4_720p_mp4 or mp4_hd_mp4 or mp4_ld_mp4 28 | else: 29 | data["msg"] = "获取失败" 30 | 31 | return data 32 | 33 | 34 | if __name__ == "__main__": 35 | url = input('url: ') 36 | print(get(url)) -------------------------------------------------------------------------------- /extractor/weishi.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlparse, parse_qs 3 | 4 | import requests 5 | 6 | 7 | data = {} 8 | headers = { 9 | "accept": "application/json", 10 | "accept-encoding": "gzip, deflate, br", 11 | "accept-language": "zh-CN,zh;q=0.9", 12 | "content-length": "63", 13 | "content-type": "application/json", 14 | "cookie": "pgv_pvi=9657849856; pgv_pvid=2069474799; RK=aHJszqfoXm; ptcz=0fc0035b9509215f060561393c09f6cde3bccc1953e79c2b5b1ec450e4e67f19; LW_uid=s1i5E5d4v2a1p5n702J1O2y0q8; eas_sid=M1k5T5N4O28185X7J291x2K1A3; o_cookie=286183317; pac_uid=1_286183317; ied_qq=o0286183317; LW_sid=x1Y5D6W4h4H516F6X9l9V8S8Z3; tvfe_boss_uuid=fbb4b39b5afeb49b; psrf_qqopenid=A140C50D3D791392EA89131C8B01FE1D; psrf_qqaccess_token=D2F43F3C25900E66193345D276AF9559; psrf_qqrefresh_token=E48409D7E8E4F3D5C3869F104380AB3E; psrf_qqunionid=002C01991CFB436BCD8A27A0EE1DB9FF; qm_keyst=Q_H_L_2ajiOt50eapmue6Eg1-l_W6XztEBr_u0vZJAPs4xctJZNJdsEZONiDnNJ206icA; psrf_musickey_createtime=1574482649; psrf_access_token_expiresAt=1582258649; person_id_bak=5295507715828209; person_id_wsbeacon=5689667751647505; wsreq_logseq=336060008", 15 | "origin": "https://h5.weishi.qq.com", 16 | "sec-fetch-mode": "cors", 17 | "sec-fetch-site": "same-origin", 18 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1", 19 | "x-requested-with": "XMLHttpRequest", 20 | } 21 | 22 | 23 | # 未登录时分享的链接 24 | # url = "https://h5.weishi.qq.com/weishi/wsplay/challenge?feedid=6YV0vjeP71IHTsV08&challegeid=100026&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=127081004&pkg=&attach=cp_reserves3_1190370002" 25 | def _get_not_logged(url: str) -> dict: 26 | global data 27 | post_url = "https://h5.weishi.qq.com/webapp/json/challenge_feedrank/GetChallengeFeedDetail?t=0.2602280426206063&g_tk=" 28 | 29 | query = parse_qs(urlparse(url).query) 30 | try: 31 | feedid = query.get('feedid')[0] 32 | challenge_id = query.get('challegeid')[0] 33 | except: 34 | data["msg"] = "获取失败" 35 | return data 36 | 37 | payload = { 38 | "feedid": feedid, 39 | "challenge_id": challenge_id, 40 | "type": 0, 41 | } 42 | 43 | with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep: 44 | if rep.status_code == 200: 45 | video_info = rep.json().get("data").get('feedinfos')[0] 46 | title = video_info.get("feed_desc") 47 | play_url = video_info.get("video_url") 48 | data["title"] = title 49 | data["videos"] = [play_url] 50 | else: 51 | data["msg"] = "获取失败" 52 | 53 | return data 54 | 55 | 56 | # 登录后分享的链接 57 | # url = "https://h5.weishi.qq.com/weishi/feed/770BSyaon1IQcqdbr/wsfeed?wxplay=1&id=770BSyaon1IQcqdbr&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=100081014&pkg=3670&attach=cp_reserves3_1000370011" 58 | def _get_logged(url: str) -> dict: 59 | global data 60 | post_url = "https://h5.weishi.qq.com/webapp/json/weishi/WSH5GetPlayPage?t=0.16820895093158983&g_tk=" 61 | 62 | query = parse_qs(urlparse(url).query) 63 | try: 64 | feedid = query.get('id')[0] 65 | except: 66 | data["msg"] = "获取失败" 67 | return data 68 | 69 | payload = { 70 | "feedid": feedid, 71 | "recommendtype": 0, 72 | "datalvl": "all", 73 | "_weishi_mapExt": {} 74 | } 75 | 76 | with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep: 77 | if rep.status_code == 200: 78 | video_info = rep.json().get('data').get('feeds')[0] 79 | title = video_info.get("feed_desc") 80 | play_url = video_info.get("video_url") 81 | data["title"] = title 82 | data["videos"] = [play_url] 83 | else: 84 | data["msg"] = "获取失败" 85 | 86 | return data 87 | 88 | 89 | def get(url: str) -> dict: 90 | return _get_not_logged(url) if url.startswith("https://h5.weishi.qq.com/weishi/wsplay/challenge") else _get_logged(url) 91 | 92 | 93 | if __name__ == "__main__": 94 | print(get(input("url: "))) 95 | -------------------------------------------------------------------------------- /extractor/xiaokaxiu.py: -------------------------------------------------------------------------------- 1 | # @wongxy 2 | import time 3 | from hashlib import md5 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | import requests 7 | 8 | 9 | def get(url: str) -> dict: 10 | """ 11 | title、videos 12 | """ 13 | data = {} 14 | 15 | try: 16 | qs = parse_qs(urlparse(url).query) 17 | video_id = qs["id"][0] 18 | except KeyError: 19 | return {"msg": "无法匹配视频id"} 20 | 21 | timestamp = str(int(time.time())) 22 | 23 | info_url = "https://appapi.xiaokaxiu.com/api/v1/web/share/video/" + video_id + "?time=" + timestamp 24 | 25 | temp = "S14OnTD#Qvdv3L=3vm" + "&time=" + timestamp 26 | x_sign = md5(temp.encode("utf-8")).hexdigest() 27 | headers = { 28 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", 29 | "x-sign": x_sign, 30 | } 31 | rep = requests.get(info_url, headers=headers) 32 | if rep.status_code == 200 and rep.json()["code"] == 0: 33 | video_info = rep.json()["data"] 34 | title = video_info["video"]["title"] 35 | video_url = video_info["video"]["url"][0] 36 | data["title"] = title 37 | data["videos"] = [video_url] 38 | return data 39 | return {"msg": "获取失败"} 40 | 41 | 42 | if __name__ == "__main__": 43 | url = "https://mobile.xiaokaxiu.com/video?id=6552158363189252096" 44 | print(get(url)) 45 | -------------------------------------------------------------------------------- /extractor/xinpianchang.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、imgs、videos(画质不同) 9 | """ 10 | data = {} 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 13 | } 14 | session = requests.Session() 15 | rep = session.get(url, headers=headers, timeout=10) 16 | if rep.status_code != 200: 17 | return {"msg": "获取失败"} 18 | try: 19 | vid = re.findall(r'vid: "(.*?)",', rep.text)[0] 20 | except IndexError: 21 | return {"msg": "获取失败"} 22 | 23 | video_info_url = "http://openapi-vtom.vmovier.com/v3/video/{vid}?expand=resource".format( 24 | vid=vid) 25 | rep = session.get(video_info_url, headers=headers, timeout=10) 26 | if rep.status_code != 200 or rep.json()["status"] != 0: 27 | return {"msg": "获取失败"} 28 | 29 | video_data = rep.json()["data"] 30 | 31 | title = video_data["video"]["title"] 32 | cover = video_data["video"]["cover"] 33 | video_list = video_data["resource"]["progressive"] # type: list 34 | 35 | # videos = [] 36 | # for item in video_list: 37 | # videos.append(item.get("https_url") or item.get("url")) 38 | video = video_list[0].get("https_url") or video_list[0].get("url") 39 | 40 | data["title"] = title 41 | data["imgs"] = [cover] 42 | data["videos"] = [video] 43 | 44 | return data 45 | 46 | 47 | if __name__ == "__main__": 48 | # url = "https://www.xinpianchang.com/a10628284" 49 | url = input("url: ") 50 | print(get(url)) 51 | -------------------------------------------------------------------------------- /extractor/zhihu_video.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | """ 9 | data = {} 10 | headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",} 11 | video_info_url = "https://lens.zhihu.com/api/v4/videos/{id}" 12 | 13 | videos = [] 14 | 15 | with requests.get(url, headers=headers, timeout=10) as rep: 16 | if rep.status_code == 200: 17 | ids = re.findall(r'www.zhihu.com/video/(\d{1,})', rep.text) 18 | ids = list(set(ids)) # 去掉重复元素 19 | else: 20 | data["msg"] = "视频获取失败,可能是这个页面没有视频" 21 | return data 22 | 23 | if not ids: 24 | data["msg"] = "视频获取失败,可能是这个页面没有视频" 25 | return data 26 | 27 | for id in ids: 28 | rep = requests.get(video_info_url.format(id=id), headers=headers, timeout=10) 29 | if rep.status_code == 200: 30 | playlist = rep.json().get("playlist") 31 | temp = playlist.get("HD") or playlist.get("SD") or playlist.get("LD") 32 | if temp: 33 | url = temp.get("play_url") 34 | videos.append(url) 35 | data["videos"] = [video for video in videos if video != ""] 36 | return data 37 | 38 | 39 | 40 | if __name__ == "__main__": 41 | url = input("url: ") 42 | print(get(url)) -------------------------------------------------------------------------------- /extractor/zuiyou_video.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def get(url: str) -> dict: 7 | """ 8 | title、videoName、videos 9 | """ 10 | data = {} 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" 13 | } 14 | re_title = r'
(.*?)
' 15 | re_video = r'' 16 | 17 | with requests.get(url, headers=headers, timeout=10) as rep: 18 | if rep.status_code == 200: 19 | title = re.findall(re_title, rep.text) 20 | video = re.findall(re_video, rep.text) 21 | if title: 22 | data["title"] = data["videoName"] = title[0] 23 | if video: 24 | data["videos"] = video 25 | else: 26 | data["msg"] = "失败" 27 | 28 | return data 29 | 30 | 31 | if __name__ == "__main__": 32 | url = "https://share.izuiyou.com/detail/147486886?zy_to=applink&to=applink" 33 | print(get(url)) 34 | -------------------------------------------------------------------------------- /extractor/zuiyou_voice.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlparse 3 | 4 | import requests 5 | 6 | 7 | def get(url: str) -> dict: 8 | """ 9 | text、audios 10 | """ 11 | data = {} 12 | headers = { 13 | "Connection": "keep-alive", 14 | "Content-Length": "209", 15 | "Content-Type": "text/plain;charset=UTF-8", 16 | "Host": "share.izuiyou.com", 17 | "Origin": "https://share.izuiyou.com", 18 | "Referer": url, 19 | "Sec-Fetch-Mode": "cors", 20 | "Sec-Fetch-Site": "same-origin", 21 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 22 | } 23 | post_url = "https://share.izuiyou.com/api/review/share_review" 24 | 25 | path = urlparse(url).path 26 | temp = path.split("/") 27 | pid = temp[-2] 28 | rid = temp[-1] 29 | 30 | payload = { 31 | "h_av": "3.0", 32 | "h_dt": 9, 33 | "h_nt": 9, 34 | "h_ch": "web_app", 35 | "ua": 36 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", 37 | "pid": f"{pid}", 38 | "rid": f"{rid}" 39 | } 40 | play_host = "http://tbvideo.ixiaochuan.cn/" 41 | with requests.post(post_url, data=json.dumps(payload), headers=headers, timeout=10) as rep: 42 | if rep.status_code == 200: 43 | try: 44 | audio_info = rep.json().get("data").get("review").get("audio") 45 | voice_text = audio_info.get("voice_text") 46 | # uri = audio_info.get("uri") 47 | org_uri = audio_info.get("org_uri") 48 | data["text"] = voice_text 49 | data["audios"] = [play_host + org_uri] 50 | except (TypeError, AttributeError): 51 | data["msg"] = "获取失败" 52 | else: 53 | data["msg"] = "获取失败" 54 | 55 | return data 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | print(get(input("url: "))) -------------------------------------------------------------------------------- /misc.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable 2 | 3 | logo = r""" 4 | _______ _______ _________ ______ _______ _______ 5 | ( ____ \( ____ )\__ __/( __ \ ( ____ \( ____ ) 6 | | ( \/| ( )| ) ( | ( \ )| ( \/| ( )| 7 | | (_____ | (____)| | | | | ) || (__ | (____)| 8 | (_____ )| _____) | | | | | || __) | __) 9 | ) || ( | | | | ) || ( | (\ ( 10 | /\____) || ) ___) (___| (__/ )| (____/\| ) \ \__ 11 | \_______)|/ \_______/(______/ (_______/|/ \__/""" 12 | 13 | 14 | def printTips(): 15 | platforms = [ 16 | ["哔哩哔哩", "封面、视频"], 17 | ["唱鸭", "音频"], 18 | ["抖音", "无水印视频"], 19 | ["酷狗", "音频"], 20 | ["酷我", "音频"], 21 | ["荔枝FM", "音频"], 22 | ["网易云音乐", "音频、mv、视频"], 23 | ["QQ音乐", "音频"], 24 | ["皮皮搞笑", "无水印视频"], 25 | ["全民K歌", "音频&视频"], 26 | ["微博", "视频"], 27 | ["微视", "无水印视频"], 28 | ["知乎", "视频"], 29 | ["最右", "音频(语音帖评论)"], 30 | ["千千音乐", "音频"], 31 | ["5sing", "音频"], 32 | ["皮皮虾", "无水印视频"], 33 | ["轻视频", "无水印视频"], 34 | ["趣头条", "视频"], 35 | ["酷6网", "视频"], 36 | ["乐乎", "视频"], 37 | ["网易公开课", "视频(免费)"], 38 | ["新片场", "视频"], 39 | ["百度贴吧", "视频"], 40 | ["快手", "无水印视频、长图视频"], 41 | ["AcFun弹幕网", "视频"], 42 | ["百度好看视频", "视频"], 43 | ["梨视频", "视频"], 44 | ["小咖秀", "无水印视频"], 45 | ["搜狐视频", "视频"], 46 | ["土豆视频", "视频(免费电视剧等)"], 47 | ["TED", "视频"], 48 | ["图虫", "图片"], 49 | ["其他", "。。。"] 50 | ] 51 | table = PrettyTable(["支持平台", "支持内容"]) 52 | for platform in platforms: 53 | table.add_row(platform) 54 | print(logo) 55 | print(""" 56 | ╭━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╮ 57 | │ @wongxy \033[36;4mhttps://github.com/xiyaowong\033[0m │ 58 | ╰━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╯""") 59 | print("爬取并下载部分资源") 60 | print(table) 61 | 62 | 63 | if __name__ == "__main__": 64 | printTips() 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # extractor 2 | requests 3 | pycryptodome 4 | 5 | # extract 6 | prettytable 7 | click 8 | 9 | # web 10 | flask 11 | flask-cors 12 | python-dotenv 13 | -------------------------------------------------------------------------------- /screenshot/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/example.gif -------------------------------------------------------------------------------- /screenshot/run.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/run.gif -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | from datetime import datetime 5 | from functools import wraps 6 | 7 | import click 8 | import requests 9 | 10 | 11 | def remove_file(path): 12 | if os.path.isfile(path): 13 | os.remove(path) 14 | 15 | 16 | def filter_name(name): 17 | """ 18 | 过滤文件名 19 | """ 20 | regexp = re.compile(r'(/|\\|:|\?|\*|\||"|\'|<|>|\$)') 21 | space = re.compile(r'\s{2,}') 22 | return space.sub(" ", regexp.sub("", name)) 23 | 24 | 25 | def check_dir(path): 26 | """ 27 | 检查文件夹是否存在,存在返回True;不存在则创建,返回False 28 | """ 29 | if not os.path.exists(path): 30 | os.makedirs(path) 31 | return False 32 | return True 33 | 34 | 35 | def retry(n=3, delay=0.5): 36 | def deco(func): 37 | @wraps(func) 38 | def wrapper(*a, **kw): 39 | count = 1 40 | while True: 41 | try: 42 | return func(*a, **kw) 43 | except Exception as e: 44 | if count == n + 1: 45 | break 46 | print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e)) 47 | count += 1 48 | time.sleep(delay) 49 | print('重试结束,[{}]运行失败'.format(func.__name__)) 50 | return False 51 | return wrapper 52 | return deco 53 | 54 | 55 | def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15): 56 | """ 57 | :param file_url: 下载资源链接 58 | :param file_name: 保存文件名,默认为当前日期时间 59 | :param file_type: 文件类型(扩展名) 60 | :param save_path: 保存路径,默认为download,后面不要"/" 61 | :param headers: http请求头,默认为iphone 62 | """ 63 | if file_name is None: 64 | file_name = str(datetime.now()) 65 | file_name = filter_name(file_name) 66 | 67 | if file_type is None: 68 | if "." in file_url: 69 | file_type = file_url.split(".")[-1] 70 | else: 71 | file_type = "uknown" 72 | 73 | check_dir(save_path) 74 | 75 | file_name = file_name + "." + file_type 76 | 77 | if headers is None: 78 | headers = { 79 | "User-Agent": 80 | "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1" 81 | } 82 | 83 | # 下载提示 84 | if os.path.exists(f"{save_path}/{file_name}"): 85 | print(f'\033[33m{file_name}已存在,不再下载!\033[0m') 86 | return True 87 | print(f"Downloading {file_name}") 88 | try: 89 | with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep: 90 | file_size = int(rep.headers['Content-Length']) 91 | if rep.status_code != 200: 92 | print("\033[31m下载失败\033[0m") 93 | return False 94 | label = '{:.2f}MB'.format(file_size / (1024 * 1024)) 95 | with click.progressbar(length=file_size, label=label) as progressbar: 96 | with open(f"{save_path}/{file_name}", "wb") as f: 97 | for chunk in rep.iter_content(chunk_size=1024): 98 | if chunk: 99 | f.write(chunk) 100 | progressbar.update(1024) 101 | print(f"\033[32m{file_name}下载成功\033[0m") 102 | except Exception as e: 103 | print('下载失败: ', e) 104 | remove_file(f"{save_path}/{file_name}") 105 | return True 106 | -------------------------------------------------------------------------------- /web/README.md: -------------------------------------------------------------------------------- 1 | #### 简单提供爬虫api 2 | 3 | 4 | 5 | ```shell 6 | pip install gunicorn 7 | ``` 8 | 9 | ```shell 10 | gunicorn app:app 11 | 12 | or 13 | 14 | python app.py 15 | ``` 16 | 17 | `/extract?url=`这里填入链接,post请求提交`url`参数也行 18 | -------------------------------------------------------------------------------- /web/__init__.py: -------------------------------------------------------------------------------- 1 | from ._response import response 2 | 3 | -------------------------------------------------------------------------------- /web/_response.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify 2 | from werkzeug.http import HTTP_STATUS_CODES 3 | 4 | 5 | def response(code=200, data=None, error=None, msg=None): 6 | """ 7 | :param code: 状态码 8 | :param data: 返回数据 9 | :param error: 错误信息 10 | :param msg: 提示信息 11 | """ 12 | 13 | if code is not None and code >= 400: 14 | error = HTTP_STATUS_CODES.get(code, "unknown error") 15 | 16 | pay_load = { 17 | "code": code, 18 | "data": data, 19 | "err": error, 20 | "message": msg or HTTP_STATUS_CODES.get(code, "unknown status"), 21 | } 22 | _response = jsonify(pay_load) 23 | _response.status_code = code 24 | return _response 25 | -------------------------------------------------------------------------------- /web/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.join(os.path.dirname(os.getcwd()))) 4 | 5 | from flask import Flask 6 | from flask_cors import CORS 7 | 8 | from web import config, error, views, log 9 | 10 | 11 | 12 | def create_app() -> Flask: 13 | app = Flask(__name__) 14 | app.config.from_object(config) 15 | CORS(app) 16 | 17 | views.init_app(app) 18 | error.init_app(app) 19 | log.init_app(app) 20 | 21 | if app.config["ENV"] == "development": 22 | print(app.url_map) 23 | 24 | return app 25 | 26 | 27 | app = create_app() 28 | 29 | if __name__ == "__main__": 30 | app.run() 31 | -------------------------------------------------------------------------------- /web/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv() 6 | 7 | ENV = os.getenv("FLASK_ENV") or "production" 8 | SECRET_KEY = os.getenv("SECRET_KEY") or "wongxy" 9 | DEBUG = os.getenv("DEBUG") or False 10 | -------------------------------------------------------------------------------- /web/error.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-argument 2 | from flask import Flask 3 | 4 | try: 5 | import response 6 | except ImportError: 7 | from . import response 8 | 9 | 10 | def init_app(app: Flask): 11 | @app.errorhandler(400) 12 | def _error_400(e): 13 | return response(400) 14 | 15 | @app.errorhandler(500) 16 | def _error_500(e): 17 | return response(500) 18 | 19 | @app.errorhandler(404) 20 | def _error_404(e): 21 | return response(404) 22 | 23 | @app.errorhandler(405) 24 | def _error_404(e): 25 | return response(405) 26 | -------------------------------------------------------------------------------- /web/example.env: -------------------------------------------------------------------------------- 1 | # FLASK_ENV=development 2 | FLASK_ENV=production 3 | DEBUG=False 4 | SECRET_KEY="a string you never guess" 5 | -------------------------------------------------------------------------------- /web/funcs.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from flask import current_app 4 | 5 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan, 6 | ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music, 7 | momo, music163, open163, pearvideo, pic58, pipigaoxiao, 8 | pipix, qianqian, qingshipin, qqmusic, quanminkge, 9 | qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo, 10 | weishi, xiaokaxiu, xinpianchang, zhihu_video, 11 | zuiyou_voice) 12 | 13 | from web import response 14 | 15 | 16 | 17 | crawlers = { 18 | 'acfun': acfun, 19 | 'tieba': baidutieba, 20 | 'bili': bilibili, 21 | 'changya': changya, 22 | 'douyin': douyin, 23 | 'haokan': haokan, 24 | 'ku6': ku6, 25 | 'chenzhongtech': kuaishou, 26 | 'kuaishou': kuaishou, 27 | 'kugou': kugou, 28 | 'kuwo': kuwo, 29 | 'lizhi': lizhiFM, 30 | 'lofter': lofter, 31 | 'music.163': music163, 32 | 'open.163': open163, 33 | 'pearvideo': pearvideo, 34 | 'ippzone': pipigaoxiao, 35 | 'pipix': pipix, 36 | 'music.taihe': qianqian, 37 | 'qingshipin': qingshipin, 38 | 'y.qq': qqmusic, 39 | 'kg': quanminkge, 40 | 'qutoutiao': qutoutiao, 41 | '5sing': sing5, 42 | 'weibo': weibo, 43 | 'weishi': weishi, 44 | 'xiaokaxiu': xiaokaxiu, 45 | 'xinpianchang': xinpianchang, 46 | 'zhihu': zhihu_video, 47 | 'zuiyou': zuiyou_voice, 48 | 'sohu': sohuTV, 49 | 'ted': ted, 50 | 'tudou': tudou, 51 | 'momo': momo, 52 | 'music.migu': migu_music, 53 | '58pic': pic58, 54 | 'tuchong': tuchong 55 | } 56 | 57 | 58 | def extract(url: str): # pylint: disable=too-many-statements 59 | try: 60 | url = re.findall( 61 | r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url) 62 | if not url: 63 | return response(404, msg="无法匹配链接") 64 | url = url[0] 65 | 66 | data = None 67 | for c_name, c_func in crawlers.items(): 68 | if c_name in url: 69 | data = c_func.get(url) # type: dict 70 | break 71 | if data is not None: 72 | # 删除值为空的键 73 | for key, value in data.copy().items(): 74 | if not value: 75 | data.pop(key) 76 | return response(data=data, msg=data.get("msg")) 77 | else: 78 | return response(404, msg="不支持的链接") 79 | except Exception as e: 80 | current_app.logger.error(e) 81 | current_app.logger.exception(e) 82 | return response(500, error=e, msg="服务器错误") 83 | -------------------------------------------------------------------------------- /web/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from logging.handlers import RotatingFileHandler 4 | 5 | from flask import Flask 6 | from flask.logging import default_handler 7 | 8 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 9 | 10 | LOG_PATH = os.path.join(BASE_DIR, 'logs') 11 | 12 | if not os.path.exists(LOG_PATH): 13 | os.makedirs(LOG_PATH) 14 | 15 | LOG_PATH_ALL = os.path.join(LOG_PATH, 'all.log') 16 | 17 | LOG_FILE_MAX_BYTES = 10 * 1024 * 1024 18 | LOG_FILE_BACKUP_COUNT = 10 19 | 20 | 21 | def init_app(app: Flask): 22 | app.logger.removeHandler(default_handler) 23 | 24 | formatter = logging.Formatter( 25 | "%(asctime)s [%(levelname)s] [%(filename)s] %(message)s" 26 | ) 27 | 28 | file_handler = RotatingFileHandler( 29 | filename=LOG_PATH_ALL, 30 | mode='a', 31 | maxBytes=LOG_FILE_MAX_BYTES, 32 | backupCount=LOG_FILE_BACKUP_COUNT, 33 | encoding='utf-8' 34 | ) 35 | 36 | file_handler.setFormatter(formatter) 37 | file_handler.setLevel(logging.WARNING) 38 | 39 | for logger in (app.logger, 40 | logging.getLogger('werkzeug')): 41 | logger.addHandler(file_handler) 42 | -------------------------------------------------------------------------------- /web/views.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | 3 | import funcs 4 | from web import response 5 | 6 | 7 | def home(): 8 | data = ":)" 9 | return response(data=data) 10 | 11 | 12 | def extract(): 13 | if "url" not in request.values: 14 | return response(400, msg="Missing parameter.") 15 | url = request.values["url"] 16 | return funcs.extract(url) 17 | 18 | 19 | def init_app(app: Flask): 20 | app.add_url_rule("/", "home", home) 21 | app.add_url_rule("/extract/", "extract", extract) 22 | --------------------------------------------------------------------------------