├── .gitignore
├── LICENSE
├── README.md
├── extract.py
├── extractor
├── README.md
├── __init__.py
├── acfun.py
├── baidutieba.py
├── bilibili.py
├── changba.py
├── changya.py
├── douyin.py
├── haokan.py
├── ku6.py
├── kuaishou.py
├── kugou.py
├── kuwo.py
├── lequ.py
├── lizhiFM.py
├── lofter.py
├── migu_music.py
├── momo.py
├── music163
│ ├── __init__.py
│ ├── encrypt.py
│ └── music163.py
├── open163.py
├── pearvideo.py
├── peiyinxiu.py
├── pic58.py
├── pipigaoxiao.py
├── pipix.py
├── qianqian.py
├── qingshipin.py
├── qmgx.py
├── qqmusic.py
├── quanminkge.py
├── quanminxsp.py
├── qutoutiao.py
├── sing5.py
├── sohuTV.py
├── ted.py
├── tuchong.py
├── tudou.py
├── wechat_article_cover.py
├── weibo.py
├── weishi.py
├── xiaokaxiu.py
├── xinpianchang.py
├── zhihu_video.py
├── zuiyou_video.py
└── zuiyou_voice.py
├── misc.py
├── requirements.txt
├── screenshot
├── example.gif
└── run.gif
├── utils.py
└── web
├── README.md
├── __init__.py
├── _response.py
├── app.py
├── config.py
├── error.py
├── example.env
├── funcs.py
├── log.py
└── views.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | .vim/coc-settings.json
131 | download/
132 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 wongxy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 新情况
2 |
3 | 这是很久没管的旧项目,代码质量和风格一言难尽,部分爬虫仍然可用。现计划用 fastAPI 框架搭建一个简单解析 API 服务,功能仍是简单粗糙,但用于学习或日常使用还是可以的
4 |
5 | 切换到[fastapi 分支](https://github.com/xiyaowong/spiders/tree/fastapi)即可
6 |
7 | ---
8 |
9 | - #### 都是相对简单的爬虫,熟练应该看一眼就懂了,如果是初学者,里面有些东西还是值得看一看的。
10 |
11 | - #### 爬虫文件详情在这里 [extractor](/extractor)
12 |
13 | ---
14 |
15 | ```shell
16 | pip3 install -r requirements.txt
17 | python3 extract.py
18 | ```
19 |
20 | 可能还需要安装 nodejs
21 |
22 | - #### screenshot
23 |
24 | 
25 |
26 | - #### release
27 |
28 | - #### 欢迎**star**:star: & **fork**
29 |
--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from datetime import datetime
4 | from queue import Queue
5 | from threading import Thread
6 |
7 | import utils
8 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan,
9 | ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music,
10 | momo, music163, open163, pearvideo, pic58, pipigaoxiao,
11 | pipix, qianqian, qingshipin, qqmusic, quanminkge,
12 | qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo,
13 | weishi, xiaokaxiu, xinpianchang, zhihu_video,
14 | zuiyou_voice)
15 | from misc import printTips
16 |
17 | here = os.path.abspath(os.path.dirname(__file__))
18 |
19 | crawlers = {
20 | 'acfun': acfun,
21 | 'tieba': baidutieba,
22 | 'bili': bilibili,
23 | 'changya': changya,
24 | 'douyin': douyin,
25 | 'haokan': haokan,
26 | 'ku6': ku6,
27 | 'chenzhongtech': kuaishou,
28 | 'kuaishou': kuaishou,
29 | 'kugou': kugou,
30 | 'kuwo': kuwo,
31 | 'lizhi': lizhiFM,
32 | 'lofter': lofter,
33 | 'music.163': music163,
34 | 'open.163': open163,
35 | 'pearvideo': pearvideo,
36 | 'ippzone': pipigaoxiao,
37 | 'pipix': pipix,
38 | 'music.taihe': qianqian,
39 | 'qingshipin': qingshipin,
40 | 'y.qq': qqmusic,
41 | 'kg': quanminkge,
42 | 'qutoutiao': qutoutiao,
43 | '5sing': sing5,
44 | 'weibo': weibo,
45 | 'weishi': weishi,
46 | 'xiaokaxiu': xiaokaxiu,
47 | 'xinpianchang': xinpianchang,
48 | 'zhihu': zhihu_video,
49 | 'zuiyou': zuiyou_voice,
50 | 'sohu': sohuTV,
51 | 'ted': ted,
52 | 'tudou': tudou,
53 | 'momo': momo,
54 | 'music.migu': migu_music,
55 | '58pic': pic58,
56 | 'tuchong': tuchong
57 | }
58 |
59 |
60 | class Task:
61 | def __init__(self, url, save_path='', file_name=None, file_type='unknown'):
62 | self.url = url
63 | self.save_path = save_path
64 | self.file_name = file_name or str(datetime.now())
65 | self.file_type = file_type
66 |
67 |
68 | def data2tasks(data: dict) -> list:
69 | title = data.get("title")
70 | author = data.get("author")
71 | audioName = data.get("audioName")
72 | videoName = data.get("videoName")
73 | imgs = data.get("imgs")
74 | audios = data.get("audios")
75 | videos = data.get("videos")
76 | text = data.get("text")
77 | msg = data.get("msg")
78 |
79 | if msg:
80 | print(msg)
81 | print()
82 | if text:
83 | print(text)
84 | print()
85 | tasks = []
86 | if imgs:
87 | img_tasks = [Task(img, 'download/images', file_type='jpg') for img in imgs]
88 | tasks.extend(img_tasks)
89 | if audios:
90 | file_name = (audioName or "") + "-" + (author or "")
91 | audio_tasks = [Task(audio, 'download/audios', file_name=file_name, file_type='mp3') for audio in audios]
92 | tasks.extend(audio_tasks)
93 | if videos:
94 | file_name = (videoName or title or "")
95 | video_tasks = [Task(video, 'download/videos', file_name=file_name, file_type='mp4') for video in videos]
96 | tasks.extend(video_tasks)
97 | return tasks
98 |
99 |
100 | @utils.retry(2)
101 | def dl(dl_queue: Queue):
102 | while not dl_queue.empty():
103 | task = dl_queue.get() # type: Task
104 | utils.download(file_url=task.url,
105 | save_path=task.save_path,
106 | file_name=task.file_name,
107 | file_type=task.file_type)
108 |
109 |
110 | def get_data(url):
111 | for c_name, c_func in crawlers.items():
112 | if c_name in url:
113 | data = c_func.get(url)
114 | print(data)
115 | return data
116 | print(f'链接【\033[31m{url}\033[0m】不支持')
117 | return None
118 |
119 |
120 | @utils.retry(2)
121 | def parse_urls(text: str) -> list:
122 | urls = re.findall(
123 | r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", text)
124 | return urls
125 |
126 |
127 | if __name__ == "__main__":
128 | printTips()
129 | while True:
130 | what = input("输入链接http开头(输入任意不包含链接的内容就能退出):")
131 | urls = parse_urls(what)
132 | if not urls:
133 | print("bye~")
134 | break
135 | print(f"""
136 | ╭━━━━━━━━━━━━━╮
137 | │ 一共{len(urls)}个链接 │
138 | ╰━━━━━━━━━━━━━╯
139 | """)
140 | all_task = []
141 | for idx, url in enumerate(urls):
142 | print(f"正在解析第{idx+1}个链接【{url}】")
143 | data = get_data(url)
144 | if data:
145 | all_task.extend(data2tasks(data))
146 |
147 | queue = Queue(maxsize=100)
148 | for t in all_task:
149 | queue.put(t)
150 |
151 | print()
152 | print(f'{len(all_task)} tasks!')
153 | print()
154 | ts = [Thread(target=dl, args=(queue, )) for _ in range(min(len(all_task), 6))]
155 | for t in ts:
156 | t.start()
157 |
158 | for t in ts:
159 | t.join()
160 |
--------------------------------------------------------------------------------
/extractor/README.md:
--------------------------------------------------------------------------------
1 | ### 这里是一些小爬虫集合
2 |
3 | ---
4 |
5 | 每个平台对应一个文件,每个文件里面有一个`get(url: str)`函数 统一形式如下(里面使用 f-string 需要 python3.6+):
6 |
7 | ```python
8 | """
9 | Args:
10 | url: str
11 | Returns:
12 | data: dict
13 | |_ {
14 | title: str,
15 | author: str,
16 | audioName: str,
17 | videoName: str,
18 | imgs: List[str],
19 | audios: List[str],
20 | videos: List[str],
21 | text: str,
22 | msg: str
23 | }
24 | Tips:
25 | data里面的各个字段只有当爬取到相关内容时才会存在,除了msg(不过这个没啥大用)
26 | ☆ 爬取未成功也会返回data,而且不一定为空
27 | """
28 | ```
29 |
30 | # 默认输入的链接都正确:grin:
31 |
32 | ---
33 |
34 | | 平台 | 资源内容 | 完成状态 |
35 | | :--------------------: | :------------------: | :----------------: |
36 | | bilibili(哔哩哔哩) | 封面、视频 | :white_check_mark: |
37 | | changya(唱鸭) | 音频 | :white_check_mark: |
38 | | douyin(抖音) | 无水印视频 | :white_check_mark: |
39 | | kugou(酷狗) | 音频 | :white_check_mark: |
40 | | kuwo(酷我) | 音频 | :white_check_mark: |
41 | | lizhiFM(荔枝 FM) | 音频 | :white_check_mark: |
42 | | music163(网易云音乐) | 音频、视频、mv | :white_check_mark: |
43 | | qqmusic(QQ 音乐) | 音频 | :white_check_mark: |
44 | | pipigaoxiao(皮皮搞笑) | 无水印视频 | :white_check_mark: |
45 | | quanminkge(全民 K 歌) | 音频或视频 | :white_check_mark: |
46 | | weibo(微博) | 视频 | :white_check_mark: |
47 | | weishi(微视) | 无水印视频 | :white_check_mark: |
48 | | zhihu(知乎) | 视频 | :white_check_mark: |
49 | | zuiyou_voice(最右) | 音频(语音帖评论) | :white_check_mark: |
50 | | zuiyou_video(最右) | 视频 | :white_check_mark: |
51 | | qianqian(千千音乐) | 音频 | :white_check_mark: |
52 | | 5sing(5sing) | 音频 | :white_check_mark: |
53 | | pipix(皮皮虾) | 无水印视频 | :white_check_mark: |
54 | | qingshipin(轻视频) | 无水印视频 | :white_check_mark: |
55 | | qutoutiao(趣头条) | 视频 | :dash: |
56 | | ku6(酷 6 网) | 视频 | :white_check_mark: |
57 | | lofter(乐乎) | 视频 | :white_check_mark: |
58 | | open163(网易公开课) | 免费视频 | :white_check_mark: |
59 | | xinpianchang(新片场) | 视频 | :white_check_mark: |
60 | | baidutieba(百度贴吧) | 视频 | :white_check_mark: |
61 | | kuaishou(快手) | 无水印视频、长图视频 | :white_check_mark: |
62 | | acfun(AcFun 弹幕网) | 视频 | :white_check_mark: |
63 | | haokan(百度好看视频) | 视频 | :white_check_mark: |
64 | | pearvideo(梨视频) | 视频 | :white_check_mark: |
65 | | xiaokaxiu(小咖秀) | 无水印视频 | :white_check_mark: |
66 | | sohuTV(搜狐视频) | 视频 | :white_check_mark: |
67 | | ted(TED) | 视频 | :white_check_mark: |
68 | | tudou(土豆视频) | 视频 | :white_check_mark: |
69 | | quanminxsp(全民小视频) | 视频 | :white_check_mark: |
70 | | lequ(乐趣) | 背景动图、音频 | :white_check_mark: |
71 | | peiyinxiu(配音秀) | 视频 | :white_check_mark: |
72 | | tuchong(图虫) | 图片 | :white_check_mark: |
73 | | changba(唱吧) | 视频 | :white_check_mark: |
74 | | migu(咪咕音乐) | 音频 | :white_check_mark: |
75 | | momo(陌陌) | 视频 | :white_check_mark: |
76 | | 58pic(千图网) | 图片 | :white_check_mark: |
77 | | qmgx(全民搞笑) | 无水印视频 | :white_check_mark: |
78 |
--------------------------------------------------------------------------------
/extractor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = "wongxy github:xiyaowong"
2 | __all__ = [
3 | "bilibili",
4 | "changya",
5 | "douyin",
6 | "kugou",
7 | "kuwo",
8 | "lizhiFM",
9 | "music163",
10 | "pipigaoxiao",
11 | "quanminkge",
12 | "weibo",
13 | "zhihu_video",
14 | "weishi",
15 | "zuiyou_voice",
16 | "zuiyou_video",
17 | "qqmusic",
18 | "qianqian",
19 | "sing5",
20 | "pipix",
21 | "qingshipin",
22 | "qutoutiao",
23 | "ku6",
24 | "lofter",
25 | "open163",
26 | "xinpianchang",
27 | "baidutieba",
28 | "kuaishou",
29 | "acfun",
30 | "haokan",
31 | "pearvideo",
32 | "xiaokaxiu",
33 | "sohuTV",
34 | "ted",
35 | "tudou",
36 | "quanminxsp",
37 | "lequ",
38 | "peiyinxiu",
39 | "tuchong",
40 | "changba",
41 | "migu_music",
42 | "momo",
43 | "pic58",
44 | "qmgx"
45 | ]
46 |
--------------------------------------------------------------------------------
/extractor/acfun.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videos
9 | """
10 | data = {}
11 | headers = {
12 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
13 | }
14 | info_url = "https://api-new.acfunchina.com/rest/app/play/playInfo/mp4?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjAzNTI2NDMAAhAAMEP1uwS3Vi7NYAAAAJumF4MyTTFh5HGoyjW6ZpdjKymALUy9jZbsMTBVx-F10EhxyvpMtGQbBCYipvkMShM3iMNwbMd9DM6r2rnOYRVEdr6MaJS4yxxlA_Sl3JNWup57qBCQzOSC7SZnbEsHTQ%3D%3D&market=xiaomi&product=ACFUN_APP&sys_version=10&app_version=6.20.0.915&boardPlatform=sdm845&sys_name=android&socName=UNKNOWN&appMode=0"
15 | # info_url = "https://m.acfun.cn/rest/mobile-direct/play/playInfo/singleQuality?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjA5NTQ0MDACARAAMEP1uwPvjQhfQAAAAIAq7FtjRH%2Fn9rSMzs1AUNhmIS6eARtddADGgoGewjnABMg39tddqp9dTUq%2Ffd7MBisH5JpVc1bpf64a%2Bz3qrdI%3D"
16 |
17 | # get videoId, resourceIds
18 | re_title = r'
(.*?)'
19 | re_videoId = r'"vid":"(\d+)",'
20 | re_resourceId = r'"ac":"(\d+)",'
21 |
22 | try:
23 | rep_html = requests.get(url, headers=headers, timeout=10)
24 |
25 | title = re.findall(re_title, rep_html.text)[0]
26 | videoId = re.findall(re_videoId, rep_html.text)[0]
27 | resourceId = re.findall(re_resourceId, rep_html.text)[0]
28 |
29 | rep_info = requests.get(info_url.format(videoId, resourceId), headers=headers, timeout=10)
30 |
31 | video = rep_info.json()["playInfo"]["streams"][0]["playUrls"][0]
32 | except (IndexError, TypeError):
33 | data["msg"] = "获取失败"
34 | else:
35 | data["title"] = title
36 | data["videos"] = [video]
37 |
38 | return data
39 |
40 |
41 | if __name__ == "__main__":
42 | url = "https://m.acfun.cn/v/?ac=14134176&part=2"
43 | print(get(url))
44 |
--------------------------------------------------------------------------------
/extractor/baidutieba.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | videos
9 | """
10 | data = {}
11 | headers = {
12 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
13 | }
14 | rep = requests.get(url, headers=headers, timeout=10)
15 | if rep.status_code == 200:
16 | data["videos"] = re.findall(r'data-video="(.*?)"', rep.text)
17 | else:
18 | data["msg"] = "获取失败"
19 |
20 | return data
21 |
22 |
23 | if __name__ == "__main__":
24 | # url = "https://tieba.baidu.com/p/6098286801?share=9105&fr=share&sfc=copy&client_type=2&client_version=11.3.8.2&st=1585294971&unique=190E4CEC3908756B412C7ABAE54C772F&red_tag=2618234446"
25 | url = input("url: ")
26 | print(get(url))
27 |
--------------------------------------------------------------------------------
/extractor/bilibili.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | imgs、videos
9 | """
10 | data = {}
11 | headers = {
12 | "user-agent":
13 | "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
14 | "Referer": "https://www.bilibili.com/",
15 | }
16 |
17 | av_number_pattern = r'(BV[0-9a-zA-Z]*)'
18 | cover_pattern = r"readyPoster: '(.*?)',"
19 | video_pattern = r"readyVideoUrl: '(.*?)',"
20 | title_pattern = r'title":"(.*?)",'
21 |
22 | av = re.findall(av_number_pattern, url)
23 | if av:
24 | av = av[0]
25 | else:
26 | data["msg"] = "链接可能不正确,因为我无法匹配到av号"
27 | return data
28 | url = f"https://www.bilibili.com/video/{av}"
29 |
30 | with requests.get(url, headers=headers, timeout=10) as rep:
31 | if rep.status_code == 200:
32 | cover_url = re.findall(cover_pattern, rep.text)
33 | if cover_url:
34 | cover_url = cover_url[0]
35 | if '@' in cover_url:
36 | cover_url = cover_url[:cover_url.index('@')]
37 | data["imgs"] = ['https:' + cover_url]
38 |
39 | video_url = re.findall(video_pattern, rep.text)
40 | title_text = re.findall(title_pattern, rep.text)
41 | if video_url:
42 | video_url = video_url[0]
43 | data["videos"] = [video_url]
44 | if title_text:
45 | data["videoName"] = title_text[0]
46 | else:
47 | data["msg"] = "获取失败"
48 | return data
49 |
50 |
51 | if __name__ == "__main__":
52 | print(get(input("url: ")))
53 |
--------------------------------------------------------------------------------
/extractor/changba.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import execjs
4 | import requests
5 |
6 | js_code = """l=new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);function u(t){var e,o,n,a,i,r,s;for(r=t.length,i=0,s="";i>4);do{if(61==(n=255&t.charCodeAt(i++)))return s;n=l[n]}while(i>2);do{if(61==(a=255&t.charCodeAt(i++)))return s;a=l[a]}while(i dict:
11 | """
12 | videos
13 | """
14 | data = {}
15 | headers = {
16 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
17 | }
18 | rep = requests.get(url, headers=headers, timeout=10)
19 | if rep.status_code != 200:
20 | return {"msg": "获取失败"}
21 |
22 | enc_video_url = re.findall(r"video_url: '(.*?)',", rep.text)[0]
23 | video_url = "https:" + js.call("u", (enc_video_url,))
24 | data["videos"] = [video_url]
25 | return data
26 |
27 |
28 | if __name__ == "__main__":
29 | print(get(input("url: ")))
30 |
--------------------------------------------------------------------------------
/extractor/changya.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | author、audioName、audios
9 | """
10 | data = {}
11 | headers = {
12 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
13 | }
14 | audio_url_pattern = r'', html)
24 | data["author"] = author
25 | data["audioName"] = audioName
26 | data["imgs"] = imgs
27 | data["audios"] = audios
28 | except Exception:
29 | data["msg"] = {"msg": "获取失败"}
30 |
31 | return data
32 |
33 |
34 | if __name__ == "__main__":
35 | url = "https://api.bestdjb.com/promote/song-share/6477f04370cc22e7d9c2d3ac4265a92a?app_version=1.4.3"
36 | print(get(url))
37 |
--------------------------------------------------------------------------------
/extractor/lizhiFM.py:
--------------------------------------------------------------------------------
1 | # from urllib.parse import urlparse
2 | import re
3 |
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | author、audioName、audios
10 | """
11 | data = {}
12 | headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"}
13 | info_url = "https://m.lizhi.fm/vodapi/voice/info/{id}"
14 |
15 | # path = urlparse(url).path
16 | # voiceId = path.split("/")[-1]
17 | voiceId = re.findall(r"/(\d{1,})", url)
18 | if not voiceId:
19 | data["msg"] = "链接无效,解析未成功"
20 | return data
21 | else:
22 | voiceId = voiceId[-1]
23 |
24 | with requests.get(info_url.format(id=voiceId), headers=headers, timeout=10) as rep:
25 | if rep.status_code == 200 and rep.json().get("code") == 0:
26 | info = rep.json()
27 | userName = info.get("data").get("userVoice").get("userInfo").get("name")
28 | voiceName = info.get("data").get("userVoice").get("voiceInfo").get("name")
29 | voiceUrl= info.get("data").get("userVoice").get("voicePlayProperty").get("trackUrl")
30 | data["author"] = userName
31 | data["audioName"] = voiceName
32 | data["audios"] = [voiceUrl]
33 | else:
34 | data["msg"] = "未能解析成功"
35 |
36 | return data
37 |
38 |
39 |
40 |
41 | if __name__ == "__main__":
42 | url = input("url: ")
43 | print(get(url))
--------------------------------------------------------------------------------
/extractor/lofter.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | videos
9 | """
10 | data = {}
11 | rep = requests.get(url, timeout=10)
12 | if rep.status_code == 200:
13 | data["videos"] = re.findall(r' 1 else singerName[0]
31 |
32 | # audioName
33 | audioName = json["songName"]
34 |
35 | # contentId
36 | c_item = json.get("qq") # type:dict
37 |
38 | if not c_item:
39 | return {"msg": "获取失败"}
40 | contentId = c_item["productId"]
41 |
42 | # toneFlag
43 | toneFlag = "HQ" if json["hasHQqq"] == "1" else "LQ"
44 |
45 | video_url = player_url.format(copyrightId=copyrightId,
46 | contentId=contentId,
47 | toneFlag=toneFlag,
48 | resourceType=2)
49 |
50 | data["author"] = author
51 | data["audioName"] = audioName
52 | data["videos"] = [video_url]
53 |
54 | return data
55 |
56 |
57 | if __name__ == "__main__":
58 | url = "http://music.migu.cn/v3/music/song/69910422841"
59 | print(get(url))
60 |
--------------------------------------------------------------------------------
/extractor/momo.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str):
7 | """
8 | title、imgs、videos
9 | """
10 | data = {}
11 | headers = {
12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 | }
14 | api = "https://m.immomo.com/inc/microvideo/share/profiles"
15 |
16 | ar = re.findall(r'/(ar.*?)\.html', url)
17 | if not ar:
18 | return {"msg": "失败"}
19 | ar = ar[0]
20 |
21 | payload = {
22 | "feedids": ar,
23 | "name": "",
24 | "avatar": "",
25 | }
26 |
27 | rep = requests.post(api, data=payload, headers=headers, timeout=6)
28 | if rep.status_code == 200 and rep.json()["ec"] == 200:
29 | info = rep.json()["data"]
30 | title = info["list"][0]["content"]
31 | img = info["list"][0]["video"]["cover"]["l"]
32 | video = info["list"][0]["video"]["video_url"]
33 |
34 | data["title"] = data["videoName"] = title
35 | data["imgs"] = [img]
36 | data["videos"] = [video]
37 | else:
38 | data["msg"] = "失败"
39 |
40 | return data
41 |
42 |
43 | if __name__ == "__main__":
44 | from pprint import pprint
45 | url = "https://m.immomo.com/s/moment/new-share-v2/ar8422649104.html"
46 | pprint(get(url))
47 |
--------------------------------------------------------------------------------
/extractor/music163/__init__.py:
--------------------------------------------------------------------------------
1 | from .music163 import Wangyiyun
2 |
3 |
4 | def get(url: str) -> dict:
5 | """
6 | aduios或者videos
7 | """
8 | data = {}
9 | wangyiyun = Wangyiyun()
10 | resource_url = wangyiyun.get(url)
11 | if not resource_url:
12 | return {"msg": "获取失败"}
13 | if "mv" in url or "video" in url:
14 | data["videos"] = [resource_url]
15 | elif "song" in url:
16 | data["audios"] = [resource_url]
17 | return data
18 |
19 |
20 | __all__ = ["get"]
21 |
--------------------------------------------------------------------------------
/extractor/music163/encrypt.py:
--------------------------------------------------------------------------------
1 | # 原理:https://www.zhihu.com/question/36081767 代码块直接copy的:https://github.com/CharlesPikachu/Music-Downloader
2 | import base64
3 | import codecs
4 | import json
5 | import os
6 |
7 | from Crypto.Cipher import AES
8 |
9 |
10 | class Cracker():
11 | modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
12 | nonce = '0CoJUm6Qyw8W8jud'
13 | pubKey = '010001'
14 |
15 | @classmethod
16 | def get(cls, text):
17 | text = json.dumps(text)
18 | secKey = cls._createSecretKey(16)
19 | encText = cls._aesEncrypt(cls._aesEncrypt(text, cls.nonce), secKey)
20 | encSecKey = cls._rsaEncrypt(secKey, cls.pubKey, cls.modulus)
21 | post_data = {'params': encText, 'encSecKey': encSecKey}
22 | return post_data
23 |
24 | @classmethod
25 | def _aesEncrypt(cls, text, secKey):
26 | pad = 16 - len(text) % 16
27 | if isinstance(text, bytes):
28 | text = text.decode('utf-8')
29 | text = text + str(pad * chr(pad))
30 | secKey = secKey.encode('utf-8')
31 | encryptor = AES.new(secKey, 2, b'0102030405060708')
32 | text = text.encode('utf-8')
33 | ciphertext = encryptor.encrypt(text)
34 | ciphertext = base64.b64encode(ciphertext)
35 | return ciphertext
36 |
37 | @classmethod
38 | def _rsaEncrypt(cls, text, pubKey, modulus):
39 | text = text[::-1]
40 | rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16)**int(pubKey, 16) % int(modulus, 16)
41 | return format(rs, 'x').zfill(256)
42 |
43 | @classmethod
44 | def _createSecretKey(cls, size):
45 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(size)))))[0:16]
46 |
47 |
48 | if __name__ == "__main__":
49 | print(Cracker.get("Hello World"))
50 |
--------------------------------------------------------------------------------
/extractor/music163/music163.py:
--------------------------------------------------------------------------------
1 | import re
2 | from urllib.parse import unquote
3 |
4 | import requests
5 |
6 | from .encrypt import Cracker
7 |
8 |
9 | class Wangyiyun():
10 | def __init__(self):
11 | self.headers = {
12 | 'Referer': 'https://music.163.com/',
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.32 Safari/537.36'
14 | }
15 | self.music_url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token='
16 | self.mv_url = "https://music.163.com/weapi/song/enhance/play/mv/url?csrf_token="
17 |
18 | def get(self, url):
19 | """
20 | 返回资源链接
21 | """
22 |
23 | if "video" in url:
24 | return self.get_video(url)
25 |
26 | id = self.get_id(url)
27 | if "mv" in url:
28 | params = {"id": id, "r": "1080", "csrf_token": ""}
29 | data = self.__postRequests(self.mv_url, params)
30 | if data:
31 | return data["data"]["url"]
32 | elif "song" in url:
33 | params = {'ids': [int(id)], 'br': 320000, 'csrf_token': ''}
34 | data = self.__postRequests(self.music_url, params)
35 | if data:
36 | return data["data"][0]["url"]
37 | return None
38 |
39 | def get_video(self, url):
40 | id = self.get_id(url)
41 | url = f"http://music.163.com/video/{id}/"
42 | rep = requests.get(url, headers=self.headers, timeout=6)
43 | if rep.status_code == 200:
44 | encoded_url = re.findall(r'', rep.text)[0]
45 | return unquote(encoded_url)
46 | return None
47 |
48 | # 匹配id
49 | def get_id(self, raw_url) -> str:
50 | pattern1 = re.compile(r'\?id=(\w+)')
51 | pattern2 = re.compile(r'song/(\w+)/')
52 | pattern3 = re.compile(r'mv/(\w+)/')
53 | pattern4 = re.compile(r'video/(\w+)/')
54 | if "?id" in raw_url:
55 | id = re.findall(pattern1, raw_url)
56 | elif "song" in raw_url:
57 | id = re.findall(pattern2, raw_url)
58 | elif "mv" in raw_url:
59 | id = re.findall(pattern3, raw_url)
60 | elif "video" in raw_url:
61 | id = re.findall(pattern4, raw_url)
62 | if id:
63 | return id[0]
64 | return None
65 |
66 | def __postRequests(self, url, params, timeout=6):
67 | post_data = Cracker.get(params)
68 | rep = requests.post(url,
69 | data=post_data,
70 | timeout=timeout,
71 | headers=self.headers)
72 | if rep.json()['code'] == 200:
73 | return rep.json()
74 | return None
75 |
--------------------------------------------------------------------------------
/extractor/open163.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=W0123
2 | import re
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | videos
9 | """
10 | data = {}
11 | data["videos"] = []
12 | headers = {
13 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
14 | }
15 | re_url = r'mid:(.*?),.*?mp4SdUrlOrign:(.*?),.*?mp4HdUrlOrign:(.*?),.*?mp4ShdUrlOrign:(.*?),'
16 | rep = requests.get(url, headers=headers, timeout=10)
17 | items = re.findall(re_url, rep.text)
18 | for item in items:
19 | # 倒序取最高画质
20 | for video_url in item[::-1]: # type: str
21 | # print(url)
22 | if "http" in video_url:
23 | video_url = eval(video_url).replace("\\u002F", "/")
24 | data["videos"].append(video_url)
25 | break
26 | return data
27 |
28 |
29 | if __name__ == "__main__":
30 | url = "http://open.163.com/newview/movie/free?pid=M8LI1JCE6&mid=M8LI3BQ60"
31 | print(get(url))
32 |
--------------------------------------------------------------------------------
/extractor/pearvideo.py:
--------------------------------------------------------------------------------
1 | # hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo.com/mp4/adshort/20200328/cont-1665047-11947733-122441_adpkg-ad_hd.mp4",
2 | # data-title="奥运推迟后东京新冠确诊数翻倍,《纽约时报》发文质疑" data-summary="从3月23日起,东京地区的新冠病毒确诊数就连续4天上涨。在24日官宣东京奥运推迟之后,第二天确诊数更是直接翻倍。《纽约时报》写了一篇文章,列出了各种数据,质疑此前东京为了奥运会而牺牲检测。"
3 |
4 | import re
5 | import requests
6 |
7 |
8 | def get(url: str) -> dict:
9 | """
10 | title、videos、text
11 | """
12 | data = {}
13 | headers = {
14 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
15 | }
16 | try:
17 | rep = requests.get(url, headers=headers, timeout=10)
18 | data["title"], data["text"] = re.findall(r'data-title="(.*?)" data-summary="(.*?)"', rep.text)[0]
19 | data["videos"] = re.findall(r'srcUrl="(.*?\.mp4)",', rep.text)
20 | except (ConnectionError, IndexError, TypeError):
21 | data["msg"] = "获取失败"
22 |
23 | return data
24 |
25 |
26 | if __name__ == "__main__":
27 | url = "https://www.pearvideo.com/video_1664989"
28 | print(get(url))
29 |
--------------------------------------------------------------------------------
/extractor/peiyinxiu.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videos
9 | """
10 | data = {}
11 | headers = {
12 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
13 | }
14 | rep = requests.get(url, headers=headers, timeout=10)
15 | if rep.status_code != 200:
16 | return {"msg": "获取失败"}
17 | html = rep.text
18 | data["title"] = re.findall(r'data-title="(.*?)"', html)[0]
19 | data["videos"] = re.findall(r"\sfilmurl: '(.*?)',", html)
20 | return data
21 |
22 |
23 | if __name__ == "__main__":
24 | url = "http://peiyinxiu.com/m/127066455"
25 | print(get(url))
26 |
--------------------------------------------------------------------------------
/extractor/pic58.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """https://www.58pic.com/newpic/*.html
8 |
9 | imgs
10 | """
11 | rep = requests.get(url, timeout=6)
12 | if not rep.ok:
13 | return {"msg": "失败"}
14 | pre_url = re.findall(r'', rep.text)
15 | if not pre_url:
16 | return {"msg": "失败"}
17 | pre_url = pre_url[0] # type: str
18 | img_url = pre_url.replace("preview.qiantucdn.com", "https://pic.qiantucdn.com")
19 | return {"imgs": [img_url], "msg": f"下载时需要设置referer: {url}"}
20 |
21 |
22 | if __name__ == "__main__":
23 | # url = input("url: ")
24 | url = "https://www.58pic.com/newpic/34673009.html"
25 | print(get(url))
26 |
--------------------------------------------------------------------------------
/extractor/pipigaoxiao.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | videos
10 | """
11 | data = {}
12 | headers = {
13 | "Host": "share.ippzone.com",
14 | "Connection": "keep-alive",
15 | "Content-Length": "45",
16 | "Origin": "http://share.ippzone.com",
17 | "User-Agent":
18 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
19 | "Content-Type": "text/plain;charset=UTF-8",
20 | "Accept": "*/*",
21 | "Referer": "http://share.ippzone.com/",
22 | "Accept-Encoding": "gzip, deflate",
23 | "Accept-Language": "zh-CN,zh;q=0.9",
24 | }
25 |
26 | post_url = "http://share.ippzone.com/ppapi/share/fetch_content"
27 |
28 | pid = re.findall(r"/(\d{1,})", url)
29 | if not pid:
30 | data["msg"] = "链接无效,无法获取有效数据"
31 | return data
32 | else:
33 | pid = int(pid[0])
34 |
35 | post_data = {
36 | "pid": pid,
37 | "type": "post",
38 | }
39 |
40 | with requests.post(post_url, headers=headers, data=json.dumps(post_data), timeout=10) as rep:
41 | if rep.status_code == 200 and rep.json().get("ret") == 1:
42 | id = rep.json().get("data").get("post").get("imgs")[0].get("id")
43 | play_url = rep.json().get('data').get('post').get('videos').get(str(id)).get('url')
44 | data["videos"] = [play_url]
45 | else:
46 | data["msg"] = "资源获取失败,请确认输入是否正确"
47 |
48 | return data
49 |
50 |
51 | if __name__ == "__main__":
52 | print(get(input("url: ")))
--------------------------------------------------------------------------------
/extractor/pipix.py:
--------------------------------------------------------------------------------
1 | # author: wongxy
2 | # --------------
3 | # https://h5.pipix.com/item/******************
4 | import re
5 | import requests
6 |
7 |
8 | def get(url: str) -> dict:
9 | """
10 | title、audios
11 | """
12 | data = {}
13 | headers = {
14 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 | }
16 | item_id = re.findall(r"item/(\d+)", url)
17 | if not item_id:
18 | return {"msg": "获取失败"}
19 | item_id = item_id[0]
20 | info_url = f"https://h5.pipix.com/bds/webapi/item/detail/?item_id={item_id}&source=share"
21 | with requests.get(info_url, headers=headers, timeout=10) as rep:
22 | if rep.status_code != 200 or rep.json().get("status_code") != 0:
23 | return {"msg": "获取失败"}
24 | info = rep.json()["data"]["item"]
25 | data["title"] = info["share"]["title"]
26 | data["audios"] = [info["origin_video_download"]["url_list"][0]["url"]]
27 |
28 |
29 |
30 | return data
31 |
32 |
33 | if __name__ == "__main__":
34 | print(get(input("url: ")))
--------------------------------------------------------------------------------
/extractor/qianqian.py:
--------------------------------------------------------------------------------
1 | # qianqian music
2 | # music.taihe.com
3 | import re
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | url sample: http://music.taihe.com/song/********
10 |
11 | author、audioName、imgs、audios
12 | """
13 | data = {}
14 | headers = {
15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
16 | }
17 | songinfo_format_url = "http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&songid={songid}&from=web"
18 |
19 | songid = re.findall(r"song/(\d+)", url)
20 | if not songid:
21 | data["msg"] = "无法获取有效消息"
22 | return data
23 | songid = songid[0]
24 | songinfo_url = songinfo_format_url.format(songid=songid)
25 | with requests.get(songinfo_url, headers=headers, timeout=10) as rep:
26 | if rep.status_code != 200:
27 | data["msg"] = "无法获取有效消息"
28 | return data
29 | result = rep.json()
30 | data["author"] = result["songinfo"]["artist"]
31 | data["audioName"] = result["songinfo"]["title"]
32 | data["imgs"] = [result["songinfo"]["album_1000_1000"]]
33 | data["audios"] = [result["bitrate"]["show_link"] or result["bitrate"]["file_link"]]
34 |
35 | return data
36 |
37 |
38 | if __name__ == "__main__":
39 | import pprint
40 | pprint.pprint(get(input("url: ")))
--------------------------------------------------------------------------------
/extractor/qingshipin.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | def get(url: str):
5 | """
6 | author、title、imgs、videos
7 | """
8 | data = {}
9 | headers = {
10 | "User-Agent":
11 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like\
12 | Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 | }
14 | detail_url = url.replace("video/?", "bbq/app-bbq/sv/detail?sv")
15 | with requests.get(detail_url, headers=headers, timeout=10) as rep:
16 |
17 | if rep.status_code != 200:
18 | return {"msg": "error occurred!"}
19 |
20 | json = rep.json()
21 | if json["code"] != 0:
22 | return {"msg": "error occurred!"}
23 |
24 | author = json["data"]["user_info"]["uname"]
25 | title = json["data"]["title"]
26 | imgs = [json["data"]["cover_url"]]
27 | videos = [json["data"]["play"]["url"]]
28 |
29 | data["author"] = author
30 | data["title"] = title
31 | data["imgs"] = imgs
32 | data["videos"] = videos
33 | return data
34 |
35 |
36 | if __name__ == "__main__":
37 | url = input("url: ")
38 | print(get(url))
39 |
--------------------------------------------------------------------------------
/extractor/qmgx.py:
--------------------------------------------------------------------------------
1 | """
2 | 全民搞笑 https://longxia.music.xiaomi.com/share/video/******
3 | """
4 | import re
5 |
6 | import requests
7 |
8 |
9 | def get(url: str) -> dict:
10 | """
11 | title、videoName、videos
12 | """
13 | data = {}
14 | vid = re.findall(r'video/(\d+)', url)
15 | if vid:
16 | api = 'https://longxia.music.xiaomi.com/api/share?contentType=video&contentId={}'.format(vid[0])
17 | headers = {
18 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
19 | }
20 | rep = requests.get(api, headers=headers, timeout=5)
21 | if rep.status_code == 200 and rep.json()['code'] == 200:
22 | info = rep.json()['data']['videoInfo']['videoInfo']
23 | data['title'] = data['videoName'] = info['desc']
24 | data['videos'] = [info['url']]
25 | return data
26 | return {'msg': 'failed'}
27 |
28 |
29 | if __name__ == "__main__":
30 | print(get('https://longxia.music.xiaomi.com/share/video/6624743459453734912?sharerUserId'))
31 |
--------------------------------------------------------------------------------
/extractor/qqmusic.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 |
4 | import requests
5 |
6 |
7 | def get(url: str):
8 | """
9 | author、audioName、audios
10 | """
11 | data = {}
12 | ios_headers = {
13 | "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
14 | "referer": "http://y.qq.com"
15 | }
16 |
17 | # find: songid、songmid and author、audioName
18 | with requests.get(url, headers=ios_headers, timeout=10) as rep:
19 | if rep.status_code != 200:
20 | return {"msg": "链接无效"}
21 | html = rep.text
22 | songid = re.findall(r'songid":(\d+),', html)
23 | songmid = re.findall(r'"songmid":"(.*?)",', html)
24 | if not (songid or songmid):
25 | return {"msg": "提取重要信息失败"}
26 | songid = songid[0]
27 | songmid = songmid[0]
28 | data["audioName"] = re.findall(r'"songname":"(.*?)"', html)[0]
29 | data["author"] = re.findall(r'"name":"(.*?)",', html)[0]
30 |
31 | # vkey
32 | vkey_url = 'https://u.y.qq.com/cgi-bin/musicu.fcg'
33 | params = {
34 | 'data': json.dumps({"req": {"module": "CDN.SrfCdnDispatchServer", "method": "GetCdnDispatch", "param": {"guid": "3982823384", "calltype": 0, "userip": ""}}, "req_0": {"module": "vkey.GetVkeyServer", "method": "CgiGetVkey", "param": {"guid": "3982823384", "songmid": [songmid], "songtype": [0], "uin": "0", "loginflag": 1, "platform": "20"}}, "comm": {"uin": 0, "format": "json", "ct": 24, "cv": 0}})
35 | }
36 | with requests.get(vkey_url, params=params, headers=ios_headers, timeout=10) as rep:
37 | if rep.json()["code"] != 0 and rep.json()['req_0']['code'] != 0:
38 | return {"msg": "提取重要信息失败"}
39 | data["audios"] = [
40 | "https://isure.stream.qqmusic.qq.com/{}".format(rep.json()['req_0']['data']['midurlinfo'][0]['purl'])
41 | ]
42 |
43 | return data
44 |
45 |
46 | if __name__ == "__main__":
47 | # print(get(input("url: ")))
48 | url = 'https://y.qq.com/n/yqq/song/003tdyG9003JqW.html'
49 | print(get(url))
50 |
51 |
52 | # "A000", "ape", 800
53 | # "F000", "flac", 800
54 | # "M800", "mp3", 320
55 | # "C400", "m4a", 128
56 | # "M500", "mp3", 128
57 |
--------------------------------------------------------------------------------
/extractor/quanminkge.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url:str) -> dict:
7 | '''
8 | author、audioName、audios、videos
9 | '''
10 | data = {}
11 | headers = {
12 | "accept":
13 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
14 | "accept-encoding":
15 | "gzip, deflate, br",
16 | "accept-language":
17 | "zh-CN,zh;q=0.9",
18 | "cache-control":
19 | "max-age=0",
20 | "sec-fetch-mode":
21 | "navigate",
22 | "sec-fetch-site":
23 | "none",
24 | "upgrade-insecure-requests":
25 | "1",
26 | "user-agent":
27 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
28 | }
29 | singer_pattern = r',"nick":"(.*?)",'
30 | song_name_pattern = r'"song_name":"(.*?)",'
31 | audio_pattern = r'"playurl":"(.*?)",'
32 | video_pattern = r',"playurl_video":"(.*?)",'
33 |
34 | with requests.get(url=url, headers=headers, timeout=10) as rep:
35 | if rep.status_code == 200:
36 | html = rep.text
37 | singer = re.findall(singer_pattern, html)
38 | song_name = re.findall(song_name_pattern, html)
39 | audio_url = re.findall(audio_pattern, html)
40 | video_url = re.findall(video_pattern, html)
41 | if singer: data["author"] = singer[0]
42 | if song_name: data["audioName"] = song_name[0]
43 | if audio_url: data["audios"] = [url for url in audio_url if url != ""]
44 | if video_url: data["videos"] = [url for url in video_url if url != ""]
45 | else:
46 | data["msg"] = "获取失败"
47 |
48 | return data
49 |
50 |
51 | if __name__ == "__main__":
52 | data = get(input("url: "))
53 | print(data)
--------------------------------------------------------------------------------
/extractor/quanminxsp.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videos
9 | """
10 | data = {}
11 | headers = {
12 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
13 | "Accept-Encoding": "gzip, deflate, br",
14 | "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
15 | "Cache-Control": "max-age=0",
16 | "Connection": "keep-alive",
17 | "Cookie": "COMMON_LID=d8d795e732f64cd28cbbce9ee76688af; Hm_lvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585966701; Hm_lpvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585969995",
18 | "DNT": "1",
19 | "Host": "quanmin.hao222.com",
20 | "Sec-Fetch-Dest": "document",
21 | "Sec-Fetch-Mode": "navigate",
22 | "Sec-Fetch-Site": "none",
23 | "Sec-Fetch-User": "?1",
24 | "Upgrade-Insecure-Requests": "1",
25 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
26 | }
27 | re_video = r''
28 | re_title = r''
29 |
30 | with requests.get(url, headers=headers, timeout=10) as rep:
31 | if rep.status_code == 200:
32 | title = re.findall(re_title, rep.text)
33 | video = re.findall(re_video, rep.text)
34 | if title:
35 | data["title"] = title[0]
36 | if video:
37 | data["videos"] = video
38 | else:
39 | data["msg"] = "失败"
40 | return data
41 |
42 |
43 | if __name__ == "__main__":
44 | url = "https://quanmin.hao222.com/sv2?source=share-h5&pd=qm_share_mvideo&vid=3877781674274744362&shareTime=1585969946&shareid=0746467921&shared_cuid=0ivn8laMv8l9uHuI_PSua_uS2u_Wav8dYu2ku_iCStloiBaR_8S08jf2QP0Hf1uea1FmA&shared_uid=gO2Ri_aIvtelA"
45 | print(get(url))
46 |
--------------------------------------------------------------------------------
/extractor/qutoutiao.py:
--------------------------------------------------------------------------------
1 | import json
2 | from urllib.parse import urlparse, parse_qs
3 |
4 | import requests
5 |
6 | # TODO: 支持小视频
7 |
8 | def get(url: str) -> dict:
9 | """
10 | author、title、videos
11 | """
12 | data = {}
13 | if "new.3qtt.cn" in url: # 短连接转长连接
14 | url = requests.get(url).url
15 |
16 | data_url_format = "http://api.1sapp.com/content/getRecommendV3?key={key}&content_id={content_id}&limit=1"
17 | play_host = "http://v4.qutoutiao.net/"
18 |
19 | query = urlparse(url).query
20 | querys = parse_qs(query)
21 | content_id = querys["content_id"][0]
22 | key = querys["key"][0]
23 | data_url = data_url_format.format(content_id=content_id, key=key)
24 |
25 | rep = requests.get(data_url, timeout=10)
26 | if rep.status_code != 200 or rep.json()["code"] != 0:
27 | return {"msg": "获取失败"}
28 |
29 | # from pprint import pprint
30 | # pprint(rep.json())
31 | json_url = rep.json()["data"]["data"][0]["urlJson"]
32 | rep = requests.get(json_url, timeout=10)
33 | if rep.status_code != 200:
34 | return {"msg": "获取失败"}
35 | # 整理
36 | video_data = json.loads(rep.text.replace("cb(", "").replace(")", ""))
37 | detail = video_data["detail"].replace("\\", "")
38 | video_data["detail"] = json.loads(detail)
39 |
40 | data["author"] = video_data["nickname"]
41 | data["title"] = video_data["title"]
42 | address = video_data["detail"]["address"]
43 |
44 | urls = [add["url"] for add in address]
45 | for q in ["hd.mp4", "hhd.mp4", "ld.mpp4", "hld.mp4"]:
46 | for i in urls:
47 | if q in i:
48 | data["videos"] = [play_host + i]
49 | break
50 |
51 | return data
52 |
53 |
54 | if __name__ == "__main__":
55 | from pprint import pprint
56 | pprint(get(input("url: ")))
57 |
--------------------------------------------------------------------------------
/extractor/sing5.py:
--------------------------------------------------------------------------------
1 | # author: wongxy
2 | # --------------
3 | # 5sing.kugou.com
4 | import re
5 | import json
6 |
7 | import requests
8 |
9 |
10 | def get(url: str) -> dict:
11 | """
12 | author、audioName、audios
13 | """
14 | data = {}
15 | headers = {
16 | "User-Agent":
17 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
18 | }
19 |
20 | songinfo_format_url = "http://service.5sing.kugou.com/song/getsongurl?&songid={songid}&songtype=fc&from=web&version=6.6.72"
21 |
22 | songid = re.findall(r"/(\d+)", url.replace("5sing", ""))
23 | if not songid:
24 | return {"msg": "无法从链接获取关键信息"}
25 | songid = songid[0]
26 |
27 | songinfo_url = songinfo_format_url.format(songid=songid)
28 | with requests.get(songinfo_url, headers=headers, timeout=10) as rep:
29 | if rep.status_code != 200:
30 | return {"msg": "获取失败, 链接可能无效"}
31 | json_ = json.loads(rep.text[1: -1])
32 | if json_["code"] != 0:
33 | return {"msg": "获取失败, 链接可能无效"}
34 | info = json_["data"]
35 | data["author"] = info["user"]["NN"]
36 | data["audioName"] = info["songName"]
37 | data["audios"] = [
38 | info.get("squrl") or info.get("hqurl") or info.get("lqurl")
39 | ]
40 |
41 | return data
42 |
43 |
44 | if __name__ == "__main__":
45 | from pprint import pprint
46 | pprint(get("http://5sing.kugou.com/fc/15717150.html"))
47 | # print(get(input("url: ")))
48 |
--------------------------------------------------------------------------------
/extractor/sohuTV.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videoName、videos
9 | """
10 | data = {}
11 | session = requests.Session()
12 | ERROR = {"msg": "获取失败"}
13 | headers = {
14 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 | }
16 | videoInfo_url = "https://my.tv.sohu.com/play/videonew.do"
17 | playInfo_url = "https://data.vod.itc.cn/ip"
18 |
19 | with session.get(url, headers=headers, timeout=10) as rep_html:
20 | if rep_html.status_code != 200:
21 | return ERROR
22 | vid = re.findall(r",vid: '(\d+)'", rep_html.text)
23 | if not vid:
24 | return ERROR
25 | vid = vid[0]
26 |
27 | videoInfo_params = {
28 | "vid": vid,
29 | "ver": 31,
30 | "ssl": 1,
31 | "referer": url
32 | }
33 | with session.get(videoInfo_url, params=videoInfo_params, timeout=10) as videoInfo_rep:
34 | if videoInfo_rep.status_code != 200:
35 | return ERROR
36 | videoInfo = videoInfo_rep.json()["data"]
37 | tvName = videoInfo["tvName"]
38 | data["title"] = data["videoName"] = tvName
39 |
40 | video_path = videoInfo["su"][0]
41 | key = videoInfo["hc"][0] if videoInfo.get("hc") else videoInfo["ck"][0]
42 | if not video_path or not key:
43 | return ERROR
44 |
45 | playInfo_params = {
46 | "new": video_path,
47 | "num": 1,
48 | "key": key,
49 | }
50 | with session.get(playInfo_url, params=playInfo_params, timeout=10) as playInfo_rep:
51 | if playInfo_rep.status_code != 200:
52 | return ERROR
53 | play_url = playInfo_rep.json()["servers"][0]["url"]
54 | data["videos"] = [play_url]
55 |
56 | return data
57 |
58 |
59 | if __name__ == "__main__":
60 | url = input("url: ")
61 | print(get(url))
62 |
--------------------------------------------------------------------------------
/extractor/ted.py:
--------------------------------------------------------------------------------
1 | # https://www.ted.com/talks/*
2 | import re
3 |
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | title、videoName、videos
10 | """
11 | data = {}
12 | headers = {
13 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
14 | }
15 | rep = requests.get(url, headers=headers, timeout=10)
16 | if rep.status_code == 200:
17 | text = rep.text
18 | try:
19 | title = re.findall(r'', text)[0]
20 | mp4 = re.findall(r'"(https://download\.ted\.com.*?mp4\?apikey=.*?)"', text)[-1]
21 | data["title"] = data["videoName"] = title
22 | data["videos"] = [mp4]
23 | except IndexError as e:
24 | data["msg"] = "获取失败:" + e
25 | else:
26 | data["msg"] = "获取失败"
27 |
28 | return data
29 |
30 |
31 | if __name__ == "__main__":
32 | url = "https://www.ted.com/talks/bill_gates_how_we_must_respond_to_the_coronavirus_pandemic"
33 | print(get(url))
34 |
--------------------------------------------------------------------------------
/extractor/tuchong.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、imgs
9 | """
10 | data = {}
11 | headers = {
12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 | }
14 | rep = requests.get(url, headers=headers, timeout=6)
15 | if rep.status_code == 200:
16 | title = re.findall(r'', rep.text)
17 | if title:
18 | data["title"] = title[0]
19 | data["imgs"] = re.findall(r'photo-image" src="(.*?)"', rep.text)
20 | else:
21 | data["msg"] = "获取失败"
22 | return data
23 |
24 |
25 | if __name__ == "__main__":
26 | from pprint import pprint
27 | pprint(get(input("url: ")))
28 |
--------------------------------------------------------------------------------
/extractor/tudou.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 |
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | :param url: 视频链接,免费电视剧单集
10 |
11 | :return title: 视频名
12 | :return videoName: 同title
13 | :return videos: 视频链接,多个片段。最后一个是视频流地址(m3u8)
14 | """
15 | data = {}
16 | headers_html = {
17 | "referer": url,
18 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
19 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
20 | }
21 | headers_info = {
22 | "referer": url,
23 | "accept": "application/json",
24 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
25 | }
26 | host = "https://ups.youku.com/ups/get.json"
27 |
28 | rep = requests.get(url, headers=headers_html, timeout=10)
29 | if rep.status_code != 200:
30 | return {"msg": "访问链接失败"}
31 | vid = re.findall(r'"vid":"(\d+)"', rep.text)
32 | if not vid:
33 | return {"msg": "获取视频id失败"}
34 | vid = vid[0]
35 |
36 | params = {
37 | # 调试了半天,才发现ckey可以通用,但是暂时不知道过期时间
38 | "ckey": "122#wppJ/JoGEExRyDpZy4pjEJponDJE7SNEEP7ZpJRBuDPpJFQLpCGwoHZDpJEL7SwBEyGZpJLlu4Ep+FQLpoGUEELWn4yE7SNEEP7ZpERBuDPE+BQPpC76EJponDJLKMQEImb2XDnTtByWAfaPwr8S14Rqur0Nj1sih8TwWMzZF+NtTPnZULbEnh9G8WlODWp1uOjeDLVr8PG6+4EEyFfDqM3bDEpxngR4ul5EDOgPm4AiJDbEfC3mqM3WE8pangL4ul0EDLVr8CpU+4EEyFfDqMfbDEpxnSp4uOIEELXZ8oL6JwTEyF3F7S32EJpadSxwuAuRiRFmYFRiZDPACVgIudh3VaGrVnUkqUbD72siAEVR1Qr4OWZjlGSrnzPs2rh4OY+Z6EbOEBJ8OnDsYwNsTdEhishHohd6L2J+K8z7LZpSitQjj8hrDOAV/ttFwMbpN7KrcdwvCJ7TbxjR5Q0rJaMPlfUv9IYPLIY9KNNy24RBro4psistlkgxw4vO3WXa4M00NlsAH1XADAp8l3+COupmS7LbhxHS2BKVRDZkDyD+xnYIaRahNuJDv7pLt830IQHgDvnq1gJBE75mVDgemdAGyc4ruFk4++Ar9T6gZbfiuacVvtDgzBcEo0r6bi+rvYQuaMy=",
39 | "utid": "otL9FkVfwnwCASv6yQTaubZ5", # expires at: 2030-03-25T07:32:14.712Z
40 | "vid": vid,
41 | "client_ts": int(time.time()),
42 | "ccode": "050F",
43 | "client_ip": "192.168.1.1",
44 | }
45 | rep = requests.get(host, params=params, headers=headers_info, timeout=10)
46 | if rep.status_code != 200 or "error" in rep.json()["data"]:
47 | return {"msg": "获取视频信息失败"}
48 | info = rep.json()["data"]
49 | title = info["video"]["title"]
50 | stream = info["stream"] # type: list
51 | # 取最高画质
52 | best_steam = sorted(stream, key=lambda item: item["width"])[-1]
53 | videos = [url_item["cdn_url"] for url_item in best_steam["segs"]]
54 | m3u8_url = best_steam["m3u8_url"]
55 | videos.append(m3u8_url)
56 |
57 | data["title"] = data["videoName"] = title
58 | data["videos"] = videos
59 |
60 | return data
61 |
62 |
63 | if __name__ == "__main__":
64 | from pprint import pprint
65 | pprint(get(input("url: ")))
66 |
--------------------------------------------------------------------------------
/extractor/wechat_article_cover.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | imgs、text
9 | """
10 | data = {}
11 | headers = {
12 | "user-agent":
13 | "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/ 53\
14 | 6.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 | }
16 | with requests.get(url, headers=headers, timeout=10) as rep:
17 | if rep.status_code != 200:
18 | return {"msg": "错误"}
19 | img = re.findall(r'', rep.text)
20 | if img:
21 | data["imgs"] = [img[0]]
22 | text = re.findall(r'', rep.text)
23 | if text:
24 | data["text"] = text[0]
25 | return data
26 |
27 |
28 | if __name__ == "__main__":
29 |
30 | url = input("url: ")
31 | print(get(url))
32 |
--------------------------------------------------------------------------------
/extractor/weibo.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videos
9 | """
10 | data = {}
11 | headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1'}
12 |
13 | title_re = r'"title": "(.*?)",'
14 | mp4_720p_mp4_re = r'"mp4_720p_mp4": "(.*?)",'
15 | mp4_hd_mp4_re = r'"mp4_hd_mp4": "(.*?)",'
16 | mp4_ld_mp4_re = r'"mp4_ld_mp4": "(.*?)"'
17 |
18 | with requests.get(url, headers=headers, timeout=10) as rep:
19 | if rep.status_code == 200:
20 | text = rep.text
21 | title = re.findall(title_re, text)
22 | mp4_720p_mp4 = re.findall(mp4_720p_mp4_re, text)
23 | mp4_hd_mp4 = re.findall(mp4_hd_mp4_re, text)
24 | mp4_ld_mp4 = re.findall(mp4_ld_mp4_re, text)
25 | if title:
26 | data["title"] = title[0]
27 | data["videos"] = mp4_720p_mp4 or mp4_hd_mp4 or mp4_ld_mp4
28 | else:
29 | data["msg"] = "获取失败"
30 |
31 | return data
32 |
33 |
34 | if __name__ == "__main__":
35 | url = input('url: ')
36 | print(get(url))
--------------------------------------------------------------------------------
/extractor/weishi.py:
--------------------------------------------------------------------------------
1 | import json
2 | from urllib.parse import urlparse, parse_qs
3 |
4 | import requests
5 |
6 |
7 | data = {}
8 | headers = {
9 | "accept": "application/json",
10 | "accept-encoding": "gzip, deflate, br",
11 | "accept-language": "zh-CN,zh;q=0.9",
12 | "content-length": "63",
13 | "content-type": "application/json",
14 | "cookie": "pgv_pvi=9657849856; pgv_pvid=2069474799; RK=aHJszqfoXm; ptcz=0fc0035b9509215f060561393c09f6cde3bccc1953e79c2b5b1ec450e4e67f19; LW_uid=s1i5E5d4v2a1p5n702J1O2y0q8; eas_sid=M1k5T5N4O28185X7J291x2K1A3; o_cookie=286183317; pac_uid=1_286183317; ied_qq=o0286183317; LW_sid=x1Y5D6W4h4H516F6X9l9V8S8Z3; tvfe_boss_uuid=fbb4b39b5afeb49b; psrf_qqopenid=A140C50D3D791392EA89131C8B01FE1D; psrf_qqaccess_token=D2F43F3C25900E66193345D276AF9559; psrf_qqrefresh_token=E48409D7E8E4F3D5C3869F104380AB3E; psrf_qqunionid=002C01991CFB436BCD8A27A0EE1DB9FF; qm_keyst=Q_H_L_2ajiOt50eapmue6Eg1-l_W6XztEBr_u0vZJAPs4xctJZNJdsEZONiDnNJ206icA; psrf_musickey_createtime=1574482649; psrf_access_token_expiresAt=1582258649; person_id_bak=5295507715828209; person_id_wsbeacon=5689667751647505; wsreq_logseq=336060008",
15 | "origin": "https://h5.weishi.qq.com",
16 | "sec-fetch-mode": "cors",
17 | "sec-fetch-site": "same-origin",
18 | "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1",
19 | "x-requested-with": "XMLHttpRequest",
20 | }
21 |
22 |
23 | # 未登录时分享的链接
24 | # url = "https://h5.weishi.qq.com/weishi/wsplay/challenge?feedid=6YV0vjeP71IHTsV08&challegeid=100026&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=127081004&pkg=&attach=cp_reserves3_1190370002"
25 | def _get_not_logged(url: str) -> dict:
26 | global data
27 | post_url = "https://h5.weishi.qq.com/webapp/json/challenge_feedrank/GetChallengeFeedDetail?t=0.2602280426206063&g_tk="
28 |
29 | query = parse_qs(urlparse(url).query)
30 | try:
31 | feedid = query.get('feedid')[0]
32 | challenge_id = query.get('challegeid')[0]
33 | except:
34 | data["msg"] = "获取失败"
35 | return data
36 |
37 | payload = {
38 | "feedid": feedid,
39 | "challenge_id": challenge_id,
40 | "type": 0,
41 | }
42 |
43 | with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep:
44 | if rep.status_code == 200:
45 | video_info = rep.json().get("data").get('feedinfos')[0]
46 | title = video_info.get("feed_desc")
47 | play_url = video_info.get("video_url")
48 | data["title"] = title
49 | data["videos"] = [play_url]
50 | else:
51 | data["msg"] = "获取失败"
52 |
53 | return data
54 |
55 |
56 | # 登录后分享的链接
57 | # url = "https://h5.weishi.qq.com/weishi/feed/770BSyaon1IQcqdbr/wsfeed?wxplay=1&id=770BSyaon1IQcqdbr&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=100081014&pkg=3670&attach=cp_reserves3_1000370011"
58 | def _get_logged(url: str) -> dict:
59 | global data
60 | post_url = "https://h5.weishi.qq.com/webapp/json/weishi/WSH5GetPlayPage?t=0.16820895093158983&g_tk="
61 |
62 | query = parse_qs(urlparse(url).query)
63 | try:
64 | feedid = query.get('id')[0]
65 | except:
66 | data["msg"] = "获取失败"
67 | return data
68 |
69 | payload = {
70 | "feedid": feedid,
71 | "recommendtype": 0,
72 | "datalvl": "all",
73 | "_weishi_mapExt": {}
74 | }
75 |
76 | with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep:
77 | if rep.status_code == 200:
78 | video_info = rep.json().get('data').get('feeds')[0]
79 | title = video_info.get("feed_desc")
80 | play_url = video_info.get("video_url")
81 | data["title"] = title
82 | data["videos"] = [play_url]
83 | else:
84 | data["msg"] = "获取失败"
85 |
86 | return data
87 |
88 |
89 | def get(url: str) -> dict:
90 | return _get_not_logged(url) if url.startswith("https://h5.weishi.qq.com/weishi/wsplay/challenge") else _get_logged(url)
91 |
92 |
93 | if __name__ == "__main__":
94 | print(get(input("url: ")))
95 |
--------------------------------------------------------------------------------
/extractor/xiaokaxiu.py:
--------------------------------------------------------------------------------
1 | # @wongxy
2 | import time
3 | from hashlib import md5
4 | from urllib.parse import urlparse, parse_qs
5 |
6 | import requests
7 |
8 |
9 | def get(url: str) -> dict:
10 | """
11 | title、videos
12 | """
13 | data = {}
14 |
15 | try:
16 | qs = parse_qs(urlparse(url).query)
17 | video_id = qs["id"][0]
18 | except KeyError:
19 | return {"msg": "无法匹配视频id"}
20 |
21 | timestamp = str(int(time.time()))
22 |
23 | info_url = "https://appapi.xiaokaxiu.com/api/v1/web/share/video/" + video_id + "?time=" + timestamp
24 |
25 | temp = "S14OnTD#Qvdv3L=3vm" + "&time=" + timestamp
26 | x_sign = md5(temp.encode("utf-8")).hexdigest()
27 | headers = {
28 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
29 | "x-sign": x_sign,
30 | }
31 | rep = requests.get(info_url, headers=headers)
32 | if rep.status_code == 200 and rep.json()["code"] == 0:
33 | video_info = rep.json()["data"]
34 | title = video_info["video"]["title"]
35 | video_url = video_info["video"]["url"][0]
36 | data["title"] = title
37 | data["videos"] = [video_url]
38 | return data
39 | return {"msg": "获取失败"}
40 |
41 |
42 | if __name__ == "__main__":
43 | url = "https://mobile.xiaokaxiu.com/video?id=6552158363189252096"
44 | print(get(url))
45 |
--------------------------------------------------------------------------------
/extractor/xinpianchang.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、imgs、videos(画质不同)
9 | """
10 | data = {}
11 | headers = {
12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 | }
14 | session = requests.Session()
15 | rep = session.get(url, headers=headers, timeout=10)
16 | if rep.status_code != 200:
17 | return {"msg": "获取失败"}
18 | try:
19 | vid = re.findall(r'vid: "(.*?)",', rep.text)[0]
20 | except IndexError:
21 | return {"msg": "获取失败"}
22 |
23 | video_info_url = "http://openapi-vtom.vmovier.com/v3/video/{vid}?expand=resource".format(
24 | vid=vid)
25 | rep = session.get(video_info_url, headers=headers, timeout=10)
26 | if rep.status_code != 200 or rep.json()["status"] != 0:
27 | return {"msg": "获取失败"}
28 |
29 | video_data = rep.json()["data"]
30 |
31 | title = video_data["video"]["title"]
32 | cover = video_data["video"]["cover"]
33 | video_list = video_data["resource"]["progressive"] # type: list
34 |
35 | # videos = []
36 | # for item in video_list:
37 | # videos.append(item.get("https_url") or item.get("url"))
38 | video = video_list[0].get("https_url") or video_list[0].get("url")
39 |
40 | data["title"] = title
41 | data["imgs"] = [cover]
42 | data["videos"] = [video]
43 |
44 | return data
45 |
46 |
47 | if __name__ == "__main__":
48 | # url = "https://www.xinpianchang.com/a10628284"
49 | url = input("url: ")
50 | print(get(url))
51 |
--------------------------------------------------------------------------------
/extractor/zhihu_video.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | """
9 | data = {}
10 | headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",}
11 | video_info_url = "https://lens.zhihu.com/api/v4/videos/{id}"
12 |
13 | videos = []
14 |
15 | with requests.get(url, headers=headers, timeout=10) as rep:
16 | if rep.status_code == 200:
17 | ids = re.findall(r'www.zhihu.com/video/(\d{1,})', rep.text)
18 | ids = list(set(ids)) # 去掉重复元素
19 | else:
20 | data["msg"] = "视频获取失败,可能是这个页面没有视频"
21 | return data
22 |
23 | if not ids:
24 | data["msg"] = "视频获取失败,可能是这个页面没有视频"
25 | return data
26 |
27 | for id in ids:
28 | rep = requests.get(video_info_url.format(id=id), headers=headers, timeout=10)
29 | if rep.status_code == 200:
30 | playlist = rep.json().get("playlist")
31 | temp = playlist.get("HD") or playlist.get("SD") or playlist.get("LD")
32 | if temp:
33 | url = temp.get("play_url")
34 | videos.append(url)
35 | data["videos"] = [video for video in videos if video != ""]
36 | return data
37 |
38 |
39 |
40 | if __name__ == "__main__":
41 | url = input("url: ")
42 | print(get(url))
--------------------------------------------------------------------------------
/extractor/zuiyou_video.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def get(url: str) -> dict:
7 | """
8 | title、videoName、videos
9 | """
10 | data = {}
11 | headers = {
12 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 | }
14 | re_title = r'(.*?)
'
15 | re_video = r''
16 |
17 | with requests.get(url, headers=headers, timeout=10) as rep:
18 | if rep.status_code == 200:
19 | title = re.findall(re_title, rep.text)
20 | video = re.findall(re_video, rep.text)
21 | if title:
22 | data["title"] = data["videoName"] = title[0]
23 | if video:
24 | data["videos"] = video
25 | else:
26 | data["msg"] = "失败"
27 |
28 | return data
29 |
30 |
31 | if __name__ == "__main__":
32 | url = "https://share.izuiyou.com/detail/147486886?zy_to=applink&to=applink"
33 | print(get(url))
34 |
--------------------------------------------------------------------------------
/extractor/zuiyou_voice.py:
--------------------------------------------------------------------------------
1 | import json
2 | from urllib.parse import urlparse
3 |
4 | import requests
5 |
6 |
7 | def get(url: str) -> dict:
8 | """
9 | text、audios
10 | """
11 | data = {}
12 | headers = {
13 | "Connection": "keep-alive",
14 | "Content-Length": "209",
15 | "Content-Type": "text/plain;charset=UTF-8",
16 | "Host": "share.izuiyou.com",
17 | "Origin": "https://share.izuiyou.com",
18 | "Referer": url,
19 | "Sec-Fetch-Mode": "cors",
20 | "Sec-Fetch-Site": "same-origin",
21 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
22 | }
23 | post_url = "https://share.izuiyou.com/api/review/share_review"
24 |
25 | path = urlparse(url).path
26 | temp = path.split("/")
27 | pid = temp[-2]
28 | rid = temp[-1]
29 |
30 | payload = {
31 | "h_av": "3.0",
32 | "h_dt": 9,
33 | "h_nt": 9,
34 | "h_ch": "web_app",
35 | "ua":
36 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
37 | "pid": f"{pid}",
38 | "rid": f"{rid}"
39 | }
40 | play_host = "http://tbvideo.ixiaochuan.cn/"
41 | with requests.post(post_url, data=json.dumps(payload), headers=headers, timeout=10) as rep:
42 | if rep.status_code == 200:
43 | try:
44 | audio_info = rep.json().get("data").get("review").get("audio")
45 | voice_text = audio_info.get("voice_text")
46 | # uri = audio_info.get("uri")
47 | org_uri = audio_info.get("org_uri")
48 | data["text"] = voice_text
49 | data["audios"] = [play_host + org_uri]
50 | except (TypeError, AttributeError):
51 | data["msg"] = "获取失败"
52 | else:
53 | data["msg"] = "获取失败"
54 |
55 | return data
56 |
57 |
58 |
59 | if __name__ == "__main__":
60 | print(get(input("url: ")))
--------------------------------------------------------------------------------
/misc.py:
--------------------------------------------------------------------------------
1 | from prettytable import PrettyTable
2 |
3 | logo = r"""
4 | _______ _______ _________ ______ _______ _______
5 | ( ____ \( ____ )\__ __/( __ \ ( ____ \( ____ )
6 | | ( \/| ( )| ) ( | ( \ )| ( \/| ( )|
7 | | (_____ | (____)| | | | | ) || (__ | (____)|
8 | (_____ )| _____) | | | | | || __) | __)
9 | ) || ( | | | | ) || ( | (\ (
10 | /\____) || ) ___) (___| (__/ )| (____/\| ) \ \__
11 | \_______)|/ \_______/(______/ (_______/|/ \__/"""
12 |
13 |
14 | def printTips():
15 | platforms = [
16 | ["哔哩哔哩", "封面、视频"],
17 | ["唱鸭", "音频"],
18 | ["抖音", "无水印视频"],
19 | ["酷狗", "音频"],
20 | ["酷我", "音频"],
21 | ["荔枝FM", "音频"],
22 | ["网易云音乐", "音频、mv、视频"],
23 | ["QQ音乐", "音频"],
24 | ["皮皮搞笑", "无水印视频"],
25 | ["全民K歌", "音频&视频"],
26 | ["微博", "视频"],
27 | ["微视", "无水印视频"],
28 | ["知乎", "视频"],
29 | ["最右", "音频(语音帖评论)"],
30 | ["千千音乐", "音频"],
31 | ["5sing", "音频"],
32 | ["皮皮虾", "无水印视频"],
33 | ["轻视频", "无水印视频"],
34 | ["趣头条", "视频"],
35 | ["酷6网", "视频"],
36 | ["乐乎", "视频"],
37 | ["网易公开课", "视频(免费)"],
38 | ["新片场", "视频"],
39 | ["百度贴吧", "视频"],
40 | ["快手", "无水印视频、长图视频"],
41 | ["AcFun弹幕网", "视频"],
42 | ["百度好看视频", "视频"],
43 | ["梨视频", "视频"],
44 | ["小咖秀", "无水印视频"],
45 | ["搜狐视频", "视频"],
46 | ["土豆视频", "视频(免费电视剧等)"],
47 | ["TED", "视频"],
48 | ["图虫", "图片"],
49 | ["其他", "。。。"]
50 | ]
51 | table = PrettyTable(["支持平台", "支持内容"])
52 | for platform in platforms:
53 | table.add_row(platform)
54 | print(logo)
55 | print("""
56 | ╭━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╮
57 | │ @wongxy \033[36;4mhttps://github.com/xiyaowong\033[0m │
58 | ╰━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╯""")
59 | print("爬取并下载部分资源")
60 | print(table)
61 |
62 |
63 | if __name__ == "__main__":
64 | printTips()
65 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # extractor
2 | requests
3 | pycryptodome
4 |
5 | # extract
6 | prettytable
7 | click
8 |
9 | # web
10 | flask
11 | flask-cors
12 | python-dotenv
13 |
--------------------------------------------------------------------------------
/screenshot/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/example.gif
--------------------------------------------------------------------------------
/screenshot/run.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/run.gif
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import time
4 | from datetime import datetime
5 | from functools import wraps
6 |
7 | import click
8 | import requests
9 |
10 |
11 | def remove_file(path):
12 | if os.path.isfile(path):
13 | os.remove(path)
14 |
15 |
16 | def filter_name(name):
17 | """
18 | 过滤文件名
19 | """
20 | regexp = re.compile(r'(/|\\|:|\?|\*|\||"|\'|<|>|\$)')
21 | space = re.compile(r'\s{2,}')
22 | return space.sub(" ", regexp.sub("", name))
23 |
24 |
25 | def check_dir(path):
26 | """
27 | 检查文件夹是否存在,存在返回True;不存在则创建,返回False
28 | """
29 | if not os.path.exists(path):
30 | os.makedirs(path)
31 | return False
32 | return True
33 |
34 |
35 | def retry(n=3, delay=0.5):
36 | def deco(func):
37 | @wraps(func)
38 | def wrapper(*a, **kw):
39 | count = 1
40 | while True:
41 | try:
42 | return func(*a, **kw)
43 | except Exception as e:
44 | if count == n + 1:
45 | break
46 | print('[{}]运行错误,{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
47 | count += 1
48 | time.sleep(delay)
49 | print('重试结束,[{}]运行失败'.format(func.__name__))
50 | return False
51 | return wrapper
52 | return deco
53 |
54 |
55 | def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
56 | """
57 | :param file_url: 下载资源链接
58 | :param file_name: 保存文件名,默认为当前日期时间
59 | :param file_type: 文件类型(扩展名)
60 | :param save_path: 保存路径,默认为download,后面不要"/"
61 | :param headers: http请求头,默认为iphone
62 | """
63 | if file_name is None:
64 | file_name = str(datetime.now())
65 | file_name = filter_name(file_name)
66 |
67 | if file_type is None:
68 | if "." in file_url:
69 | file_type = file_url.split(".")[-1]
70 | else:
71 | file_type = "uknown"
72 |
73 | check_dir(save_path)
74 |
75 | file_name = file_name + "." + file_type
76 |
77 | if headers is None:
78 | headers = {
79 | "User-Agent":
80 | "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
81 | }
82 |
83 | # 下载提示
84 | if os.path.exists(f"{save_path}/{file_name}"):
85 | print(f'\033[33m{file_name}已存在,不再下载!\033[0m')
86 | return True
87 | print(f"Downloading {file_name}")
88 | try:
89 | with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
90 | file_size = int(rep.headers['Content-Length'])
91 | if rep.status_code != 200:
92 | print("\033[31m下载失败\033[0m")
93 | return False
94 | label = '{:.2f}MB'.format(file_size / (1024 * 1024))
95 | with click.progressbar(length=file_size, label=label) as progressbar:
96 | with open(f"{save_path}/{file_name}", "wb") as f:
97 | for chunk in rep.iter_content(chunk_size=1024):
98 | if chunk:
99 | f.write(chunk)
100 | progressbar.update(1024)
101 | print(f"\033[32m{file_name}下载成功\033[0m")
102 | except Exception as e:
103 | print('下载失败: ', e)
104 | remove_file(f"{save_path}/{file_name}")
105 | return True
106 |
--------------------------------------------------------------------------------
/web/README.md:
--------------------------------------------------------------------------------
1 | #### 简单提供爬虫api
2 |
3 |
4 |
5 | ```shell
6 | pip install gunicorn
7 | ```
8 |
9 | ```shell
10 | gunicorn app:app
11 |
12 | or
13 |
14 | python app.py
15 | ```
16 |
17 | `/extract?url=`这里填入链接,post请求提交`url`参数也行
18 |
--------------------------------------------------------------------------------
/web/__init__.py:
--------------------------------------------------------------------------------
1 | from ._response import response
2 |
3 |
--------------------------------------------------------------------------------
/web/_response.py:
--------------------------------------------------------------------------------
1 | from flask import jsonify
2 | from werkzeug.http import HTTP_STATUS_CODES
3 |
4 |
5 | def response(code=200, data=None, error=None, msg=None):
6 | """
7 | :param code: 状态码
8 | :param data: 返回数据
9 | :param error: 错误信息
10 | :param msg: 提示信息
11 | """
12 |
13 | if code is not None and code >= 400:
14 | error = HTTP_STATUS_CODES.get(code, "unknown error")
15 |
16 | pay_load = {
17 | "code": code,
18 | "data": data,
19 | "err": error,
20 | "message": msg or HTTP_STATUS_CODES.get(code, "unknown status"),
21 | }
22 | _response = jsonify(pay_load)
23 | _response.status_code = code
24 | return _response
25 |
--------------------------------------------------------------------------------
/web/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.join(os.path.dirname(os.getcwd())))
4 |
5 | from flask import Flask
6 | from flask_cors import CORS
7 |
8 | from web import config, error, views, log
9 |
10 |
11 |
12 | def create_app() -> Flask:
13 | app = Flask(__name__)
14 | app.config.from_object(config)
15 | CORS(app)
16 |
17 | views.init_app(app)
18 | error.init_app(app)
19 | log.init_app(app)
20 |
21 | if app.config["ENV"] == "development":
22 | print(app.url_map)
23 |
24 | return app
25 |
26 |
27 | app = create_app()
28 |
29 | if __name__ == "__main__":
30 | app.run()
31 |
--------------------------------------------------------------------------------
/web/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from dotenv import load_dotenv
4 |
5 | load_dotenv()
6 |
7 | ENV = os.getenv("FLASK_ENV") or "production"
8 | SECRET_KEY = os.getenv("SECRET_KEY") or "wongxy"
9 | DEBUG = os.getenv("DEBUG") or False
10 |
--------------------------------------------------------------------------------
/web/error.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=unused-argument
2 | from flask import Flask
3 |
4 | try:
5 | import response
6 | except ImportError:
7 | from . import response
8 |
9 |
10 | def init_app(app: Flask):
11 | @app.errorhandler(400)
12 | def _error_400(e):
13 | return response(400)
14 |
15 | @app.errorhandler(500)
16 | def _error_500(e):
17 | return response(500)
18 |
19 | @app.errorhandler(404)
20 | def _error_404(e):
21 | return response(404)
22 |
23 | @app.errorhandler(405)
24 | def _error_404(e):
25 | return response(405)
26 |
--------------------------------------------------------------------------------
/web/example.env:
--------------------------------------------------------------------------------
1 | # FLASK_ENV=development
2 | FLASK_ENV=production
3 | DEBUG=False
4 | SECRET_KEY="a string you never guess"
5 |
--------------------------------------------------------------------------------
/web/funcs.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from flask import current_app
4 |
5 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan,
6 | ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music,
7 | momo, music163, open163, pearvideo, pic58, pipigaoxiao,
8 | pipix, qianqian, qingshipin, qqmusic, quanminkge,
9 | qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo,
10 | weishi, xiaokaxiu, xinpianchang, zhihu_video,
11 | zuiyou_voice)
12 |
13 | from web import response
14 |
15 |
16 |
17 | crawlers = {
18 | 'acfun': acfun,
19 | 'tieba': baidutieba,
20 | 'bili': bilibili,
21 | 'changya': changya,
22 | 'douyin': douyin,
23 | 'haokan': haokan,
24 | 'ku6': ku6,
25 | 'chenzhongtech': kuaishou,
26 | 'kuaishou': kuaishou,
27 | 'kugou': kugou,
28 | 'kuwo': kuwo,
29 | 'lizhi': lizhiFM,
30 | 'lofter': lofter,
31 | 'music.163': music163,
32 | 'open.163': open163,
33 | 'pearvideo': pearvideo,
34 | 'ippzone': pipigaoxiao,
35 | 'pipix': pipix,
36 | 'music.taihe': qianqian,
37 | 'qingshipin': qingshipin,
38 | 'y.qq': qqmusic,
39 | 'kg': quanminkge,
40 | 'qutoutiao': qutoutiao,
41 | '5sing': sing5,
42 | 'weibo': weibo,
43 | 'weishi': weishi,
44 | 'xiaokaxiu': xiaokaxiu,
45 | 'xinpianchang': xinpianchang,
46 | 'zhihu': zhihu_video,
47 | 'zuiyou': zuiyou_voice,
48 | 'sohu': sohuTV,
49 | 'ted': ted,
50 | 'tudou': tudou,
51 | 'momo': momo,
52 | 'music.migu': migu_music,
53 | '58pic': pic58,
54 | 'tuchong': tuchong
55 | }
56 |
57 |
58 | def extract(url: str): # pylint: disable=too-many-statements
59 | try:
60 | url = re.findall(
61 | r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)
62 | if not url:
63 | return response(404, msg="无法匹配链接")
64 | url = url[0]
65 |
66 | data = None
67 | for c_name, c_func in crawlers.items():
68 | if c_name in url:
69 | data = c_func.get(url) # type: dict
70 | break
71 | if data is not None:
72 | # 删除值为空的键
73 | for key, value in data.copy().items():
74 | if not value:
75 | data.pop(key)
76 | return response(data=data, msg=data.get("msg"))
77 | else:
78 | return response(404, msg="不支持的链接")
79 | except Exception as e:
80 | current_app.logger.error(e)
81 | current_app.logger.exception(e)
82 | return response(500, error=e, msg="服务器错误")
83 |
--------------------------------------------------------------------------------
/web/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from logging.handlers import RotatingFileHandler
4 |
5 | from flask import Flask
6 | from flask.logging import default_handler
7 |
8 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
9 |
10 | LOG_PATH = os.path.join(BASE_DIR, 'logs')
11 |
12 | if not os.path.exists(LOG_PATH):
13 | os.makedirs(LOG_PATH)
14 |
15 | LOG_PATH_ALL = os.path.join(LOG_PATH, 'all.log')
16 |
17 | LOG_FILE_MAX_BYTES = 10 * 1024 * 1024
18 | LOG_FILE_BACKUP_COUNT = 10
19 |
20 |
21 | def init_app(app: Flask):
22 | app.logger.removeHandler(default_handler)
23 |
24 | formatter = logging.Formatter(
25 | "%(asctime)s [%(levelname)s] [%(filename)s] %(message)s"
26 | )
27 |
28 | file_handler = RotatingFileHandler(
29 | filename=LOG_PATH_ALL,
30 | mode='a',
31 | maxBytes=LOG_FILE_MAX_BYTES,
32 | backupCount=LOG_FILE_BACKUP_COUNT,
33 | encoding='utf-8'
34 | )
35 |
36 | file_handler.setFormatter(formatter)
37 | file_handler.setLevel(logging.WARNING)
38 |
39 | for logger in (app.logger,
40 | logging.getLogger('werkzeug')):
41 | logger.addHandler(file_handler)
42 |
--------------------------------------------------------------------------------
/web/views.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request
2 |
3 | import funcs
4 | from web import response
5 |
6 |
7 | def home():
8 | data = ":)"
9 | return response(data=data)
10 |
11 |
12 | def extract():
13 | if "url" not in request.values:
14 | return response(400, msg="Missing parameter.")
15 | url = request.values["url"]
16 | return funcs.extract(url)
17 |
18 |
19 | def init_app(app: Flask):
20 | app.add_url_rule("/", "home", home)
21 | app.add_url_rule("/extract/", "extract", extract)
22 |
--------------------------------------------------------------------------------