├── .gitignore
├── LICENSE
├── README.md
├── extract.py
├── extractor
    ├── README.md
    ├── __init__.py
    ├── acfun.py
    ├── baidutieba.py
    ├── bilibili.py
    ├── changba.py
    ├── changya.py
    ├── douyin.py
    ├── haokan.py
    ├── ku6.py
    ├── kuaishou.py
    ├── kugou.py
    ├── kuwo.py
    ├── lequ.py
    ├── lizhiFM.py
    ├── lofter.py
    ├── migu_music.py
    ├── momo.py
    ├── music163
    │   ├── __init__.py
    │   ├── encrypt.py
    │   └── music163.py
    ├── open163.py
    ├── pearvideo.py
    ├── peiyinxiu.py
    ├── pic58.py
    ├── pipigaoxiao.py
    ├── pipix.py
    ├── qianqian.py
    ├── qingshipin.py
    ├── qmgx.py
    ├── qqmusic.py
    ├── quanminkge.py
    ├── quanminxsp.py
    ├── qutoutiao.py
    ├── sing5.py
    ├── sohuTV.py
    ├── ted.py
    ├── tuchong.py
    ├── tudou.py
    ├── wechat_article_cover.py
    ├── weibo.py
    ├── weishi.py
    ├── xiaokaxiu.py
    ├── xinpianchang.py
    ├── zhihu_video.py
    ├── zuiyou_video.py
    └── zuiyou_voice.py
├── misc.py
├── requirements.txt
├── screenshot
    ├── example.gif
    └── run.gif
├── utils.py
└── web
    ├── README.md
    ├── __init__.py
    ├── _response.py
    ├── app.py
    ├── config.py
    ├── error.py
    ├── example.env
    ├── funcs.py
    ├── log.py
    └── views.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .vim/coc-settings.json
131 | download/
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 wongxy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 新情况
 2 | 
 3 | 这是很久没管的旧项目，代码质量和风格一言难尽，部分爬虫仍然可用。现计划用 fastAPI 框架搭建一个简单解析 API 服务，功能仍是简单粗糙，但用于学习或日常使用还是可以的
 4 | 
 5 | 切换到[fastapi 分支](https://github.com/xiyaowong/spiders/tree/fastapi)即可
 6 | 
 7 | ---
 8 | 
 9 | - #### 都是相对简单的爬虫，熟练应该看一眼就懂了，如果是初学者，里面有些东西还是值得看一看的。
10 | 
11 | - #### 爬虫文件详情在这里 [extractor](/extractor)
12 | 
13 | ---
14 | 
15 | ```shell
16 | pip3 install -r requirements.txt
17 | python3 extract.py
18 | ```
19 | 
20 | 可能还需要安装 nodejs
21 | 
22 | - #### screenshot
23 | 
24 |   ![example.gif](https://cdn.jsdelivr.net/gh/xiyaowong/spiders/screenshot/run.gif)
25 | 
26 | - #### release
27 | 
28 | - #### 欢迎**star**:star: & **fork**
29 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from datetime import datetime
  4 | from queue import Queue
  5 | from threading import Thread
  6 | 
  7 | import utils
  8 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan,
  9 |                        ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music,
 10 |                        momo, music163, open163, pearvideo, pic58, pipigaoxiao,
 11 |                        pipix, qianqian, qingshipin, qqmusic, quanminkge,
 12 |                        qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo,
 13 |                        weishi, xiaokaxiu, xinpianchang, zhihu_video,
 14 |                        zuiyou_voice)
 15 | from misc import printTips
 16 | 
 17 | here = os.path.abspath(os.path.dirname(__file__))
 18 | 
 19 | crawlers = {
 20 |     'acfun': acfun,
 21 |     'tieba': baidutieba,
 22 |     'bili': bilibili,
 23 |     'changya': changya,
 24 |     'douyin': douyin,
 25 |     'haokan': haokan,
 26 |     'ku6': ku6,
 27 |     'chenzhongtech': kuaishou,
 28 |     'kuaishou': kuaishou,
 29 |     'kugou': kugou,
 30 |     'kuwo': kuwo,
 31 |     'lizhi': lizhiFM,
 32 |     'lofter': lofter,
 33 |     'music.163': music163,
 34 |     'open.163': open163,
 35 |     'pearvideo': pearvideo,
 36 |     'ippzone': pipigaoxiao,
 37 |     'pipix': pipix,
 38 |     'music.taihe': qianqian,
 39 |     'qingshipin': qingshipin,
 40 |     'y.qq': qqmusic,
 41 |     'kg': quanminkge,
 42 |     'qutoutiao': qutoutiao,
 43 |     '5sing': sing5,
 44 |     'weibo': weibo,
 45 |     'weishi': weishi,
 46 |     'xiaokaxiu': xiaokaxiu,
 47 |     'xinpianchang': xinpianchang,
 48 |     'zhihu': zhihu_video,
 49 |     'zuiyou': zuiyou_voice,
 50 |     'sohu': sohuTV,
 51 |     'ted': ted,
 52 |     'tudou': tudou,
 53 |     'momo': momo,
 54 |     'music.migu': migu_music,
 55 |     '58pic': pic58,
 56 |     'tuchong': tuchong
 57 | }
 58 | 
 59 | 
 60 | class Task:
 61 |     def __init__(self, url, save_path='', file_name=None, file_type='unknown'):
 62 |         self.url = url
 63 |         self.save_path = save_path
 64 |         self.file_name = file_name or str(datetime.now())
 65 |         self.file_type = file_type
 66 | 
 67 | 
 68 | def data2tasks(data: dict) -> list:
 69 |     title = data.get("title")
 70 |     author = data.get("author")
 71 |     audioName = data.get("audioName")
 72 |     videoName = data.get("videoName")
 73 |     imgs = data.get("imgs")
 74 |     audios = data.get("audios")
 75 |     videos = data.get("videos")
 76 |     text = data.get("text")
 77 |     msg = data.get("msg")
 78 | 
 79 |     if msg:
 80 |         print(msg)
 81 |         print()
 82 |     if text:
 83 |         print(text)
 84 |         print()
 85 |     tasks = []
 86 |     if imgs:
 87 |         img_tasks = [Task(img, 'download/images', file_type='jpg') for img in imgs]
 88 |         tasks.extend(img_tasks)
 89 |     if audios:
 90 |         file_name = (audioName or "") + "-" + (author or "")
 91 |         audio_tasks = [Task(audio, 'download/audios', file_name=file_name, file_type='mp3') for audio in audios]
 92 |         tasks.extend(audio_tasks)
 93 |     if videos:
 94 |         file_name = (videoName or title or "")
 95 |         video_tasks = [Task(video, 'download/videos', file_name=file_name, file_type='mp4') for video in videos]
 96 |         tasks.extend(video_tasks)
 97 |     return tasks
 98 | 
 99 | 
100 | @utils.retry(2)
101 | def dl(dl_queue: Queue):
102 |     while not dl_queue.empty():
103 |         task = dl_queue.get()  # type: Task
104 |         utils.download(file_url=task.url,
105 |                        save_path=task.save_path,
106 |                        file_name=task.file_name,
107 |                        file_type=task.file_type)
108 | 
109 | 
110 | def get_data(url):
111 |     for c_name, c_func in crawlers.items():
112 |         if c_name in url:
113 |             data = c_func.get(url)
114 |             print(data)
115 |             return data
116 |     print(f'链接【\033[31m{url}\033[0m】不支持')
117 |     return None
118 | 
119 | 
120 | @utils.retry(2)
121 | def parse_urls(text: str) -> list:
122 |     urls = re.findall(
123 |         r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", text)
124 |     return urls
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     printTips()
129 |     while True:
130 |         what = input("输入链接http开头(输入任意不包含链接的内容就能退出)：")
131 |         urls = parse_urls(what)
132 |         if not urls:
133 |             print("bye~")
134 |             break
135 |         print(f"""
136 | ╭━━━━━━━━━━━━━╮
137 | │ 一共{len(urls)}个链接 │
138 | ╰━━━━━━━━━━━━━╯
139 |         """)
140 |         all_task = []
141 |         for idx, url in enumerate(urls):
142 |             print(f"正在解析第{idx+1}个链接【{url}】")
143 |             data = get_data(url)
144 |             if data:
145 |                 all_task.extend(data2tasks(data))
146 | 
147 |         queue = Queue(maxsize=100)
148 |         for t in all_task:
149 |             queue.put(t)
150 | 
151 |         print()
152 |         print(f'{len(all_task)} tasks!')
153 |         print()
154 |         ts = [Thread(target=dl, args=(queue, )) for _ in range(min(len(all_task), 6))]
155 |         for t in ts:
156 |             t.start()
157 | 
158 |         for t in ts:
159 |             t.join()
160 | 


--------------------------------------------------------------------------------
/extractor/README.md:
--------------------------------------------------------------------------------
 1 | ### 这里是一些小爬虫集合
 2 | 
 3 | ---
 4 | 
 5 | 每个平台对应一个文件，每个文件里面有一个`get(url: str)`函数 统一形式如下(里面使用 f-string 需要 python3.6+)：
 6 | 
 7 | ```python
 8 | """
 9 | Args:
10 |     url: str
11 | Returns:
12 |     data: dict
13 |       |_ {
14 |             title: str,
15 |             author: str,
16 |             audioName: str,
17 |             videoName: str,
18 |             imgs: List[str],
19 |             audios: List[str],
20 |             videos: List[str],
21 |             text: str,
22 |             msg: str
23 |          }
24 | Tips:
25 |     data里面的各个字段只有当爬取到相关内容时才会存在，除了msg(不过这个没啥大用)
26 |     ☆ 爬取未成功也会返回data，而且不一定为空
27 | """
28 | ```
29 | 
30 | # 默认输入的链接都正确:grin:
31 | 
32 | ---
33 | 
34 | |          平台          |       资源内容       |      完成状态      |
35 | | :--------------------: | :------------------: | :----------------: |
36 | |   bilibili(哔哩哔哩)   |      封面、视频      | :white_check_mark: |
37 | |     changya(唱鸭)      |         音频         | :white_check_mark: |
38 | |      douyin(抖音)      |      无水印视频      | :white_check_mark: |
39 | |      kugou(酷狗)       |         音频         | :white_check_mark: |
40 | |       kuwo(酷我)       |         音频         | :white_check_mark: |
41 | |    lizhiFM(荔枝 FM)    |         音频         | :white_check_mark: |
42 | |  music163(网易云音乐)  |    音频、视频、mv    | :white_check_mark: |
43 | |    qqmusic(QQ 音乐)    |         音频         | :white_check_mark: |
44 | | pipigaoxiao(皮皮搞笑)  |      无水印视频      | :white_check_mark: |
45 | | quanminkge(全民 K 歌)  |      音频或视频      | :white_check_mark: |
46 | |      weibo(微博)       |         视频         | :white_check_mark: |
47 | |      weishi(微视)      |      无水印视频      | :white_check_mark: |
48 | |      zhihu(知乎)       |         视频         | :white_check_mark: |
49 | |   zuiyou_voice(最右)   |   音频(语音帖评论)   | :white_check_mark: |
50 | |   zuiyou_video(最右)   |         视频         | :white_check_mark: |
51 | |   qianqian(千千音乐)   |         音频         | :white_check_mark: |
52 | |      5sing(5sing)      |         音频         | :white_check_mark: |
53 | |     pipix(皮皮虾)      |      无水印视频      | :white_check_mark: |
54 | |   qingshipin(轻视频)   |      无水印视频      | :white_check_mark: |
55 | |   qutoutiao(趣头条)    |         视频         |       :dash:       |
56 | |      ku6(酷 6 网)      |         视频         | :white_check_mark: |
57 | |      lofter(乐乎)      |         视频         | :white_check_mark: |
58 | |  open163(网易公开课)   |       免费视频       | :white_check_mark: |
59 | |  xinpianchang(新片场)  |         视频         | :white_check_mark: |
60 | |  baidutieba(百度贴吧)  |         视频         | :white_check_mark: |
61 | |     kuaishou(快手)     | 无水印视频、长图视频 | :white_check_mark: |
62 | |  acfun(AcFun 弹幕网)   |         视频         | :white_check_mark: |
63 | |  haokan(百度好看视频)  |         视频         | :white_check_mark: |
64 | |   pearvideo(梨视频)    |         视频         | :white_check_mark: |
65 | |   xiaokaxiu(小咖秀)    |      无水印视频      | :white_check_mark: |
66 | |    sohuTV(搜狐视频)    |         视频         | :white_check_mark: |
67 | |        ted(TED)        |         视频         | :white_check_mark: |
68 | |    tudou(土豆视频)     |         视频         | :white_check_mark: |
69 | | quanminxsp(全民小视频) |         视频         | :white_check_mark: |
70 | |       lequ(乐趣)       |    背景动图、音频    | :white_check_mark: |
71 | |   peiyinxiu(配音秀)    |         视频         | :white_check_mark: |
72 | |     tuchong(图虫)      |         图片         | :white_check_mark: |
73 | |     changba(唱吧)      |         视频         | :white_check_mark: |
74 | |     migu(咪咕音乐)     |         音频         | :white_check_mark: |
75 | |       momo(陌陌)       |         视频         | :white_check_mark: |
76 | |     58pic(千图网)      |         图片         | :white_check_mark: |
77 | |     qmgx(全民搞笑)     |         无水印视频      | :white_check_mark: |
78 | 


--------------------------------------------------------------------------------
/extractor/__init__.py:
--------------------------------------------------------------------------------
 1 | __author__ = "wongxy github:xiyaowong"
 2 | __all__ = [
 3 |     "bilibili",
 4 |     "changya",
 5 |     "douyin",
 6 |     "kugou",
 7 |     "kuwo",
 8 |     "lizhiFM",
 9 |     "music163",
10 |     "pipigaoxiao",
11 |     "quanminkge",
12 |     "weibo",
13 |     "zhihu_video",
14 |     "weishi",
15 |     "zuiyou_voice",
16 |     "zuiyou_video",
17 |     "qqmusic",
18 |     "qianqian",
19 |     "sing5",
20 |     "pipix",
21 |     "qingshipin",
22 |     "qutoutiao",
23 |     "ku6",
24 |     "lofter",
25 |     "open163",
26 |     "xinpianchang",
27 |     "baidutieba",
28 |     "kuaishou",
29 |     "acfun",
30 |     "haokan",
31 |     "pearvideo",
32 |     "xiaokaxiu",
33 |     "sohuTV",
34 |     "ted",
35 |     "tudou",
36 |     "quanminxsp",
37 |     "lequ",
38 |     "peiyinxiu",
39 |     "tuchong",
40 |     "changba",
41 |     "migu_music",
42 |     "momo",
43 |     "pic58",
44 |     "qmgx"
45 | ]
46 | 


--------------------------------------------------------------------------------
/extractor/acfun.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
13 |     }
14 |     info_url = "https://api-new.acfunchina.com/rest/app/play/playInfo/mp4?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjAzNTI2NDMAAhAAMEP1uwS3Vi7NYAAAAJumF4MyTTFh5HGoyjW6ZpdjKymALUy9jZbsMTBVx-F10EhxyvpMtGQbBCYipvkMShM3iMNwbMd9DM6r2rnOYRVEdr6MaJS4yxxlA_Sl3JNWup57qBCQzOSC7SZnbEsHTQ%3D%3D&market=xiaomi&product=ACFUN_APP&sys_version=10&app_version=6.20.0.915&boardPlatform=sdm845&sys_name=android&socName=UNKNOWN&appMode=0"
15 |     # info_url = "https://m.acfun.cn/rest/mobile-direct/play/playInfo/singleQuality?videoId={}&resourceId={}&resourceType=2&mkey=AAHewK3eIAAyMjA5NTQ0MDACARAAMEP1uwPvjQhfQAAAAIAq7FtjRH%2Fn9rSMzs1AUNhmIS6eARtddADGgoGewjnABMg39tddqp9dTUq%2Ffd7MBisH5JpVc1bpf64a%2Bz3qrdI%3D"
16 | 
17 |     # get videoId, resourceIds
18 |     re_title = r'<title>(.*?)</title>'
19 |     re_videoId = r'"vid":"(\d+)",'
20 |     re_resourceId = r'"ac":"(\d+)",'
21 | 
22 |     try:
23 |         rep_html = requests.get(url, headers=headers, timeout=10)
24 | 
25 |         title = re.findall(re_title, rep_html.text)[0]
26 |         videoId = re.findall(re_videoId, rep_html.text)[0]
27 |         resourceId = re.findall(re_resourceId, rep_html.text)[0]
28 | 
29 |         rep_info = requests.get(info_url.format(videoId, resourceId), headers=headers, timeout=10)
30 | 
31 |         video = rep_info.json()["playInfo"]["streams"][0]["playUrls"][0]
32 |     except (IndexError, TypeError):
33 |         data["msg"] = "获取失败"
34 |     else:
35 |         data["title"] = title
36 |         data["videos"] = [video]
37 | 
38 |     return data
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     url = "https://m.acfun.cn/v/?ac=14134176&part=2"
43 |     print(get(url))
44 | 


--------------------------------------------------------------------------------
/extractor/baidutieba.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
13 |     }
14 |     rep = requests.get(url, headers=headers, timeout=10)
15 |     if rep.status_code == 200:
16 |         data["videos"] = re.findall(r'data-video="(.*?)"', rep.text)
17 |     else:
18 |         data["msg"] = "获取失败"
19 | 
20 |     return data
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     # url = "https://tieba.baidu.com/p/6098286801?share=9105&fr=share&sfc=copy&client_type=2&client_version=11.3.8.2&st=1585294971&unique=190E4CEC3908756B412C7ABAE54C772F&red_tag=2618234446"
25 |     url = input("url: ")
26 |     print(get(url))
27 | 


--------------------------------------------------------------------------------
/extractor/bilibili.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     imgs、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "user-agent":
13 |         "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
14 |         "Referer": "https://www.bilibili.com/",
15 |     }
16 | 
17 |     av_number_pattern = r'(BV[0-9a-zA-Z]*)'
18 |     cover_pattern = r"readyPoster: '(.*?)',"
19 |     video_pattern = r"readyVideoUrl: '(.*?)',"
20 |     title_pattern = r'title":"(.*?)",'
21 | 
22 |     av = re.findall(av_number_pattern, url)
23 |     if av:
24 |         av = av[0]
25 |     else:
26 |         data["msg"] = "链接可能不正确，因为我无法匹配到av号"
27 |         return data
28 |     url = f"https://www.bilibili.com/video/{av}"
29 | 
30 |     with requests.get(url, headers=headers, timeout=10) as rep:
31 |         if rep.status_code == 200:
32 |             cover_url = re.findall(cover_pattern, rep.text)
33 |             if cover_url:
34 |                 cover_url = cover_url[0]
35 |                 if '@' in cover_url:
36 |                     cover_url = cover_url[:cover_url.index('@')]
37 |                 data["imgs"] = ['https:' + cover_url]
38 | 
39 |             video_url = re.findall(video_pattern, rep.text)
40 |             title_text = re.findall(title_pattern, rep.text)
41 |             if video_url:
42 |                 video_url = video_url[0]
43 |                 data["videos"] = [video_url]
44 |             if title_text:
45 |                 data["videoName"] = title_text[0]
46 |         else:
47 |             data["msg"] = "获取失败"
48 |         return data
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     print(get(input("url: ")))
53 | 


--------------------------------------------------------------------------------
/extractor/changba.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import execjs
 4 | import requests
 5 | 
 6 | js_code = """l=new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);function u(t){var e,o,n,a,i,r,s;for(r=t.length,i=0,s="";i<r;){for(;e=l[255&t.charCodeAt(i++)],i<r&&-1==e;);if(-1==e)break;for(;o=l[255&t.charCodeAt(i++)],i<r&&-1==o;);if(-1==o)break;s+=String.fromCharCode(e<<2|(48&o)>>4);do{if(61==(n=255&t.charCodeAt(i++)))return s;n=l[n]}while(i<r&&-1==n);if(-1==n)break;s+=String.fromCharCode((15&o)<<4|(60&n)>>2);do{if(61==(a=255&t.charCodeAt(i++)))return s;a=l[a]}while(i<r&&-1==a);if(-1==a)break;s+=String.fromCharCode((3&n)<<6|a)}return s}"""
 7 | js = execjs.compile(js_code)
 8 | 
 9 | 
10 | def get(url: str) -> dict:
11 |     """
12 |     videos
13 |     """
14 |     data = {}
15 |     headers = {
16 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
17 |     }
18 |     rep = requests.get(url, headers=headers, timeout=10)
19 |     if rep.status_code != 200:
20 |         return {"msg": "获取失败"}
21 | 
22 |     enc_video_url = re.findall(r"video_url: '(.*?)',", rep.text)[0]
23 |     video_url = "https:" + js.call("u", (enc_video_url,))
24 |     data["videos"] = [video_url]
25 |     return data
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     print(get(input("url: ")))
30 | 


--------------------------------------------------------------------------------
/extractor/changya.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     author、audioName、audios
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
13 |     }
14 |     audio_url_pattern = r'<audio src="(http://cdn.singroom.i52hz.com/.*?)" preload="metadata"'
15 |     author_pattern = r'"nickname":"(.*?)",'
16 |     audio_name_pattern = r'"songName":"(.*?)",'
17 | 
18 |     with requests.get(url, headers=headers, timeout=10) as rep:
19 |         if rep.status_code == 200:
20 |             html = rep.text
21 |             author = re.findall(author_pattern, html)
22 |             if author:
23 |                 author = author[0]
24 |                 data["author"] = author
25 |             audio_name = re.findall(audio_name_pattern, html)
26 |             if audio_name:
27 |                 audio_name = audio_name[0]
28 |                 data["audioName"] = audio_name
29 |             audio_url = re.findall(audio_url_pattern, html)
30 |             if audio_url:
31 |                 audio_url = audio_url[0]
32 |                 data["audios"] = [audio_url]
33 |         else:
34 |             data["msg"] = "访问链接内容失败"
35 | 
36 |         return data
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     data = get(input("url: "))
41 |     print(data)
42 | 


--------------------------------------------------------------------------------
/extractor/douyin.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(share_url) -> dict:
 7 |     """
 8 |     author, title, audioName, audios, videoName, videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         'accept': 'application/json',
13 |         'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
14 |     }
15 |     api = "https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={item_id}"
16 | 
17 |     rep = requests.get(share_url, headers=headers, timeout=10)
18 |     if rep.ok:
19 |         # item_id
20 |         item_id = re.findall(r'video/(\d+)', rep.url)
21 |         if item_id:
22 |             item_id = item_id[0]
23 |             # video info
24 |             rep = requests.get(api.format(item_id=item_id), headers=headers, timeout=10)
25 |             if rep.ok and rep.json()["status_code"] == 0:
26 |                 info = rep.json()["item_list"][0]
27 | 
28 |                 data["author"] = info["author"]["nickname"]
29 |                 data["title"] = data["videoName"] = info["desc"]
30 |                 if info.get('music'):
31 |                     data["audioName"] = info["music"]["title"]
32 |                     data["audios"] = [info["music"]["play_url"]["uri"]]
33 |                 # data["imgs"] = [info["video"]["origin_cover"]["url_list"][0]]
34 | 
35 |                 # playwm_url -> play_url
36 |                 play_url = info["video"]["play_addr"]["url_list"][0].replace('playwm', 'play')
37 |                 data["videos"] = [play_url]
38 |                 return data
39 |     return {'msg': '获取失败'}
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     data = get(input('share url: '))
44 |     print(data)
45 | 


--------------------------------------------------------------------------------
/extractor/haokan.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | 
 4 | 
 5 | def get(url: str) -> dict:
 6 |     """
 7 |     title、videos
 8 |     """
 9 |     data = {}
10 |     headers = {
11 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
12 |     }
13 |     re_title = r'<title>(.*?)</title>'
14 |     re_video = r'<video class="video" src=(.*?)></video>'
15 | 
16 |     with requests.get(url, headers=headers, timeout=10) as rep:
17 |         if rep.status_code == 200:
18 |             title = re.findall(re_title, rep.text)
19 |             video = re.findall(re_video, rep.text)
20 |             if title:
21 |                 data["title"] = title
22 |             if video:
23 |                 data["videos"] = [video]
24 |         else:
25 |             data["msg"] = "失败"
26 | 
27 |     return data
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     url = "https://haokan.baidu.com/v?vid=10422427972023610990&tab=recommend"
32 |     print(get(url))
33 | 


--------------------------------------------------------------------------------
/extractor/ku6.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | # https://m.ku6.com/video/detail?id=*.
 7 | 
 8 | 
 9 | def get(url: str):
10 |     """
11 |     title、videos
12 |     """
13 |     data = {}
14 |     rep = requests.get(url, timeout=10)
15 |     try:
16 |         data["title"] = re.findall(r'video-title\'\).text\("(.*?)"\)', rep.text)[0]
17 |         data["videos"] = re.findall(r'{type: "video/mp4", src: "(.*?)"}', rep.text)
18 |     except IndexError as e:
19 |         data["msg"] = f"获取失败{e}"
20 | 
21 |     return data
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     from pprint import pprint
26 |     pprint(get(input("url: ")))
27 | 


--------------------------------------------------------------------------------
/extractor/kuaishou.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     title、imgs、videos
10 |     """
11 |     data = {}
12 |     failed = {'msg': 'failed...'}
13 |     headers = {
14 |         "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25",
15 |         "Cookie": "did=web_68e0268146694843a92700d2de49a0a6;"
16 |     }
17 |     # rewrite desktop url
18 |     temp = re.findall(r'live\.kuaishou\.com/u/\w+/(\w+)', url)
19 |     if temp:
20 |         url = 'https://c.kuaishou.com/fw/photo/{}'.format(temp[0])
21 | 
22 |     rep = requests.get(url, headers=headers, timeout=10)
23 |     if rep.status_code != 200:
24 |         return failed
25 | 
26 |     page_data = re.findall(r'<script type="text/javascript">window\.pageData= (\{.*?\})</script>', rep.text)
27 |     if not page_data:
28 |         return failed
29 | 
30 |     try:
31 |         page_data = json.loads(page_data[0])
32 |     except Exception:
33 |         print('kuaishou loads json failed')
34 |         return failed
35 | 
36 |     video_info = page_data['video']
37 |     data['title'] = video_info['caption']
38 |     # 获取视频
39 |     try:  # 如果出错，则可能是长图视频
40 |         data['videos'] = [video_info['srcNoMark']]
41 |     except Exception:
42 |         pass
43 |     else:
44 |         data['videoName'] = data['title']
45 |         data['msg'] = '如果快手视频下载出错请尝试更换网络'
46 |     # 获取图片
47 |     try:  # 如果出错，则可能是普通视频；
48 |         images = video_info['images']
49 |         imageCDN: str = video_info['imageCDN']
50 |         # 如果是长图视频，则这几项一定存在
51 |         assert images is not None
52 |         assert imageCDN is not None
53 |     except Exception:
54 |         pass
55 |     else:
56 |         if not imageCDN.startswith('http'):
57 |             imageCDN = 'http://' + imageCDN
58 |         data['imgs'] = [imageCDN + i['path'] for i in images]
59 |     return data
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     from pprint import pprint
64 |     pprint(get(input("url: ")))
65 | 


--------------------------------------------------------------------------------
/extractor/kugou.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url:str) -> dict:
 8 |     """
 9 |     author、audioName、imgs、audios
10 |     """
11 |     data = {}
12 |     headers = {
13 |         'user-agent':
14 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
15 |         'referer':
16 |         'https://www.kugou.com/song/',
17 |         'cookie':
18 |         'kg_mid=f679eeece44cf6bec74d2867be4901f7; kg_dfid=2kuKRO3GStCZ0VBY9V12pXeT; Hm_lvt_aedee6983d4cfc62f509129360d6bb3d=1574177549,1576216623,1576386693; kg_mid_temp=f679eeece44cf6bec74d2867be4901f7; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e; Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d=1576387198',
19 |     }
20 |     song_info_url = "https://wwwapi.kugou.com/yy/index.php"
21 | 
22 |     hash = re.findall(r"hash=([a-zA-Z0-9]+)", url)
23 |     if hash:
24 |         hash = hash[0]
25 |     else:
26 |         hash_pattern = r'"hash":"(.*?)",'
27 |         with requests.get(url, headers=headers, timeout=10) as rep:
28 |             if rep.status_code == 200:
29 |                 hash = re.findall(hash_pattern, rep.text)
30 |             if not hash:
31 |                 data["msg"] = "关键信息获取失败"
32 |                 return data
33 |             hash = hash[0]
34 | 
35 |     params = {
36 |         "r": "play/getdata",
37 |         "callback": "jQuery191003428174711713661_1583566461495",
38 |         "hash": hash,
39 |         "album_id": 0,
40 |         "dfid": "2kuKRO3GStCZ0VBY9V12pXeT",
41 |         "mid": "f679eeece44cf6bec74d2867be4901f7",
42 |         "platid": 4,
43 |     }
44 | 
45 |     with requests.get(song_info_url, params=params, headers=headers, timeout=10) as rep:
46 |         if rep.status_code == 200:
47 |             text = rep.text[rep.text.index("(")+1:].replace(")", "").replace(";", "")
48 |             data_ = json.loads(text)
49 |             if data_.get("err_code") == 0:
50 |                 info = data_.get("data")
51 |                 author_name = info.get('author_name')
52 |                 song_name = info.get('song_name')
53 |                 play_url = info.get('play_url')
54 |                 img = info.get('img')
55 |                 data["author"] = author_name
56 |                 data["audioName"] = song_name
57 |                 data["audios"] = [play_url]
58 |                 data["imgs"] = [img]
59 |             else:
60 |                 data["msg"] = "。。。好像失败了"
61 |         else:
62 |             data["msg"] = "。。。好像失败了"
63 | 
64 |     return data
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     print(get(input('url: ')))


--------------------------------------------------------------------------------
/extractor/kuwo.py:
--------------------------------------------------------------------------------
 1 | # 320 192 128
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     author、audioName、audios
10 |     """
11 |     data = {}
12 |     headers = {
13 |         "Accept": "application/json, text/plain, */*",
14 |         "Accept-Encoding": "gzip, deflate",
15 |         "Accept-Language": "zh-CN,zh;q=0.9",
16 |         "Connection": "keep-alive",
17 |         "Host": "www.kuwo.cn",
18 |         "Referer": "http://www.kuwo.cn/",
19 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
20 |     }
21 |     song_info_url_format = "http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={id}"
22 |     mp3_url_format = "http://www.kuwo.cn/url?format=mp3&rid={id}&response=url&type=convert_url3&br={quality}&from=web"
23 | 
24 |     # http://www.kuwo.cn/play_detail/*********
25 |     id = re.findall(r"/(\d{1,})", url)
26 |     if id:
27 |         id = id[0]
28 |     else:
29 |         data["msg"] = "不支持输入的链接形式"
30 |         return data
31 | 
32 |     session = requests.session()
33 | 
34 |     # 得到最高品质以及歌曲信息
35 |     with session.get(song_info_url_format.format(id=id), headers=headers, timeout=10) as rep:
36 |         if rep.status_code == 200 and rep.json().get("status") == 200:
37 |             best_quality = rep.json().get("data").get(
38 |                 "songinfo").get("coopFormats")[0]
39 |             author = rep.json().get("data").get("songinfo").get("artist")
40 |             song_name = rep.json().get("data").get("songinfo").get("songName")
41 |             pic = rep.json().get("data").get("songinfo").get("pic")
42 |             data["author"] = author
43 |             data["audioName"] = song_name
44 |             data["imgs"] = [pic]
45 |         else:
46 |             data["msg"] = "获取失败"
47 |             return data
48 | 
49 |     if not best_quality:
50 |         best_quality = "128kmp3"
51 | 
52 |     # 得到歌曲链接
53 |     with session.get(mp3_url_format.format(id=id, quality=best_quality), headers=headers, timeout=10) as rep:
54 |         if rep.status_code == 200 and rep.json().get("code") == 200:
55 |             play_url = rep.json().get("url")
56 |             data["audios"] = [play_url]
57 |         else:
58 |             data["msg"] = "获取音频链接失败"
59 | 
60 |     return data
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     print(get(input("url:  ")))
65 | 


--------------------------------------------------------------------------------
/extractor/lequ.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | requests.packages.urllib3.disable_warnings()  # pylint: disable=no-member
 6 | 
 7 | 
 8 | def get(url: str) -> dict:
 9 |     """
10 |     author、audioName、imgs、audios
11 |     """
12 |     data = {}
13 |     headers = {
14 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
15 |     }
16 |     rep = requests.get(url, headers=headers, timeout=10, verify=False)
17 |     if rep.status_code == 200:
18 |         html = rep.text
19 |         try:
20 |             author = re.findall(r'class="singer_name">(.*?)<', html)[0]
21 |             audioName = re.findall(r'class="song_name">(.*?)<', html)[0]
22 |             imgs = re.findall(r'background-image: url\("(.*?)"\);', html)
23 |             audios = re.findall(r'<audio.*src="(.*?)"  loop="loop"></audio>', html)
24 |             data["author"] = author
25 |             data["audioName"] = audioName
26 |             data["imgs"] = imgs
27 |             data["audios"] = audios
28 |         except Exception:
29 |             data["msg"] = {"msg": "获取失败"}
30 | 
31 |     return data
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     url = "https://api.bestdjb.com/promote/song-share/6477f04370cc22e7d9c2d3ac4265a92a?app_version=1.4.3"
36 |     print(get(url))
37 | 


--------------------------------------------------------------------------------
/extractor/lizhiFM.py:
--------------------------------------------------------------------------------
 1 | # from urllib.parse import urlparse
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     author、audioName、audios
10 |     """
11 |     data = {}
12 |     headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"}
13 |     info_url = "https://m.lizhi.fm/vodapi/voice/info/{id}"
14 | 
15 |     # path = urlparse(url).path
16 |     # voiceId = path.split("/")[-1]
17 |     voiceId = re.findall(r"/(\d{1,})", url)
18 |     if not voiceId:
19 |         data["msg"] = "链接无效，解析未成功"
20 |         return data
21 |     else:
22 |         voiceId = voiceId[-1]
23 | 
24 |     with requests.get(info_url.format(id=voiceId), headers=headers, timeout=10) as rep:
25 |         if rep.status_code == 200 and rep.json().get("code") == 0:
26 |             info = rep.json()
27 |             userName = info.get("data").get("userVoice").get("userInfo").get("name")
28 |             voiceName = info.get("data").get("userVoice").get("voiceInfo").get("name")
29 |             voiceUrl= info.get("data").get("userVoice").get("voicePlayProperty").get("trackUrl")
30 |             data["author"] = userName
31 |             data["audioName"] = voiceName
32 |             data["audios"] = [voiceUrl]
33 |         else:
34 |             data["msg"] = "未能解析成功"
35 | 
36 |     return data
37 | 
38 | 
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     url = input("url: ")
43 |     print(get(url))


--------------------------------------------------------------------------------
/extractor/lofter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     videos
 9 |     """
10 |     data = {}
11 |     rep = requests.get(url, timeout=10)
12 |     if rep.status_code == 200:
13 |         data["videos"] = re.findall(r'<video.*?src="(.*?)"', rep.text)
14 |     else:
15 |         data["msg"] = "获取失败"
16 | 
17 |     return data
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     # url = "https://yan5236.lofter.com/post/1d6ced3e_1c74f6df6"
22 |     url = input("url: ")
23 |     print(get(url))
24 | 


--------------------------------------------------------------------------------
/extractor/migu_music.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | headers = {
 6 |     'Referer': 'https://m.music.migu.cn/',
 7 |     'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Mobile Safari/537.36'
 8 | }
 9 | detail_url = 'http://m.music.migu.cn/migu/remoting/cms_detail_tag?cpid={copyrightId}'
10 | player_url = 'https://app.pd.nf.migu.cn/MIGUM3.0/v1.0/content/sub/listenSong.do?channel=mx&copyrightId={copyrightId}&contentId={contentId}&toneFlag={toneFlag}&resourceType={resourceType}&userId=15548614588710179085069&netType=00'
11 | 
12 | 
13 | def get(url: str):
14 |     """http://music.migu.cn/v3/music/song/*********
15 | 
16 |     author、audioName、audios
17 |     """
18 |     data = {}
19 |     # get copyrightId
20 |     copyrightId = re.findall(r"song/(\d+)", url)[0]
21 | 
22 |     # get detail
23 |     rep = requests.get(detail_url.format(copyrightId=copyrightId), headers=headers, timeout=6)
24 |     if rep.status_code != 200 or rep.json()["data"] is None:
25 |         return {"msg": "获取失败,请检查链接是否正确"}
26 |     json = rep.json()["data"]  # type: dict
27 | 
28 |     # author
29 |     singerName = json["singerName"]  # type: list
30 |     author = "&".join(singerName) if len(singerName) > 1 else singerName[0]
31 | 
32 |     # audioName
33 |     audioName = json["songName"]
34 | 
35 |     # contentId
36 |     c_item = json.get("qq")  # type:dict
37 | 
38 |     if not c_item:
39 |         return {"msg": "获取失败"}
40 |     contentId = c_item["productId"]
41 | 
42 |     # toneFlag
43 |     toneFlag = "HQ" if json["hasHQqq"] == "1" else "LQ"
44 | 
45 |     video_url = player_url.format(copyrightId=copyrightId,
46 |                                   contentId=contentId,
47 |                                   toneFlag=toneFlag,
48 |                                   resourceType=2)
49 | 
50 |     data["author"] = author
51 |     data["audioName"] = audioName
52 |     data["videos"] = [video_url]
53 | 
54 |     return data
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     url = "http://music.migu.cn/v3/music/song/69910422841"
59 |     print(get(url))
60 | 


--------------------------------------------------------------------------------
/extractor/momo.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str):
 7 |     """
 8 |     title、imgs、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 |     }
14 |     api = "https://m.immomo.com/inc/microvideo/share/profiles"
15 | 
16 |     ar = re.findall(r'/(ar.*?)\.html', url)
17 |     if not ar:
18 |         return {"msg": "失败"}
19 |     ar = ar[0]
20 | 
21 |     payload = {
22 |         "feedids": ar,
23 |         "name": "",
24 |         "avatar": "",
25 |     }
26 | 
27 |     rep = requests.post(api, data=payload, headers=headers, timeout=6)
28 |     if rep.status_code == 200 and rep.json()["ec"] == 200:
29 |         info = rep.json()["data"]
30 |         title = info["list"][0]["content"]
31 |         img = info["list"][0]["video"]["cover"]["l"]
32 |         video = info["list"][0]["video"]["video_url"]
33 | 
34 |         data["title"] = data["videoName"] = title
35 |         data["imgs"] = [img]
36 |         data["videos"] = [video]
37 |     else:
38 |         data["msg"] = "失败"
39 | 
40 |     return data
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     from pprint import pprint
45 |     url = "https://m.immomo.com/s/moment/new-share-v2/ar8422649104.html"
46 |     pprint(get(url))
47 | 


--------------------------------------------------------------------------------
/extractor/music163/__init__.py:
--------------------------------------------------------------------------------
 1 | from .music163 import Wangyiyun
 2 | 
 3 | 
 4 | def get(url: str) -> dict:
 5 |     """
 6 |     aduios或者videos
 7 |     """
 8 |     data = {}
 9 |     wangyiyun = Wangyiyun()
10 |     resource_url = wangyiyun.get(url)
11 |     if not resource_url:
12 |         return {"msg": "获取失败"}
13 |     if "mv" in url or "video" in url:
14 |         data["videos"] = [resource_url]
15 |     elif "song" in url:
16 |         data["audios"] = [resource_url]
17 |     return data
18 | 
19 | 
20 | __all__ = ["get"]
21 | 


--------------------------------------------------------------------------------
/extractor/music163/encrypt.py:
--------------------------------------------------------------------------------
 1 | # 原理：https://www.zhihu.com/question/36081767  代码块直接copy的：https://github.com/CharlesPikachu/Music-Downloader
 2 | import base64
 3 | import codecs
 4 | import json
 5 | import os
 6 | 
 7 | from Crypto.Cipher import AES
 8 | 
 9 | 
10 | class Cracker():
11 |     modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
12 |     nonce = '0CoJUm6Qyw8W8jud'
13 |     pubKey = '010001'
14 | 
15 |     @classmethod
16 |     def get(cls, text):
17 |         text = json.dumps(text)
18 |         secKey = cls._createSecretKey(16)
19 |         encText = cls._aesEncrypt(cls._aesEncrypt(text, cls.nonce), secKey)
20 |         encSecKey = cls._rsaEncrypt(secKey, cls.pubKey, cls.modulus)
21 |         post_data = {'params': encText, 'encSecKey': encSecKey}
22 |         return post_data
23 | 
24 |     @classmethod
25 |     def _aesEncrypt(cls, text, secKey):
26 |         pad = 16 - len(text) % 16
27 |         if isinstance(text, bytes):
28 |             text = text.decode('utf-8')
29 |         text = text + str(pad * chr(pad))
30 |         secKey = secKey.encode('utf-8')
31 |         encryptor = AES.new(secKey, 2, b'0102030405060708')
32 |         text = text.encode('utf-8')
33 |         ciphertext = encryptor.encrypt(text)
34 |         ciphertext = base64.b64encode(ciphertext)
35 |         return ciphertext
36 | 
37 |     @classmethod
38 |     def _rsaEncrypt(cls, text, pubKey, modulus):
39 |         text = text[::-1]
40 |         rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16)**int(pubKey, 16) % int(modulus, 16)
41 |         return format(rs, 'x').zfill(256)
42 | 
43 |     @classmethod
44 |     def _createSecretKey(cls, size):
45 |         return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(size)))))[0:16]
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     print(Cracker.get("Hello World"))
50 | 


--------------------------------------------------------------------------------
/extractor/music163/music163.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib.parse import unquote
 3 | 
 4 | import requests
 5 | 
 6 | from .encrypt import Cracker
 7 | 
 8 | 
 9 | class Wangyiyun():
10 |     def __init__(self):
11 |         self.headers = {
12 |             'Referer': 'https://music.163.com/',
13 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.32 Safari/537.36'
14 |         }
15 |         self.music_url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token='
16 |         self.mv_url = "https://music.163.com/weapi/song/enhance/play/mv/url?csrf_token="
17 | 
18 |     def get(self, url):
19 |         """
20 |         返回资源链接
21 |         """
22 | 
23 |         if "video" in url:
24 |             return self.get_video(url)
25 | 
26 |         id = self.get_id(url)
27 |         if "mv" in url:
28 |             params = {"id": id, "r": "1080", "csrf_token": ""}
29 |             data = self.__postRequests(self.mv_url, params)
30 |             if data:
31 |                 return data["data"]["url"]
32 |         elif "song" in url:
33 |             params = {'ids': [int(id)], 'br': 320000, 'csrf_token': ''}
34 |             data = self.__postRequests(self.music_url, params)
35 |             if data:
36 |                 return data["data"][0]["url"]
37 |         return None
38 | 
39 |     def get_video(self, url):
40 |         id = self.get_id(url)
41 |         url = f"http://music.163.com/video/{id}/"
42 |         rep = requests.get(url, headers=self.headers, timeout=6)
43 |         if rep.status_code == 200:
44 |             encoded_url = re.findall(r'<meta property="og:video" content="(.*?)" />', rep.text)[0]
45 |             return unquote(encoded_url)
46 |         return None
47 | 
48 |     # 匹配id
49 |     def get_id(self, raw_url) -> str:
50 |         pattern1 = re.compile(r'\?id=(\w+)')
51 |         pattern2 = re.compile(r'song/(\w+)/')
52 |         pattern3 = re.compile(r'mv/(\w+)/')
53 |         pattern4 = re.compile(r'video/(\w+)/')
54 |         if "?id" in raw_url:
55 |             id = re.findall(pattern1, raw_url)
56 |         elif "song" in raw_url:
57 |             id = re.findall(pattern2, raw_url)
58 |         elif "mv" in raw_url:
59 |             id = re.findall(pattern3, raw_url)
60 |         elif "video" in raw_url:
61 |             id = re.findall(pattern4, raw_url)
62 |         if id:
63 |             return id[0]
64 |         return None
65 | 
66 |     def __postRequests(self, url, params, timeout=6):
67 |         post_data = Cracker.get(params)
68 |         rep = requests.post(url,
69 |                             data=post_data,
70 |                             timeout=timeout,
71 |                             headers=self.headers)
72 |         if rep.json()['code'] == 200:
73 |             return rep.json()
74 |         return None
75 | 


--------------------------------------------------------------------------------
/extractor/open163.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=W0123
 2 | import re
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     videos
 9 |     """
10 |     data = {}
11 |     data["videos"] = []
12 |     headers = {
13 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
14 |     }
15 |     re_url = r'mid:(.*?),.*?mp4SdUrlOrign:(.*?),.*?mp4HdUrlOrign:(.*?),.*?mp4ShdUrlOrign:(.*?),'
16 |     rep = requests.get(url, headers=headers, timeout=10)
17 |     items = re.findall(re_url, rep.text)
18 |     for item in items:
19 |         # 倒序取最高画质
20 |         for video_url in item[::-1]:  # type: str
21 |             # print(url)
22 |             if "http" in video_url:
23 |                 video_url = eval(video_url).replace("\\u002F", "/")
24 |                 data["videos"].append(video_url)
25 |                 break
26 |     return data
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     url = "http://open.163.com/newview/movie/free?pid=M8LI1JCE6&mid=M8LI3BQ60"
31 |     print(get(url))
32 | 


--------------------------------------------------------------------------------
/extractor/pearvideo.py:
--------------------------------------------------------------------------------
 1 | # hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo.com/mp4/adshort/20200328/cont-1665047-11947733-122441_adpkg-ad_hd.mp4",
 2 | # data-title="奥运推迟后东京新冠确诊数翻倍，《纽约时报》发文质疑" data-summary="从3月23日起，东京地区的新冠病毒确诊数就连续4天上涨。在24日官宣东京奥运推迟之后，第二天确诊数更是直接翻倍。《纽约时报》写了一篇文章，列出了各种数据，质疑此前东京为了奥运会而牺牲检测。"
 3 | 
 4 | import re
 5 | import requests
 6 | 
 7 | 
 8 | def get(url: str) -> dict:
 9 |     """
10 |     title、videos、text
11 |     """
12 |     data = {}
13 |     headers = {
14 |         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
15 |     }
16 |     try:
17 |         rep = requests.get(url, headers=headers, timeout=10)
18 |         data["title"], data["text"] = re.findall(r'data-title="(.*?)" data-summary="(.*?)"', rep.text)[0]
19 |         data["videos"] = re.findall(r'srcUrl="(.*?\.mp4)",', rep.text)
20 |     except (ConnectionError, IndexError, TypeError):
21 |         data["msg"] = "获取失败"
22 | 
23 |     return data
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     url = "https://www.pearvideo.com/video_1664989"
28 |     print(get(url))
29 | 


--------------------------------------------------------------------------------
/extractor/peiyinxiu.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
13 |     }
14 |     rep = requests.get(url, headers=headers, timeout=10)
15 |     if rep.status_code != 200:
16 |         return {"msg": "获取失败"}
17 |     html = rep.text
18 |     data["title"] = re.findall(r'data-title="(.*?)"', html)[0]
19 |     data["videos"] = re.findall(r"\sfilmurl: '(.*?)',", html)
20 |     return data
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     url = "http://peiyinxiu.com/m/127066455"
25 |     print(get(url))
26 | 


--------------------------------------------------------------------------------
/extractor/pic58.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """https://www.58pic.com/newpic/*.html
 8 | 
 9 |     imgs
10 |     """
11 |     rep = requests.get(url, timeout=6)
12 |     if not rep.ok:
13 |         return {"msg": "失败"}
14 |     pre_url = re.findall(r'<meta property="og:image" content="//(.*?)!.*?"/>', rep.text)
15 |     if not pre_url:
16 |         return {"msg": "失败"}
17 |     pre_url = pre_url[0]  # type: str
18 |     img_url = pre_url.replace("preview.qiantucdn.com", "https://pic.qiantucdn.com")
19 |     return {"imgs": [img_url], "msg": f"下载时需要设置referer: {url}"}
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     # url = input("url: ")
24 |     url = "https://www.58pic.com/newpic/34673009.html"
25 |     print(get(url))
26 | 


--------------------------------------------------------------------------------
/extractor/pipigaoxiao.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     videos
10 |     """
11 |     data = {}
12 |     headers = {
13 |         "Host": "share.ippzone.com",
14 |         "Connection": "keep-alive",
15 |         "Content-Length": "45",
16 |         "Origin": "http://share.ippzone.com",
17 |         "User-Agent":
18 |         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
19 |         "Content-Type": "text/plain;charset=UTF-8",
20 |         "Accept": "*/*",
21 |         "Referer": "http://share.ippzone.com/",
22 |         "Accept-Encoding": "gzip, deflate",
23 |         "Accept-Language": "zh-CN,zh;q=0.9",
24 |     }
25 | 
26 |     post_url = "http://share.ippzone.com/ppapi/share/fetch_content"
27 | 
28 |     pid = re.findall(r"/(\d{1,})", url)
29 |     if not pid:
30 |         data["msg"] = "链接无效，无法获取有效数据"
31 |         return data
32 |     else:
33 |         pid = int(pid[0])
34 | 
35 |     post_data = {
36 |         "pid": pid,
37 |         "type": "post",
38 |     }
39 | 
40 |     with requests.post(post_url, headers=headers, data=json.dumps(post_data), timeout=10) as rep:
41 |         if rep.status_code == 200 and rep.json().get("ret") == 1:
42 |             id = rep.json().get("data").get("post").get("imgs")[0].get("id")
43 |             play_url = rep.json().get('data').get('post').get('videos').get(str(id)).get('url')
44 |             data["videos"] = [play_url]
45 |         else:
46 |             data["msg"] = "资源获取失败，请确认输入是否正确"
47 | 
48 |         return data
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     print(get(input("url: ")))


--------------------------------------------------------------------------------
/extractor/pipix.py:
--------------------------------------------------------------------------------
 1 | # author: wongxy
 2 | # --------------
 3 | # https://h5.pipix.com/item/******************
 4 | import re
 5 | import requests
 6 | 
 7 | 
 8 | def get(url: str) -> dict:
 9 |     """
10 |     title、audios
11 |     """
12 |     data = {}
13 |     headers = {
14 |         "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 |     }
16 |     item_id = re.findall(r"item/(\d+)", url)
17 |     if not item_id:
18 |         return {"msg": "获取失败"}
19 |     item_id = item_id[0]
20 |     info_url = f"https://h5.pipix.com/bds/webapi/item/detail/?item_id={item_id}&source=share"
21 |     with requests.get(info_url, headers=headers, timeout=10) as rep:
22 |         if rep.status_code != 200 or rep.json().get("status_code") != 0:
23 |             return {"msg": "获取失败"}
24 |         info = rep.json()["data"]["item"]
25 |         data["title"] = info["share"]["title"]
26 |         data["audios"] = [info["origin_video_download"]["url_list"][0]["url"]]
27 | 
28 | 
29 | 
30 |     return data
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     print(get(input("url: ")))


--------------------------------------------------------------------------------
/extractor/qianqian.py:
--------------------------------------------------------------------------------
 1 | # qianqian music
 2 | # music.taihe.com
 3 | import re
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     url sample: http://music.taihe.com/song/********
10 | 
11 |     author、audioName、imgs、audios
12 |     """
13 |     data = {}
14 |     headers = {
15 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
16 |     }
17 |     songinfo_format_url = "http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&songid={songid}&from=web"
18 | 
19 |     songid = re.findall(r"song/(\d+)", url)
20 |     if not songid:
21 |         data["msg"] = "无法获取有效消息"
22 |         return data
23 |     songid = songid[0]
24 |     songinfo_url = songinfo_format_url.format(songid=songid)
25 |     with requests.get(songinfo_url, headers=headers, timeout=10) as rep:
26 |         if rep.status_code != 200:
27 |             data["msg"] = "无法获取有效消息"
28 |             return data
29 |         result = rep.json()
30 |         data["author"] = result["songinfo"]["artist"]
31 |         data["audioName"] = result["songinfo"]["title"]
32 |         data["imgs"] = [result["songinfo"]["album_1000_1000"]]
33 |         data["audios"] = [result["bitrate"]["show_link"] or result["bitrate"]["file_link"]]
34 | 
35 |     return data
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     import pprint
40 |     pprint.pprint(get(input("url: ")))


--------------------------------------------------------------------------------
/extractor/qingshipin.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | def get(url: str):
 5 |     """
 6 |     author、title、imgs、videos
 7 |     """
 8 |     data = {}
 9 |     headers = {
10 |         "User-Agent":
11 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like\
12 |             Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 |     }
14 |     detail_url = url.replace("video/?", "bbq/app-bbq/sv/detail?sv")
15 |     with requests.get(detail_url, headers=headers, timeout=10) as rep:
16 | 
17 |         if rep.status_code != 200:
18 |             return {"msg": "error occurred!"}
19 | 
20 |         json = rep.json()
21 |         if json["code"] != 0:
22 |             return {"msg": "error occurred!"}
23 | 
24 |         author = json["data"]["user_info"]["uname"]
25 |         title = json["data"]["title"]
26 |         imgs = [json["data"]["cover_url"]]
27 |         videos = [json["data"]["play"]["url"]]
28 | 
29 |         data["author"] = author
30 |         data["title"] = title
31 |         data["imgs"] = imgs
32 |         data["videos"] = videos
33 |     return data
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     url = input("url: ")
38 |     print(get(url))
39 | 


--------------------------------------------------------------------------------
/extractor/qmgx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 全民搞笑 https://longxia.music.xiaomi.com/share/video/******
 3 | """
 4 | import re
 5 | 
 6 | import requests
 7 | 
 8 | 
 9 | def get(url: str) -> dict:
10 |     """
11 |     title、videoName、videos
12 |     """
13 |     data = {}
14 |     vid = re.findall(r'video/(\d+)', url)
15 |     if vid:
16 |         api = 'https://longxia.music.xiaomi.com/api/share?contentType=video&contentId={}'.format(vid[0])
17 |         headers = {
18 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
19 |         }
20 |         rep = requests.get(api, headers=headers, timeout=5)
21 |         if rep.status_code == 200 and rep.json()['code'] == 200:
22 |             info = rep.json()['data']['videoInfo']['videoInfo']
23 |             data['title'] = data['videoName'] = info['desc']
24 |             data['videos'] = [info['url']]
25 |             return data
26 |     return {'msg': 'failed'}
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     print(get('https://longxia.music.xiaomi.com/share/video/6624743459453734912?sharerUserId'))
31 | 


--------------------------------------------------------------------------------
/extractor/qqmusic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str):
 8 |     """
 9 |     author、audioName、audios
10 |     """
11 |     data = {}
12 |     ios_headers = {
13 |         "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
14 |         "referer": "http://y.qq.com"
15 |     }
16 | 
17 |     # find: songid、songmid and author、audioName
18 |     with requests.get(url, headers=ios_headers, timeout=10) as rep:
19 |         if rep.status_code != 200:
20 |             return {"msg": "链接无效"}
21 |         html = rep.text
22 |         songid = re.findall(r'songid":(\d+),', html)
23 |         songmid = re.findall(r'"songmid":"(.*?)",', html)
24 |         if not (songid or songmid):
25 |             return {"msg": "提取重要信息失败"}
26 |         songid = songid[0]
27 |         songmid = songmid[0]
28 |         data["audioName"] = re.findall(r'"songname":"(.*?)"', html)[0]
29 |         data["author"] = re.findall(r'"name":"(.*?)",', html)[0]
30 | 
31 |     # vkey
32 |     vkey_url = 'https://u.y.qq.com/cgi-bin/musicu.fcg'
33 |     params = {
34 |         'data': json.dumps({"req": {"module": "CDN.SrfCdnDispatchServer", "method": "GetCdnDispatch", "param": {"guid": "3982823384", "calltype": 0, "userip": ""}}, "req_0": {"module": "vkey.GetVkeyServer", "method": "CgiGetVkey", "param": {"guid": "3982823384", "songmid": [songmid], "songtype": [0], "uin": "0", "loginflag": 1, "platform": "20"}}, "comm": {"uin": 0, "format": "json", "ct": 24, "cv": 0}})
35 |     }
36 |     with requests.get(vkey_url, params=params, headers=ios_headers, timeout=10) as rep:
37 |         if rep.json()["code"] != 0 and rep.json()['req_0']['code'] != 0:
38 |             return {"msg": "提取重要信息失败"}
39 |         data["audios"] = [
40 |             "https://isure.stream.qqmusic.qq.com/{}".format(rep.json()['req_0']['data']['midurlinfo'][0]['purl'])
41 |         ]
42 | 
43 |     return data
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     # print(get(input("url: ")))
48 |     url = 'https://y.qq.com/n/yqq/song/003tdyG9003JqW.html'
49 |     print(get(url))
50 | 
51 | 
52 | # "A000", "ape", 800
53 | # "F000", "flac", 800
54 | # "M800", "mp3", 320
55 | # "C400", "m4a", 128
56 | # "M500", "mp3", 128
57 | 


--------------------------------------------------------------------------------
/extractor/quanminkge.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url:str) -> dict:
 7 |     '''
 8 |     author、audioName、audios、videos
 9 |     '''
10 |     data = {}
11 |     headers = {
12 |         "accept":
13 |         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
14 |         "accept-encoding":
15 |         "gzip, deflate, br",
16 |         "accept-language":
17 |         "zh-CN,zh;q=0.9",
18 |         "cache-control":
19 |         "max-age=0",
20 |         "sec-fetch-mode":
21 |         "navigate",
22 |         "sec-fetch-site":
23 |         "none",
24 |         "upgrade-insecure-requests":
25 |         "1",
26 |         "user-agent":
27 |         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
28 |     }
29 |     singer_pattern = r',"nick":"(.*?)",'
30 |     song_name_pattern = r'"song_name":"(.*?)",'
31 |     audio_pattern = r'"playurl":"(.*?)",'
32 |     video_pattern = r',"playurl_video":"(.*?)",'
33 | 
34 |     with requests.get(url=url, headers=headers, timeout=10) as rep:
35 |         if rep.status_code == 200:
36 |             html = rep.text
37 |             singer = re.findall(singer_pattern, html)
38 |             song_name = re.findall(song_name_pattern, html)
39 |             audio_url = re.findall(audio_pattern, html)
40 |             video_url = re.findall(video_pattern, html)
41 |             if singer: data["author"] = singer[0]
42 |             if song_name: data["audioName"] = song_name[0]
43 |             if audio_url: data["audios"] = [url for url in audio_url if url != ""]
44 |             if video_url: data["videos"] = [url for url in video_url if url != ""]
45 |         else:
46 |             data["msg"] = "获取失败"
47 | 
48 |         return data
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     data = get(input("url: "))
53 |     print(data)


--------------------------------------------------------------------------------
/extractor/quanminxsp.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
13 |         "Accept-Encoding": "gzip, deflate, br",
14 |         "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
15 |         "Cache-Control": "max-age=0",
16 |         "Connection": "keep-alive",
17 |         "Cookie": "COMMON_LID=d8d795e732f64cd28cbbce9ee76688af; Hm_lvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585966701; Hm_lpvt_a42a9a9e9ea0c8ce010e90569767e0f4=1585969995",
18 |         "DNT": "1",
19 |         "Host": "quanmin.hao222.com",
20 |         "Sec-Fetch-Dest": "document",
21 |         "Sec-Fetch-Mode": "navigate",
22 |         "Sec-Fetch-Site": "none",
23 |         "Sec-Fetch-User": "?1",
24 |         "Upgrade-Insecure-Requests": "1",
25 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
26 |     }
27 |     re_video = r'<meta property="og:videosrc" content="(.*?)">'
28 |     re_title = r'<meta property="og:title" content="(.*?)">'
29 | 
30 |     with requests.get(url, headers=headers, timeout=10) as rep:
31 |         if rep.status_code == 200:
32 |             title = re.findall(re_title, rep.text)
33 |             video = re.findall(re_video, rep.text)
34 |             if title:
35 |                 data["title"] = title[0]
36 |             if video:
37 |                 data["videos"] = video
38 |         else:
39 |             data["msg"] = "失败"
40 |     return data
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     url = "https://quanmin.hao222.com/sv2?source=share-h5&pd=qm_share_mvideo&vid=3877781674274744362&shareTime=1585969946&shareid=0746467921&shared_cuid=0ivn8laMv8l9uHuI_PSua_uS2u_Wav8dYu2ku_iCStloiBaR_8S08jf2QP0Hf1uea1FmA&shared_uid=gO2Ri_aIvtelA"
45 |     print(get(url))
46 | 


--------------------------------------------------------------------------------
/extractor/qutoutiao.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.parse import urlparse, parse_qs
 3 | 
 4 | import requests
 5 | 
 6 | # TODO: 支持小视频
 7 | 
 8 | def get(url: str) -> dict:
 9 |     """
10 |     author、title、videos
11 |     """
12 |     data = {}
13 |     if "new.3qtt.cn" in url:  # 短连接转长连接
14 |         url = requests.get(url).url
15 | 
16 |     data_url_format = "http://api.1sapp.com/content/getRecommendV3?key={key}&content_id={content_id}&limit=1"
17 |     play_host = "http://v4.qutoutiao.net/"
18 | 
19 |     query = urlparse(url).query
20 |     querys = parse_qs(query)
21 |     content_id = querys["content_id"][0]
22 |     key = querys["key"][0]
23 |     data_url = data_url_format.format(content_id=content_id, key=key)
24 | 
25 |     rep = requests.get(data_url, timeout=10)
26 |     if rep.status_code != 200 or rep.json()["code"] != 0:
27 |         return {"msg": "获取失败"}
28 | 
29 |     # from pprint import pprint
30 |     # pprint(rep.json())
31 |     json_url = rep.json()["data"]["data"][0]["urlJson"]
32 |     rep = requests.get(json_url, timeout=10)
33 |     if rep.status_code != 200:
34 |         return {"msg": "获取失败"}
35 |     # 整理
36 |     video_data = json.loads(rep.text.replace("cb(", "").replace(")", ""))
37 |     detail = video_data["detail"].replace("\\", "")
38 |     video_data["detail"] = json.loads(detail)
39 | 
40 |     data["author"] = video_data["nickname"]
41 |     data["title"] = video_data["title"]
42 |     address = video_data["detail"]["address"]
43 | 
44 |     urls = [add["url"] for add in address]
45 |     for q in ["hd.mp4", "hhd.mp4", "ld.mpp4", "hld.mp4"]:
46 |         for i in urls:
47 |             if q in i:
48 |                 data["videos"] = [play_host + i]
49 |                 break
50 | 
51 |     return data
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     from pprint import pprint
56 |     pprint(get(input("url: ")))
57 | 


--------------------------------------------------------------------------------
/extractor/sing5.py:
--------------------------------------------------------------------------------
 1 | # author: wongxy
 2 | # --------------
 3 | # 5sing.kugou.com
 4 | import re
 5 | import json
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def get(url: str) -> dict:
11 |     """
12 |     author、audioName、audios
13 |     """
14 |     data = {}
15 |     headers = {
16 |         "User-Agent":
17 |         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
18 |     }
19 | 
20 |     songinfo_format_url = "http://service.5sing.kugou.com/song/getsongurl?&songid={songid}&songtype=fc&from=web&version=6.6.72"
21 | 
22 |     songid = re.findall(r"/(\d+)", url.replace("5sing", ""))
23 |     if not songid:
24 |         return {"msg": "无法从链接获取关键信息"}
25 |     songid = songid[0]
26 | 
27 |     songinfo_url = songinfo_format_url.format(songid=songid)
28 |     with requests.get(songinfo_url, headers=headers, timeout=10) as rep:
29 |         if rep.status_code != 200:
30 |             return {"msg": "获取失败, 链接可能无效"}
31 |         json_ = json.loads(rep.text[1: -1])
32 |         if json_["code"] != 0:
33 |             return {"msg": "获取失败, 链接可能无效"}
34 |         info = json_["data"]
35 |         data["author"] = info["user"]["NN"]
36 |         data["audioName"] = info["songName"]
37 |         data["audios"] = [
38 |             info.get("squrl") or info.get("hqurl") or info.get("lqurl")
39 |         ]
40 | 
41 |     return data
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     from pprint import pprint
46 |     pprint(get("http://5sing.kugou.com/fc/15717150.html"))
47 |     # print(get(input("url: ")))
48 | 


--------------------------------------------------------------------------------
/extractor/sohuTV.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videoName、videos
 9 |     """
10 |     data = {}
11 |     session = requests.Session()
12 |     ERROR = {"msg": "获取失败"}
13 |     headers = {
14 |         "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 |     }
16 |     videoInfo_url = "https://my.tv.sohu.com/play/videonew.do"
17 |     playInfo_url = "https://data.vod.itc.cn/ip"
18 | 
19 |     with session.get(url, headers=headers, timeout=10) as rep_html:
20 |         if rep_html.status_code != 200:
21 |             return ERROR
22 |         vid = re.findall(r",vid: '(\d+)'", rep_html.text)
23 |         if not vid:
24 |             return ERROR
25 |         vid = vid[0]
26 | 
27 |     videoInfo_params = {
28 |         "vid": vid,
29 |         "ver": 31,
30 |         "ssl": 1,
31 |         "referer": url
32 |     }
33 |     with session.get(videoInfo_url, params=videoInfo_params, timeout=10) as videoInfo_rep:
34 |         if videoInfo_rep.status_code != 200:
35 |             return ERROR
36 |         videoInfo = videoInfo_rep.json()["data"]
37 |         tvName = videoInfo["tvName"]
38 |         data["title"] = data["videoName"] = tvName
39 | 
40 |         video_path = videoInfo["su"][0]
41 |         key = videoInfo["hc"][0] if videoInfo.get("hc") else videoInfo["ck"][0]
42 |         if not video_path or not key:
43 |             return ERROR
44 | 
45 |     playInfo_params = {
46 |         "new": video_path,
47 |         "num": 1,
48 |         "key": key,
49 |     }
50 |     with session.get(playInfo_url, params=playInfo_params, timeout=10) as playInfo_rep:
51 |         if playInfo_rep.status_code != 200:
52 |             return ERROR
53 |         play_url = playInfo_rep.json()["servers"][0]["url"]
54 |         data["videos"] = [play_url]
55 | 
56 |     return data
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     url = input("url: ")
61 |     print(get(url))
62 | 


--------------------------------------------------------------------------------
/extractor/ted.py:
--------------------------------------------------------------------------------
 1 | # https://www.ted.com/talks/*
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     title、videoName、videos
10 |     """
11 |     data = {}
12 |     headers = {
13 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
14 |     }
15 |     rep = requests.get(url, headers=headers, timeout=10)
16 |     if rep.status_code == 200:
17 |         text = rep.text
18 |         try:
19 |             title = re.findall(r'<meta name="title" content="(.*?)" />', text)[0]
20 |             mp4 = re.findall(r'"(https://download\.ted\.com.*?mp4\?apikey=.*?)"', text)[-1]
21 |             data["title"] = data["videoName"] = title
22 |             data["videos"] = [mp4]
23 |         except IndexError as e:
24 |             data["msg"] = "获取失败：" + e
25 |     else:
26 |         data["msg"] = "获取失败"
27 | 
28 |     return data
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     url = "https://www.ted.com/talks/bill_gates_how_we_must_respond_to_the_coronavirus_pandemic"
33 |     print(get(url))
34 | 


--------------------------------------------------------------------------------
/extractor/tuchong.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、imgs
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 |     }
14 |     rep = requests.get(url, headers=headers, timeout=6)
15 |     if rep.status_code == 200:
16 |         title = re.findall(r'<meta name="description" content="(.*?)" >', rep.text)
17 |         if title:
18 |             data["title"] = title[0]
19 |         data["imgs"] = re.findall(r'photo-image" src="(.*?)"', rep.text)
20 |     else:
21 |         data["msg"] = "获取失败"
22 |     return data
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     from pprint import pprint
27 |     pprint(get(input("url: ")))
28 | 


--------------------------------------------------------------------------------
/extractor/tudou.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     :param url: 视频链接，免费电视剧单集
10 | 
11 |     :return title: 视频名
12 |     :return videoName: 同title
13 |     :return videos: 视频链接，多个片段。最后一个是视频流地址(m3u8)
14 |     """
15 |     data = {}
16 |     headers_html = {
17 |         "referer": url,
18 |         "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
19 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
20 |     }
21 |     headers_info = {
22 |         "referer": url,
23 |         "accept": "application/json",
24 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
25 |     }
26 |     host = "https://ups.youku.com/ups/get.json"
27 | 
28 |     rep = requests.get(url, headers=headers_html, timeout=10)
29 |     if rep.status_code != 200:
30 |         return {"msg": "访问链接失败"}
31 |     vid = re.findall(r'"vid":"(\d+)"', rep.text)
32 |     if not vid:
33 |         return {"msg": "获取视频id失败"}
34 |     vid = vid[0]
35 | 
36 |     params = {
37 |         #  调试了半天，才发现ckey可以通用，但是暂时不知道过期时间
38 |         "ckey": "122#wppJ/JoGEExRyDpZy4pjEJponDJE7SNEEP7ZpJRBuDPpJFQLpCGwoHZDpJEL7SwBEyGZpJLlu4Ep+FQLpoGUEELWn4yE7SNEEP7ZpERBuDPE+BQPpC76EJponDJLKMQEImb2XDnTtByWAfaPwr8S14Rqur0Nj1sih8TwWMzZF+NtTPnZULbEnh9G8WlODWp1uOjeDLVr8PG6+4EEyFfDqM3bDEpxngR4ul5EDOgPm4AiJDbEfC3mqM3WE8pangL4ul0EDLVr8CpU+4EEyFfDqMfbDEpxnSp4uOIEELXZ8oL6JwTEyF3F7S32EJpadSxwuAuRiRFmYFRiZDPACVgIudh3VaGrVnUkqUbD72siAEVR1Qr4OWZjlGSrnzPs2rh4OY+Z6EbOEBJ8OnDsYwNsTdEhishHohd6L2J+K8z7LZpSitQjj8hrDOAV/ttFwMbpN7KrcdwvCJ7TbxjR5Q0rJaMPlfUv9IYPLIY9KNNy24RBro4psistlkgxw4vO3WXa4M00NlsAH1XADAp8l3+COupmS7LbhxHS2BKVRDZkDyD+xnYIaRahNuJDv7pLt830IQHgDvnq1gJBE75mVDgemdAGyc4ruFk4++Ar9T6gZbfiuacVvtDgzBcEo0r6bi+rvYQuaMy=",
39 |         "utid": "otL9FkVfwnwCASv6yQTaubZ5",  # expires at: 2030-03-25T07:32:14.712Z
40 |         "vid": vid,
41 |         "client_ts": int(time.time()),
42 |         "ccode": "050F",
43 |         "client_ip": "192.168.1.1",
44 |     }
45 |     rep = requests.get(host, params=params, headers=headers_info, timeout=10)
46 |     if rep.status_code != 200 or "error" in rep.json()["data"]:
47 |         return {"msg": "获取视频信息失败"}
48 |     info = rep.json()["data"]
49 |     title = info["video"]["title"]
50 |     stream = info["stream"]  # type: list
51 |     # 取最高画质
52 |     best_steam = sorted(stream, key=lambda item: item["width"])[-1]
53 |     videos = [url_item["cdn_url"] for url_item in best_steam["segs"]]
54 |     m3u8_url = best_steam["m3u8_url"]
55 |     videos.append(m3u8_url)
56 | 
57 |     data["title"] = data["videoName"] = title
58 |     data["videos"] = videos
59 | 
60 |     return data
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     from pprint import pprint
65 |     pprint(get(input("url: ")))
66 | 


--------------------------------------------------------------------------------
/extractor/wechat_article_cover.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     imgs、text
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "user-agent":
13 |         "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/ 53\
14 |         6.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
15 |     }
16 |     with requests.get(url, headers=headers, timeout=10) as rep:
17 |         if rep.status_code != 200:
18 |             return {"msg": "错误"}
19 |         img = re.findall(r'<meta property="og:image" content="(.*?)" />', rep.text)
20 |         if img:
21 |             data["imgs"] = [img[0]]
22 |         text = re.findall(r'<meta property="og:title" content="(.*?)" />', rep.text)
23 |         if text:
24 |             data["text"] = text[0]
25 |     return data
26 | 
27 | 
28 | if __name__ == "__main__":
29 | 
30 |     url = input("url: ")
31 |     print(get(url))
32 | 


--------------------------------------------------------------------------------
/extractor/weibo.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videos
 9 |     """
10 |     data = {}
11 |     headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1'}
12 | 
13 |     title_re = r'"title": "(.*?)",'
14 |     mp4_720p_mp4_re = r'"mp4_720p_mp4": "(.*?)",'
15 |     mp4_hd_mp4_re = r'"mp4_hd_mp4": "(.*?)",'
16 |     mp4_ld_mp4_re = r'"mp4_ld_mp4": "(.*?)"'
17 | 
18 |     with requests.get(url, headers=headers, timeout=10) as rep:
19 |         if rep.status_code == 200:
20 |             text = rep.text
21 |             title = re.findall(title_re, text)
22 |             mp4_720p_mp4 = re.findall(mp4_720p_mp4_re, text)
23 |             mp4_hd_mp4 = re.findall(mp4_hd_mp4_re, text)
24 |             mp4_ld_mp4 = re.findall(mp4_ld_mp4_re, text)
25 |             if title:
26 |                 data["title"] = title[0]
27 |             data["videos"] = mp4_720p_mp4 or mp4_hd_mp4 or mp4_ld_mp4
28 |         else:
29 |             data["msg"] = "获取失败"
30 | 
31 |         return data
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     url = input('url: ')
36 |     print(get(url))


--------------------------------------------------------------------------------
/extractor/weishi.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.parse import urlparse, parse_qs
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | data = {}
 8 | headers = {
 9 |     "accept": "application/json",
10 |     "accept-encoding": "gzip, deflate, br",
11 |     "accept-language": "zh-CN,zh;q=0.9",
12 |     "content-length": "63",
13 |     "content-type": "application/json",
14 |     "cookie": "pgv_pvi=9657849856; pgv_pvid=2069474799; RK=aHJszqfoXm; ptcz=0fc0035b9509215f060561393c09f6cde3bccc1953e79c2b5b1ec450e4e67f19; LW_uid=s1i5E5d4v2a1p5n702J1O2y0q8; eas_sid=M1k5T5N4O28185X7J291x2K1A3; o_cookie=286183317; pac_uid=1_286183317; ied_qq=o0286183317; LW_sid=x1Y5D6W4h4H516F6X9l9V8S8Z3; tvfe_boss_uuid=fbb4b39b5afeb49b; psrf_qqopenid=A140C50D3D791392EA89131C8B01FE1D; psrf_qqaccess_token=D2F43F3C25900E66193345D276AF9559; psrf_qqrefresh_token=E48409D7E8E4F3D5C3869F104380AB3E; psrf_qqunionid=002C01991CFB436BCD8A27A0EE1DB9FF; qm_keyst=Q_H_L_2ajiOt50eapmue6Eg1-l_W6XztEBr_u0vZJAPs4xctJZNJdsEZONiDnNJ206icA; psrf_musickey_createtime=1574482649; psrf_access_token_expiresAt=1582258649; person_id_bak=5295507715828209; person_id_wsbeacon=5689667751647505; wsreq_logseq=336060008",
15 |     "origin": "https://h5.weishi.qq.com",
16 |     "sec-fetch-mode": "cors",
17 |     "sec-fetch-site": "same-origin",
18 |     "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1",
19 |     "x-requested-with": "XMLHttpRequest",
20 | }
21 | 
22 | 
23 | # 未登录时分享的链接
24 | # url = "https://h5.weishi.qq.com/weishi/wsplay/challenge?feedid=6YV0vjeP71IHTsV08&challegeid=100026&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=127081004&pkg=&attach=cp_reserves3_1190370002"
25 | def _get_not_logged(url: str) -> dict:
26 |     global data
27 |     post_url = "https://h5.weishi.qq.com/webapp/json/challenge_feedrank/GetChallengeFeedDetail?t=0.2602280426206063&g_tk="
28 | 
29 |     query = parse_qs(urlparse(url).query)
30 |     try:
31 |         feedid = query.get('feedid')[0]
32 |         challenge_id = query.get('challegeid')[0]
33 |     except:
34 |         data["msg"] = "获取失败"
35 |         return data
36 | 
37 |     payload = {
38 |         "feedid": feedid,
39 |         "challenge_id": challenge_id,
40 |         "type": 0,
41 |     }
42 | 
43 |     with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep:
44 |         if rep.status_code == 200:
45 |             video_info = rep.json().get("data").get('feedinfos')[0]
46 |             title = video_info.get("feed_desc")
47 |             play_url = video_info.get("video_url")
48 |             data["title"] = title
49 |             data["videos"] = [play_url]
50 |         else:
51 |             data["msg"] = "获取失败"
52 | 
53 |     return data
54 | 
55 | 
56 | # 登录后分享的链接
57 | # url = "https://h5.weishi.qq.com/weishi/feed/770BSyaon1IQcqdbr/wsfeed?wxplay=1&id=770BSyaon1IQcqdbr&spid=8039370850869145600&qua=v1_and_weishi_6.5.0_588_312027000_d&chid=100081014&pkg=3670&attach=cp_reserves3_1000370011"
58 | def _get_logged(url: str) -> dict:
59 |     global data
60 |     post_url = "https://h5.weishi.qq.com/webapp/json/weishi/WSH5GetPlayPage?t=0.16820895093158983&g_tk="
61 | 
62 |     query = parse_qs(urlparse(url).query)
63 |     try:
64 |         feedid = query.get('id')[0]
65 |     except:
66 |         data["msg"] = "获取失败"
67 |         return data
68 | 
69 |     payload = {
70 |         "feedid": feedid,
71 |         "recommendtype": 0,
72 |         "datalvl": "all",
73 |         "_weishi_mapExt": {}
74 |     }
75 | 
76 |     with requests.post(post_url, headers=headers, data=json.dumps(payload), timeout=10) as rep:
77 |         if rep.status_code == 200:
78 |             video_info = rep.json().get('data').get('feeds')[0]
79 |             title = video_info.get("feed_desc")
80 |             play_url = video_info.get("video_url")
81 |             data["title"] = title
82 |             data["videos"] = [play_url]
83 |         else:
84 |             data["msg"] = "获取失败"
85 | 
86 |     return data
87 | 
88 | 
89 | def get(url: str) -> dict:
90 |     return _get_not_logged(url) if url.startswith("https://h5.weishi.qq.com/weishi/wsplay/challenge") else _get_logged(url)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     print(get(input("url: ")))
95 | 


--------------------------------------------------------------------------------
/extractor/xiaokaxiu.py:
--------------------------------------------------------------------------------
 1 | # @wongxy
 2 | import time
 3 | from hashlib import md5
 4 | from urllib.parse import urlparse, parse_qs
 5 | 
 6 | import requests
 7 | 
 8 | 
 9 | def get(url: str) -> dict:
10 |     """
11 |     title、videos
12 |     """
13 |     data = {}
14 | 
15 |     try:
16 |         qs = parse_qs(urlparse(url).query)
17 |         video_id = qs["id"][0]
18 |     except KeyError:
19 |         return {"msg": "无法匹配视频id"}
20 | 
21 |     timestamp = str(int(time.time()))
22 | 
23 |     info_url = "https://appapi.xiaokaxiu.com/api/v1/web/share/video/" + video_id + "?time=" + timestamp
24 | 
25 |     temp = "S14OnTD#Qvdv3L=3vm" + "&time=" + timestamp
26 |     x_sign = md5(temp.encode("utf-8")).hexdigest()
27 |     headers = {
28 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
29 |         "x-sign": x_sign,
30 |     }
31 |     rep = requests.get(info_url, headers=headers)
32 |     if rep.status_code == 200 and rep.json()["code"] == 0:
33 |         video_info = rep.json()["data"]
34 |         title = video_info["video"]["title"]
35 |         video_url = video_info["video"]["url"][0]
36 |         data["title"] = title
37 |         data["videos"] = [video_url]
38 |         return data
39 |     return {"msg": "获取失败"}
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     url = "https://mobile.xiaokaxiu.com/video?id=6552158363189252096"
44 |     print(get(url))
45 | 


--------------------------------------------------------------------------------
/extractor/xinpianchang.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、imgs、videos(画质不同)
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 |     }
14 |     session = requests.Session()
15 |     rep = session.get(url, headers=headers, timeout=10)
16 |     if rep.status_code != 200:
17 |         return {"msg": "获取失败"}
18 |     try:
19 |         vid = re.findall(r'vid: "(.*?)",', rep.text)[0]
20 |     except IndexError:
21 |         return {"msg": "获取失败"}
22 | 
23 |     video_info_url = "http://openapi-vtom.vmovier.com/v3/video/{vid}?expand=resource".format(
24 |         vid=vid)
25 |     rep = session.get(video_info_url, headers=headers, timeout=10)
26 |     if rep.status_code != 200 or rep.json()["status"] != 0:
27 |         return {"msg": "获取失败"}
28 | 
29 |     video_data = rep.json()["data"]
30 | 
31 |     title = video_data["video"]["title"]
32 |     cover = video_data["video"]["cover"]
33 |     video_list = video_data["resource"]["progressive"]  # type: list
34 | 
35 |     # videos = []
36 |     # for item in video_list:
37 |     #     videos.append(item.get("https_url") or item.get("url"))
38 |     video = video_list[0].get("https_url") or video_list[0].get("url")
39 | 
40 |     data["title"] = title
41 |     data["imgs"] = [cover]
42 |     data["videos"] = [video]
43 | 
44 |     return data
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     # url = "https://www.xinpianchang.com/a10628284"
49 |     url = input("url: ")
50 |     print(get(url))
51 | 


--------------------------------------------------------------------------------
/extractor/zhihu_video.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     """
 9 |     data = {}
10 |     headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",}
11 |     video_info_url = "https://lens.zhihu.com/api/v4/videos/{id}"
12 | 
13 |     videos = []
14 | 
15 |     with requests.get(url, headers=headers, timeout=10) as rep:
16 |         if rep.status_code == 200:
17 |             ids = re.findall(r'www.zhihu.com/video/(\d{1,})', rep.text)
18 |             ids = list(set(ids)) # 去掉重复元素
19 |         else:
20 |             data["msg"] = "视频获取失败，可能是这个页面没有视频"
21 |             return data
22 | 
23 |     if not ids:
24 |         data["msg"] = "视频获取失败，可能是这个页面没有视频"
25 |         return data
26 | 
27 |     for id in ids:
28 |         rep = requests.get(video_info_url.format(id=id), headers=headers, timeout=10)
29 |         if rep.status_code == 200:
30 |             playlist = rep.json().get("playlist")
31 |             temp = playlist.get("HD") or playlist.get("SD") or playlist.get("LD")
32 |             if temp:
33 |                 url = temp.get("play_url")
34 |                 videos.append(url)
35 |     data["videos"] = [video for video in videos if video != ""]
36 |     return data
37 | 
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     url = input("url: ")
42 |     print(get(url))


--------------------------------------------------------------------------------
/extractor/zuiyou_video.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def get(url: str) -> dict:
 7 |     """
 8 |     title、videoName、videos
 9 |     """
10 |     data = {}
11 |     headers = {
12 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
13 |     }
14 |     re_title = r'<div class="SharePostCard__content"><span.*?</span>(.*?)</div>'
15 |     re_video = r'<video src="(.*?)".*?></video>'
16 | 
17 |     with requests.get(url, headers=headers, timeout=10) as rep:
18 |         if rep.status_code == 200:
19 |             title = re.findall(re_title, rep.text)
20 |             video = re.findall(re_video, rep.text)
21 |             if title:
22 |                 data["title"] = data["videoName"] = title[0]
23 |             if video:
24 |                 data["videos"] = video
25 |         else:
26 |             data["msg"] = "失败"
27 | 
28 |     return data
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     url = "https://share.izuiyou.com/detail/147486886?zy_to=applink&to=applink"
33 |     print(get(url))
34 | 


--------------------------------------------------------------------------------
/extractor/zuiyou_voice.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.parse import urlparse
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def get(url: str) -> dict:
 8 |     """
 9 |     text、audios
10 |     """
11 |     data = {}
12 |     headers = {
13 |         "Connection": "keep-alive",
14 |         "Content-Length": "209",
15 |         "Content-Type": "text/plain;charset=UTF-8",
16 |         "Host": "share.izuiyou.com",
17 |         "Origin": "https://share.izuiyou.com",
18 |         "Referer": url,
19 |         "Sec-Fetch-Mode": "cors",
20 |         "Sec-Fetch-Site": "same-origin",
21 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
22 |     }
23 |     post_url = "https://share.izuiyou.com/api/review/share_review"
24 | 
25 |     path = urlparse(url).path
26 |     temp = path.split("/")
27 |     pid = temp[-2]
28 |     rid = temp[-1]
29 | 
30 |     payload = {
31 |         "h_av": "3.0",
32 |         "h_dt": 9,
33 |         "h_nt": 9,
34 |         "h_ch": "web_app",
35 |         "ua":
36 |         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
37 |         "pid": f"{pid}",
38 |         "rid": f"{rid}"
39 |     }
40 |     play_host = "http://tbvideo.ixiaochuan.cn/"
41 |     with requests.post(post_url, data=json.dumps(payload), headers=headers, timeout=10) as rep:
42 |         if rep.status_code == 200:
43 |             try:
44 |                 audio_info = rep.json().get("data").get("review").get("audio")
45 |                 voice_text = audio_info.get("voice_text")
46 |                 # uri = audio_info.get("uri")
47 |                 org_uri = audio_info.get("org_uri")
48 |                 data["text"] = voice_text
49 |                 data["audios"] = [play_host + org_uri]
50 |             except (TypeError, AttributeError):
51 |                 data["msg"] = "获取失败"
52 |         else:
53 |             data["msg"] = "获取失败"
54 | 
55 |     return data
56 | 
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     print(get(input("url: ")))


--------------------------------------------------------------------------------
/misc.py:
--------------------------------------------------------------------------------
 1 | from prettytable import PrettyTable
 2 | 
 3 | logo = r"""
 4 |  _______  _______ _________ ______   _______  _______
 5 | (  ____ \(  ____ )\__   __/(  __  \ (  ____ \(  ____ )
 6 | | (    \/| (    )|   ) (   | (  \  )| (    \/| (    )|
 7 | | (_____ | (____)|   | |   | |   ) || (__    | (____)|
 8 | (_____  )|  _____)   | |   | |   | ||  __)   |     __)
 9 |       ) || (         | |   | |   ) || (      | (\ (
10 | /\____) || )      ___) (___| (__/  )| (____/\| ) \ \__
11 | \_______)|/       \_______/(______/ (_______/|/   \__/"""
12 | 
13 | 
14 | def printTips():
15 |     platforms = [
16 |         ["哔哩哔哩", "封面、视频"],
17 |         ["唱鸭", "音频"],
18 |         ["抖音", "无水印视频"],
19 |         ["酷狗", "音频"],
20 |         ["酷我", "音频"],
21 |         ["荔枝FM", "音频"],
22 |         ["网易云音乐", "音频、mv、视频"],
23 |         ["QQ音乐", "音频"],
24 |         ["皮皮搞笑", "无水印视频"],
25 |         ["全民K歌", "音频&视频"],
26 |         ["微博", "视频"],
27 |         ["微视", "无水印视频"],
28 |         ["知乎", "视频"],
29 |         ["最右", "音频(语音帖评论)"],
30 |         ["千千音乐", "音频"],
31 |         ["5sing", "音频"],
32 |         ["皮皮虾", "无水印视频"],
33 |         ["轻视频", "无水印视频"],
34 |         ["趣头条", "视频"],
35 |         ["酷6网", "视频"],
36 |         ["乐乎", "视频"],
37 |         ["网易公开课", "视频(免费)"],
38 |         ["新片场", "视频"],
39 |         ["百度贴吧", "视频"],
40 |         ["快手", "无水印视频、长图视频"],
41 |         ["AcFun弹幕网", "视频"],
42 |         ["百度好看视频", "视频"],
43 |         ["梨视频", "视频"],
44 |         ["小咖秀", "无水印视频"],
45 |         ["搜狐视频", "视频"],
46 |         ["土豆视频", "视频(免费电视剧等)"],
47 |         ["TED", "视频"],
48 |         ["图虫", "图片"],
49 |         ["其他", "。。。"]
50 |     ]
51 |     table = PrettyTable(["支持平台", "支持内容"])
52 |     for platform in platforms:
53 |         table.add_row(platform)
54 |     print(logo)
55 |     print("""
56 |     ╭━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╮
57 |     │ @wongxy \033[36;4mhttps://github.com/xiyaowong\033[0m │
58 |     ╰━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╯""")
59 |     print("爬取并下载部分资源")
60 |     print(table)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     printTips()
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # extractor
 2 | requests
 3 | pycryptodome
 4 | 
 5 | # extract
 6 | prettytable
 7 | click
 8 | 
 9 | # web
10 | flask
11 | flask-cors
12 | python-dotenv
13 | 


--------------------------------------------------------------------------------
/screenshot/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/example.gif


--------------------------------------------------------------------------------
/screenshot/run.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiyaowong/spiders/304f1d32d07b6c42feb8ddbcb83dac90558be503/screenshot/run.gif


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | from datetime import datetime
  5 | from functools import wraps
  6 | 
  7 | import click
  8 | import requests
  9 | 
 10 | 
 11 | def remove_file(path):
 12 |     if os.path.isfile(path):
 13 |         os.remove(path)
 14 | 
 15 | 
 16 | def filter_name(name):
 17 |     """
 18 |     过滤文件名
 19 |     """
 20 |     regexp = re.compile(r'(/|\\|:|\?|\*|\||"|\'|<|>|\$)')
 21 |     space = re.compile(r'\s{2,}')
 22 |     return space.sub(" ", regexp.sub("", name))
 23 | 
 24 | 
 25 | def check_dir(path):
 26 |     """
 27 |     检查文件夹是否存在，存在返回True;不存在则创建，返回False
 28 |     """
 29 |     if not os.path.exists(path):
 30 |         os.makedirs(path)
 31 |         return False
 32 |     return True
 33 | 
 34 | 
 35 | def retry(n=3, delay=0.5):
 36 |     def deco(func):
 37 |         @wraps(func)
 38 |         def wrapper(*a, **kw):
 39 |             count = 1
 40 |             while True:
 41 |                 try:
 42 |                     return func(*a, **kw)
 43 |                 except Exception as e:
 44 |                     if count == n + 1:
 45 |                         break
 46 |                     print('[{}]运行错误，{}s后进行第{}次重试 Err: {}'.format(func.__name__, delay, count, e))
 47 |                     count += 1
 48 |                     time.sleep(delay)
 49 |             print('重试结束，[{}]运行失败'.format(func.__name__))
 50 |             return False
 51 |         return wrapper
 52 |     return deco
 53 | 
 54 | 
 55 | def download(file_url, file_name=None, file_type=None, save_path="download", headers=None, timeout=15):
 56 |     """
 57 |     :param file_url: 下载资源链接
 58 |     :param file_name: 保存文件名，默认为当前日期时间
 59 |     :param file_type: 文件类型(扩展名)
 60 |     :param save_path: 保存路径，默认为download,后面不要"/"
 61 |     :param headers: http请求头，默认为iphone
 62 |     """
 63 |     if file_name is None:
 64 |         file_name = str(datetime.now())
 65 |     file_name = filter_name(file_name)
 66 | 
 67 |     if file_type is None:
 68 |         if "." in file_url:
 69 |             file_type = file_url.split(".")[-1]
 70 |         else:
 71 |             file_type = "uknown"
 72 | 
 73 |     check_dir(save_path)
 74 | 
 75 |     file_name = file_name + "." + file_type
 76 | 
 77 |     if headers is None:
 78 |         headers = {
 79 |             "User-Agent":
 80 |             "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1"
 81 |         }
 82 | 
 83 |     # 下载提示
 84 |     if os.path.exists(f"{save_path}/{file_name}"):
 85 |         print(f'\033[33m{file_name}已存在，不再下载！\033[0m')
 86 |         return True
 87 |     print(f"Downloading {file_name}")
 88 |     try:
 89 |         with requests.get(file_url, headers=headers, stream=True, timeout=timeout) as rep:
 90 |             file_size = int(rep.headers['Content-Length'])
 91 |             if rep.status_code != 200:
 92 |                 print("\033[31m下载失败\033[0m")
 93 |                 return False
 94 |             label = '{:.2f}MB'.format(file_size / (1024 * 1024))
 95 |             with click.progressbar(length=file_size, label=label) as progressbar:
 96 |                 with open(f"{save_path}/{file_name}", "wb") as f:
 97 |                     for chunk in rep.iter_content(chunk_size=1024):
 98 |                         if chunk:
 99 |                             f.write(chunk)
100 |                             progressbar.update(1024)
101 |             print(f"\033[32m{file_name}下载成功\033[0m")
102 |     except Exception as e:
103 |         print('下载失败: ', e)
104 |         remove_file(f"{save_path}/{file_name}")
105 |     return True
106 | 


--------------------------------------------------------------------------------
/web/README.md:
--------------------------------------------------------------------------------
 1 | #### 简单提供爬虫api
 2 | 
 3 | 
 4 | 
 5 | ```shell
 6 | pip install gunicorn
 7 | ```
 8 | 
 9 | ```shell
10 | gunicorn app:app
11 | 
12 | or
13 | 
14 | python app.py
15 | ```
16 | 
17 | `/extract?url=`这里填入链接，post请求提交`url`参数也行
18 | 


--------------------------------------------------------------------------------
/web/__init__.py:
--------------------------------------------------------------------------------
1 | from ._response import response
2 | 
3 | 


--------------------------------------------------------------------------------
/web/_response.py:
--------------------------------------------------------------------------------
 1 | from flask import jsonify
 2 | from werkzeug.http import HTTP_STATUS_CODES
 3 | 
 4 | 
 5 | def response(code=200, data=None, error=None, msg=None):
 6 |     """
 7 |     :param code: 状态码
 8 |     :param data: 返回数据
 9 |     :param error: 错误信息
10 |     :param msg: 提示信息
11 |     """
12 | 
13 |     if code is not None and code >= 400:
14 |         error = HTTP_STATUS_CODES.get(code, "unknown error")
15 | 
16 |     pay_load = {
17 |         "code": code,
18 |         "data": data,
19 |         "err": error,
20 |         "message": msg or HTTP_STATUS_CODES.get(code, "unknown status"),
21 |     }
22 |     _response = jsonify(pay_load)
23 |     _response.status_code = code
24 |     return _response
25 | 


--------------------------------------------------------------------------------
/web/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.join(os.path.dirname(os.getcwd())))
 4 | 
 5 | from flask import Flask
 6 | from flask_cors import CORS
 7 | 
 8 | from web import config, error, views, log
 9 | 
10 | 
11 | 
12 | def create_app() -> Flask:
13 |     app = Flask(__name__)
14 |     app.config.from_object(config)
15 |     CORS(app)
16 | 
17 |     views.init_app(app)
18 |     error.init_app(app)
19 |     log.init_app(app)
20 | 
21 |     if app.config["ENV"] == "development":
22 |         print(app.url_map)
23 | 
24 |     return app
25 | 
26 | 
27 | app = create_app()
28 | 
29 | if __name__ == "__main__":
30 |     app.run()
31 | 


--------------------------------------------------------------------------------
/web/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv()
 6 | 
 7 | ENV = os.getenv("FLASK_ENV") or "production"
 8 | SECRET_KEY = os.getenv("SECRET_KEY") or "wongxy"
 9 | DEBUG = os.getenv("DEBUG") or False
10 | 


--------------------------------------------------------------------------------
/web/error.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=unused-argument
 2 | from flask import Flask
 3 | 
 4 | try:
 5 |     import response
 6 | except ImportError:
 7 |     from . import response
 8 | 
 9 | 
10 | def init_app(app: Flask):
11 |     @app.errorhandler(400)
12 |     def _error_400(e):
13 |         return response(400)
14 | 
15 |     @app.errorhandler(500)
16 |     def _error_500(e):
17 |         return response(500)
18 | 
19 |     @app.errorhandler(404)
20 |     def _error_404(e):
21 |         return response(404)
22 | 
23 |     @app.errorhandler(405)
24 |     def _error_404(e):
25 |         return response(405)
26 | 


--------------------------------------------------------------------------------
/web/example.env:
--------------------------------------------------------------------------------
1 | # FLASK_ENV=development
2 | FLASK_ENV=production
3 | DEBUG=False
4 | SECRET_KEY="a string you never guess"
5 | 


--------------------------------------------------------------------------------
/web/funcs.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from flask import current_app
 4 | 
 5 | from extractor import (acfun, baidutieba, bilibili, changya, douyin, haokan,
 6 |                        ku6, kuaishou, kugou, kuwo, lizhiFM, lofter, migu_music,
 7 |                        momo, music163, open163, pearvideo, pic58, pipigaoxiao,
 8 |                        pipix, qianqian, qingshipin, qqmusic, quanminkge,
 9 |                        qutoutiao, sing5, sohuTV, ted, tuchong, tudou, weibo,
10 |                        weishi, xiaokaxiu, xinpianchang, zhihu_video,
11 |                        zuiyou_voice)
12 | 
13 | from web import response
14 | 
15 | 
16 | 
17 | crawlers = {
18 |     'acfun': acfun,
19 |     'tieba': baidutieba,
20 |     'bili': bilibili,
21 |     'changya': changya,
22 |     'douyin': douyin,
23 |     'haokan': haokan,
24 |     'ku6': ku6,
25 |     'chenzhongtech': kuaishou,
26 |     'kuaishou': kuaishou,
27 |     'kugou': kugou,
28 |     'kuwo': kuwo,
29 |     'lizhi': lizhiFM,
30 |     'lofter': lofter,
31 |     'music.163': music163,
32 |     'open.163': open163,
33 |     'pearvideo': pearvideo,
34 |     'ippzone': pipigaoxiao,
35 |     'pipix': pipix,
36 |     'music.taihe': qianqian,
37 |     'qingshipin': qingshipin,
38 |     'y.qq': qqmusic,
39 |     'kg': quanminkge,
40 |     'qutoutiao': qutoutiao,
41 |     '5sing': sing5,
42 |     'weibo': weibo,
43 |     'weishi': weishi,
44 |     'xiaokaxiu': xiaokaxiu,
45 |     'xinpianchang': xinpianchang,
46 |     'zhihu': zhihu_video,
47 |     'zuiyou': zuiyou_voice,
48 |     'sohu': sohuTV,
49 |     'ted': ted,
50 |     'tudou': tudou,
51 |     'momo': momo,
52 |     'music.migu': migu_music,
53 |     '58pic': pic58,
54 |     'tuchong': tuchong
55 | }
56 | 
57 | 
58 | def extract(url: str):  # pylint: disable=too-many-statements
59 |     try:
60 |         url = re.findall(
61 |             r"https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\.[-A-Za-z]+[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url)
62 |         if not url:
63 |             return response(404, msg="无法匹配链接")
64 |         url = url[0]
65 | 
66 |         data = None
67 |         for c_name, c_func in crawlers.items():
68 |             if c_name in url:
69 |                 data = c_func.get(url)  # type: dict
70 |                 break
71 |         if data is not None:
72 |             # 删除值为空的键
73 |             for key, value in data.copy().items():
74 |                 if not value:
75 |                     data.pop(key)
76 |             return response(data=data, msg=data.get("msg"))
77 |         else:
78 |             return response(404, msg="不支持的链接")
79 |     except Exception as e:
80 |         current_app.logger.error(e)
81 |         current_app.logger.exception(e)
82 |         return response(500, error=e, msg="服务器错误")
83 | 


--------------------------------------------------------------------------------
/web/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from logging.handlers import RotatingFileHandler
 4 | 
 5 | from flask import Flask
 6 | from flask.logging import default_handler
 7 | 
 8 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 9 | 
10 | LOG_PATH = os.path.join(BASE_DIR, 'logs')
11 | 
12 | if not os.path.exists(LOG_PATH):
13 |     os.makedirs(LOG_PATH)
14 | 
15 | LOG_PATH_ALL = os.path.join(LOG_PATH, 'all.log')
16 | 
17 | LOG_FILE_MAX_BYTES = 10 * 1024 * 1024
18 | LOG_FILE_BACKUP_COUNT = 10
19 | 
20 | 
21 | def init_app(app: Flask):
22 |     app.logger.removeHandler(default_handler)
23 | 
24 |     formatter = logging.Formatter(
25 |         "%(asctime)s [%(levelname)s] [%(filename)s] %(message)s"
26 |     )
27 | 
28 |     file_handler = RotatingFileHandler(
29 |         filename=LOG_PATH_ALL,
30 |         mode='a',
31 |         maxBytes=LOG_FILE_MAX_BYTES,
32 |         backupCount=LOG_FILE_BACKUP_COUNT,
33 |         encoding='utf-8'
34 |     )
35 | 
36 |     file_handler.setFormatter(formatter)
37 |     file_handler.setLevel(logging.WARNING)
38 | 
39 |     for logger in (app.logger,
40 |                    logging.getLogger('werkzeug')):
41 |         logger.addHandler(file_handler)
42 | 


--------------------------------------------------------------------------------
/web/views.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | 
 3 | import funcs
 4 | from web import response
 5 | 
 6 | 
 7 | def home():
 8 |     data = ":)"
 9 |     return response(data=data)
10 | 
11 | 
12 | def extract():
13 |     if "url" not in request.values:
14 |         return response(400, msg="Missing parameter.")
15 |     url = request.values["url"]
16 |     return funcs.extract(url)
17 | 
18 | 
19 | def init_app(app: Flask):
20 |     app.add_url_rule("/", "home", home)
21 |     app.add_url_rule("/extract/", "extract", extract)
22 | 


--------------------------------------------------------------------------------