├── .gitignore ├── LICENSE ├── README.md ├── acfun └── acfun.py ├── bilibili ├── bilibili.py └── config.yaml ├── ibizhi ├── config.yaml └── ibizhi.py ├── instgram ├── config.yaml └── instgram.py ├── karaoke └── karaoke.py ├── requirements.txt ├── twitter ├── config.yaml └── twitter.py └── weibo ├── config.yaml └── weibo.py /.gitignore: -------------------------------------------------------------------------------- 1 | # file to ignore 2 | *.yaml 3 | 4 | # folder to ignore 5 | */data 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Thekips 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Media Crawler 2 | 3 | 一些常用社交网站的媒体资源(视频,图片)的爬取脚本 4 | 5 | ### Support 6 | 7 | | 英文名 | 中文名 | 爬取范围 | 8 | | --------- | -------- | ------------------------------------------------------ | 9 | | acfun | A站 | 支持播放页URL的单视频爬取 | 10 | | bilibilii | 哔哩哔哩 | 支持单个账号的所有视频爬取,支持单个bv号的所有视频爬取 | 11 | | instgram | 图享 | 支持单个账号的所有图片和视频爬取 | 12 | | karaoke | 全民K歌 | 支持单个账号的所有歌曲爬取 | 13 | | twitter | 推特 | 支持单个账号的所有图片和视频爬取 | 14 | | weibo | 新浪微博 | 支持单个账号的所有图片和视频爬取 | 15 | 16 | ### Usage 17 | 18 | - **Python依赖:** 19 | 20 | ```shell 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | - **其他依赖:** 25 | 26 | 定位到想要爬取的网站对应的文件夹,填写该文件夹下的`config.yaml`中列举的所有字段。 27 | 28 | | 英文名 | 字段 | 额外依赖 | 29 | | --------- | ----------------------------------- | ------------------------ | 30 | | acfun | | ffmpeg(添加到环境变量) | 31 | | bilibilii | cookie | ffmpeg(添加到环境变量) | 32 | | instgram | cookie | | 33 | | karaoke | | | 34 | | twitter | cookie, x-csrf-token, authorization | | 35 | | weibo | cookie | | 36 | 37 | 提示:打开浏览器的开发者工具,在网络面板里查找与目标网站通信时的包记录,多找几个,可能在`Request Headers`中找到这些字段;填写字段的时候,填至冒号的一个空格后即可,不需要加双引号,如: 38 | 39 | ```yaml 40 | cookie: you_cookie_str_without_quotation_mark 41 | ``` 42 | 43 | - 执行程序,根据提示输入想要爬取的账号ID或URL即可。 44 | 45 | ```shell 46 | # 将*替换成网站对应的英文名 47 | python *.py 48 | ``` 49 | 50 | 51 | -------------------------------------------------------------------------------- /acfun/acfun.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import json 4 | 5 | headers = { 6 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", 7 | } 8 | 9 | 10 | def mkDir(path): 11 | if not os.path.exists(path): 12 | try: 13 | os.makedirs(path) 14 | except Exception as e: 15 | print(e) 16 | 17 | 18 | class Acfun: 19 | def __init__(self, mid="") -> None: 20 | # TODO(thekips): Add Episode Download. 21 | self.mid = mid 22 | 23 | def getJson(self, data: str): 24 | index1 = data.find("ksPlayJson") 25 | index2 = data.find("{", index1) 26 | 27 | old_index = index2 28 | li = ["{"] 29 | while True: 30 | left = data.find("{", old_index + 1) 31 | right = data.find("}", old_index + 1) 32 | if left < right: 33 | old_index = left 34 | li.append("{") 35 | else: 36 | old_index = right 37 | li.pop() 38 | 39 | if len(li) == 0: 40 | index3 = right 41 | break 42 | 43 | videoInfo = data[index2 : index3 + 1] 44 | videoInfo = eval(repr(videoInfo).replace("\\\\", "\\")) 45 | return json.loads(videoInfo) 46 | 47 | def dlVideo(self, url, path="data/"): 48 | mkDir(path) 49 | response = requests.get(url, headers=headers) 50 | 51 | if response.status_code == 200: 52 | data = response.text 53 | videoInfo = self.getJson(data) 54 | m3u8 = videoInfo["adaptationSet"][0]["representation"][0]["url"] 55 | videoId = videoInfo["videoId"] 56 | path_to_file = path + videoId + ".mp4" 57 | 58 | if os.path.exists(path_to_file): 59 | print("Video [%s] has been download" % videoId) 60 | return 61 | cmd = 'ffmpeg -loglevel warning -i "%s" -codec copy %s' % (m3u8, path_to_file) 62 | print('Downloading...') 63 | os.system(cmd) 64 | else: 65 | print(response.status_code) 66 | 67 | 68 | if __name__ == "__main__": 69 | acfun = Acfun() 70 | url = input("Please input url of the video:") 71 | acfun.dlVideo(url) 72 | -------------------------------------------------------------------------------- /bilibili/bilibili.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import json 5 | import urllib3 6 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 7 | import subprocess 8 | import yaml 9 | import requests 10 | from tqdm import tqdm 11 | from jsonpath import jsonpath 12 | 13 | INFO_URL = 'https://api.bilibili.com/x/space/acc/info' 14 | MEDIA_URL = 'https://api.bilibili.com/x/space/arc/search' 15 | CID_URL = 'https://api.bilibili.com/x/player/pagelist' 16 | PLAYER_URL = 'https://api.bilibili.com/x/player/playurl' 17 | 18 | def mkDir(path): 19 | if(not os.path.exists(path)): 20 | try: 21 | os.makedirs(path) 22 | except Exception as e: 23 | print(e) 24 | 25 | def enablePath(path): 26 | ''' 27 | Replace character like: / \ : * ? " < > | and blank character 28 | to: _ 29 | ''' 30 | pattern = r"[\/\\\:\*\?\"\<\>\|\s]" 31 | return re.sub(pattern, "_", path) 32 | 33 | class Bilibili(): 34 | def __init__(self, mid='') -> None: 35 | with open('config.yaml', 'r', encoding='utf-8') as f: 36 | self.headers = yaml.safe_load(f) 37 | self.mid = mid 38 | self.session=requests.session() 39 | 40 | def __getName(self): 41 | if self.mid == '': 42 | return 'anyone' 43 | params = ( 44 | ('mid', self.mid), 45 | ) 46 | res = requests.get(INFO_URL, params=params, headers=self.headers) 47 | res = json.loads(res.text) 48 | 49 | return res['data']['name'] 50 | 51 | def __getMediaID(self): 52 | params = ( 53 | ('mid', self.mid), 54 | ('pn', self.index), 55 | ('order', 'pubdate'), 56 | ) 57 | res = requests.get(MEDIA_URL, params=params, headers=self.headers) 58 | res = json.loads(res.text) 59 | 60 | # Get video(bvid) and music id. 61 | bvids = jsonpath(res, expr='$.data.list.vlist.[*].bvid') 62 | bv_times = jsonpath(res, expr='$.data.list.vlist.[*].created') 63 | tids = jsonpath(res, expr='$.data.list.tlist.[*].tid') 64 | t_times = jsonpath(res, expr='$.data.list.tlist.[*].created') 65 | 66 | return zip(bvids, bv_times) if bvids else False, False 67 | # return zip(bvids, bv_times), zip(tids, t_times) 68 | 69 | def __getEpisode(self, bvid): 70 | params = ( 71 | ('bvid', bvid), 72 | ) 73 | res = requests.get(CID_URL, params=params) 74 | res = json.loads(res.text) 75 | 76 | # Get all video episode(cid) under a video(vid) and their name. 77 | cids = jsonpath(res, expr='$.data.[*].cid') 78 | names = jsonpath(res, expr='$.data.[*].part') 79 | 80 | return zip(cids, names) 81 | 82 | def __getVideoUrl(self, bvid, cid): 83 | params = ( 84 | ('bvid', bvid), 85 | ('cid', cid), 86 | ('fourk', '1'), 87 | ('fnval', '16'), 88 | ) 89 | 90 | res = requests.get(PLAYER_URL, headers=self.headers, params=params) 91 | res = json.loads(res.text) 92 | 93 | video_info = jsonpath(res, expr='$.data.dash.video.[*]') 94 | print(video_info[0]['width']) 95 | video_url = video_info[0]['baseUrl'] # Get best quality. 96 | audio_info = jsonpath(res, expr='$.data.dash.audio.[*]') 97 | audio_url = audio_info[0]['baseUrl'] # Get best quality. 98 | 99 | return video_url, audio_url 100 | 101 | def __dlMedia(self, url, path_to_file, name, chunk=1024*1024): 102 | headers = { 103 | 'user-agent': 'Safari/537.36', 104 | 'referer': 'https://www.bilibili.com/', 105 | 'range': 'bytes=0-1', 106 | } 107 | # Send option request to get resource of server. 108 | session = self.session 109 | session.options(url=url, headers=headers, verify=False) 110 | 111 | # Get total file size and set initial chunk range. 112 | res = session.get(url=url, headers=headers) 113 | range = int(res.headers['Content-Range'].split('/')[1]) 114 | l_range, r_range = 0, min(chunk - 1, range - 1) 115 | bar = tqdm(total=range, unit_divisor=1024, unit='B', unit_scale=True, desc=name, ascii=' #') 116 | 117 | with open(path_to_file, 'wb') as f: 118 | while True: 119 | headers.update({'Range': 'bytes=%d-%d' % (l_range, r_range)}) 120 | 121 | # Get media chunk and print download progress. 122 | res = session.get(url=url, headers=headers) 123 | f.write(res.content) 124 | bar.update(r_range - l_range + 1) 125 | 126 | if r_range + 1 != range: 127 | l_range = r_range + 1 128 | r_range = min(r_range + chunk, range - 1) 129 | else: 130 | break 131 | 132 | bar.close() 133 | 134 | def dlVideo(self, bvid, path='data/anyone/', bvtime=time.time()): 135 | ''' 136 | Download all episode under a bvid. 137 | ''' 138 | mkDir(path) 139 | 140 | cidinfo = self.__getEpisode(bvid) 141 | for cid, cname in cidinfo: 142 | 143 | # Go to next episode if this one exits. 144 | cname = enablePath(cname) 145 | output_path = '%s%d_%s.mp4' % (path, cid, cname) 146 | if os.path.exists(output_path): 147 | os.utime(output_path, (bvtime, bvtime)) 148 | print('%s have downloaded.' % output_path) 149 | continue 150 | 151 | # Download video and audio of the episode. 152 | video_path = '%s%d_%s.video' % (path, cid, cname) 153 | audio_path = '%s%d_%s.audio' % (path, cid, cname) 154 | video_url, audio_url =self.__getVideoUrl(bvid, cid) 155 | self.__dlMedia(video_url, video_path, video_path[len(path) + 1 : ]) 156 | self.__dlMedia(audio_url, audio_path, audio_path[len(path) + 1 : ]) 157 | 158 | # Combine video and audio to mp4 file. 159 | cmd = 'ffmpeg -i %s -i %s -y -c copy %s' % (video_path, audio_path, output_path) 160 | ret = subprocess.call(cmd, shell=True) 161 | if ret != 0: 162 | print(' ERROR WHEN COMBINE, WITH CODE %d' % ret) 163 | 164 | os.utime(output_path, (bvtime, bvtime)) 165 | os.remove(video_path) 166 | os.remove(audio_path) 167 | 168 | def scrawlMedia(self, path=''): 169 | if self.mid == '': 170 | self.mid = input('Please input bilibili mid of the user:') 171 | 172 | if path == '': 173 | path = self.mid + '_' + self.__getName() + '/' 174 | enablePath(path) 175 | print(path) 176 | 177 | self.index = 0 178 | while True: 179 | self.index += 1 180 | bvinfo, tidinfo = self.__getMediaID() 181 | if not bvinfo and not tidinfo: 182 | print('No Video Info, EXIT.') 183 | break 184 | 185 | if bvinfo: 186 | for bvid, bvtime in bvinfo: 187 | self.dlVideo(bvid, 'data/' + path, bvtime) 188 | 189 | if tidinfo: 190 | pass 191 | 192 | if __name__ == '__main__': 193 | bilibili = Bilibili() 194 | 195 | while True: 196 | option = input('[1] Download one video.\n[2] Download all video from a user.\n') 197 | 198 | if option == '1': 199 | bvid = input('Please input video BV number:') 200 | bilibili.dlVideo(bvid) 201 | os.system('pause') 202 | break 203 | elif option == '2': 204 | start_time = time.time() 205 | bilibili.scrawlMedia() 206 | end_time = time.time() 207 | print("All finished in %ds!" % (end_time-start_time)) 208 | break 209 | else: 210 | print('Please input a number...') 211 | -------------------------------------------------------------------------------- /bilibili/config.yaml: -------------------------------------------------------------------------------- 1 | user-agent: 2 | cookie: -------------------------------------------------------------------------------- /ibizhi/config.yaml: -------------------------------------------------------------------------------- 1 | token: -------------------------------------------------------------------------------- /ibizhi/ibizhi.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import os 4 | import re 5 | import time 6 | from concurrent.futures import ThreadPoolExecutor 7 | 8 | import js2py 9 | import requests 10 | from pyexiv2 import Image 11 | 12 | # Init for decode 13 | URL_CLIENT = 'https://client.ibzhi.com/http/client' 14 | CryptoJS = js2py.require('crypto-js') 15 | CLASS = ['wallpaper/get_wallpaper_list', 'wallpaper/get_list_by_classify'] 16 | FUNC = [ 17 | lambda page:'{"page":%s,"size":30,"v":3}' % str(page), 18 | lambda page, c_id:'{"page":%s,"size":30,"v":3,"classify_ids":"%s"}' % (str(page), c_id), 19 | ] 20 | EXIF = {"0th": {}, "Exif": {}, "GPS": {}, "1st": {}, "thumbnail": None} 21 | 22 | # Methods. 23 | def mkDir(path): 24 | if(not os.path.exists(path)): 25 | try: 26 | os.makedirs(path) 27 | except Exception as e: 28 | print(e) 29 | 30 | 31 | def decode_b64_aes(enc): 32 | key = 'aes' 33 | enc = base64.b64decode(enc).decode('utf-8') 34 | dec = CryptoJS.AES.decrypt(enc, key).toString(CryptoJS.enc.Utf8) 35 | dec = dec.encode('latin1').decode('utf-8') 36 | return json.loads(dec) 37 | 38 | 39 | def write_pic(content, tag, img_path): 40 | # 将下载的二进制数据创建为图像 41 | with open(img_path, 'wb') as f: 42 | f.write(content) 43 | 44 | tag = re.split(r'[;,\\s]+', tag) 45 | tag.append(c_name[c_index]) 46 | 47 | # 添加关键字标记到 EXIF 数据中 48 | img = Image(img_path, encoding='gbk') 49 | img_type = img.get_mime_type() 50 | if img_type == 'image/jpeg': 51 | img.modify_iptc({'Iptc.Application2.Keywords': tag}) 52 | elif img_type == 'image/png': 53 | img.modify_xmp({'Xmp.dc.subject': tag}) 54 | img.close() 55 | 56 | def dl_pic(img_url, tag, c_time): 57 | img_name = re.findall(r'(?<=wallpaper/).*', img_url)[0] 58 | path = os.path.join(PATH, c_time) 59 | img_path = os.path.join(path, img_name) 60 | 61 | if not os.path.exists(img_path): 62 | resp = requests.get(img_url) 63 | if resp.status_code == 200: 64 | mkDir(path) 65 | try: 66 | print(f'Get {img_url} -> {path}') 67 | write_pic(resp.content, tag, img_path) 68 | except Exception as e: 69 | print(e) 70 | print('Try to download the picture again...') 71 | dl_pic(img_url, tag, c_time) 72 | else: 73 | print(f'Error {img_url} -> {path}') 74 | else: 75 | print(f'Exist {img_url} -> {path}') 76 | 77 | # Request params 78 | def get_params(page, f_index, c_id=None): 79 | if page == None: page = 1e16 80 | 81 | params = { 82 | "token": "", 83 | "path": CLASS[f_index], 84 | } 85 | if f_index == 0: 86 | params['param'] = FUNC[f_index](page) 87 | if f_index == 1: 88 | params['param'] = FUNC[f_index](page, c_id) 89 | 90 | # print(params) 91 | return params 92 | 93 | # Initial args. 94 | f_index = int(input('输入 0.获取最近壁纸 1.获取类别壁纸 : ')) 95 | if f_index == 0: 96 | params = lambda page=None : get_params(page, f_index) 97 | PATH = 'data/最近壁纸' 98 | elif f_index == 1: 99 | params = { 100 | 'token': '', 101 | 'param': '{"v":3}', 102 | 'path': 'wallpaper/get_classify_list', 103 | } 104 | resp = requests.get(URL_CLIENT, params=params) 105 | if resp.status_code == 200: 106 | print("正在获取类别,请稍等...") 107 | path_dict = decode_b64_aes(resp.text)['data'] 108 | else: 109 | print('Get class Failure!') 110 | exit() 111 | 112 | c_name = [x['name'] for x in path_dict] 113 | c_ids = [x['_id'] for x in path_dict] 114 | for i in range(len(c_name)): 115 | print(str(i) + '. ' + c_name[i], end=' ') 116 | c_index = int(input('\n请输入类别序号:')) 117 | 118 | params = lambda page=None : get_params(page, f_index, c_ids[c_index]) 119 | PATH = 'data/' + c_name[c_index] 120 | 121 | 122 | # Get total pages num. 123 | resp = requests.get(URL_CLIENT, params=params()) 124 | # print(resp.text) 125 | info = decode_b64_aes(resp.text) 126 | total_pages = info['totalPages'] 127 | print(f'Total Pages Num: {total_pages}') 128 | 129 | while True: 130 | begin_page = input('Input the page num to start with(default=1): ') 131 | if begin_page.isnumeric(): 132 | begin_page = int(begin_page) 133 | break 134 | elif begin_page == '': 135 | begin_page = 1 136 | break 137 | else: 138 | print('Please input num without alphabet!') 139 | 140 | # Start crawler. 141 | for page in range(total_pages): 142 | if begin_page > page + 1: continue 143 | 144 | print(f'=====Downloading Page {page + 1}=====') 145 | resp = requests.get(URL_CLIENT, params=params(page)) 146 | # print(params()) 147 | wallpaper_list = decode_b64_aes(resp.text) 148 | 149 | page_num = len(wallpaper_list['data']) 150 | print(f'Total Pages Image Num: {page_num}') 151 | 152 | tags = [] 153 | img_urls = [] 154 | ctimes = [] 155 | for wallpaper_info in wallpaper_list['data']: 156 | url = wallpaper_info['originalUrl'] 157 | img_urls.append(re.sub(r'\?.*', '', url)) 158 | 159 | tags.append(wallpaper_info['tag']) 160 | 161 | c_time = time.localtime(wallpaper_info['create_time'] // 1000) 162 | ctimes.append('%d-%02d' % (c_time.tm_year, c_time.tm_mon)) 163 | 164 | with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: 165 | executor.map(dl_pic, img_urls, tags, ctimes) 166 | 167 | os.system('pause') -------------------------------------------------------------------------------- /instgram/config.yaml: -------------------------------------------------------------------------------- 1 | x-ig-app-id: 2 | cookie: -------------------------------------------------------------------------------- /instgram/instgram.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import yaml 4 | import os 5 | import time 6 | from jsonpath import jsonpath 7 | 8 | ID_URL = 'https://i.instagram.com/api/v1/users/web_profile_info/' 9 | QUERY_URL = 'https://i.instagram.com/api/v1/feed/user/%s/username/?count=12' 10 | MEDIA_URL = 'https://i.instagram.com/api/v1/feed/user/%s/' 11 | 12 | def mkDir(path): 13 | if(not os.path.exists(path)): 14 | try: 15 | os.makedirs(path) 16 | except Exception as e: 17 | print(e) 18 | 19 | class Instgram: 20 | 21 | def __init__(self, screen_id): 22 | with open('config.yaml', 'r', encoding='utf-8') as f: 23 | self.headers = yaml.safe_load(f) 24 | self.rest_id = self.getRestID(screen_id) 25 | self.screen_id = screen_id 26 | self.cursor = '' 27 | self.pic_path = 'data/' + screen_id + '/img/' 28 | self.video_path = 'data/' + screen_id + '/video/' 29 | 30 | def getRestID(self, screen_id): 31 | params = { 32 | 'username': screen_id, 33 | } 34 | 35 | try: 36 | response = requests.get(ID_URL, params=params, headers=self.headers) 37 | if response.status_code == 200: 38 | response = response.json() 39 | return response['data']['user']['id'] 40 | else: 41 | print('request error, the response error code is: ', response.status_code) 42 | except Exception as e: 43 | print(e) 44 | return None 45 | 46 | def get_media_urls(self, cursor): 47 | try: 48 | if cursor == '': 49 | response = requests.get(QUERY_URL % self.screen_id, headers=self.headers) 50 | else: 51 | params = { 52 | 'count': '12', 53 | 'max_id': cursor, 54 | } 55 | response = requests.get(MEDIA_URL % self.rest_id, params=params, headers=self.headers) 56 | 57 | if(response.status_code == 200): 58 | response = json.dumps(response.json(), sort_keys=True) 59 | return json.loads(response) 60 | else: 61 | print('request error in get_media(), the response error code is: ', response.status_code) 62 | 63 | except Exception as e: 64 | print(e) 65 | 66 | def getMax(self, candidates): 67 | max_urls = [] 68 | for candidate in candidates: 69 | max_height = 0 70 | max_url = '' 71 | 72 | for pic in candidate: 73 | if pic['height'] > max_height: 74 | max_height = pic['height'] 75 | max_url = pic['url'] 76 | max_urls.append(max_url) 77 | 78 | return max_urls 79 | 80 | def scrawlMedia(self): 81 | mkDir(self.pic_path) 82 | mkDir(self.video_path) 83 | 84 | while self.cursor != 'END': 85 | response = self.get_media_urls(self.cursor) 86 | if not response: 87 | print('No Response, EXIT.') 88 | break 89 | 90 | pic_candidates = jsonpath(response, expr='$.items.[*].image_versions2.candidates') 91 | video_candidates = jsonpath(response, expr='$.items.[*].video_versions') 92 | 93 | #get max resolution 94 | if pic_candidates: 95 | pic_urls = self.getMax(pic_candidates) 96 | 97 | #download media 98 | if pic_urls: 99 | for url in pic_urls: self.dlPic(url) 100 | 101 | #get max resolution 102 | if video_candidates: 103 | video_urls = self.getMax(video_candidates ) 104 | 105 | #download media 106 | if video_urls: 107 | for url in video_urls: self.dlVideo(url) 108 | 109 | #upgrade cursor 110 | self.cursor = response['next_max_id'] if response['more_available'] == True else 'END' 111 | 112 | def dlPic(self, url): 113 | if url == '': 114 | return 115 | 116 | try: 117 | pic_name = url[url.rfind('/') + 1 : url.rfind('?')] 118 | path_to_file = self.pic_path + '/' + pic_name 119 | if os.path.exists(path_to_file): return 120 | 121 | pic = requests.get(url) 122 | if pic: 123 | print(pic_name) 124 | with open(path_to_file, 'wb') as f: 125 | f.write(pic.content) 126 | except: 127 | print('try download the picture again...') 128 | self.dlPic(url) 129 | 130 | 131 | def dlVideo(self, url): 132 | try: 133 | video_name = url[url.rfind('/') + 1 : url.rfind('?')] 134 | path_to_file = self.video_path + '/' + video_name 135 | if os.path.exists(path_to_file): return 136 | 137 | video = requests.get(url) 138 | if video: 139 | print(video_name) 140 | with open(path_to_file, 'wb') as f: 141 | f.write(video.content) 142 | except: 143 | print('try download the video again...') 144 | self.dlVideo(url) 145 | 146 | if __name__ == '__main__': 147 | screen_id = input('Please input instgram id of the user:') 148 | instgram = Instgram(screen_id) 149 | 150 | start_time = time.time() 151 | instgram.scrawlMedia() 152 | end_time = time.time() 153 | 154 | print("All finished in %ds!" % (end_time-start_time)) 155 | os.system('pause') 156 | -------------------------------------------------------------------------------- /karaoke/karaoke.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import requests 4 | import re 5 | import music_tag 6 | 7 | # referer can be any site to get through verification. 8 | headers = { 9 | 'referer': 'https://kg.qq.com/index-pc.html', 10 | } 11 | 12 | def mkdir(path): 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | 16 | class Karaoke(object): 17 | def __init__(self, uid: str) -> None: 18 | super().__init__() 19 | self.songs_id = [] 20 | self.songs_name = [] 21 | self.songs_date = [] 22 | self.get_songs(uid) 23 | 24 | def get_songs(self, uid: str) -> list: 25 | url = 'https://node.kg.qq.com/cgi/fcgi-bin/kg_ugc_get_homepage?type=get_uinfo&start=%d&num=8&share_uid=%s' 26 | res = requests.get(url % (1, uid), headers=headers) 27 | if res.status_code != 200: 28 | print(res.status_code) 29 | return 30 | 31 | res = res.text 32 | self.artist = re.search(r'(?<="nickname": ").*?(?=",)', res).group() 33 | self.path = 'data/' + self.artist 34 | num = re.search(r'(?<="ugc_total_count":).+?(?=,)', res).group() 35 | total = (int(num)+ 7) // 8 36 | 37 | for start in range(1, total + 1): 38 | res = requests.get(url % (start, uid), headers=headers).text 39 | self.songs_id += re.findall(r'(?<="shareid": ").*?(?=",)', res) 40 | self.songs_name += re.findall(r'(?<="title": ").*?(?=",)', res) 41 | self.songs_date += re.findall(r'(?<="time": ).*?(?=,)', res) 42 | 43 | print(self.songs_name) 44 | if '' in self.songs_name: 45 | index = [] 46 | for i in range(len(self.songs_name)): 47 | if self.songs_name[i] == '': 48 | index.append(i+1) 49 | for i in index: 50 | del self.songs_name[i] 51 | 52 | print(self.songs_name) 53 | print('found %d songs.' % len(self.songs_id)) 54 | 55 | def scrawlMedia(self): 56 | mkdir(self.path) 57 | 58 | url = 'https://node.kg.qq.com/cgi/fcgi-bin/fcg_get_play_url?shareid=%s' 59 | for song_id, song_name, song_date in zip(self.songs_id, self.songs_name, self.songs_date): 60 | song_date = time.strftime("%Y-%m-%d", time.localtime(int(song_date))) 61 | self.dlSong(url % song_id, song_name, song_date) 62 | 63 | def dlSong(self, url, name, date): 64 | try: 65 | name += '-' + date + '.m4a' 66 | name = re.sub(r'[\/:*?"<>|]', ' ', name) 67 | path_to_file = self.path + '/' + name 68 | if os.path.exists(path_to_file): return 69 | 70 | song = requests.get(url) 71 | if song: 72 | print(name) 73 | with open(path_to_file, 'wb') as f: 74 | f.write(song.content) 75 | 76 | file = music_tag.load_file(path_to_file) 77 | file['title'] = name[:name.find('-')] 78 | file['artist'] = self.artist 79 | file['year'] = date 80 | file.save() 81 | except: 82 | print('try download the song again...') 83 | self.dlSong(url, name, date) 84 | 85 | if __name__ == '__main__': 86 | uid = input('Please input the karaoke share uid of the user: ') 87 | karaoke = Karaoke(uid) 88 | 89 | start_time = time.time() 90 | karaoke.scrawlMedia() 91 | end_time = time.time() 92 | 93 | print("All finished in %ds!" % (end_time-start_time)) 94 | os.system('pause') 95 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jsonpath==0.82 2 | music_tag==0.4.3 3 | PyYAML==6.0 4 | requests==2.27.1 5 | tqdm==4.64.0 6 | urllib3==1.26.9 7 | -------------------------------------------------------------------------------- /twitter/config.yaml: -------------------------------------------------------------------------------- 1 | x-csrf-token: 2 | authorization: 3 | cookie: 4 | -------------------------------------------------------------------------------- /twitter/twitter.py: -------------------------------------------------------------------------------- 1 | import json 2 | from jsonpath import jsonpath 3 | import os 4 | import re 5 | import requests 6 | import time 7 | import yaml 8 | 9 | ID_URL = 'https://twitter.com/i/api/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName' 10 | MEDIA_URL = 'https://twitter.com/i/api/2/timeline/media/%s.json' 11 | 12 | def mkDir(path): 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | 16 | class Twitter: 17 | 18 | def __init__(self, screen_id): 19 | with open('config.yaml', 'r', encoding='utf-8') as f: 20 | self.headers = yaml.safe_load(f) 21 | self.rest_id = self.getRestID(screen_id) 22 | self.cursor = '' 23 | self.pic_path = 'data/' + screen_id + '/img/' 24 | self.video_path = 'data/' + screen_id + '/video/' 25 | 26 | def dlPic(self, url): 27 | try: 28 | pic_name = url[url.rfind('/')+1:] 29 | path_to_file = self.pic_path + '/' + pic_name 30 | if os.path.exists(path_to_file): return 31 | 32 | pic = requests.get(url + '?name=large') 33 | if pic: 34 | print(pic_name) 35 | with open(path_to_file, 'wb') as f: 36 | f.write(pic.content) 37 | except: 38 | print('try download the picture again...') 39 | self.dlPic(url) 40 | 41 | 42 | def dlVideo(self, url): 43 | try: 44 | video_name = url[url.rfind('/') + 1 :] 45 | path_to_file = self.video_path + '/' + video_name 46 | if os.path.exists(path_to_file): return 47 | 48 | video = requests.get(url) 49 | if video: 50 | print(video_name) 51 | with open(path_to_file, 'wb') as f: 52 | f.write(video.content) 53 | except: 54 | print('try download the video again...') 55 | self.dlVideo(url) 56 | 57 | 58 | def getRestID(self, screen_id): 59 | params = ( 60 | ('variables', '{"screen_name":"%s","withHighlightedLabel":true}' % screen_id), 61 | ) 62 | 63 | response = requests.get(ID_URL, headers=self.headers, params=params) 64 | content = json.loads(response.text) 65 | 66 | return content["data"]["user"]["rest_id"] 67 | 68 | def getMedia(self): 69 | params = ( 70 | ('cursor', '{}'.format(self.cursor)), 71 | ) 72 | if self.cursor == '': 73 | self.res = requests.get(MEDIA_URL % self.rest_id, headers=self.headers) 74 | else: 75 | self.res = requests.get(MEDIA_URL % self.rest_id, headers=self.headers, params=params) 76 | 77 | def scrawlMedia(self): 78 | 79 | mkDir(self.pic_path) 80 | mkDir(self.video_path) 81 | 82 | cursor = '#' 83 | while cursor != self.cursor: 84 | 85 | # handing the response and get links of videos and pictures 86 | self.getMedia() 87 | res = json.loads(self.res.text) 88 | res = json.loads(json.dumps(res, sort_keys=True, ensure_ascii=False)) 89 | tweets = res['globalObjects']['tweets'] 90 | pic_urls = jsonpath(tweets, expr='$.[*].entities.media.[*].media_url_https') 91 | video_urls = jsonpath(tweets, expr='$.[*].extended_entities.media.[*].video_info.variants[0].url') 92 | 93 | # download picture 94 | if pic_urls: 95 | for url in pic_urls: self.dlPic(url) 96 | 97 | # download video 98 | if video_urls: 99 | for url in video_urls: self.dlVideo(url) 100 | 101 | # upgrade cursor 102 | cursor = self.cursor 103 | self.cursor = res['timeline']['instructions'][0]['addEntries']['entries'][-1]['content']['operation']['cursor']['value'] 104 | 105 | if __name__ == '__main__': 106 | screen_id = input('Please input twitter id of the user: ') 107 | twitter = Twitter(screen_id) 108 | 109 | start_time = time.time() 110 | twitter.scrawlMedia() 111 | end_time = time.time() 112 | 113 | print("All finished in %ds!" % (end_time-start_time)) 114 | os.system('pause') 115 | -------------------------------------------------------------------------------- /weibo/config.yaml: -------------------------------------------------------------------------------- 1 | cookie: -------------------------------------------------------------------------------- /weibo/weibo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import yaml 3 | import json 4 | from jsonpath import jsonpath 5 | import os 6 | import time 7 | 8 | WEIBO_URL = 'https://m.weibo.cn/api/container/getIndex' 9 | PHOTO_WALL_URL = 'https://m.weibo.cn/api/container/getSecond' 10 | 11 | def mkDir(path): 12 | if not os.path.exists(path): 13 | os.makedirs(path) 14 | 15 | def responseToJson(response): 16 | res = json.loads(response.text) 17 | res = json.loads(json.dumps(res, ensure_ascii=False, sort_keys=True)) 18 | 19 | return res 20 | 21 | class Weibo(object): 22 | def __init__(self, uid): 23 | with open('config.yaml', 'r', encoding='utf-8') as f: 24 | self.headers = yaml.safe_load(f) 25 | self.uid = uid 26 | self.index = 1 #for count. 27 | self.cursor = '' #location for video info. 28 | self.pic_path = 'data/' + self.getScreenName() + '/img/' 29 | self.video_path = 'data/' + self.getScreenName() + '/video/' 30 | 31 | def dlPic(self, url): 32 | try: 33 | pic_name = url[url.rfind('/') + 1:] 34 | path_to_file = self.pic_path + '/' + pic_name 35 | if os.path.exists(path_to_file): return 36 | 37 | pic = requests.get(url) 38 | if pic: 39 | print(pic_name) 40 | with open(path_to_file, 'wb') as f: 41 | f.write(pic.content) 42 | except: 43 | print('try download the picture again...') 44 | self.dlPic(url) 45 | 46 | def dlVideo(self, url): 47 | try: 48 | video_name = url[url.find('.mp4') - 37: url.find('.mp4') + 4] 49 | path_to_file = self.video_path + '/' + video_name 50 | if os.path.exists(path_to_file): return 51 | 52 | video = requests.get(url) 53 | if video: 54 | print(video_name) 55 | with open(path_to_file, 'wb') as f: 56 | f.write(video.content) 57 | except: 58 | print('try download the video again...') 59 | self.dlVideo(url) 60 | 61 | def getScreenName(self): 62 | params = ( 63 | ('type', 'uid'), 64 | ('value', self.uid), 65 | ('containerid', '100505' + self.uid) 66 | ) 67 | 68 | res = requests.get(WEIBO_URL, params=params) 69 | res = responseToJson(res) 70 | return res['data']['userInfo']['screen_name'] 71 | 72 | def getPicInfo(self): 73 | params = ( 74 | ('containerid', '107803' + self.uid + '_-_photoall'), 75 | ('count', '24'), 76 | ('type', 'uid'), 77 | ('value', self.uid), 78 | ('page', self.index) 79 | ) 80 | 81 | res = requests.get(PHOTO_WALL_URL, headers=self.headers, params=params) 82 | return responseToJson(res) 83 | 84 | def getVideoInfo(self): 85 | params = ( 86 | ('containerid', '231567' + self.uid), 87 | ('is_all/[/]', ['1?is_all=1', '1']), 88 | ('type', 'uid'), 89 | ('value', self.uid), 90 | ('since_id', self.cursor), 91 | ) 92 | 93 | res = requests.get('https://m.weibo.cn/api/container/getIndex', headers=self.headers, params=params) 94 | return responseToJson(res) 95 | 96 | def scrawlMedia(self): 97 | mkDir(self.pic_path) 98 | # get picture links from response and download them. 99 | while True: 100 | res = self.getPicInfo() 101 | if res['ok'] == 0: break 102 | 103 | pic_urls = jsonpath(res, expr='$.data.cards.[*].pics.[*].pic_big') 104 | if pic_urls: 105 | for url in pic_urls: self.dlPic(url) 106 | 107 | print("we have downloaded %d pictures." % (self.index * 24)) 108 | self.index += 1 109 | 110 | self.index = 1 111 | mkDir(self.video_path) 112 | # get video links from response and download them. 113 | while True: 114 | res = self.getVideoInfo() 115 | if res['ok'] == 0 or self.cursor == 0: break 116 | 117 | video_urls = jsonpath(res, expr='$.data.cards.[*].mblog.page_info.urls.mp4_720p_mp4') 118 | if video_urls: 119 | for url in video_urls: self.dlVideo(url) 120 | 121 | self.cursor = res['data']['cardlistInfo']['since_id'] 122 | print("We have downloaded %d videos." % (self.index * 20)) 123 | self.index += 1 124 | 125 | if __name__ == '__main__': 126 | uid = input('Please input weibo uid of the user: ') 127 | weibo = Weibo(uid) 128 | 129 | start_time = time.time() 130 | weibo.scrawlMedia() 131 | end_time = time.time() 132 | 133 | print("All finished in %ds!" % (end_time-start_time)) 134 | os.system('pause') 135 | 136 | --------------------------------------------------------------------------------