├── README.md ├── main.py ├── .gitignore └── weibo.py /README.md: -------------------------------------------------------------------------------- 1 | # weibo-image-crawler 2 | 从一条特定的微博中下载所有的高清大图、livephoto 动图、高清视频。 3 | 4 | 无需登录账号。 5 | 6 | # Updates 7 | **4/4/2024**: 因微博接口的改变,爬取收费图片的方式失效。正在尝试其他方法... 8 | 9 | **3/23/2024**: 增加了 2K 60 帧高清视频下载 (如果没有 2K 视频,则自动选择最高质量视频下载) 10 | 11 | **3/23/2024**: 增加了对微博新接口的支持 12 | 13 | **3/23/2024**: 增加了对 t.cn 短链的支持 14 | 15 | **1/7/2024**: 增加了图片和视频的并行下载 16 | 17 | # Usage 18 | 1. 获取一条微博链接。链接的形式有两种: 19 | - t.cn短链: http://t.cn/A6YBzjqQ 20 | - 短码链接: https://weibo.com/2343372547/O6kbQmBwX 21 | - 长码链接: https://weibo.com/2117508734/5014997330298194 22 | 23 | 这两个链接指向同一个微博页面。 24 | 25 | 2. 26 | 下载一条微博的所有高清大图/livephoto动图/高清视频: 27 | ``` 28 | python main.py -l https://weibo.com/2343372547/O6kbQmBwX 29 | 或者 30 | python main.py -l http://t.cn/A6YBzjqQ 31 | 或者 32 | python main.py -l https://weibo.com/2117508734/5014997330298194 33 | ``` 34 | 35 | 下载多条微博的所有高清大图/livephoto动图/高清视频,可以把微博链接放到一个文本文件里,每个链接占一行: 36 | ``` 37 | python main.py -f links.txt 38 | ``` 39 | 40 | 设定图片存储到本地 images 目录下: 41 | ``` 42 | python main.py -f links.txt -s images 43 | ``` 44 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from weibo import weibo_image_download as wb 3 | 4 | parser = argparse.ArgumentParser( 5 | prog='Weibo Image Crawler', 6 | description='Download all the high-resolution images from weibo links.') 7 | parser.add_argument('-l', '--link', default=None, type=str, metavar='Link', 8 | help='A Weibo link to download images. For example, https://weibo.com/3178232834/MFStocIKp or https://weibo.com/3178232834/4899808463031949') 9 | parser.add_argument('-f', '--file', default=None, type=str, metavar='File', 10 | help='A file that contains lines of weibo links.') 11 | parser.add_argument('-s', '--save', default="images", type=str, metavar='File', 12 | help='Folder to save images.') 13 | 14 | def main(): 15 | args = parser.parse_args() 16 | 17 | if args.link is not None: 18 | wb(args.link, args.save) 19 | elif args.file is not None: 20 | links = [] 21 | with open(args.file, 'r') as f: 22 | for line in f: 23 | line = line.strip() 24 | if line: 25 | links.append(line) 26 | for url in links: 27 | wb(url, args.save) 28 | else: 29 | print("Please provide a weibo link.") 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom Folders 2 | images/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | -------------------------------------------------------------------------------- /weibo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | from requests.adapters import HTTPAdapter 5 | import os 6 | import sys 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | 9 | headers = { 10 | 'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 11 | } 12 | 13 | cookies = [ 14 | 'XSRF-TOKEN=df6_295NgzFvJXS5Ebir3Lsy; SUB=_2AkMSoQfrf8NxqwFRmfsVyG3mbYp3wgjEieKk_fYwJRMxHRl-yT9vqlIAtRB6OSEpBFcGiARBOAnwKhC5ZrKPs-0Tb0Qo; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWFg0fG6cq3oE2bqhsoAldV; WBPSESS=gJ7ElPMf_3q2cdj5JUfmvGVqkHB92RE2_AwewsrjYWIBFCA1ZPKYgsEdwAzm6brHYlW5B6maWDy-hBEgLCyxVoJJry48tUmcvk0HOSyHP_39vQbgHUQVhjsEpRu0qJLNziegtrfv2J4r-EEdKdga-YSfVBhzDTG8azkZAaaS7Pw=', 15 | 'XSRF-TOKEN=df6_295NgzFvJXS5Ebir3Lsy; _s_tentry=-; Apache=6386009078674.588.1711119828939; SINAGLOBAL=6386009078674.588.1711119828939; ULV=1711119828990:1:1:1:6386009078674.588.1711119828939:; ALF=1713761012; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWuyTE2R_nVVR8Gh5or-UwK5JpX5KzhUgL.FoeNeKM4eh2pehe2dJLoIEBLxK-LBozLB-BLxKBLB.2L1K2LxKqL1KMLB.2LxKqL1-eL1hnt; SUB=_2A25I-vXPDeRhGeVJ6lUY8C_Nyz-IHXVodncHrDV8PUJbn9AGLRfgkW1NT-K3eh5dPZM9ZF7BK1xxHGc5IPJnHuMf; PC_TOKEN=f7723f2711; WBPSESS=HRGrvUX5o6Tu2aaaIhJAc5AQgyd_ChArhzxpTq2G3firFqV8woflsEY-USPTjze-BoOqGxKWzJs_RNqUxg8KLzjyZVqLaIkbmQaoOqD2zfduwsrQg_Im_Rf7wmjlKZHFecpsx1yYhPLc5CTwHO7KuQ==', 16 | ] 17 | 18 | def extract_redirected_link(short_link): 19 | try: 20 | response = requests.get(short_link, headers=headers, allow_redirects=False) 21 | if response.status_code == 200: 22 | for key, value in response.__dict__.items(): 23 | print(f"{key}: {value}") 24 | return None 25 | elif response.status_code == 302: 26 | # Extract the redirection URL from the Location header 27 | if 'Location' in response.headers: 28 | final_url = response.headers['Location'] 29 | return final_url 30 | else: 31 | print("Error: Redirection URL not found in response headers.") 32 | return None 33 | else: 34 | print(f"Error: Unable to access {short_link}. Status code: {response.status_code}") 35 | return None 36 | except Exception as e: 37 | print(f"Error: {e}") 38 | return None 39 | 40 | def get_page_id(url): 41 | # 匹配UID和页面ID的正则表达式 42 | if "t.cn" in url: 43 | url = extract_redirected_link(url) 44 | 45 | pattern = re.compile(r'https?://weibo.com/(\d+)/?(\w+)?') 46 | 47 | # 尝试匹配链接中的UID和页面ID 48 | match = pattern.match(url) 49 | if match: 50 | uid = match.group(1) 51 | if match.group(2): 52 | page_id = match.group(2) 53 | else: 54 | # 如果页面ID没有在链接中指定,则默认为最后一个字符 55 | page_id = url[-1] 56 | return page_id 57 | 58 | # 如果链接格式不正确,则返回None 59 | return None 60 | 61 | def weibo_pagesource(page_id): 62 | 63 | headers_copy = headers.copy() 64 | 65 | for cookie in cookies: 66 | headers_copy['Cookie'] = cookie 67 | request_link = "https://weibo.com/ajax/statuses/show?id=" + page_id 68 | 69 | response = requests.get(request_link, headers=headers_copy) 70 | if response.ok: 71 | try: 72 | data = response.json() 73 | return data 74 | except ValueError: 75 | print("Failed to decode JSON. Response was:", response.text) 76 | elif response.status_code == 400: 77 | print("Request failed with status code 400. Trying next cookie.") 78 | continue 79 | else: 80 | print("Request failed with status code:", response.status_code) 81 | return "" 82 | 83 | print("All cookies failed. Unable to retrieve data.") 84 | return "" 85 | 86 | def get_page_type(response): 87 | page_type = '' 88 | if 'mix_media_info' in response.keys(): 89 | page_type = 'multimedia' 90 | elif 'pic_infos' in response.keys() and 'pic_ids' in response.keys() and 'pic_num' in response.keys(): 91 | page_type = 'images' 92 | # Note: Some picture weibo also has the 'page_info' field, e.g., https://weibo.com/2687932353/O594VFGac 93 | elif 'page_info' in response.keys() and 'media_info' in response['page_info'].keys(): 94 | page_type = 'video' 95 | else: 96 | page_type = 'Unknown' 97 | 98 | return page_type 99 | 100 | def get_pic_type(response, pic_id): 101 | # type: 102 | # pic: photo 103 | # livephoto: live photo 104 | 105 | media_type = response['pic_infos'][pic_id]['type'] 106 | return media_type 107 | 108 | 109 | def get_media_urls(response, page_type): 110 | 111 | media_urls = [] 112 | 113 | if page_type == 'multimedia': 114 | num_medias = len(response['mix_media_info']['items']) 115 | for i in range(num_medias): 116 | media = response['mix_media_info']['items'][i] 117 | if 'pic' == media['type']: 118 | media_urls += [{'url': media['data']['largest']['url'], 119 | 'media_id': media['data']['pic_id'], 120 | 'media_type': 'pic'}] 121 | elif 'video' == media['type']: 122 | video_url = media['data']['media_info']['mp4_720p_mp4'] or \ 123 | media['data']['media_info']['stream_url_hd'] 124 | 125 | media_urls += [{'url': video_url, 126 | 'media_id': media['data']['media_info']['media_id'], 127 | 'media_type': 'video'}] 128 | else: 129 | print('unknown media type in multi-media page...') 130 | 131 | elif page_type == 'video': 132 | video_info = response['page_info']['media_info'] 133 | 134 | video_url = video_info['playback_list'][0]['play_info']['url'] or \ 135 | video_info['mp4_720p_mp4'] or \ 136 | video_info['stream_url_hd'] 137 | 138 | media_urls += [{'url': video_url, 139 | 'media_id': video_info['media_id'], 140 | 'media_type': 'video'}] 141 | 142 | elif page_type == 'images': 143 | pic_ids = response['pic_ids'] 144 | # Note: 如果是转发微博,那么会有多个 pic_ids,其中当前微博的 pic_ids 是空,而被转发微博的 pic_ids 正常。 145 | for pic_id in pic_ids: 146 | pic_type = get_pic_type(response, pic_id) 147 | if pic_type == "pic": 148 | media_urls += [{'url': response['pic_infos'][pic_id]['largest']['url'], 149 | 'media_id': pic_id, 150 | 'media_type': 'pic'}] 151 | elif (pic_type == "livephoto"): 152 | media_urls += [{'url': response['pic_infos'][pic_id]['video'], 153 | 'media_id': pic_id, 154 | 'media_type': 'livephoto'}] 155 | else: 156 | print("No urls catched for new type of weibo_page.") 157 | 158 | return media_urls 159 | 160 | 161 | def get_user_info(response): 162 | user = {} 163 | user['screen_name'] = response['user']['screen_name'] 164 | user['uid'] = str(response['user']['id']) 165 | 166 | return user 167 | 168 | def download_media(url, file_path, uid): 169 | downloader_headers = { 170 | 'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 171 | 'Referer': 'https://weibo.com/', 172 | 'Sec_Fetch_Site': 'cross-site', 173 | 'Cookie': 'XSRF-TOKEN=df6_295NgzFvJXS5Ebir3Lsy; SUB=_2AkMSoQfrf8NxqwFRmfsVyG3mbYp3wgjEieKk_fYwJRMxHRl-yT9vqlIAtRB6OSEpBFcGiARBOAnwKhC5ZrKPs-0Tb0Qo; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWFg0fG6cq3oE2bqhsoAldV; WBPSESS=gJ7ElPMf_3q2cdj5JUfmvGVqkHB92RE2_AwewsrjYWIBFCA1ZPKYgsEdwAzm6brHYlW5B6maWDy-hBEgLCyxVoJJry48tUmcvk0HOSyHP_39vQbgHUQVhjsEpRu0qJLNziegtrfv2J4r-EEdKdga-YSfVBhzDTG8azkZAaaS7Pw=; _s_tentry=-; Apache=6386009078674.588.1711119828939; SINAGLOBAL=6386009078674.588.1711119828939; ULV=1711119828990:1:1:1:6386009078674.588.1711119828939:', 174 | } 175 | try: 176 | file_exist = os.path.isfile(file_path) 177 | need_download = (not file_exist) 178 | if not need_download: 179 | return 180 | s = requests.Session() 181 | s.mount(url, HTTPAdapter(max_retries=5)) 182 | try_count = 0 183 | success = False 184 | MAX_TRY_COUNT = 3 185 | while try_count < MAX_TRY_COUNT: 186 | downloaded = s.get( 187 | #url, headers=downloader_headers, timeout=(5, 10), verify=False 188 | url, headers=downloader_headers, timeout=(5, 10) 189 | ) 190 | try_count += 1 191 | fail_flg_1 = url.endswith(("jpg", "jpeg")) and not downloaded.content.endswith(b"\xff\xd9") 192 | fail_flg_2 = url.endswith("png") and not downloaded.content.endswith(b"\xaeB`\x82") 193 | fail_flg_3 = url.endswith("mov") and False # to do: Verify mov file status 194 | fail_flg_4 = url.endswith(",video") and False 195 | if ( fail_flg_1 or fail_flg_2 or fail_flg_3 or fail_flg_4): 196 | # logger.debug("[DEBUG] failed " + url + " " + str(try_count)) 197 | print("[DEBUG] Download failed ") 198 | else: 199 | success = True 200 | # logger.debug("[DEBUG] success " + url + " " + str(try_count)) 201 | print("[DEBUG] success ") 202 | break 203 | if success: 204 | if not file_exist: 205 | with open(file_path, "wb") as f: 206 | f.write(downloaded.content) 207 | # logger.debug("[DEBUG] save " + file_path ) 208 | print("saved") 209 | else: 210 | # logger.debug("[DEBUG] failed " + url + " TOTALLY") 211 | print("[DEBUG] Save failed " ) 212 | except Exception as e: 213 | error_file = "not_downloaded.txt" 214 | with open(error_file, "ab") as f: 215 | url = str(uid) + ":" + file_path + ":" + url + "\n" 216 | f.write(url.encode(sys.stdout.encoding)) 217 | # logger.exception(e) 218 | 219 | 220 | def weibo_image_download(url, save_folder="images"): 221 | print("Downloading URL: ", url) 222 | 223 | page_id = get_page_id(url) 224 | response = weibo_pagesource(page_id) 225 | 226 | user_info = get_user_info(response) 227 | user_folder = user_info['screen_name'] + "_" + user_info['uid'] 228 | save_folder = os.path.join(save_folder, user_folder) 229 | if not os.path.isdir(save_folder): 230 | os.makedirs(save_folder) 231 | 232 | page_type = get_page_type(response) 233 | media_urls = get_media_urls(response, page_type) 234 | 235 | with ThreadPoolExecutor(max_workers=10) as executor: 236 | futures = [] 237 | for media in media_urls: 238 | media_url = media['url'] 239 | media_type = media['media_type'] 240 | media_id = media['media_id'] 241 | if 'pic' == media_type: 242 | save_name = os.path.join(save_folder, media_id + '.jpg') 243 | elif 'video' == media_type: 244 | save_name = os.path.join(save_folder, media_id + '.mp4') 245 | elif 'livephoto' == media_type: 246 | save_name = os.path.join(save_folder, media_id + '.mov') 247 | 248 | future = executor.submit(download_media, media_url, save_name, user_info['uid']) 249 | futures.append(future) 250 | 251 | for future in as_completed(futures): 252 | try: 253 | future.result() 254 | except Exception as exc: 255 | print('There was an exception: %s' % exc) 256 | 257 | print("Finished downloading user: ", user_info['screen_name']) 258 | --------------------------------------------------------------------------------