├── .gitignore ├── requirements.in ├── rate_limits.png ├── likes_public.png ├── stampede_400.jpg ├── tumblr_api_key.png ├── api_registration.png ├── register_application.png ├── requirements.txt ├── LICENSE ├── save_media_files.py ├── save_posts_metadata.py ├── save_likes_metadata.py ├── README.md └── common.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | tumblr 3 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | click 3 | requests 4 | tqdm 5 | youtube-dl 6 | -------------------------------------------------------------------------------- /rate_limits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/rate_limits.png -------------------------------------------------------------------------------- /likes_public.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/likes_public.png -------------------------------------------------------------------------------- /stampede_400.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/stampede_400.jpg -------------------------------------------------------------------------------- /tumblr_api_key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/tumblr_api_key.png -------------------------------------------------------------------------------- /api_registration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/api_registration.png -------------------------------------------------------------------------------- /register_application.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/register_application.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | beautifulsoup4==4.6.3 8 | certifi==2018.11.29 # via requests 9 | chardet==3.0.4 # via requests 10 | click==7.0 11 | idna==2.7 # via requests 12 | requests==2.20.1 13 | tqdm==4.28.1 14 | urllib3==1.24.1 # via requests 15 | youtube-dl==2018.12.3 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /save_media_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | 4 | import os 5 | import sys 6 | import traceback 7 | 8 | import click 9 | import tqdm 10 | 11 | from common import find_all_metadata_files, save_post_media_files 12 | 13 | 14 | @click.command( 15 | help="Save all the media files for your Tumblr posts/likes." 16 | ) 17 | @click.option( 18 | "--metadata", default="tumblr", 19 | help="Directory where your metadata is saved." 20 | ) 21 | def save_all_media_files(metadata): 22 | all_media_files = list(find_all_metadata_files(path=metadata)) 23 | for info_path in tqdm.tqdm(all_media_files): 24 | try: 25 | save_post_media_files(info_path) 26 | except Exception: 27 | post_id = os.path.basename(os.path.dirname(info_path)) 28 | traceback.print_exc() 29 | print(f"Error trying to save media for post {post_id}") 30 | print("~") 31 | 32 | 33 | if __name__ == '__main__': 34 | # Allows us to omit the '--metadata' argument and click is still happy. 35 | if len(sys.argv) == 2 and sys.argv[1] != "--help": 36 | sys.argv = [sys.argv[0], "--metadata", sys.argv[1]] 37 | 38 | save_all_media_files() 39 | -------------------------------------------------------------------------------- /save_posts_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | 4 | import os 5 | 6 | import click 7 | 8 | from common import get_all_posts, save_post_metadata 9 | 10 | 11 | @click.command( 12 | help="Save all the metadata from your Tumblr posts." 13 | ) 14 | @click.option( 15 | "--blog_identifier", required=True, 16 | prompt="What is your blog identifier? e.g. 'alexwlchan.tumblr.com'", 17 | help="Blog identifier, as used by the Tumblr API" 18 | ) 19 | @click.option( 20 | "--api_key", required=True, 21 | prompt="What is your API key? Register at https://www.tumblr.com/oauth/apps", 22 | help="OAuth API key for the Tumblr API (https://www.tumblr.com/oauth/apps)" 23 | ) 24 | @click.option( 25 | "--dst", default="tumblr", 26 | help="Directory for saving metadata" 27 | ) 28 | def save_metadata(blog_identifier, api_key, dst): 29 | for post_data in get_all_posts(blog_identifier=blog_identifier, api_key=api_key): 30 | save_post_metadata( 31 | dst=os.path.join(dst, blog_identifier.replace(".", "_"), "posts"), 32 | post_data=post_data 33 | ) 34 | 35 | if __name__ == '__main__': 36 | save_metadata() 37 | print( 38 | "Note: if the progress bar didn't quite get to 100%, that's okay -- " 39 | "it's only an estimate, and the Tumblr API doesn't always return everything." 40 | ) 41 | -------------------------------------------------------------------------------- /save_likes_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | 4 | import os 5 | 6 | import click 7 | 8 | from common import get_all_likes, save_post_metadata 9 | 10 | 11 | @click.command( 12 | help="Save all the metadata from your Tumblr likes." 13 | ) 14 | @click.option( 15 | "--blog_identifier", required=True, 16 | prompt="What is your blog identifier? e.g. 'alexwlchan.tumblr.com'", 17 | help="Blog identifier, as used by the Tumblr API" 18 | ) 19 | @click.option( 20 | "--api_key", required=True, 21 | prompt="What is your API key? Register at https://www.tumblr.com/oauth/apps", 22 | help="OAuth API key for the Tumblr API (https://www.tumblr.com/oauth/apps)" 23 | ) 24 | @click.option( 25 | "--dst", default="tumblr", 26 | help="Directory for saving metadata" 27 | ) 28 | def save_metadata(blog_identifier, api_key, dst): 29 | for post_data in get_all_likes(blog_identifier=blog_identifier, api_key=api_key): 30 | save_post_metadata( 31 | dst=os.path.join(dst, blog_identifier.replace(".", "_"), "likes"), 32 | post_data=post_data 33 | ) 34 | 35 | 36 | if __name__ == '__main__': 37 | save_metadata() 38 | print( 39 | "Note: if the progress bar didn't quite get to 100%, that's okay -- " 40 | "it's only an estimate, and the Tumblr API doesn't always return everything." 41 | ) 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # backup_tumblr 2 | 3 | This is a set of scripts for downloading your posts and likes from Tumblr. 4 | 5 | The scripts try to download as much as possible, including: 6 | 7 | * Every post and like 8 | * All the metadata about a post that's available through the Tumblr API 9 | * Any media files attached to a post (e.g. photos, videos) 10 | 11 | I've had these for private use for a while, and in the wake of Tumblr going on a deletion spree, I'm trying to make them usable by other people. 12 | 13 | **If you're having problems, the easiest way to get my attention is by [opening an issue](https://github.com/alexwlchan/backup_tumblr/issues/new).** 14 | If you don't have a GitHub account, there are alternative contact details [on my website](https://alexwlchan.net/contact/). 15 | 16 | ![](stampede_400.jpg) 17 | 18 | Pictured: a group of Tumblr users fleeing the new content moderation policies. Image credit: Wellcome Collection, CC BY. 19 | 20 | ## Motivation 21 | 22 | **These scripts are only for personal use.** 23 | Please don't use them to download posts and then make them publicly accessible without consent. 24 | Your own blog is yours to do what you want with; your likes and other people's posts are not. 25 | 26 | Some of what's on Tumblr is deeply personal content, and is either private or requires a login. 27 | Don't put it somewhere where the original creator can't control how it's presented or whether it's visible. 28 | 29 | ## Getting started 30 | 31 | 1. Install Python 3.6 or later. 32 | Instructions on [the Python website](https://www.python.org/downloads/). 33 | 34 | 2. Check you have pip installed by running the following command at a command prompt: 35 | 36 | ```console 37 | $ pip3 --version 38 | pip 18.1 (python 3.6) 39 | ``` 40 | 41 | If you don't have it installed or the command errors, follow the [pip installation instructions](https://pip.pypa.io/en/stable/installing/) 42 | 43 | 3. Clone this repository: 44 | 45 | ```console 46 | $ git clone https://github.com/alexwlchan/backup_tumblr.git 47 | $ cd backup_tumblr 48 | ``` 49 | 50 | 4. Install the Python dependencies: 51 | 52 | ```console 53 | $ pip3 install -r requirements.txt 54 | ``` 55 | 56 | 5. Get yourself a Tumblr API key by registering an app at . 57 | 58 | If you haven't done it before, start by clicking the **Register application** button: 59 | 60 | ![](register_application.png) 61 | 62 | Then fill in the details for your app. 63 | Here's an example of what you could use (but put your own email address!): 64 | 65 | ![](api_registration.png) 66 | 67 | You can leave everything else blank. 68 | Then scroll down and hit the "Register" button. 69 | 70 | ![](rate_limits.png) 71 | 72 | Note: unless you have a _lot_ of posts (20k or more), you shouldn't need to ask for a rate limit removal. 73 | 74 | Once you've registered, you'll have a new entry in the list of applications. 75 | You need the **OAuth Consumer Key**: 76 | 77 | ![](tumblr_api_key.png) 78 | 79 | 6. If you're saving your likes, make your likes public by visiting `https://www.tumblr.com/settings/blog/BLOGNAME`, and turning on the "Share posts you like" setting: 80 | 81 | ![](likes_public.png) 82 | 83 | Otherwise the script can't see them! 84 | 85 | ## Usage 86 | 87 | There are three scripts in this repo: 88 | 89 | 1. `save_posts_metadata.py` saves metadata about all the posts on your blog. 90 | 2. `save_likes_metadata.py` saves metadata about all the posts you've liked. 91 | 3. `save_media_files.py` saves all the media (images, videos, etc.) from those posts. 92 | 93 | They're split into separate scripts because saving metadata is much faster than media files. 94 | 95 | You should run (1) and/or (2), then run (3). 96 | Something like: 97 | 98 | ```console 99 | $ python3 save_posts_metadata.py 100 | 101 | $ python3 save_likes_metadata.py 102 | 103 | $ python3 save_media_files.py 104 | ``` 105 | 106 | If you know what command-line flags are: you can pass arguments (e.g. API key) as flags. 107 | Use `--help` to see the available flags. 108 | 109 | If that sentence meant nothing: don't worry, the scripts will ask you for any information they need. 110 | 111 | ## Unanswered questions and notes 112 | 113 | * I have no idea how Tumblr's content blocks interact with the API, or if blocked posts are visible through the API. 114 | 115 | * I've seen mixed reports saying that ordering in the dashboard has been broken for the last few days. 116 | Again, no idea how this interacts with the API. 117 | 118 | * Media files can get big. 119 | I have ~12k likes which are taking ~9GB of disk space. 120 | The scripts will merrily fill up your disk, so make sure you have plenty of space before you start! 121 | 122 | * These scripts are provided "as is". 123 | File an issue if you have a problem, but I don't have much time for maintenance right now. 124 | 125 | * Sometimes the Tumblr API claims to have more posts than it actually returns, and the effect is that the script appears to stop early, e.g. at 96%. 126 | 127 | I'm reading the `total_posts` parameter from the API responses, and paginating through it as expected -- I have no idea what causes the discrepancy. 128 | 129 | ## Alternatives 130 | 131 | These scripts only save the raw API responses and media files. 132 | 133 | It *doesn't* create a pretty index, or interface, or make it especially searchable. 134 | I like saving the complete response because it gives me as much flexibility as possible, but it means you need more work to do something useful later. 135 | 136 | If you're looking for a more full-featured, well-documented project, I've heard good things about [bbolli/tumblr-utils](https://github.com/bbolli/tumblr-utils). 137 | 138 | ## Acknowledgements 139 | 140 | Hat tip to [@cesy](https://github.com/cesy/) for nudging me to post it, and providing useful feedback on the initial version. 141 | 142 | ## Licence 143 | 144 | MIT. 145 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | import json 4 | import os 5 | import subprocess 6 | from urllib.error import HTTPError 7 | from urllib.parse import parse_qs, urlparse 8 | from urllib.request import urlretrieve 9 | 10 | from bs4 import BeautifulSoup 11 | import requests 12 | import tqdm 13 | 14 | 15 | class TumblrSession: 16 | 17 | def __init__(self, api_key): 18 | self.api_key = api_key 19 | self.base_api_url = f"https://api.tumblr.com/v2" 20 | self.sess = requests.Session() 21 | 22 | def get(self, path, params=None): 23 | if params is None: 24 | params = {} 25 | params["api_key"] = self.api_key 26 | resp = self.sess.get(self.base_api_url + path, params=params) 27 | resp.raise_for_status() 28 | return resp.json() 29 | 30 | 31 | def save_post_metadata(dst, post_data): 32 | post_id = post_data["id"] 33 | out_dir = os.path.join(dst, str(post_id)[:2], str(post_id)) 34 | out_path = os.path.join(out_dir, "info.json") 35 | 36 | if os.path.exists(out_path): 37 | return 38 | 39 | os.makedirs(out_dir, exist_ok=True) 40 | 41 | json_string = json.dumps(post_data, indent=2, sort_keys=True) 42 | with open(out_path + ".tmp", "w") as outfile: 43 | outfile.write(json_string) 44 | 45 | os.rename(out_path + ".tmp", out_path) 46 | 47 | 48 | def get_all_likes(*, blog_identifier, api_key): 49 | sess = TumblrSession(api_key=api_key) 50 | 51 | # First get the number of liked posts, so we can give the user some idea of 52 | # how many there are and how long the script will take. 53 | api_path = f"/blog/{blog_identifier}/likes" 54 | resp = sess.get(api_path) 55 | 56 | liked_count = resp["response"]["liked_count"] 57 | 58 | def iterator(): 59 | params = {} 60 | while True: 61 | resp = sess.get(api_path, params=params) 62 | 63 | posts = resp["response"]["liked_posts"] 64 | yield from posts 65 | 66 | # An empty posts list tells us we've finished. 67 | if not posts: 68 | break 69 | 70 | # Tumblr helpfully includes some query parameters in the response that 71 | # we can use to build our next request. 72 | params.update(resp["response"]["_links"]["next"]["query_params"]) 73 | 74 | return tqdm.tqdm(iterator(), total=liked_count) 75 | 76 | 77 | def get_all_posts(*, blog_identifier, api_key): 78 | sess = TumblrSession(api_key=api_key) 79 | api_path = f"/blog/{blog_identifier}/posts" 80 | 81 | # First get the number of liked posts, so we can give the user some idea of 82 | # how many there are and how long the script will take. 83 | resp = sess.get(api_path) 84 | 85 | total_posts = resp["response"]["total_posts"] 86 | 87 | def iterator(): 88 | params = { 89 | "reblog_info": True, 90 | "notes_info": True, 91 | } 92 | 93 | while True: 94 | resp = sess.get(api_path, params=params) 95 | 96 | posts = resp["response"]["posts"] 97 | yield from posts 98 | 99 | # An empty posts list tells us we've finished. 100 | if not posts: 101 | break 102 | 103 | # We can only get the last 1000 posts with the offset parameter; 104 | # instead look at the timestamps of the posts we retrieved and 105 | # set that as the "before" parameter. 106 | earliest_timestamp = min(p["timestamp"] for p in posts) 107 | params["before"] = earliest_timestamp - 1 108 | 109 | return tqdm.tqdm(iterator(), total=total_posts) 110 | 111 | 112 | def find_all_metadata_files(path): 113 | if not os.path.exists(path): 114 | raise ValueError(f"Asked to save media files in non-existent dir {path!r}?") 115 | 116 | if not os.path.isdir(path): 117 | raise ValueError(f"Asked to save media files in non-directory {path!r}?") 118 | 119 | for root, _, filenames in os.walk(path): 120 | if "info.json" in filenames: 121 | yield os.path.join(root, "info.json") 122 | 123 | 124 | def _download_asset(post_dir, url, suffix=""): 125 | name = os.path.basename(url) + suffix 126 | out_path = os.path.join(post_dir, name) 127 | if os.path.exists(out_path): 128 | return 129 | try: 130 | urlretrieve(url, out_path + ".tmp") 131 | os.rename(out_path + ".tmp", out_path) 132 | except HTTPError as err: 133 | print(f"Error trying to download URL {url!r} ({err})") 134 | return 135 | 136 | 137 | def _download_with_youtube_dl(post_dir, url, cmd=None): 138 | """ 139 | Download a video using youtube-dl. 140 | """ 141 | 142 | # The purpose of this marker is to check "have we run youtube_dl before?" 143 | # 144 | # Although youtube_dl is smart about not re-downloading files, it has to make 145 | # a network request before it does that, which is slow and mostly unnecessary. 146 | # This is a crude way to avoid unnecessary shell-outs/network requests. 147 | # 148 | marker = os.path.join(post_dir, ".youtube_dl") 149 | if os.path.exists(marker): 150 | return 151 | 152 | if cmd is None: 153 | cmd = ["youtube-dl", url] 154 | 155 | try: 156 | subprocess.check_call(cmd, stdout=subprocess.DEVNULL, cwd=post_dir) 157 | except subprocess.CalledProcessError as err: 158 | post_id = os.path.basename(post_dir) 159 | print(f"Unable to download video for post ID {post_id} from {url!r} ({err}).") 160 | raise 161 | else: 162 | open(marker, "wb").write(b"") 163 | 164 | 165 | def save_post_media_files(info_path): 166 | post_data = json.load(open(info_path)) 167 | post_dir = os.path.dirname(info_path) 168 | post_id = post_data["id"] 169 | 170 | if post_data["type"] == "photo": 171 | for photo in post_data["photos"]: 172 | _download_asset(post_dir=post_dir, url=photo["original_size"]["url"]) 173 | 174 | elif post_data["type"] in ("answer", "chat", "link", "quote", "text"): 175 | return 176 | 177 | elif post_data["type"] == "video": 178 | players = [p for p in post_data["player"] if p["embed_code"]] 179 | 180 | if post_data["video_type"] == "tumblr": 181 | _download_asset(post_dir=post_dir, url=post_data["video_url"]) 182 | 183 | elif post_data["video_type"] == "youtube": 184 | if all(not p["embed_code"] for p in post_data["player"]): 185 | return 186 | 187 | try: 188 | if post_data["source_url"].startswith("https://www.youtube.com/embed"): 189 | source_url = post_data["source_url"] 190 | else: 191 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0] 192 | except KeyError: 193 | best_player = max(players, key=lambda p: p["width"]) 194 | soup = BeautifulSoup(best_player["embed_code"], "html.parser") 195 | iframe_matches = soup.find_all("iframe", attrs={"id": "youtube_iframe"}) 196 | assert len(iframe_matches) == 1 197 | 198 | source_url = iframe_matches[0].attrs["src"] 199 | 200 | _download_with_youtube_dl(post_dir=post_dir, url=source_url) 201 | 202 | elif post_data["video_type"] in ("vimeo", "youtube"): 203 | best_player = max(players, key=lambda p: p["width"]) 204 | soup = BeautifulSoup(best_player["embed_code"], "html.parser") 205 | iframe_matches = soup.find_all("iframe") 206 | assert len(iframe_matches) == 1 207 | 208 | embed_url = iframe_matches[0].attrs["src"] 209 | 210 | _download_with_youtube_dl(post_dir=post_dir, url=embed_url) 211 | 212 | elif ( 213 | post_data["video_type"] == "unknown" and 214 | post_data.get("source_url", "").startswith("https://t.umblr.com/redirect?z=http%3A%2F%2Fwww.youtube.com") 215 | ): 216 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0] 217 | _download_with_youtube_dl(post_dir=post_dir, url=source_url) 218 | 219 | elif post_data["video_type"] in ("instagram", "vine"): 220 | # Normally there's a link to Instagram videos in the "permalink_url" 221 | # field, but sometimes this is missing. I think it happens when the 222 | # Instagram video is taken down, and it's no longer viewable on Tumblr. 223 | # e.g. http://his-shining-tears.tumblr.com/post/146498996350 224 | try: 225 | source_url = post_data["permalink_url"] 226 | except KeyError: 227 | print(f"Unable to get video URL for {post_id!r}") 228 | else: 229 | # For Vine videos, downloading the HD version but the standard 230 | # version works. So fall back to that if the initial download fails. 231 | # https://github.com/alexwlchan/backup_tumblr/issues/3 232 | try: 233 | _download_with_youtube_dl(post_dir=post_dir, url=source_url) 234 | except subprocess.CalledProcessError: 235 | _download_with_youtube_dl( 236 | post_dir=post_dir, 237 | url=source_url, 238 | cmd=["youtube-dl", "-f", "standard", url] 239 | ) 240 | 241 | elif post_data["video_type"] == "flickr": 242 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0] 243 | print(f"Unable to download video for {post_id!r}: {source_url}") 244 | 245 | else: 246 | print(f"Unable to download video for {post_id!r}; unrecognised video type {post_data['video_type']!r}") 247 | 248 | elif post_data["type"] == "audio": 249 | 250 | # Exammple contents of the "player" field: 251 | # 252 | # 260 | # 261 | if post_data["audio_type"] == "tumblr": 262 | player_soup = BeautifulSoup(post_data["player"], "html.parser") 263 | player_matches = player_soup.find_all( 264 | "iframe", attrs={"class": "tumblr_audio_player"} 265 | ) 266 | assert len(player_matches) == 1 267 | 268 | src_url = player_matches[0]["src"] 269 | query_string = parse_qs(urlparse(src_url).query) 270 | assert len(query_string["audio_file"]) == 1 271 | audio_file = query_string["audio_file"][0] 272 | _download_asset(post_dir=post_dir, url=audio_file) 273 | 274 | elif post_data["audio_type"] == "spotify": 275 | source_url = post_data["audio_source_url"] 276 | print( 277 | f"Unable to download audio file for {post_id!r}: {source_url!r}" 278 | ) 279 | 280 | elif post_data["audio_type"] == "soundcloud": 281 | source_url = post_data["audio_source_url"] 282 | print( 283 | f"Unable to download audio file for {post_id!r}: {source_url!r}" 284 | ) 285 | 286 | else: 287 | print(f"Unable to download audio for {post_id!r}") 288 | 289 | else: 290 | post_type = post_data["type"] 291 | raise ValueError(f"Unrecognised post type: {post_id!r} ({post_type})") 292 | --------------------------------------------------------------------------------