├── .gitignore
├── requirements.in
├── rate_limits.png
├── likes_public.png
├── stampede_400.jpg
├── tumblr_api_key.png
├── api_registration.png
├── register_application.png
├── requirements.txt
├── LICENSE
├── save_media_files.py
├── save_posts_metadata.py
├── save_likes_metadata.py
├── README.md
└── common.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | tumblr
3 |
--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | click
3 | requests
4 | tqdm
5 | youtube-dl
6 |
--------------------------------------------------------------------------------
/rate_limits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/rate_limits.png
--------------------------------------------------------------------------------
/likes_public.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/likes_public.png
--------------------------------------------------------------------------------
/stampede_400.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/stampede_400.jpg
--------------------------------------------------------------------------------
/tumblr_api_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/tumblr_api_key.png
--------------------------------------------------------------------------------
/api_registration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/api_registration.png
--------------------------------------------------------------------------------
/register_application.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/backup_tumblr/master/register_application.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile
3 | # To update, run:
4 | #
5 | # pip-compile --output-file requirements.txt requirements.in
6 | #
7 | beautifulsoup4==4.6.3
8 | certifi==2018.11.29 # via requests
9 | chardet==3.0.4 # via requests
10 | click==7.0
11 | idna==2.7 # via requests
12 | requests==2.20.1
13 | tqdm==4.28.1
14 | urllib3==1.24.1 # via requests
15 | youtube-dl==2018.12.3
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2018 Alex Chan
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 |
--------------------------------------------------------------------------------
/save_media_files.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8
3 |
4 | import os
5 | import sys
6 | import traceback
7 |
8 | import click
9 | import tqdm
10 |
11 | from common import find_all_metadata_files, save_post_media_files
12 |
13 |
14 | @click.command(
15 | help="Save all the media files for your Tumblr posts/likes."
16 | )
17 | @click.option(
18 | "--metadata", default="tumblr",
19 | help="Directory where your metadata is saved."
20 | )
21 | def save_all_media_files(metadata):
22 | all_media_files = list(find_all_metadata_files(path=metadata))
23 | for info_path in tqdm.tqdm(all_media_files):
24 | try:
25 | save_post_media_files(info_path)
26 | except Exception:
27 | post_id = os.path.basename(os.path.dirname(info_path))
28 | traceback.print_exc()
29 | print(f"Error trying to save media for post {post_id}")
30 | print("~")
31 |
32 |
33 | if __name__ == '__main__':
34 | # Allows us to omit the '--metadata' argument and click is still happy.
35 | if len(sys.argv) == 2 and sys.argv[1] != "--help":
36 | sys.argv = [sys.argv[0], "--metadata", sys.argv[1]]
37 |
38 | save_all_media_files()
39 |
--------------------------------------------------------------------------------
/save_posts_metadata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8
3 |
4 | import os
5 |
6 | import click
7 |
8 | from common import get_all_posts, save_post_metadata
9 |
10 |
11 | @click.command(
12 | help="Save all the metadata from your Tumblr posts."
13 | )
14 | @click.option(
15 | "--blog_identifier", required=True,
16 | prompt="What is your blog identifier? e.g. 'alexwlchan.tumblr.com'",
17 | help="Blog identifier, as used by the Tumblr API"
18 | )
19 | @click.option(
20 | "--api_key", required=True,
21 | prompt="What is your API key? Register at https://www.tumblr.com/oauth/apps",
22 | help="OAuth API key for the Tumblr API (https://www.tumblr.com/oauth/apps)"
23 | )
24 | @click.option(
25 | "--dst", default="tumblr",
26 | help="Directory for saving metadata"
27 | )
28 | def save_metadata(blog_identifier, api_key, dst):
29 | for post_data in get_all_posts(blog_identifier=blog_identifier, api_key=api_key):
30 | save_post_metadata(
31 | dst=os.path.join(dst, blog_identifier.replace(".", "_"), "posts"),
32 | post_data=post_data
33 | )
34 |
35 | if __name__ == '__main__':
36 | save_metadata()
37 | print(
38 | "Note: if the progress bar didn't quite get to 100%, that's okay -- "
39 | "it's only an estimate, and the Tumblr API doesn't always return everything."
40 | )
41 |
--------------------------------------------------------------------------------
/save_likes_metadata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8
3 |
4 | import os
5 |
6 | import click
7 |
8 | from common import get_all_likes, save_post_metadata
9 |
10 |
11 | @click.command(
12 | help="Save all the metadata from your Tumblr likes."
13 | )
14 | @click.option(
15 | "--blog_identifier", required=True,
16 | prompt="What is your blog identifier? e.g. 'alexwlchan.tumblr.com'",
17 | help="Blog identifier, as used by the Tumblr API"
18 | )
19 | @click.option(
20 | "--api_key", required=True,
21 | prompt="What is your API key? Register at https://www.tumblr.com/oauth/apps",
22 | help="OAuth API key for the Tumblr API (https://www.tumblr.com/oauth/apps)"
23 | )
24 | @click.option(
25 | "--dst", default="tumblr",
26 | help="Directory for saving metadata"
27 | )
28 | def save_metadata(blog_identifier, api_key, dst):
29 | for post_data in get_all_likes(blog_identifier=blog_identifier, api_key=api_key):
30 | save_post_metadata(
31 | dst=os.path.join(dst, blog_identifier.replace(".", "_"), "likes"),
32 | post_data=post_data
33 | )
34 |
35 |
36 | if __name__ == '__main__':
37 | save_metadata()
38 | print(
39 | "Note: if the progress bar didn't quite get to 100%, that's okay -- "
40 | "it's only an estimate, and the Tumblr API doesn't always return everything."
41 | )
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # backup_tumblr
2 |
3 | This is a set of scripts for downloading your posts and likes from Tumblr.
4 |
5 | The scripts try to download as much as possible, including:
6 |
7 | * Every post and like
8 | * All the metadata about a post that's available through the Tumblr API
9 | * Any media files attached to a post (e.g. photos, videos)
10 |
11 | I've had these for private use for a while, and in the wake of Tumblr going on a deletion spree, I'm trying to make them usable by other people.
12 |
13 | **If you're having problems, the easiest way to get my attention is by [opening an issue](https://github.com/alexwlchan/backup_tumblr/issues/new).**
14 | If you don't have a GitHub account, there are alternative contact details [on my website](https://alexwlchan.net/contact/).
15 |
16 | 
17 |
18 | Pictured: a group of Tumblr users fleeing the new content moderation policies. Image credit: Wellcome Collection, CC BY.
19 |
20 | ## Motivation
21 |
22 | **These scripts are only for personal use.**
23 | Please don't use them to download posts and then make them publicly accessible without consent.
24 | Your own blog is yours to do what you want with; your likes and other people's posts are not.
25 |
26 | Some of what's on Tumblr is deeply personal content, and is either private or requires a login.
27 | Don't put it somewhere where the original creator can't control how it's presented or whether it's visible.
28 |
29 | ## Getting started
30 |
31 | 1. Install Python 3.6 or later.
32 | Instructions on [the Python website](https://www.python.org/downloads/).
33 |
34 | 2. Check you have pip installed by running the following command at a command prompt:
35 |
36 | ```console
37 | $ pip3 --version
38 | pip 18.1 (python 3.6)
39 | ```
40 |
41 | If you don't have it installed or the command errors, follow the [pip installation instructions](https://pip.pypa.io/en/stable/installing/)
42 |
43 | 3. Clone this repository:
44 |
45 | ```console
46 | $ git clone https://github.com/alexwlchan/backup_tumblr.git
47 | $ cd backup_tumblr
48 | ```
49 |
50 | 4. Install the Python dependencies:
51 |
52 | ```console
53 | $ pip3 install -r requirements.txt
54 | ```
55 |
56 | 5. Get yourself a Tumblr API key by registering an app at .
57 |
58 | If you haven't done it before, start by clicking the **Register application** button:
59 |
60 | 
61 |
62 | Then fill in the details for your app.
63 | Here's an example of what you could use (but put your own email address!):
64 |
65 | 
66 |
67 | You can leave everything else blank.
68 | Then scroll down and hit the "Register" button.
69 |
70 | 
71 |
72 | Note: unless you have a _lot_ of posts (20k or more), you shouldn't need to ask for a rate limit removal.
73 |
74 | Once you've registered, you'll have a new entry in the list of applications.
75 | You need the **OAuth Consumer Key**:
76 |
77 | 
78 |
79 | 6. If you're saving your likes, make your likes public by visiting `https://www.tumblr.com/settings/blog/BLOGNAME`, and turning on the "Share posts you like" setting:
80 |
81 | 
82 |
83 | Otherwise the script can't see them!
84 |
85 | ## Usage
86 |
87 | There are three scripts in this repo:
88 |
89 | 1. `save_posts_metadata.py` saves metadata about all the posts on your blog.
90 | 2. `save_likes_metadata.py` saves metadata about all the posts you've liked.
91 | 3. `save_media_files.py` saves all the media (images, videos, etc.) from those posts.
92 |
93 | They're split into separate scripts because saving metadata is much faster than media files.
94 |
95 | You should run (1) and/or (2), then run (3).
96 | Something like:
97 |
98 | ```console
99 | $ python3 save_posts_metadata.py
100 |
101 | $ python3 save_likes_metadata.py
102 |
103 | $ python3 save_media_files.py
104 | ```
105 |
106 | If you know what command-line flags are: you can pass arguments (e.g. API key) as flags.
107 | Use `--help` to see the available flags.
108 |
109 | If that sentence meant nothing: don't worry, the scripts will ask you for any information they need.
110 |
111 | ## Unanswered questions and notes
112 |
113 | * I have no idea how Tumblr's content blocks interact with the API, or if blocked posts are visible through the API.
114 |
115 | * I've seen mixed reports saying that ordering in the dashboard has been broken for the last few days.
116 | Again, no idea how this interacts with the API.
117 |
118 | * Media files can get big.
119 | I have ~12k likes which are taking ~9GB of disk space.
120 | The scripts will merrily fill up your disk, so make sure you have plenty of space before you start!
121 |
122 | * These scripts are provided "as is".
123 | File an issue if you have a problem, but I don't have much time for maintenance right now.
124 |
125 | * Sometimes the Tumblr API claims to have more posts than it actually returns, and the effect is that the script appears to stop early, e.g. at 96%.
126 |
127 | I'm reading the `total_posts` parameter from the API responses, and paginating through it as expected -- I have no idea what causes the discrepancy.
128 |
129 | ## Alternatives
130 |
131 | These scripts only save the raw API responses and media files.
132 |
133 | It *doesn't* create a pretty index, or interface, or make it especially searchable.
134 | I like saving the complete response because it gives me as much flexibility as possible, but it means you need more work to do something useful later.
135 |
136 | If you're looking for a more full-featured, well-documented project, I've heard good things about [bbolli/tumblr-utils](https://github.com/bbolli/tumblr-utils).
137 |
138 | ## Acknowledgements
139 |
140 | Hat tip to [@cesy](https://github.com/cesy/) for nudging me to post it, and providing useful feedback on the initial version.
141 |
142 | ## Licence
143 |
144 | MIT.
145 |
--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8
2 |
3 | import json
4 | import os
5 | import subprocess
6 | from urllib.error import HTTPError
7 | from urllib.parse import parse_qs, urlparse
8 | from urllib.request import urlretrieve
9 |
10 | from bs4 import BeautifulSoup
11 | import requests
12 | import tqdm
13 |
14 |
15 | class TumblrSession:
16 |
17 | def __init__(self, api_key):
18 | self.api_key = api_key
19 | self.base_api_url = f"https://api.tumblr.com/v2"
20 | self.sess = requests.Session()
21 |
22 | def get(self, path, params=None):
23 | if params is None:
24 | params = {}
25 | params["api_key"] = self.api_key
26 | resp = self.sess.get(self.base_api_url + path, params=params)
27 | resp.raise_for_status()
28 | return resp.json()
29 |
30 |
31 | def save_post_metadata(dst, post_data):
32 | post_id = post_data["id"]
33 | out_dir = os.path.join(dst, str(post_id)[:2], str(post_id))
34 | out_path = os.path.join(out_dir, "info.json")
35 |
36 | if os.path.exists(out_path):
37 | return
38 |
39 | os.makedirs(out_dir, exist_ok=True)
40 |
41 | json_string = json.dumps(post_data, indent=2, sort_keys=True)
42 | with open(out_path + ".tmp", "w") as outfile:
43 | outfile.write(json_string)
44 |
45 | os.rename(out_path + ".tmp", out_path)
46 |
47 |
48 | def get_all_likes(*, blog_identifier, api_key):
49 | sess = TumblrSession(api_key=api_key)
50 |
51 | # First get the number of liked posts, so we can give the user some idea of
52 | # how many there are and how long the script will take.
53 | api_path = f"/blog/{blog_identifier}/likes"
54 | resp = sess.get(api_path)
55 |
56 | liked_count = resp["response"]["liked_count"]
57 |
58 | def iterator():
59 | params = {}
60 | while True:
61 | resp = sess.get(api_path, params=params)
62 |
63 | posts = resp["response"]["liked_posts"]
64 | yield from posts
65 |
66 | # An empty posts list tells us we've finished.
67 | if not posts:
68 | break
69 |
70 | # Tumblr helpfully includes some query parameters in the response that
71 | # we can use to build our next request.
72 | params.update(resp["response"]["_links"]["next"]["query_params"])
73 |
74 | return tqdm.tqdm(iterator(), total=liked_count)
75 |
76 |
77 | def get_all_posts(*, blog_identifier, api_key):
78 | sess = TumblrSession(api_key=api_key)
79 | api_path = f"/blog/{blog_identifier}/posts"
80 |
81 | # First get the number of liked posts, so we can give the user some idea of
82 | # how many there are and how long the script will take.
83 | resp = sess.get(api_path)
84 |
85 | total_posts = resp["response"]["total_posts"]
86 |
87 | def iterator():
88 | params = {
89 | "reblog_info": True,
90 | "notes_info": True,
91 | }
92 |
93 | while True:
94 | resp = sess.get(api_path, params=params)
95 |
96 | posts = resp["response"]["posts"]
97 | yield from posts
98 |
99 | # An empty posts list tells us we've finished.
100 | if not posts:
101 | break
102 |
103 | # We can only get the last 1000 posts with the offset parameter;
104 | # instead look at the timestamps of the posts we retrieved and
105 | # set that as the "before" parameter.
106 | earliest_timestamp = min(p["timestamp"] for p in posts)
107 | params["before"] = earliest_timestamp - 1
108 |
109 | return tqdm.tqdm(iterator(), total=total_posts)
110 |
111 |
112 | def find_all_metadata_files(path):
113 | if not os.path.exists(path):
114 | raise ValueError(f"Asked to save media files in non-existent dir {path!r}?")
115 |
116 | if not os.path.isdir(path):
117 | raise ValueError(f"Asked to save media files in non-directory {path!r}?")
118 |
119 | for root, _, filenames in os.walk(path):
120 | if "info.json" in filenames:
121 | yield os.path.join(root, "info.json")
122 |
123 |
124 | def _download_asset(post_dir, url, suffix=""):
125 | name = os.path.basename(url) + suffix
126 | out_path = os.path.join(post_dir, name)
127 | if os.path.exists(out_path):
128 | return
129 | try:
130 | urlretrieve(url, out_path + ".tmp")
131 | os.rename(out_path + ".tmp", out_path)
132 | except HTTPError as err:
133 | print(f"Error trying to download URL {url!r} ({err})")
134 | return
135 |
136 |
137 | def _download_with_youtube_dl(post_dir, url, cmd=None):
138 | """
139 | Download a video using youtube-dl.
140 | """
141 |
142 | # The purpose of this marker is to check "have we run youtube_dl before?"
143 | #
144 | # Although youtube_dl is smart about not re-downloading files, it has to make
145 | # a network request before it does that, which is slow and mostly unnecessary.
146 | # This is a crude way to avoid unnecessary shell-outs/network requests.
147 | #
148 | marker = os.path.join(post_dir, ".youtube_dl")
149 | if os.path.exists(marker):
150 | return
151 |
152 | if cmd is None:
153 | cmd = ["youtube-dl", url]
154 |
155 | try:
156 | subprocess.check_call(cmd, stdout=subprocess.DEVNULL, cwd=post_dir)
157 | except subprocess.CalledProcessError as err:
158 | post_id = os.path.basename(post_dir)
159 | print(f"Unable to download video for post ID {post_id} from {url!r} ({err}).")
160 | raise
161 | else:
162 | open(marker, "wb").write(b"")
163 |
164 |
165 | def save_post_media_files(info_path):
166 | post_data = json.load(open(info_path))
167 | post_dir = os.path.dirname(info_path)
168 | post_id = post_data["id"]
169 |
170 | if post_data["type"] == "photo":
171 | for photo in post_data["photos"]:
172 | _download_asset(post_dir=post_dir, url=photo["original_size"]["url"])
173 |
174 | elif post_data["type"] in ("answer", "chat", "link", "quote", "text"):
175 | return
176 |
177 | elif post_data["type"] == "video":
178 | players = [p for p in post_data["player"] if p["embed_code"]]
179 |
180 | if post_data["video_type"] == "tumblr":
181 | _download_asset(post_dir=post_dir, url=post_data["video_url"])
182 |
183 | elif post_data["video_type"] == "youtube":
184 | if all(not p["embed_code"] for p in post_data["player"]):
185 | return
186 |
187 | try:
188 | if post_data["source_url"].startswith("https://www.youtube.com/embed"):
189 | source_url = post_data["source_url"]
190 | else:
191 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0]
192 | except KeyError:
193 | best_player = max(players, key=lambda p: p["width"])
194 | soup = BeautifulSoup(best_player["embed_code"], "html.parser")
195 | iframe_matches = soup.find_all("iframe", attrs={"id": "youtube_iframe"})
196 | assert len(iframe_matches) == 1
197 |
198 | source_url = iframe_matches[0].attrs["src"]
199 |
200 | _download_with_youtube_dl(post_dir=post_dir, url=source_url)
201 |
202 | elif post_data["video_type"] in ("vimeo", "youtube"):
203 | best_player = max(players, key=lambda p: p["width"])
204 | soup = BeautifulSoup(best_player["embed_code"], "html.parser")
205 | iframe_matches = soup.find_all("iframe")
206 | assert len(iframe_matches) == 1
207 |
208 | embed_url = iframe_matches[0].attrs["src"]
209 |
210 | _download_with_youtube_dl(post_dir=post_dir, url=embed_url)
211 |
212 | elif (
213 | post_data["video_type"] == "unknown" and
214 | post_data.get("source_url", "").startswith("https://t.umblr.com/redirect?z=http%3A%2F%2Fwww.youtube.com")
215 | ):
216 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0]
217 | _download_with_youtube_dl(post_dir=post_dir, url=source_url)
218 |
219 | elif post_data["video_type"] in ("instagram", "vine"):
220 | # Normally there's a link to Instagram videos in the "permalink_url"
221 | # field, but sometimes this is missing. I think it happens when the
222 | # Instagram video is taken down, and it's no longer viewable on Tumblr.
223 | # e.g. http://his-shining-tears.tumblr.com/post/146498996350
224 | try:
225 | source_url = post_data["permalink_url"]
226 | except KeyError:
227 | print(f"Unable to get video URL for {post_id!r}")
228 | else:
229 | # For Vine videos, downloading the HD version but the standard
230 | # version works. So fall back to that if the initial download fails.
231 | # https://github.com/alexwlchan/backup_tumblr/issues/3
232 | try:
233 | _download_with_youtube_dl(post_dir=post_dir, url=source_url)
234 | except subprocess.CalledProcessError:
235 | _download_with_youtube_dl(
236 | post_dir=post_dir,
237 | url=source_url,
238 | cmd=["youtube-dl", "-f", "standard", url]
239 | )
240 |
241 | elif post_data["video_type"] == "flickr":
242 | source_url = parse_qs(urlparse(post_data["source_url"]).query)["z"][0]
243 | print(f"Unable to download video for {post_id!r}: {source_url}")
244 |
245 | else:
246 | print(f"Unable to download video for {post_id!r}; unrecognised video type {post_data['video_type']!r}")
247 |
248 | elif post_data["type"] == "audio":
249 |
250 | # Exammple contents of the "player" field:
251 | #
252 | #
260 | #
261 | if post_data["audio_type"] == "tumblr":
262 | player_soup = BeautifulSoup(post_data["player"], "html.parser")
263 | player_matches = player_soup.find_all(
264 | "iframe", attrs={"class": "tumblr_audio_player"}
265 | )
266 | assert len(player_matches) == 1
267 |
268 | src_url = player_matches[0]["src"]
269 | query_string = parse_qs(urlparse(src_url).query)
270 | assert len(query_string["audio_file"]) == 1
271 | audio_file = query_string["audio_file"][0]
272 | _download_asset(post_dir=post_dir, url=audio_file)
273 |
274 | elif post_data["audio_type"] == "spotify":
275 | source_url = post_data["audio_source_url"]
276 | print(
277 | f"Unable to download audio file for {post_id!r}: {source_url!r}"
278 | )
279 |
280 | elif post_data["audio_type"] == "soundcloud":
281 | source_url = post_data["audio_source_url"]
282 | print(
283 | f"Unable to download audio file for {post_id!r}: {source_url!r}"
284 | )
285 |
286 | else:
287 | print(f"Unable to download audio for {post_id!r}")
288 |
289 | else:
290 | post_type = post_data["type"]
291 | raise ValueError(f"Unrecognised post type: {post_id!r} ({post_type})")
292 |
--------------------------------------------------------------------------------