├── .gitignore ├── requirements.in ├── record_player.jpg ├── add_email_password ├── overcast_account_1.png ├── overcast_account_2.png ├── overcast_account_4.png ├── overcast_account_3a.png ├── overcast_account_3b.png └── README.md ├── download_podcasts.sh ├── LICENSE ├── requirements.txt ├── download_all_episodes_from_rss.py ├── download.py ├── README.md └── download_overcast_podcasts.py /.gitignore: -------------------------------------------------------------------------------- 1 | audiofiles 2 | *.pyc 3 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | httpx 2 | lxml 3 | pip-tools 4 | smartypants 5 | tenacity 6 | urllib3 7 | -------------------------------------------------------------------------------- /record_player.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/record_player.jpg -------------------------------------------------------------------------------- /add_email_password/overcast_account_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/add_email_password/overcast_account_1.png -------------------------------------------------------------------------------- /add_email_password/overcast_account_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/add_email_password/overcast_account_2.png -------------------------------------------------------------------------------- /add_email_password/overcast_account_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/add_email_password/overcast_account_4.png -------------------------------------------------------------------------------- /add_email_password/overcast_account_3a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/add_email_password/overcast_account_3a.png -------------------------------------------------------------------------------- /add_email_password/overcast_account_3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/overcast-downloader/main/add_email_password/overcast_account_3b.png -------------------------------------------------------------------------------- /download_podcasts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | python3 download_overcast_podcasts.py ~/Desktop/overcast.opml --download_dir "/Volumes/Media (Sapphire)/backups/overcast/audiofiles" 7 | # mv ~/Desktop/overcast.opml "/Volumes/Media (Sapphire)/backups/overcast/overcast.$(date +'%Y-%m-%d').xml" 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 17 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 18 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 19 | OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | anyio==4.1.0 8 | # via httpx 9 | build==1.0.3 10 | # via pip-tools 11 | certifi==2023.11.17 12 | # via 13 | # httpcore 14 | # httpx 15 | click==8.1.7 16 | # via pip-tools 17 | h11==0.14.0 18 | # via httpcore 19 | httpcore==1.0.2 20 | # via httpx 21 | httpx==0.25.2 22 | # via -r requirements.in 23 | idna==3.5 24 | # via 25 | # anyio 26 | # httpx 27 | lxml==4.9.3 28 | # via -r requirements.in 29 | packaging==23.2 30 | # via build 31 | pip-tools==7.3.0 32 | # via -r requirements.in 33 | pyproject-hooks==1.0.0 34 | # via build 35 | smartypants==2.0.1 36 | # via -r requirements.in 37 | sniffio==1.3.0 38 | # via 39 | # anyio 40 | # httpx 41 | tenacity==8.2.3 42 | # via -r requirements.in 43 | urllib3==2.1.0 44 | # via -r requirements.in 45 | wheel==0.41.3 46 | # via pip-tools 47 | 48 | # The following packages are considered to be unsafe in a requirements file: 49 | # pip 50 | # setuptools 51 | -------------------------------------------------------------------------------- /add_email_password/README.md: -------------------------------------------------------------------------------- 1 | # Create a username/password for your account 2 | 3 | 1. In the iOS app, tap the icon in the top left-hand corner to open the settings screen. 4 | 5 | ![Screenshot of Overcast, with an icon in the top-left corner highlighted with a red arrow.](overcast_account_1.png) 6 | 7 | 2. Tap "Account" to enter the account settings. 8 | 9 | ![An iOS settings screen, with a list of options. "Settings" is highlighted in orange.](overcast_account_2.png) 10 | 11 | 3. If you already have an email/password set up, you'll see the email at the top of the account settings: 12 | 13 | ![Account settings. The email entry has "example@example.org" filled in.](overcast_account_3a.png) 14 | 15 | If you remember your password, you're done! 16 | If not, tap "Change Password" to set a new password. 17 | 18 | If you don't have an email and password set up, tap "Add Email and Password": 19 | 20 | ![An iOS settings screen, with a list of options. "Add Email and Password" is highlighted in orange.](overcast_account_3b.png) 21 | 22 | 4. Enter your email address and set a password for your account: 23 | 24 | ![An "Add Email To Account" settings screen, with email and password fields](overcast_account_4.png). 25 | -------------------------------------------------------------------------------- /download_all_episodes_from_rss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | The main downloader script will also get a copy of the RSS feed. 4 | 5 | If there are episodes in the RSS feeds that you haven't listened to in Overcast, 6 | but you still want in your podcast archive (for example, if you listened to them 7 | in a different podcast app), you can use this script to download them all. 8 | """ 9 | 10 | import glob 11 | import html 12 | import os 13 | import sys 14 | 15 | from lxml import etree 16 | import smartypants 17 | 18 | from download_overcast_podcasts import download_url, get_filename, logger 19 | 20 | 21 | def download_files_for_xml(xml_path): 22 | logger.info("Inspecting %r", xml_path) 23 | tree = etree.parse(xml_path) 24 | 25 | download_dir = os.path.dirname(xml_path) 26 | 27 | for item in tree.xpath(".//item"): 28 | title = item.find("title").text 29 | logger.debug("Checking episode %r", title) 30 | 31 | audio_url = item.find("enclosure").attrib["url"] 32 | 33 | filename = get_filename( 34 | download_url=audio_url, 35 | # We have to replicate some of the processing done by Overcast's 36 | # title cleanups. 37 | title=html.unescape(smartypants.smartypants(title)), 38 | ) 39 | download_path = os.path.join(download_dir, filename) 40 | 41 | if os.path.exists(download_path): 42 | logger.debug("This episode is already downloaded, skipping") 43 | continue 44 | 45 | logger.info("Downloading episode %r", title) 46 | 47 | download_url(url=audio_url, path=download_path, description="audio file") 48 | 49 | 50 | if __name__ == "__main__": 51 | try: 52 | audiofile_dir = sys.argv[1] 53 | except IndexError: 54 | sys.exit(f"{__file__} ") 55 | 56 | for xml_path in glob.iglob(os.path.join(audiofile_dir, "feed.*.xml")): 57 | download_files_for_xml(xml_path) 58 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import uuid 4 | 5 | import httpx 6 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed 7 | import urllib3.exceptions 8 | 9 | 10 | @retry( 11 | retry=( 12 | retry_if_exception_type(httpx.HTTPError) 13 | | retry_if_exception_type(urllib3.exceptions.HTTPError) 14 | ), 15 | stop=stop_after_attempt(10), 16 | wait=wait_fixed(60), 17 | ) 18 | def download_file(*, url, path, client=None): 19 | """ 20 | Atomically download a file from ``url`` to ``path``. 21 | 22 | If ``path`` already exists, the file will not be downloaded again. 23 | This means that different URLs should be saved to different paths. 24 | 25 | This function is meant to be used in cases where the contents of ``url`` 26 | is immutable -- calling it more than once should always return the same bytes. 27 | 28 | Returns the download path. 29 | 30 | """ 31 | # If the URL has already been downloaded, we can skip downloading it again. 32 | if os.path.exists(path): 33 | return path 34 | 35 | if os.path.dirname(path): 36 | os.makedirs(os.path.dirname(path), exist_ok=True) 37 | 38 | if client is None: 39 | client = httpx.Client() 40 | 41 | try: 42 | with client.stream("GET", url) as resp: 43 | resp.raise_for_status() 44 | 45 | # Download to a temporary path first. That way, we only get 46 | # something at the destination path if the download is successful. 47 | # 48 | # We download to a path in the same directory so we can do an 49 | # atomic ``os.rename()`` later -- atomic renames don't work 50 | # across filesystem boundaries. 51 | tmp_path = f"{path}.{uuid.uuid4()}.tmp" 52 | 53 | with open(tmp_path, "wb") as out_file: 54 | for chunk in resp.iter_raw(): 55 | out_file.write(chunk) 56 | 57 | # If something goes wrong, it will probably be retried by tenacity. 58 | # Log the exception in case a programming bug has been introduced in 59 | # the ``try`` block or there's a persistent error. 60 | except Exception as exc: 61 | print(exc, file=sys.stderr) 62 | raise 63 | 64 | os.rename(tmp_path, path) 65 | return path 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # overcast-downloader 2 | 3 | This is a script that lets you download the audio files for every podcast you've listened to in Overcast. 4 | 5 | I listen to a lot of podcasts, and I use [Overcast]. 6 | Sometimes I want to listen to a podcast I heard a long time ago, but links rot, websites break, and episodes go offline. 7 | This script allows me to download the audio file of every episode I've listened to, so I have a local archive of podcast episodes. 8 | 9 | [Overcast]: https://overcast.fm/ 10 | 11 | ![An old man sitting on a chair outdoors, listening to a record player.](record_player.jpg) 12 | 13 | *Podcasts have a richer sound when you listen to them on vinyl. Image credit: Wellcome Collection. CC BY.* 14 | 15 | 16 | 17 | ## Background 18 | 19 | This is something I've wanted for a while (at least two podcasts I loved have completely disappeared from the web), but I was never sure how to do it. 20 | In a segment in [ATP episode 353][atp353], Marco mentioned an export with a list of every episode you'd ever listened to: 21 | 22 | > I can tell you what Overcast saves in the database table that saves your episode progress, which is by far my biggest database table. 23 | > It saves, for each episode you've interacted with, the current timestamp, and whether it's been completed. […] 24 | > 25 | > If you go to the account section of the website, you can export OPML -- a custom, extended format that I implemented -- so you can actually export all your data. […] 26 | > I can tell you a list of episodes you've completed, or played in some way in Overcast. 27 | 28 | I went digging, and it was exactly what I wanted -- and shortly after, I had a script that downloads every MP3 it refers to. 29 | 30 | [atp353]: https://overcast.fm/+R7DWLpsnY/1:40:21 31 | 32 | 33 | 34 | ## How to use this script 35 | 36 | You need: 37 | 38 | * **An Overcast account with an email and password.** 39 | You can create this in the Overcast iOS app. 40 | If you haven't done this before, or you've forgotten your email/password, read [my instructions](add_email_password) for doing so. 41 | 42 | * **A working Python 3 installation.** 43 | This script only works with Python 3.6 or later. 44 | 45 | Steps: 46 | 47 | 1. **Get your Overcast OPML file.** 48 | 49 | Log in to the Overcast website at using your email address and password. 50 | 51 | Once you're logged in, navigate to . 52 | Under "Export Your Data", click "All data". 53 | This will download an OPML file, which includes a list of every podcast episode you've ever played. 54 | 55 | 2. **Download the Python script.** 56 | 57 | Download the script [`download_overcast_podcasts.py`](download_overcast_podcasts.py), and save it somewhere on your disk. 58 | 59 | 3. **Run the script, passing the path to your OPML file as the first argument.** 60 | For example, if the OPML file is in `~/Downloads/overcast.opml.xml`, run: 61 | 62 | ```console 63 | $ python download_overcast_podcasts.py ~/Downloads/overcast.opml.xml 64 | ``` 65 | 66 | This will start downloading the audio files to a folder called `audiofiles`. 67 | If you'd like to save them somewhere different, pass the `--download_dir` flag. 68 | For example, if you wanted to save the files to `~/Documents/podcasts`, run: 69 | 70 | ```console 71 | $ python download_overcast_podcasts.py ~/Downloads/overcast.opml.xml --out_dir ~/Documents/podcasts 72 | ``` 73 | 74 | The initial download will be very slow, depending on how many podcasts you've listened to, and it uses a lot of disk space. 75 | (At time of writing, I have ~1200 episodes in my export, which take up 61 GB.) 76 | On subsequent runs, the script should only download files that it hasn't saved before, so it should be a lot faster. 77 | 78 | 79 | 80 | ## License 81 | 82 | MIT. 83 | -------------------------------------------------------------------------------- /download_overcast_podcasts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | """ 4 | Download podcast files based on your Overcast export. 5 | 6 | If you have an Overcast account, you can download an OPML file with 7 | a list of every episode you've played from https://overcast.fm/account. 8 | 9 | This tool can read that OPML file, and save a local copy of the audio files 10 | for every episode you've listened to. 11 | """ 12 | 13 | import argparse 14 | import datetime 15 | import errno 16 | import filecmp 17 | import functools 18 | import glob 19 | import json 20 | import os 21 | import sqlite3 22 | import sys 23 | from urllib.parse import urlparse 24 | import xml.etree.ElementTree as ET 25 | 26 | from download import download_file 27 | 28 | 29 | def parse_args(argv): 30 | """Parse command-line arguments.""" 31 | parser = argparse.ArgumentParser(description=__doc__) 32 | 33 | parser.add_argument( 34 | "OPML_PATH", 35 | help="Path to an OPML file downloaded from https://overcast.fm/account", 36 | ) 37 | 38 | parser.add_argument( 39 | "--download_dir", 40 | default="audiofiles", 41 | help="directory to save podcast information to to", 42 | ) 43 | 44 | args = parser.parse_args(argv) 45 | 46 | return { 47 | "opml_path": os.path.abspath(args.OPML_PATH), 48 | "download_dir": os.path.abspath(args.download_dir), 49 | } 50 | 51 | 52 | def get_episodes(xml_string): 53 | """ 54 | Given the XML string of the Overcast OPML, generate a sequence of entries 55 | that represent a single, played podcast episode. 56 | """ 57 | root = ET.fromstring(xml_string) 58 | 59 | # The Overcast OPML has the following form: 60 | # 61 | # 62 | # 63 | # Overcast Podcast Subscriptions 64 | # 65 | # ... 66 | # ... 67 | # 68 | # 69 | # 70 | # Within the block of XML, there's a list of feeds 71 | # with the following structure (some attributes omitted): 72 | # 73 | # 76 | # 83 | # ... 84 | # 85 | # 86 | # We use an XPath expression to find the entries 87 | # (so we get the podcast metadata), and then find the individual 88 | # "podcast-episode" entries in that feed. 89 | 90 | for feed in root.findall("./body/outline[@text='feeds']/outline[@type='rss']"): 91 | podcast = { 92 | "title": feed.get("title"), 93 | "text": feed.get("text"), 94 | "xml_url": feed.get("xmlUrl"), 95 | } 96 | 97 | for episode_xml in feed.findall("./outline[@type='podcast-episode']"): 98 | episode = { 99 | "published_date": episode_xml.get("pubDate"), 100 | "title": episode_xml.get("title"), 101 | "url": episode_xml.get("url"), 102 | "overcast_id": episode_xml.get("overcastId"), 103 | "overcast_url": episode_xml.get("overcastUrl"), 104 | "enclosure_url": episode_xml.get("enclosureUrl"), 105 | } 106 | 107 | yield { 108 | "podcast": podcast, 109 | "episode": episode, 110 | } 111 | 112 | 113 | def has_episode_been_downloaded_already(episode, download_dir): 114 | try: 115 | conn = sqlite3.connect(os.path.join(download_dir, "overcast.db")) 116 | except sqlite3.OperationalError as err: 117 | if err.args[0] == "unable to open database file": 118 | return False 119 | else: 120 | raise 121 | 122 | c = conn.cursor() 123 | 124 | try: 125 | c.execute( 126 | "SELECT * FROM downloaded_episodes WHERE overcast_id=?", 127 | (episode["episode"]["overcast_id"],), 128 | ) 129 | except sqlite3.OperationalError as err: 130 | if err.args[0] == "no such table: downloaded_episodes": 131 | return False 132 | else: 133 | raise 134 | 135 | return c.fetchone() is not None 136 | 137 | 138 | def mark_episode_as_downloaded(episode, download_dir): 139 | conn = sqlite3.connect(os.path.join(download_dir, "overcast.db")) 140 | c = conn.cursor() 141 | 142 | try: 143 | c.execute("CREATE TABLE downloaded_episodes (overcast_id text PRIMARY KEY)") 144 | except sqlite3.OperationalError as err: 145 | if err.args[0] == "table downloaded_episodes already exists": 146 | pass 147 | else: 148 | raise 149 | 150 | c.execute( 151 | "INSERT INTO downloaded_episodes VALUES (?)", 152 | (episode["episode"]["overcast_id"],), 153 | ) 154 | conn.commit() 155 | conn.close() 156 | 157 | 158 | def _escape(s): 159 | return s.replace(":", "-").replace("/", "-") 160 | 161 | 162 | def get_filename(*, download_url, title): 163 | url_path = urlparse(download_url).path 164 | 165 | extension = os.path.splitext(url_path)[-1] 166 | base_name = _escape(title) 167 | 168 | return base_name + extension 169 | 170 | 171 | def download_episode(episode, download_dir): 172 | """ 173 | Given a blob of episode data from get_episodes, download the MP3 file and 174 | save the metadata to ``download_dir``. 175 | """ 176 | if has_episode_been_downloaded_already(episode=episode, download_dir=download_dir): 177 | return 178 | 179 | # If the MP3 URL is https://example.net/mypodcast/podcast1.mp3 and the 180 | # title is "Episode 1: My Great Podcast", the filename is 181 | # ``Episode 1- My Great Podcast.mp3``. 182 | audio_url = episode["episode"]["enclosure_url"] 183 | 184 | filename = get_filename(download_url=audio_url, title=episode["episode"]["title"]) 185 | 186 | # Within the download_dir, put the episodes for each podcast in the 187 | # same folder. 188 | podcast_dir = os.path.join(download_dir, _escape(episode["podcast"]["title"])) 189 | os.makedirs(podcast_dir, exist_ok=True) 190 | 191 | # Download the podcast audio file if it hasn't already been downloaded. 192 | download_path = os.path.join(podcast_dir, filename) 193 | base_name = _escape(episode["episode"]["title"]) 194 | json_path = os.path.join(podcast_dir, base_name + ".json") 195 | 196 | # If the MP3 file already exists, check to see if it's the same episode, 197 | # or if this podcast isn't using unique filenames. 198 | # 199 | # If a podcast has multiple episodes with the same filename in its feed, 200 | # append the Overcast ID to disambiguate. 201 | if os.path.exists(download_path): 202 | try: 203 | cached_metadata = json.load(open(json_path, "r")) 204 | except Exception as err: 205 | print(err, json_path) 206 | return 207 | 208 | cached_overcast_id = cached_metadata["episode"]["overcast_id"] 209 | this_overcast_id = episode["episode"]["overcast_id"] 210 | 211 | if cached_overcast_id != this_overcast_id: 212 | filename = filename.replace(".mp3", "_%s.mp3" % this_overcast_id) 213 | old_download_path = download_path 214 | download_path = os.path.join(podcast_dir, filename) 215 | json_path = download_path + ".json" 216 | 217 | print( 218 | "Downloading %s: %s to %s" 219 | % (episode["podcast"]["title"], audio_url, filename) 220 | ) 221 | download_file(url=audio_url, path=download_path) 222 | 223 | try: 224 | if filecmp.cmp(download_path, old_download_path, shallow=False): 225 | print("Duplicates detected! %s" % download_path) 226 | os.unlink(download_path) 227 | download_path = old_download_path 228 | except FileNotFoundError: 229 | # This can occur if the download fails -- say, the episode is 230 | # in the Overcast catalogue, but no longer available from source. 231 | pass 232 | 233 | else: 234 | # Already downloaded and it's the same episode. 235 | pass 236 | 237 | # This episode has never been downloaded before, so we definitely have 238 | # to download it fresh. 239 | else: 240 | print( 241 | "Downloading %s: %s to %s" 242 | % (episode["podcast"]["title"], audio_url, filename) 243 | ) 244 | download_file(url=audio_url, path=download_path) 245 | 246 | # Save a blob of JSON with some episode metadata 247 | episode["filename"] = filename 248 | 249 | json_string = json.dumps(episode, indent=2, sort_keys=True) 250 | 251 | with open(json_path, "w") as outfile: 252 | outfile.write(json_string) 253 | 254 | save_rss_feed(episode=episode, download_dir=download_dir) 255 | mark_episode_as_downloaded(episode=episode, download_dir=download_dir) 256 | 257 | 258 | def save_rss_feed(*, episode, download_dir): 259 | _save_rss_feed( 260 | title=episode["podcast"]["title"], 261 | xml_url=episode["podcast"]["xml_url"], 262 | download_dir=download_dir, 263 | ) 264 | 265 | 266 | # Use caching so we only have to download this RSS feed once. 267 | @functools.lru_cache() 268 | def _save_rss_feed(*, title, xml_url, download_dir): 269 | podcast_dir = os.path.join(download_dir, _escape(title)) 270 | 271 | today = datetime.datetime.now().strftime("%Y-%m-%d") 272 | 273 | rss_path = os.path.join(podcast_dir, f"feed.{today}.xml") 274 | 275 | if not os.path.exists(rss_path): 276 | print("Downloading RSS feed for %s" % title) 277 | download_file(url=xml_url, path=rss_path) 278 | 279 | matching_feeds = sorted(glob.glob(os.path.join(podcast_dir, "feed.*.xml"))) 280 | 281 | while len(matching_feeds) >= 2 and filecmp.cmp( 282 | matching_feeds[-2], matching_feeds[-1], shallow=False 283 | ): 284 | os.unlink(matching_feeds[-1]) 285 | matching_feeds.remove(matching_feeds[-1]) 286 | 287 | 288 | if __name__ == "__main__": 289 | args = parse_args(argv=sys.argv[1:]) 290 | 291 | opml_path = args["opml_path"] 292 | download_dir = args["download_dir"] 293 | 294 | try: 295 | with open(opml_path) as infile: 296 | xml_string = infile.read() 297 | except OSError as err: 298 | if err.errno == errno.ENOENT: 299 | sys.exit("Could not find an OPML file at %s" % opml_path) 300 | else: 301 | raise 302 | 303 | for episode in get_episodes(xml_string): 304 | download_episode(episode, download_dir=download_dir) 305 | --------------------------------------------------------------------------------