├── .gitignore ├── Dockerfile ├── README.md ├── podsnatch.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .python-version 2 | *.opml 3 | 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.6-slim-stretch 2 | 3 | WORKDIR /podsnatch 4 | ADD podsnatch.py /podsnatch 5 | ADD requirements.txt /podsnatch 6 | 7 | VOLUME ["/input", "/output"] 8 | 9 | RUN apt-get update && \ 10 | apt-get install -y libxml2-dev libxslt-dev gcc && \ 11 | pip install -r requirements.txt 12 | 13 | ENTRYPOINT ["python", "podsnatch.py"] 14 | CMD ["--opml", "/input.opml", "--output-dir", "/output"] 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PodSnatch 2 | 3 | PodSnatch is a simple^{[1](#footnote1)}, cross-platform 4 | ^{[2](#footnote2)} podcast downloader. Feed it an OPML file and wire it 5 | up to a cronjob, and it downloads your podcasts on your schedule. PodSnatch 6 | also downloads all the metadata for each episode, and stores it in a plaintext 7 | file with the same name as the episode audio, with `.txt` appended. 8 | 9 | 1: Only \~100 lines of Python! 10 | 11 | 2: *Probably*, I've only tested on Mac. 12 | 13 | ## Usage 14 | ```bash 15 | python podsnatch.py --opml -o 16 | ``` 17 | 18 | If you don't want to deal with all the python setup crap (and I don't blame you) 19 | you can build the docker container and run with 20 | 21 | ```bash 22 | docker run -it -v '/path/to/opml.opml:/input.opml' -v '/path/to/output_dir:/output' podsnatch 23 | ``` 24 | 25 | If you want to limit episodes for download, use `-n` argument. Say, for download last 3 episodes, of each podcast you need specify your command to: 26 | ```bash 27 | python podsnatch.py --opml -o -n 3 28 | ``` 29 | 30 | ## Contributing 31 | PRs welcone! 32 | -------------------------------------------------------------------------------- /podsnatch.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | from lxml import etree as xml 4 | from tqdm import tqdm 5 | import feedparser 6 | import requests 7 | import argparse 8 | import signal 9 | import time 10 | import sys 11 | import re 12 | import os 13 | 14 | TMP_EXT = '.part' 15 | 16 | 17 | class Show: 18 | 19 | def __init__(self, outline_element): 20 | self.url = (outline_element.get('xmlUrl') or 21 | outline_element.get('xmlurl') or 22 | None) 23 | self.title = (outline_element.get('title') or 24 | outline_element.get('text')[0:50] or 25 | self.url.split('/')[-1]) 26 | self.episode_guids = [] 27 | 28 | def __str__(self): 29 | return f'{self.title}: {self.url}' 30 | 31 | def get_dir_name(self): 32 | return re.sub(r'[\W]+', '_', self.title) 33 | 34 | 35 | class Episode: 36 | 37 | def __init__(self, item, show): 38 | self.guid = item.id if 'id' in item else '' 39 | self.title = item.title if 'title' in item else '' 40 | self.link = item.link if 'link' in item else '' 41 | self.description = item.summary if 'summary' in item else '' 42 | self.content = item.content[0].value if 'content' in item else '' 43 | self.number = item.itunes_episode if 'itunes_episode' in item else '' 44 | self.url = item.enclosures[0].href if 'enclosures' in item and item.enclosures else '' 45 | self.date = item.published_parsed if 'published_parsed' in item else '' 46 | 47 | self.show = show 48 | 49 | def __str__(self): 50 | return f"""{self.title} 51 | {self.number} 52 | {self.guid} 53 | {self.date} 54 | {self.link} 55 | {self.url} 56 | {self.content if self.content else self.description} 57 | {self.description}""" 58 | 59 | def get_file_name(self): 60 | url_tail = self.url.split('/')[-1].split('?')[0] 61 | show_title = re.sub(r'[\W]+', '_', self.show.title) 62 | ep_title = re.sub(r'[\W]+', '_', self.title) 63 | formatted_date = time.strftime('%Y_%m_%d', self.date) 64 | 65 | name_tokens = [formatted_date, self.number, ep_title, url_tail] 66 | return '_'.join([s for s in name_tokens if s is not '']) 67 | 68 | 69 | def parse_ompl(ompl_path): 70 | tree = xml.parse(ompl_path) 71 | root = tree.getroot() 72 | 73 | shows = root.findall('./body/outline') 74 | 75 | return [Show(x) for x in shows] 76 | 77 | 78 | def download(url, path, mode): 79 | # https://stackoverflow.com/a/37573701 80 | response = requests.get(url, stream=True) 81 | total_size = int(response.headers.get('content-length', 0)) 82 | block_size = 1024 83 | 84 | downloaded_size = 0 85 | t = tqdm(total=total_size, unit='iB', unit_scale=True) 86 | with open(path, mode) as f: 87 | for data in response.iter_content(block_size): 88 | t.update(len(data)) 89 | f.write(data) 90 | downloaded_size += len(data) 91 | t.close() 92 | 93 | if total_size != 0 and t.n != total_size: 94 | print("ERROR downloading file") 95 | 96 | return downloaded_size 97 | 98 | total_downloaded_size = 0 99 | total_downloaded = 0 100 | full_path = '' 101 | 102 | 103 | def convert_to_size(size): 104 | """ 105 | Takes a number of bytes and converts it to a string that is a human readable size. 106 | """ 107 | size_labels = ['B','KB','MB','GB','TB', 'PB', 'EB', 'ZB', 'YB'] 108 | converted_size = size 109 | counter = 0 110 | while converted_size > 1000: 111 | converted_size /= 1000 112 | counter += 1 113 | 114 | size_str = f'{converted_size:.2f}{size_labels[counter]}' 115 | 116 | return size_str 117 | 118 | 119 | def save_podcasts(opml, output, episode_count=None): 120 | global total_downloaded_size 121 | global total_downloaded 122 | global full_path 123 | 124 | shows = parse_ompl(opml) 125 | 126 | for show in shows: 127 | print(f'Processing show {show.title}') 128 | feed = feedparser.parse(show.url) 129 | 130 | show_path = os.path.join(output, show.get_dir_name()) 131 | os.makedirs(show_path, exist_ok=True) 132 | 133 | cnt_eps_to_dl = (int(episode_count, 10) 134 | if episode_count is not None 135 | else len(feed.entries)) 136 | 137 | i = 0 138 | show_downloaded = 0 139 | while show_downloaded < cnt_eps_to_dl and i < len(feed.entries): 140 | item = feed.entries[i] 141 | episode = Episode(item, show) 142 | 143 | print(f'Processing episode {episode.title}') 144 | 145 | full_path = os.path.join(show_path, episode.get_file_name()) 146 | print(full_path) 147 | 148 | if not os.path.exists(full_path) and episode.url: 149 | print('Downloading episode') 150 | total_downloaded_size += download(episode.url, full_path + TMP_EXT, 'wb') 151 | 152 | os.rename(full_path + TMP_EXT, full_path) 153 | 154 | handle = open(full_path + ".txt", "w") 155 | handle.write(str(episode)) 156 | handle.close() 157 | 158 | show_downloaded += 1 159 | total_downloaded += 1 160 | else: 161 | print('Episode already downloaded!') 162 | 163 | i += 1 164 | 165 | print(f'{total_downloaded} episode(s) totaling {convert_to_size(total_downloaded_size)} downloaded') 166 | 167 | 168 | def ctrl_c_handler(signum, frame): 169 | print('Stopping...') 170 | 171 | if os.path.exists(full_path + TMP_EXT): 172 | os.remove(full_path + TMP_EXT) 173 | 174 | print(f'{total_downloaded} episode(s) totaling {convert_to_size(total_downloaded_size)} downloaded') 175 | sys.exit(1) 176 | 177 | 178 | if __name__ == '__main__': 179 | parser = argparse.ArgumentParser(description='Download podcasts.') 180 | 181 | parser.add_argument('--opml', '-i', dest='opml_loc', action='store', 182 | required=True, help='path to opml file to import') 183 | parser.add_argument('--output-dir', '-o', dest='output_loc', action='store', 184 | required=False, default='.', 185 | help='location to save podcasts') 186 | parser.add_argument('--number-of-episodes', '-n', dest='ep_cnt', 187 | action='store', default=None, 188 | help='how many episodes to download. By default - download all') 189 | args = parser.parse_args() 190 | 191 | signal.signal(signal.SIGINT, ctrl_c_handler) 192 | 193 | save_podcasts(args.opml_loc, args.output_loc, args.ep_cnt) 194 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | feedparser==5.2.1 2 | lxml==4.9.1 3 | requests==2.32.0 4 | tqdm==4.66.3 5 | --------------------------------------------------------------------------------