├── podcastdownloader ├── __init__.py ├── tests │ ├── __init__.py │ ├── test_integration.py │ ├── test_utility_functions.py │ └── test_episode.py ├── exceptions.py ├── writer.py ├── podcast.py ├── utility_functions.py ├── tag_engine.py ├── episode.py └── __main__.py ├── requirements.txt ├── setup.py ├── setup.cfg ├── LICENSE ├── .gitignore └── README.md /podcastdownloader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /podcastdownloader/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | click 3 | feedparser 4 | multidict 5 | mutagen -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import setuptools 4 | 5 | setuptools.setup(setup_requires=['pbr'], pbr=True) 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = podcastdownloader 3 | author = Serene-Arc 4 | author-email = serenical@gmail.com 5 | 6 | [options] 7 | packages=podcastdownloader 8 | -------------------------------------------------------------------------------- /podcastdownloader/exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | class PodcastException(Exception): 5 | pass 6 | 7 | 8 | class FeedException(PodcastException): 9 | pass 10 | 11 | 12 | class EpisodeException(PodcastException): 13 | pass 14 | 15 | 16 | class TagEngineError(PodcastException): 17 | pass 18 | -------------------------------------------------------------------------------- /podcastdownloader/tests/test_integration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | from pathlib import Path 5 | 6 | import pytest 7 | from click.testing import CliRunner 8 | 9 | from podcastdownloader.__main__ import cli 10 | 11 | 12 | @pytest.mark.parametrize('test_args', ( 13 | [], 14 | )) 15 | def test_download_no_feeds(test_args: list[str], tmp_path: Path): 16 | runner = CliRunner() 17 | result = runner.invoke(cli, ['download', '-vv', str(tmp_path)] + test_args) 18 | assert result.exit_code == 0 19 | assert 'No feeds have been provided' in result.output 20 | 21 | 22 | @pytest.mark.parametrize('test_args', ( 23 | ['-f', 'https://rss.art19.com/wecrashed'], 24 | )) 25 | def test_download_single_feed(test_args: list[str], tmp_path: Path): 26 | runner = CliRunner() 27 | result = runner.invoke(cli, ['download', '-vv', str(tmp_path)] + test_args) 28 | assert result.exit_code == 0 29 | -------------------------------------------------------------------------------- /podcastdownloader/writer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import logging 5 | import pathlib 6 | 7 | from podcastdownloader.podcast import Podcast 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def _write_m3u(podcast: Podcast): 13 | podcast_path = podcast.episodes[0].file_path.parent 14 | podcast_path.mkdir(parents=True, exist_ok=True) 15 | with open(pathlib.Path(podcast_path, 'episode_playlist.m3u'), 'w') as file: 16 | file.write('#EXTM3U\n') 17 | for episode in reversed(podcast.episodes): 18 | try: 19 | file.write('./' + episode.file_path.name + '\n') 20 | except AttributeError: 21 | logger.warning(f'Could not write {episode.title} to playlist') 22 | logger.debug(f'M3U playlist for {podcast.name} written') 23 | 24 | 25 | def write_episode_playlist(podcast: Podcast, write_choices: tuple[str]): 26 | for format_choice in write_choices: 27 | if format_choice == 'm3u': 28 | _write_m3u(podcast) 29 | else: 30 | logger.error(f'Unknown playlist format type: {format_choice}') 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Serene 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /podcastdownloader/tests/test_utility_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import pytest 5 | 6 | import podcastdownloader.utility_functions as util 7 | 8 | 9 | @pytest.mark.parametrize(('test_input_string', 'expected'), ( 10 | ('', None), 11 | ('\n', None), 12 | (' \n', None), 13 | ('#test', None), 14 | ('# test', None), 15 | (' #test', None), 16 | (' # test', None), 17 | )) 18 | def test_clean_text_line_non_feeds(test_input_string: str, expected: str): 19 | result = util._clean_text_line(test_input_string) 20 | assert result == expected 21 | 22 | 23 | @pytest.mark.parametrize(('test_input_string', 'expected'), ( 24 | ('https://www.example.com/test', 'https://www.example.com/test'), 25 | (' https://www.example.com/test', 'https://www.example.com/test'), 26 | ('https://www.example.com/test#random', 'https://www.example.com/test#random'), 27 | ('https://www.example.com/test/feed.rss # test comment', 'https://www.example.com/test/feed.rss'), 28 | (' https://www.example.com/test/feed.rss # test comment', 'https://www.example.com/test/feed.rss'), 29 | ('https://www.example.com/test/feed.rss\t # test comment', 'https://www.example.com/test/feed.rss'), 30 | )) 31 | def test_clean_text_line_good(test_input_string: str, expected: str): 32 | result = util._clean_text_line(test_input_string) 33 | assert result == expected 34 | -------------------------------------------------------------------------------- /podcastdownloader/podcast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import logging 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | import aiohttp 9 | import aiohttp.client_exceptions 10 | import feedparser 11 | import feedparser.exceptions 12 | 13 | from podcastdownloader.episode import Episode 14 | from podcastdownloader.exceptions import FeedException 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class Podcast: 20 | def __init__(self, url: str): 21 | self.url = url 22 | self.feed: Optional[feedparser.FeedParserDict] = None 23 | self.name: Optional[str] = None 24 | self.location: Optional[Path] = None 25 | self.episodes: Optional[list[Episode]] = [] 26 | 27 | async def download_feed(self, session: aiohttp.ClientSession): 28 | try: 29 | async with session.get(self.url) as response: 30 | feed_data = await response.content.read() 31 | if response.status != 200: 32 | raise FeedException(f'Failed to download feed from {self.url}: Response code {response.status}') 33 | except aiohttp.client_exceptions.ClientError as e: 34 | raise FeedException(f'Failed to download feed from {self.url}: {e}') 35 | feed = feedparser.parse(feed_data) 36 | if feed['bozo']: 37 | raise FeedException(f'Feed from {self.url} was malformed') 38 | self.feed = feed 39 | self.name = feed['feed']['title'] 40 | self.episodes = [Episode.parse_dict(entry, self.name) for entry in self.feed['entries']] 41 | -------------------------------------------------------------------------------- /podcastdownloader/utility_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import logging 5 | import re 6 | import xml.etree.ElementTree as ElementTree 7 | from pathlib import Path 8 | from typing import Optional 9 | 10 | from podcastdownloader.exceptions import FeedException 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def _check_required_path(file_path: str) -> Path: 16 | result = Path(file_path).resolve().expanduser() 17 | return result 18 | 19 | 20 | def load_feeds_from_text_file(feed_files: tuple[str]) -> list[str]: 21 | result = [] 22 | feed_files = [_check_required_path(file) for file in feed_files] 23 | for feed_file in feed_files: 24 | with open(Path(feed_file), 'r') as feed: 25 | for line in feed.readlines(): 26 | if parsed_line := _clean_text_line(line): 27 | result.append(parsed_line) 28 | logger.debug(f'Feed {parsed_line} added') 29 | return result 30 | 31 | 32 | def _clean_text_line(in_string: str) -> Optional[str]: 33 | non_feed_pattern = re.compile(r'^\s*(#.*)?$') 34 | if re.match(non_feed_pattern, in_string): 35 | return None 36 | feed_pattern = re.compile(r'^\s*(.*?)(\s+#.*)?$') 37 | feed_match = re.match(feed_pattern, in_string) 38 | if feed_match: 39 | return feed_match.group(1) 40 | else: 41 | raise FeedException(f'Could not extract feed from {in_string.strip()}') 42 | 43 | 44 | def load_feeds_from_opml(opml_files: tuple[str]) -> list[str]: 45 | result = [] 46 | opml_files = [_check_required_path(file) for file in opml_files] 47 | for opml_loc in opml_files: 48 | opml_tree = ElementTree.parse(Path(opml_loc)) 49 | for opml_feed in opml_tree.getroot().iter('outline'): 50 | result.append(opml_feed.attrib['xmlUrl']) 51 | logger.debug(f'Feed {opml_feed.attrib["xmlUrl"]} added') 52 | return result 53 | -------------------------------------------------------------------------------- /podcastdownloader/tag_engine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import logging 5 | 6 | import mutagen 7 | import mutagen.id3 8 | import mutagen.mp3 9 | import mutagen.mp4 10 | from mutagen.id3 import PCST, TALB, TDES, TIT2 11 | 12 | from podcastdownloader.episode import Episode 13 | from podcastdownloader.exceptions import TagEngineError 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class TagEngine: 19 | def __init__(self): 20 | pass 21 | 22 | @staticmethod 23 | def tag_episode(episode: Episode): 24 | tag_file = mutagen.File(episode.file_path) 25 | if tag_file is None: 26 | raise TagEngineError(f'Could not write tags to {episode.title} in {episode.podcast_name}') 27 | try: 28 | tag_file.add_tags() 29 | except mutagen.MutagenError: 30 | pass 31 | if isinstance(tag_file.tags, mutagen.id3.ID3): 32 | TagEngine._write_id3_tags(episode, tag_file) 33 | elif isinstance(tag_file.tags, mutagen.mp4.MP4Tags): 34 | TagEngine._write_mp4_tags(episode, tag_file) 35 | else: 36 | raise TagEngineError(f'Tagging for type {type(tag_file).__name__} not supported') 37 | 38 | @staticmethod 39 | def _write_id3_tags(episode: Episode, tag_file: mutagen.File): 40 | tag_file.tags.add(PCST(value=True)) # Podcast Flag 41 | tag_file.tags.add(TALB(encoding=3, text=episode.podcast_name)) 42 | tag_file.tags.add(TDES(encoding=3, text=episode.feed.get('summary', ''))) 43 | tag_file.tags.add(TIT2(encoding=3, text=episode.title)) 44 | tag_file.save() 45 | 46 | @staticmethod 47 | def _write_mp4_tags(episode: Episode, tag_file: mutagen.File): 48 | tag_file.tags['\xa9nam'] = episode.title # Episode title 49 | tag_file.tags['\xa9alb'] = episode.podcast_name # Podcast name 50 | tag_file.tags['pcst'] = True # Podcast bit 51 | tag_file.tags['desc'] = episode.feed.get('summary', '') 52 | tag_file.save() 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /podcastdownloader/tests/test_episode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import asyncio 4 | 5 | import aiohttp 6 | import pytest 7 | 8 | from podcastdownloader.episode import Episode 9 | from podcastdownloader.exceptions import EpisodeException 10 | 11 | 12 | @pytest.fixture(scope='session') 13 | def client_session() -> aiohttp.ClientSession: 14 | out = aiohttp.ClientSession() 15 | return out 16 | 17 | 18 | @pytest.mark.parametrize(('test_link_dict', 'expected'), ( 19 | ([{ 20 | 'rel': 'alternate', 21 | 'type': 'text/html', 22 | 'href': 'http://evenmorenewspodcast.com/usa-still-leaving-afghanistan-still-shooting' 23 | '-unarmed-people-w-christopher-rivas-ep-142', 24 | }, 25 | { 26 | 'length': '54468321', 27 | 'type': 'audio/mpeg', 28 | 'href': 'https://dts.podtrac.com/redirect.mp3/chtbl.com/track/242FB3/' 29 | 'traffic.libsyn.com/secure/evenmorenews/EMN_Ep142.mp3?dest-id=695480', 30 | 'rel': 'enclosure', 31 | }], 32 | 'https://dts.podtrac.com/redirect.mp3/chtbl.com/track/242FB3/' 33 | 'traffic.libsyn.com/secure/evenmorenews/EMN_Ep142.mp3?dest-id=695480'), 34 | )) 35 | def test_episode_find_url(test_link_dict: list[dict], expected: str): 36 | test_dict = {'links': test_link_dict, } 37 | result = Episode._find_url(test_dict) 38 | assert result == expected 39 | 40 | 41 | @pytest.mark.parametrize('test_link_dict', ( 42 | [{ 43 | 'rel': 'alternate', 44 | 'type': 'text/html', 45 | 'href': 'http://evenmorenewspodcast.com/usa-still-leaving-afghanistan-still-shooting' 46 | '-unarmed-people-w-christopher-rivas-ep-142', 47 | }], 48 | )) 49 | def test_episode_find_url_bad(test_link_dict: list[dict]): 50 | test_dict = {'links': test_link_dict, } 51 | with pytest.raises(EpisodeException): 52 | Episode._find_url(test_dict) 53 | 54 | 55 | @pytest.mark.parametrize(('test_url', 'expected'), ( 56 | ('https://www.example.com/test.png', '.png'), 57 | ('https://www.example.com/test.mp3', '.mp3'), 58 | ('https://www.example.com/random/test.flac', '.flac'), 59 | ('https://www.example.com/test.mp3?test=value', '.mp3'), 60 | ('https://www.example.com/test.mp3?test=value#test', '.mp3'), 61 | ('https://www.example.com/test.aac', '.aac'), 62 | )) 63 | def test_determine_file_extension_from_url(test_url: str, expected: str, client_session): 64 | result = asyncio.run(Episode._get_file_extension(test_url, client_session)) 65 | assert result == expected 66 | 67 | 68 | @pytest.mark.parametrize(('test_name', 'expected'), ( 69 | ('test', 'test'), 70 | ('te/st', 'test'), 71 | ('test/test', 'testtest'), 72 | ('test\0', 'test'), 73 | )) 74 | def test_clean_name(test_name: str, expected: str): 75 | result = Episode._clean_name(test_name) 76 | assert result == expected 77 | -------------------------------------------------------------------------------- /podcastdownloader/episode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import logging 5 | import mimetypes 6 | import re 7 | import urllib.parse 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | import aiohttp 12 | import aiohttp.client_exceptions 13 | import mutagen 14 | from multidict import CIMultiDictProxy 15 | 16 | from podcastdownloader.exceptions import EpisodeException 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Episode: 22 | def __init__(self, title_name: str, episode_url: str, podcast_name: str, feed: dict): 23 | self.title = self._clean_name(title_name) 24 | self.url = episode_url 25 | self.podcast_name = podcast_name 26 | self.file_path: Optional[Path] = None 27 | self.feed = feed 28 | 29 | @staticmethod 30 | def parse_dict(feed_dict: dict, podcast_name: str) -> 'Episode': 31 | episode_url = Episode._find_url(feed_dict) 32 | result = Episode( 33 | feed_dict['title'], 34 | episode_url, 35 | podcast_name, 36 | feed_dict, 37 | ) 38 | return result 39 | 40 | @staticmethod 41 | def _clean_name(name: str) -> str: 42 | name = re.sub(r'([\0/])', '', name) 43 | return name 44 | 45 | @staticmethod 46 | def _find_url(feed_dict: dict) -> str: 47 | mime_type_regex = re.compile(r'^audio.*') 48 | try: 49 | valid_urls = list(filter(lambda u: re.match(mime_type_regex, u['type']), feed_dict['links'])) 50 | except KeyError: 51 | valid_urls = None 52 | if valid_urls: 53 | return valid_urls[0].get('href') 54 | else: 55 | raise EpisodeException(f'Could not find a valid link for episode {feed_dict["title"]}') 56 | 57 | @staticmethod 58 | async def _get_file_extension(url: str, session: aiohttp.ClientSession) -> str: 59 | url = urllib.parse.urlsplit(url).path 60 | mime_type = mimetypes.guess_type(url) 61 | mime_type = mime_type[0] 62 | if not mime_type: 63 | async with session.get(url) as response: 64 | headers = response.headers 65 | mime_type = headers.get('Content-Type') 66 | if not mime_type: 67 | raise EpisodeException(f'Could not determine MIME type for URL {url}') 68 | result = mimetypes.guess_extension(mime_type) 69 | if result: 70 | return result 71 | else: 72 | raise EpisodeException(f'Could not determine file extension for download {url}') 73 | 74 | async def calculate_path(self, destination: Path, session: aiohttp.ClientSession): 75 | try: 76 | file_extension = await self._get_file_extension(self.url, session) 77 | file_name = self.title + file_extension 78 | self.file_path = Path(destination, self.podcast_name, file_name) 79 | except (aiohttp.client_exceptions.ClientError, EpisodeException) as e: 80 | raise EpisodeException(f'Failed to determine path for "{self.title}" from "{self.podcast_name}": {e}') 81 | 82 | async def download(self, session: aiohttp.ClientSession): 83 | if not self.file_path: 84 | raise EpisodeException('Episode has no calculated path') 85 | try: 86 | async with session.get(self.url) as response: 87 | if not self.file_path.exists(): 88 | data = await response.content.read() 89 | self.file_path.parent.mkdir(exist_ok=True, parents=True) 90 | with open(self.file_path, 'wb') as file: 91 | file.write(data) 92 | logger.info(f'Downloaded {self.title} in podcast {self.podcast_name}') 93 | try: 94 | from podcastdownloader.tag_engine import TagEngine 95 | TagEngine.tag_episode(self) 96 | except mutagen.MutagenError as e: 97 | logger.error(f'Failed to tag episode {self.title}: {e}') 98 | else: 99 | logger.debug(f'File already exists at {self.file_path}') 100 | except aiohttp.client_exceptions.ClientError as e: 101 | raise EpisodeException(f'Failed to download "{self.title}" from "{self.podcast_name}": {e}') 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # podcast-downloader 2 | 3 | This is a simple tool for downloading all the available episodes in an RSS feed to disk, where they can be listened to offline. 4 | 5 | Firstly, Python 3 must be installed, then the requirements must be installed. These are documented in `requirements.txt` and can be installed via the command `python3 -m pip install -r requirements.txt`. 6 | 7 | ## Arguments 8 | 9 | Following are the arguments that can be supplied to the program: 10 | 11 | - `destination` is the directory that the folder structure will be created in and the podcasts downloaded to 12 | - `-f, --feed` is the URL for the RSS feed of the podcast 13 | - `-o, --opml` is the location of an OPML file with podcast data 14 | - `--file` is the location of a simple text file with an RSS feed URL on each line 15 | - `-l, --limit` is the maximum number of episodes to try and download from the feed; if left blank, it is all episodes, but a small number is fastest for updating a feed 16 | - `-m, --max-downloads` will limit the number of episodes to be downloaded to the specified integer 17 | - `-w, --write-list` is the option to write an ordered list of the episodes in the podcast in several different formats, as specified: 18 | - `none` 19 | - `text` 20 | - `audacious` 21 | - `m3u` 22 | - `-t, --threads` is the number of threads to run concurrently; defaults to 10 23 | - `--max-attempts` will specify the number of reattempts for a failed or refused connection; see below for more details 24 | 25 | The following arguments alter the functioning of the program in a major way e.g. they do not download: 26 | 27 | - `--skip-download` will do everything but download the files; useful for updating episode playlists without a lengthy download 28 | - `--verify` will scan existing files for ones with a file-size outside a 2% and list them in `results.txt` 29 | - `--update-tags` will download episode information and write tags to all episodes already downloaded 30 | 31 | The following arguments alter the verbosity and logging behaviour: 32 | 33 | - `-s, --suppress-progress` will disable all progress bars 34 | - `-v, --verbose` will increase the verbosity of the information output to the console 35 | - `--log` will log all messages to a debug level (the equivalent of `-v`) to the specified file, appending if it already exists 36 | 37 | The `--feed`, `--file`, and `--opml` flags can all be specified multiple times to aggregate feeds from multiple locations. 38 | 39 | Of these, only the destination is required, though one or more feeds or one or more OPML files must be provided or the program will just complete instantly. 40 | 41 | ### Maximum Reattempts 42 | 43 | In some cases, particularly when downloading a single or a few specific podcasts with a lot of episodes at once, the remote server will receive a number of simultaneous or consecutive requests. As this may appear to be atypical behaviour, this server may refuse or close incoming connections as a rate-limiting measure. This is normal in scraping servers that do not want to be scraped. 44 | 45 | There are several countermeasures in the downloader for this behaviour, such as randomising the download list to avoid repeated calls to the same server in a short amount of time, but this may not work if there is only one or a few podcast feeds to download. As such, the method of last resort is a sleep function to wait until the server allows the download to continue. This is done with increasing increments of 30 seconds, with the maximum number or reattempts specified by the `--max-attempts` argument. For example, if left at the default of 10, the program will sleep for 30 seconds if the connection is refused. Then, if it was refused again, it will sleep for 60 before reattempting the download. It will do this until the 10th attempt, where it will sleep for 300 seconds, or five minutes. If the connection is refused after this, then an error will occur and the download thread will move on to the next podcast episode. 46 | 47 | The maximum number of reattempts may need to be changed in several cases. If you wish to download the episode regardless of anything else, then you may want to increase the argument. This may result in longer wait times for the downloads to complete. However, a low argument will make the program skip downloads if they time out repeatedly, missing content but completing faster. 48 | 49 | ### Warnings 50 | 51 | The `--write-list` option should not be used with the `--limit` option. The limit option will be applied to the episode list in whatever format chosen, and this will overwrite any past episode list files. For example, if a `--limit` of 5 is chosen with `-w audacious`, then the exported Audacious playlist will only be 5 items long. Thus the `-w` option should only be used when there is not a limit. 52 | 53 | ## Tags 54 | 55 | The downloader has basic tag writing support. It will write ID3 tags to MP3 files and iTunes-compatible tags to m4a and MP4 files. The information written is as follows: 56 | 57 | - The episode title 58 | - The podcast title 59 | - The publishing date and time of the episode 60 | - The description accompanying the episode 61 | - The episode number (if available) 62 | 63 | ## Example Command 64 | 65 | Following is an example command to download a single feed to a podcasts folder. 66 | 67 | `python3 -m podcastdownloader media/podcasts --f 'http://linustechtips.libsyn.com/wanshow' -o podcasts.opml` 68 | 69 | ## Podcast Feed Files 70 | 71 | A feed file, for use with the `--file` option, is a simple text file with one URL that leads to the RSS feed per line. The podcastdownloader will ignore all lines beginning with a hash (#), as well as empty lines to allow comments and a rudimentary structure if desired. Additionally, comments can be appended to the end of a line with a feed URL. As long as there is a space between the hash and the end of the URL, it will be removed when the file is parsed. 72 | -------------------------------------------------------------------------------- /podcastdownloader/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import asyncio 4 | import itertools 5 | import logging 6 | import random 7 | import sys 8 | from asyncio.queues import Queue 9 | from pathlib import Path 10 | from typing import Optional 11 | 12 | import aiohttp 13 | import click 14 | 15 | import podcastdownloader.utility_functions as util 16 | from podcastdownloader.exceptions import EpisodeException, PodcastException 17 | from podcastdownloader.podcast import Podcast 18 | from podcastdownloader.writer import write_episode_playlist 19 | 20 | logger = logging.getLogger() 21 | 22 | 23 | def _setup_logging(verbosity: int): 24 | logger.setLevel(1) 25 | stream = logging.StreamHandler(sys.stdout) 26 | formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') 27 | stream.setFormatter(formatter) 28 | logger.addHandler(stream) 29 | if verbosity >= 1: 30 | stream.setLevel(logging.DEBUG) 31 | else: 32 | stream.setLevel(logging.INFO) 33 | logging.getLogger('asyncio').setLevel(logging.CRITICAL) 34 | logging.getLogger('chardet').setLevel(logging.CRITICAL) 35 | 36 | 37 | _common_options = [ 38 | click.argument('destination', type=str), 39 | click.option('-v', '--verbose', type=int, default=0, count=True), 40 | click.option('-f', '--feed', type=str, multiple=True, default=[]), 41 | click.option('-F', '--file', type=str, multiple=True, default=[]), 42 | click.option('--opml', type=str, multiple=True, default=[]), 43 | ] 44 | 45 | 46 | async def fill_individual_feed(in_queue: Queue, out_queue: Queue, destination: Path, session: aiohttp.ClientSession): 47 | while not in_queue.empty(): 48 | podcast = await in_queue.get() 49 | if podcast is None: 50 | break 51 | logger.debug(f'Beginning retrieval for {podcast.url}') 52 | try: 53 | await podcast.download_feed(session) 54 | for episode in podcast.episodes: 55 | try: 56 | await episode.calculate_path(destination, session) 57 | except TypeError: 58 | logger.error(f'Failed to parse {episode.title} in {episode.podcast_name}') 59 | except PodcastException as e: 60 | logger.error(e) 61 | except Exception: 62 | logger.critical(f'Error with {podcast.url}') 63 | raise 64 | else: 65 | await out_queue.put(podcast) 66 | logger.info(f'Retrieved RSS for {podcast.name}') 67 | in_queue.task_done() 68 | 69 | 70 | async def download_individual_episode(in_queue: Queue, session: aiohttp.ClientSession): 71 | while not in_queue.empty(): 72 | episode = await in_queue.get() 73 | if episode is None: 74 | break 75 | logger.debug(f'Attempting download of episode {episode.title} in {episode.podcast_name}') 76 | try: 77 | await episode.download(session) 78 | except EpisodeException as e: 79 | logger.error(e) 80 | in_queue.task_done() 81 | 82 | 83 | def add_common_options(func): 84 | for option in _common_options: 85 | func = option(func) 86 | return func 87 | 88 | 89 | @click.group() 90 | def cli(): 91 | pass 92 | 93 | 94 | @cli.command('download') 95 | @add_common_options 96 | @click.option('-l', '--limit', type=int, default=None) 97 | @click.option('-t', '--threads', type=int, default=10) 98 | @click.option('-w', '--write-playlist', type=click.Choice(('m3u',)), default=(), multiple=True) 99 | def cli_download( 100 | destination: str, 101 | feed: tuple[str], 102 | file: tuple[str], 103 | limit: Optional[int], 104 | opml: tuple[str], 105 | threads: int, 106 | verbose: int, 107 | write_playlist: tuple[str], 108 | ): 109 | _setup_logging(verbose) 110 | destination = Path(destination).expanduser().resolve() 111 | if not destination.exists(): 112 | logger.warning(f'Specified destination {destination} does not exist, creating it now') 113 | destination.mkdir(parents=True) 114 | 115 | all_feeds = set(itertools.chain(feed, util.load_feeds_from_text_file(file), util.load_feeds_from_opml(opml))) 116 | logger.info(f'{len(all_feeds)} feeds found') 117 | if all_feeds: 118 | asyncio.run(download_episodes(all_feeds, destination, threads, write_playlist, limit)) 119 | else: 120 | logger.error('No feeds have been provided') 121 | logger.info('Program Complete') 122 | 123 | 124 | async def download_episodes( 125 | all_feeds: set[str], 126 | destination: Path, 127 | threads: int, 128 | playlist_formats: tuple[str], 129 | limit: Optional[int], 130 | ): 131 | unfilled_podcasts = Queue() 132 | filled_podcasts = Queue() 133 | episodes = Queue() 134 | [await unfilled_podcasts.put(Podcast(url)) for url in all_feeds] 135 | async with aiohttp.ClientSession() as session: 136 | feed_fillers = [asyncio.create_task( 137 | fill_individual_feed(unfilled_podcasts, filled_podcasts, destination, session) 138 | ) for _ in range(1, threads)] 139 | await asyncio.gather(*feed_fillers) 140 | await unfilled_podcasts.join() 141 | logger.info('All feeds filled') 142 | 143 | podcasts = [] 144 | while not filled_podcasts.empty(): 145 | podcast = filled_podcasts.get_nowait() 146 | write_episode_playlist(podcast, playlist_formats) 147 | podcasts.append(podcast) 148 | 149 | if limit: 150 | logger.info(f'Limiting episodes per podcast to {limit} entries') 151 | for podcast in podcasts: 152 | podcast.episodes = podcast.episodes[:limit] 153 | 154 | unfilled_episodes = list(filter( 155 | lambda e: not e.file_path or not e.file_path.exists(), 156 | [ep for pod in podcasts for ep in pod.episodes], 157 | )) 158 | logger.info(f'{len(unfilled_episodes)} episodes to download') 159 | 160 | random.shuffle(unfilled_episodes) 161 | 162 | [await episodes.put(ep) for ep in unfilled_episodes] 163 | 164 | episode_downloaders = [asyncio.create_task( 165 | download_individual_episode(episodes, session) 166 | ) for _ in range(1, threads)] 167 | 168 | await asyncio.gather(*episode_downloaders) 169 | await episodes.join() 170 | 171 | 172 | if __name__ == '__main__': 173 | cli() 174 | --------------------------------------------------------------------------------