├── .dprint.json ├── .github └── workflows │ └── python-tests.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── action.yaml ├── podcast_config.example.yaml ├── requirements.txt ├── rss_generator.py └── tests └── test_rss_generator.py /.dprint.json: -------------------------------------------------------------------------------- 1 | { 2 | "lineWidth": 0, 3 | "markdown": { 4 | "textWrap": "maintain", 5 | "lineWidth": 0 6 | }, 7 | "includes": ["**/*.md"], 8 | "excludes": [ 9 | "**/node_modules", 10 | "**/*-lock.json", 11 | "**/target" 12 | ], 13 | "plugins": [ 14 | "https://plugins.dprint.dev/markdown-0.16.3.wasm" 15 | ] 16 | } -------------------------------------------------------------------------------- /.github/workflows/python-tests.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Python Unit Tests and Linting 3 | 4 | on: 5 | push: 6 | branches: 7 | - master 8 | pull_request: 9 | branches: 10 | - master 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python 3.8 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.8 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements.txt 29 | pip install flake8 30 | pip install yamllint 31 | sudo apt-get update && sudo apt-get install ffmpeg 32 | 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | 41 | 42 | - name: Ensure example file passes YAMLlint 43 | run: | 44 | yamllint -fgithub -d "{rules: {line-length: false}}" podcast_config.example.yaml 45 | 46 | - name: Generate Feed from Example 47 | run: python rss_generator.py --input-file podcast_config.example.yaml --output-file podcast_feed.xml --skip-asset-verification 48 | 49 | - name: Install xq 50 | run: | 51 | wget -q https://github.com/sibprogrammer/xq/releases/download/v1.2.3/xq_1.2.3_linux_amd64.tar.gz 52 | tar xfz xq_1.2.3_linux_amd64.tar.gz 53 | sudo mv xq /usr/local/bin/ 54 | 55 | - name: Validate Feed XML with xq 56 | run: xq . podcast_feed.xml 57 | 58 | - name: Run tests 59 | run: | 60 | python -m unittest discover tests 61 | 62 | - name: Upload Test Feed Artifact 63 | uses: actions/upload-artifact@v4 64 | with: 65 | name: podcast-feed-xml 66 | path: podcast_feed.xml 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Exclude output from script 163 | podcast_config.yaml 164 | podcast_feed.xml 165 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | # Install ffmpeg 4 | RUN apt-get update && \ 5 | apt-get install -y ffmpeg && \ 6 | rm -rf /var/lib/apt/lists/* 7 | 8 | # Copy your script and requirements file 9 | COPY rss_generator.py /rss_generator.py 10 | COPY requirements.txt /requirements.txt 11 | 12 | # Install Python dependencies 13 | RUN pip install --no-cache-dir -r /requirements.txt 14 | 15 | # Set the entrypoint to your script 16 | ENTRYPOINT ["python", "/rss_generator.py"] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Viktor Petersson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Podcast RSS Generator 2 | 3 | [![Python Unit Tests and Linting](https://github.com/vpetersson/podcast-rss-generator/actions/workflows/python-tests.yml/badge.svg)](https://github.com/vpetersson/podcast-rss-generator/actions/workflows/python-tests.yml) 4 | 5 | ## Description 6 | 7 | This an RSS Feed Generator is designed to generate an RSS feed for audio/video podcasts, reading metadata and episode data from a YAML file. 8 | 9 | It assumes that you self-host your video episodes somewhere (e.g. S3/GCS/R2) as well as the output of this script. You can then point YouTube/Spotify/Apple Podcast to this path. 10 | 11 | This tool was written for my podcast [Nerding Out with Viktor](https://vpetersson.com/podcast/) to solve for the fact that Apple's [Podcast Connect](https://podcastsconnect.apple.com) require you to self-host videos in order to publish. 12 | 13 | I also wrote an article on how you can use this tool to automatically turn a video podcast into audio in [this article](https://vpetersson.com/2024/06/27/video-to-audio-podcast.html). 14 | 15 | ## Features 16 | 17 | - Generates RSS feed for audio/video podcasts 18 | - Reads podcast metadata and episode data from a YAML file 19 | - Converts ISO format dates to RFC 2822 format 20 | - Attempts to follow [The Podcast RSS Standard](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification) 21 | 22 | ## Known Issues 23 | 24 | - Videos uploaded to YouTube [via RSS](https://support.google.com/youtube/answer/13525207?hl=en#zippy=%2Ccan-i-deliver-an-rss-feed-if-i-already-have-a-podcast-on-youtube) will be uploaded as audio. 25 | - Spotify can't handle videos via RSS yet. You will be able to see the episodes in Podcaster, but they will not be processed and sent to Spotify properly. This is apparently a known issue that they are working on resolving. 26 | 27 | The workaround for the above issues is to manually upload the episodes. 28 | 29 | ## Installation 30 | 31 | ### Prerequisites 32 | 33 | - Python 3.8 or higher 34 | - pip (Python package installer) 35 | - ffmpeg 36 | 37 | ### Setup 38 | 39 | 1. **Clone the Repository** 40 | 41 | ```bash 42 | $ git clone https://github.com/vpetersson/podcast-rss-generator.git 43 | $ cd podcast-rss-generator 44 | ``` 45 | 46 | 2. **Install Dependencies** 47 | 48 | ```bash 49 | $ pip install -r requirements.txt 50 | ``` 51 | 52 | **Optional:** Install `yamllint`, `xq` and `flake8`. 53 | 54 | ## Usage 55 | 56 | ```bash 57 | $ python rss_generator.py --help 58 | usage: rss_generator.py [-h] [--input-file INPUT_FILE] [--output-file OUTPUT_FILE] [--skip-asset-verification] 59 | 60 | Process some parameters. 61 | 62 | options: 63 | -h, --help show this help message and exit 64 | --input-file INPUT_FILE 65 | Input YAML file 66 | --output-file OUTPUT_FILE 67 | Output XML file 68 | --skip-asset-verification 69 | Skip HTTP HEAD and ffprobe checks for asset URLs (use for testing/fake URLs) 70 | ``` 71 | 72 | 1. **Prepare Your Data Files** 73 | 74 | Copy `podcast_config.example.yaml` to `podcast_config.yaml` and fill out your podcast metadata and eepisodes. 75 | 76 | The `podcast_config.yaml` file contains two main sections: `metadata` and `episodes`. 77 | 78 | ### Metadata Section 79 | 80 | This section contains general information about your podcast: 81 | 82 | | Key | Description | Notes | 83 | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 84 | | `title` | The title of your podcast. | Required | 85 | | `description` | A description of your podcast. | Required. Markdown is supported. | 86 | | `link` | The URL of the main website for your podcast. | Required. Also the default link for episodes if not provided per-episode. | 87 | | `rss_feed_url` | The public URL where your generated `podcast_feed.xml` will be hosted. | Required | 88 | | `language` | The language of the podcast (e.g., `en-us`). | Optional. Default: `en-us`. | 89 | | `email` | The contact email for the podcast owner. | Required. Backward compatibility: `itunes_email`. Used for ``. | 90 | | `author` | The author name(s). | Required. Backward compatibility: `itunes_author`. | 91 | | `category` | The primary category for iTunes. | Optional. Backward compatibility: `itunes_category`. | 92 | | `image` | The URL for the main podcast cover art (JPEG or PNG, 1400x1400 to 3000x3000 pixels). | Required. Also the default image for episodes if not provided per-episode. | 93 | | `explicit` | Indicates if the podcast contains explicit content. | Optional (`true`/`false`). Default: `false`. Backward compatibility: `itunes_explicit`. Can be overridden per-episode. | 94 | | `use_asset_hash_as_guid` | Use a content hash or ETag from the asset file\'s headers as the episode\'s ``. Prioritizes `x-amz-checksum-sha256`, then `x-goog-hash` (MD5 part), then `ETag`. **Warning:** This can break GUID permanence if asset files change or are re-uploaded, potentially causing re-downloads for subscribers. | Optional (`true`/`false`). Default: `false` (uses `asset_url`). | 95 | | `copyright` | A string containing the copyright notice for the podcast. | Optional. | 96 | | `podcast_locked` | Tells platforms not to import the feed without confirming ownership via the `email` address. | Optional (`yes`/`no`). Default: `no`. Based on [Podcast Standards Project](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification). | 97 | | `podcast_guid` | A globally unique, permanent identifier (UUID recommended) for the *entire podcast show*. | Optional. If omitted, a stable UUIDv5 is generated from `rss_feed_url`. Strongly recommended to set explicitly. Based on [Podcast Standards Project](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification). | 98 | 99 | ### Episodes Section 100 | 101 | This section is a list of your podcast episodes. Each episode is an object with the following fields: 102 | 103 | | Key | Description | Notes | 104 | | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | 105 | | `title` | The title of the episode. | Required. | 106 | | `description` | A description of the episode. | Required. Markdown is supported. | 107 | | `publication_date` | The date and time the episode was published. Episodes with future dates will not be included. | Required. ISO 8601 format (e.g., `2023-01-15T10:00:00Z`). | 108 | | `asset_url` | The direct URL to the audio or video file for the episode. | Required. | 109 | | `link` | The URL for a webpage specific to this episode. | Optional. Defaults to the global `link` from `metadata`. | 110 | | `image` | The URL for artwork specific to this episode (same format requirements as the main podcast image). | Optional. Defaults to the global `image` from `metadata`. | 111 | | `episode` | The episode number. | Optional. Integer. | 112 | | `season` | The season number. | Optional. Integer. | 113 | | `episode_type` | Defines the type of content for the episode. | Optional. Can be `full` (default), `trailer`, or `bonus`. | 114 | | `explicit` | Indicates if this specific episode contains explicit content. | Optional (`true`/`false`). Overrides the global `explicit` setting for this episode. Backward compatibility: `itunes_explicit`. | 115 | | `transcripts` | A list of transcript files associated with the episode. Each item is an object with `url` (required), `type` (required), `language` (opt), `rel` (opt). | Optional. See example config for structure. | 116 | 117 | 2. **Generate the RSS Feed** 118 | 119 | Make sure your YAML is valid: 120 | 121 | ```bash 122 | $ yamllint podcast_config.yaml 123 | ``` 124 | 125 | Generate your `podcast_feed.xml` file: 126 | 127 | ```bash 128 | $ python rss_generator.py 129 | ``` 130 | 131 | Now copy your `podcast_feed.xml` to S3/GCS/R2 using a tool like `s3cmd`, `aws` or `mc` (from Minio). 132 | 133 | You can verify your RSS feed using a tool like [Podbase](https://podba.se/validate/). 134 | 135 | ## **Optional:** Optimize video 136 | 137 | If you're dealing with video podcasts, the file size matters for obvious reasons. What I do is to export the video as h624 from my video editor (which I upload to YouTube and Spotify). 138 | 139 | I then re-encode the h264 video to h265 for other platforms using `ffmpeg` with the following command (on macOS): 140 | 141 | ```bash 142 | $ ffmpeg -i input.mp4 \ 143 | -tag:v hvc1 \ 144 | -c:v hevc_videotoolbox \ 145 | -crf 28 \ 146 | -preset slowest \ 147 | -c:a aac \ 148 | -b:a 128k \ 149 | -movflags faststart \ 150 | output.mp4 151 | ``` 152 | 153 | ## Usage with GitHub Actions 154 | 155 | To incorporate this action into your workflow, follow these steps: 156 | 157 | 1. **Create a Workflow File**: 158 | - In your repository, create a new file under `.github/workflows`, for example, `rss_workflow.yml`. 159 | 160 | 2. **Set Up the Workflow**: 161 | - Use the following configuration as a starting point: 162 | 163 | ```yaml 164 | name: Generate Podcast RSS Feed 165 | 166 | on: [push, pull_request] 167 | 168 | env: 169 | R2_BUCKET: 'foobar' 170 | jobs: 171 | generate-rss: 172 | runs-on: ubuntu-latest 173 | 174 | steps: 175 | - name: Checkout repository 176 | uses: actions/checkout@v2 177 | 178 | - name: Install yamllint 179 | run: | 180 | sudo apt-get update 181 | sudo apt-get install yamllint 182 | 183 | - name: Lint YAML file 184 | run: yamllint podcast_config.yaml 185 | 186 | - name: Run Podcast RSS Generator 187 | uses: vpetersson/podcast-rss-generator@master 188 | with: 189 | input_file: 'podcast_config.yaml' 190 | output_file: 'podcast_feed.xml' 191 | 192 | - name: Validate output with xq 193 | run: | 194 | wget -q https://github.com/sibprogrammer/xq/releases/download/v1.2.3/xq_1.2.3_linux_amd64.tar.gz 195 | tar xfz xq_1.2.3_linux_amd64.tar.gz 196 | cat podcast_feed.xml | ./xq 197 | 198 | - uses: actions/upload-artifact@v2 199 | with: 200 | name: podcast_feed.xml 201 | path: podcast_feed.xml 202 | 203 | deploy: 204 | runs-on: ubuntu-latest 205 | needs: generate-rss 206 | if: github.ref == 'refs/heads/master' 207 | steps: 208 | - uses: actions/download-artifact@v2 209 | with: 210 | name: podcast_feed.xml 211 | 212 | - name: Install mc 213 | run: | 214 | wget -q https://dl.min.io/client/mc/release/linux-amd64/mc 215 | chmod +x mc 216 | 217 | - name: Set up mc 218 | env: 219 | R2_ENDPOINT: ${{ secrets.R2_ENDPOINT }} 220 | R2_KEY_ID: ${{ secrets.R2_KEY_ID }} 221 | R2_KEY_SECRET: ${{ secrets.R2_KEY_SECRET }} 222 | run: ./mc alias set r2-storage ${R2_ENDPOINT} ${R2_KEY_ID} ${R2_KEY_SECRET} 223 | 224 | - name: Copy file 225 | run: ./mc cp podcast_feed.xml r2-storage/${R2_BUCKET}/ 226 | ``` 227 | 228 | 3. **Customize Your Workflow**: 229 | - Adjust paths to the YAML configuration and the output XML files as per your repository structure. 230 | - Ensure the `uses` field points to `vpetersson/podcast-rss-generator@master` (or specify a specific release tag/version instead of `master`). 231 | 232 | 4. **Commit and Push Your Workflow**: 233 | - Once you commit this workflow file to your repository, the action will be triggered based on the defined events (e.g., on push or pull request). 234 | 235 | ### Inputs 236 | 237 | - `input_file`: Path to the input YAML file. Default: `podcast_config.yaml`. 238 | - `output_file`: Path for the generated RSS feed XML file. Default: `podcast_feed.xml`. 239 | 240 | ## Running Tests 241 | 242 | To run unit tests, use: 243 | 244 | ```bash 245 | $ python -m unittest discover tests 246 | ``` 247 | 248 | ## Contributing 249 | 250 | Contributions to this project are welcome! Please follow these steps: 251 | 252 | 1. Fork the repository. 253 | 2. Create a new branch for your feature. 254 | 3. Commit your changes. 255 | 4. Push to the branch. 256 | 5. Submit a pull request. 257 | 258 | ## License 259 | 260 | [MIT License](LICENSE) 261 | -------------------------------------------------------------------------------- /action.yaml: -------------------------------------------------------------------------------- 1 | name: 'Podcast RSS Generator' 2 | description: 'Generates a podcast RSS feed from a YAML configuration' 3 | author: 'Viktor Petersson' 4 | branding: 5 | icon: 'rss' 6 | color: 'purple' 7 | 8 | runs: 9 | using: 'docker' 10 | image: 'Dockerfile' 11 | args: 12 | - "--input-file" 13 | - ${{ inputs.input_file }} 14 | - "--output-file" 15 | - ${{ inputs.output_file }} 16 | 17 | inputs: 18 | input_file: 19 | description: 'Input YAML file' 20 | required: true 21 | default: 'podcast_config.yaml' 22 | output_file: 23 | description: 'Output XML file' 24 | required: true 25 | default: 'podcast_feed.xml' 26 | -------------------------------------------------------------------------------- /podcast_config.example.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | metadata: 3 | title: "My Awesome Podcast" 4 | description: "A podcast about technology and programming." 5 | link: "https://example.com/podcast" # Fallback link 6 | rss_feed_url: "https://example.com/podcast/feed.xml" 7 | language: "en-us" 8 | email: "podcast@example.com" 9 | author: "Podcast Host" 10 | category: "Technology" 11 | image: "https://example.com/podcast/images/podcast_cover.png" # Fallback image 12 | explicit: false 13 | use_asset_hash_as_guid: false # Default: Use asset_url. Set to true to use content hash (SHA256/MD5/ETag). 14 | copyright: "© 2024 Your Name/Company" # Optional copyright notice 15 | # --- Podcast Standards Project Recommended Tags --- # 16 | podcast_locked: "no" # "yes" or "no". Prevents feed import by other platforms without owner confirmation (uses email above). 17 | # podcast_guid: "YOUR-PODCAST-GUID-HERE" # A unique, permanent ID for the entire podcast (UUID recommended). 18 | # If omitted, a GUID will be generated based on rss_feed_url. 19 | # Generate one using: python -c "import uuid; print(uuid.uuid4())" or an online generator. 20 | # Example: 9b024349-ccf0-5f69-a609-6b82873eab3c 21 | 22 | episodes: 23 | - title: "Episode 1: The Beginning" 24 | description: "Introduction to the podcast." 25 | publication_date: "2023-01-15T10:00:00Z" 26 | asset_url: "https://example.com/podcast/episodes/episode1.mp3" 27 | episode: 1 28 | season: 1 29 | episode_type: "full" 30 | # This episode will use the global link and image from metadata 31 | transcripts: 32 | - url: "https://example.com/podcast/transcripts/episode1.srt" 33 | type: "application/x-subrip" 34 | - url: "https://example.com/podcast/transcripts/episode1.vtt" 35 | type: "text/vtt" 36 | language: "en" # Optional: Language code (e.g., "en", "en-US", "es"). Follows RFC 5646. 37 | 38 | - title: "Episode 2: Deep Dive into Python" 39 | description: "Exploring advanced Python features." 40 | publication_date: "2023-01-22T10:00:00Z" 41 | asset_url: "https://example.com/podcast/episodes/episode2.mp3" 42 | link: "https://example.com/podcast/episodes/2" # Episode-specific link 43 | image: "https://example.com/podcast/images/episode2_cover.png" # Episode-specific image 44 | episode: 2 45 | season: 1 46 | episode_type: "full" 47 | # This episode has specific link/image but no transcripts 48 | 49 | - title: "Special Bonus Episode" 50 | description: "An interview with a special guest." 51 | publication_date: "2023-01-25T12:00:00Z" 52 | asset_url: "https://example.com/podcast/episodes/bonus1.mp3" 53 | link: "https://example.com/podcast/episodes/bonus" # Episode-specific link 54 | # This episode will use the global image from metadata, but has its own link 55 | episode_type: "bonus" 56 | transcripts: 57 | - url: "https://example.com/podcast/transcripts/bonus1.json" 58 | type: "application/json" 59 | # language: "es" # Optional: Language code (e.g., "en", "en-US", "es"). Follows RFC 5646. 60 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0.1 2 | requests==2.31.0 3 | Markdown==3.5.1 4 | sh==2.0.6 5 | yamllint==1.33.0 6 | retry==0.9.2 7 | -------------------------------------------------------------------------------- /rss_generator.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | from datetime import datetime, timezone 3 | from email.utils import format_datetime 4 | import argparse 5 | import time 6 | import os 7 | import re 8 | import uuid 9 | 10 | import markdown 11 | import requests 12 | import yaml 13 | from sh import ffprobe, ErrorReturnCode 14 | from retry import retry 15 | 16 | # Flag to indicate if we're in test mode 17 | TEST_MODE = os.environ.get("RSS_GENERATOR_TEST_MODE", "false").lower() == "true" 18 | 19 | # Mock ffprobe output for testing 20 | MOCK_FFPROBE_OUTPUT = """streams.stream.0.index=0 21 | streams.stream.0.codec_name="aac" 22 | streams.stream.0.codec_long_name="AAC (Advanced Audio Coding)" 23 | streams.stream.0.profile="LC" 24 | streams.stream.0.codec_type="audio" 25 | streams.stream.0.codec_tag_string="mp4a" 26 | streams.stream.0.codec_tag="0x6134706d" 27 | streams.stream.0.sample_fmt="fltp" 28 | streams.stream.0.sample_rate="44100" 29 | streams.stream.0.channels=2 30 | streams.stream.0.channel_layout="stereo" 31 | streams.stream.0.bits_per_sample=0 32 | streams.stream.0.initial_padding=0 33 | streams.stream.0.id="0x1" 34 | streams.stream.0.r_frame_rate="0/0" 35 | streams.stream.0.avg_frame_rate="0/0" 36 | streams.stream.0.time_base="1/44100" 37 | streams.stream.0.start_pts=0 38 | streams.stream.0.start_time="0.000000" 39 | streams.stream.0.duration_ts=156170240 40 | streams.stream.0.duration="3541.275283" 41 | streams.stream.0.bit_rate="107301" 42 | streams.stream.0.max_bit_rate="N/A" 43 | streams.stream.0.bits_per_raw_sample="N/A" 44 | streams.stream.0.nb_frames="152510" 45 | streams.stream.0.nb_read_frames="N/A" 46 | streams.stream.0.nb_read_packets="N/A" 47 | streams.stream.0.extradata_size=2 48 | streams.stream.0.disposition.default=1""" 49 | 50 | 51 | # Mock HTTP response for testing 52 | class MockResponse: 53 | def __init__(self, url): 54 | self.url = url 55 | self.headers = { 56 | "content-length": "12345678", 57 | "content-type": "audio/mpeg", 58 | # Example headers for testing hash extraction 59 | "ETag": '"d41d8cd98f00b204e9800998ecf8427e"', # MD5 hash 60 | # 'ETag': '"abc-1"', # Multipart ETag 61 | # 'x-amz-checksum-sha256': 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', 62 | # 'x-goog-hash': 'crc32c=AAAAAA==,md5=1B2M2Y8AsgTpgAmY7PhCfg==' # Base64 MD5 63 | } 64 | 65 | 66 | def read_podcast_config(yaml_file_path): 67 | with open(yaml_file_path, "r", encoding="utf-8") as file: 68 | return yaml.safe_load(file) 69 | 70 | 71 | def convert_iso_to_rfc2822(iso_date): 72 | # Replace 'Z' with '+00:00' for Python < 3.11 compatibility 73 | compatible_iso_date = iso_date.replace("Z", "+00:00") 74 | date_obj = datetime.fromisoformat(compatible_iso_date) 75 | return format_datetime(date_obj) 76 | 77 | 78 | @retry(tries=5, delay=2, backoff=2, logger=None) 79 | def _make_http_request(url): 80 | """Make HTTP request with retry logic""" 81 | if TEST_MODE: 82 | return MockResponse(url) 83 | return requests.head(url, allow_redirects=True) 84 | 85 | 86 | def _run_ffprobe_with_retry(url, max_retries=5, delay=2): 87 | """ 88 | Run ffprobe with manual retry logic to handle ErrorReturnCode exceptions 89 | """ 90 | if TEST_MODE: 91 | return MOCK_FFPROBE_OUTPUT 92 | 93 | retries = 0 94 | while retries < max_retries: 95 | try: 96 | return ffprobe( 97 | "-hide_banner", 98 | "-v", 99 | "quiet", 100 | "-show_streams", 101 | "-print_format", 102 | "flat", 103 | url, 104 | ) 105 | except ErrorReturnCode: 106 | retries += 1 107 | if retries >= max_retries: 108 | print( 109 | f"Failed to run ffprobe after {max_retries} attempts for URL: {url}" 110 | ) 111 | # Return empty string if all retries fail 112 | return "" 113 | print( 114 | f"ffprobe failed (attempt {retries}/{max_retries}), retrying in {delay} seconds..." 115 | ) 116 | time.sleep(delay) 117 | delay *= 2 # Exponential backoff 118 | 119 | 120 | def get_file_info(url): 121 | # Make HTTP request with retry logic 122 | response = _make_http_request(url) 123 | 124 | # Get duration of audio/video file 125 | # We're using the response.url here in order to 126 | # follow redirects and get the actual file 127 | 128 | # Run ffprobe with retry logic 129 | probe = _run_ffprobe_with_retry(response.url) 130 | 131 | # If probe is empty (all retries failed), set duration to None 132 | if not probe: 133 | return { 134 | "content-length": response.headers.get("content-length"), 135 | "content-type": response.headers.get("content-type"), 136 | "duration": None, 137 | } 138 | 139 | lines = probe.split("\n") 140 | 141 | # Filtering out the line that contains 'streams.stream.0.duration' 142 | duration_line = next( 143 | (line for line in lines if line.startswith("streams.stream.0.duration=")), None 144 | ) 145 | 146 | if duration_line: 147 | # Extracting the numeric value and converting it to an integer 148 | duration = int(float(duration_line.split("=")[1].strip('"'))) 149 | else: 150 | duration = None 151 | 152 | # --- Extract content hash from headers --- 153 | content_hash = None 154 | headers = response.headers 155 | 156 | # 1. Check for x-amz-checksum-sha256 157 | sha256_hash = headers.get("x-amz-checksum-sha256") 158 | if sha256_hash: 159 | content_hash = f"sha256:{sha256_hash}" 160 | 161 | # 2. Check for GCS MD5 (if SHA256 not found) 162 | if not content_hash: 163 | gcs_hash = headers.get("x-goog-hash") 164 | if gcs_hash: 165 | # Extract base64 md5 value - look for md5= and capture until next comma or end of string 166 | match = re.search(r"md5=([^,]+)", gcs_hash) 167 | if match: 168 | # Note: GCS MD5 is base64 encoded, needs decoding if we wanted raw bytes, 169 | # but for a GUID string, the base64 representation is fine and unique. 170 | content_hash = f"md5:{match.group(1)}" 171 | 172 | # 3. Check ETag (if other hashes not found) 173 | if not content_hash: 174 | etag = headers.get("ETag", "").strip('" ') # Remove quotes and whitespace 175 | if etag: # Use any non-empty ETag as a fallback hash 176 | content_hash = f"etag:{etag}" 177 | 178 | return { 179 | "content-length": headers.get("content-length"), 180 | "content-type": headers.get("content-type"), 181 | "duration": duration, 182 | "content_hash": content_hash, # Add the extracted hash to the result 183 | } 184 | 185 | 186 | def format_description(description): 187 | """ 188 | Convert Markdown description to HTML 189 | """ 190 | html_description = markdown.markdown(description) 191 | wrapped_description = f"" 192 | 193 | # Ensure byte limit for the channel description 194 | byte_limit = 4000 195 | if len(wrapped_description.encode("utf-8")) > byte_limit: 196 | # Truncate the description if it exceeds the limit 197 | # Note: Truncation logic might need to be more sophisticated to handle HTML correctly 198 | wrapped_description = wrapped_description[:byte_limit] 199 | 200 | return wrapped_description 201 | 202 | 203 | def generate_rss(config, output_file_path, skip_asset_verification=False): 204 | # --- Namespace Registration --- (Ensure podcast namespace is included) 205 | ET.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") 206 | ET.register_namespace("atom", "http://www.w3.org/2005/Atom") 207 | ET.register_namespace("podcast", "https://podcastindex.org/namespace/1.0") # Add podcast namespace 208 | 209 | # --- Root Element Setup --- (Add podcast namespace attribute) 210 | rss = ET.Element( 211 | "rss", 212 | version="2.0", 213 | attrib={ 214 | "xmlns:itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", 215 | "xmlns:atom": "http://www.w3.org/2005/Atom", 216 | "xmlns:podcast": "https://podcastindex.org/namespace/1.0" # Add podcast namespace 217 | }, 218 | ) 219 | 220 | # Global itunes:explicit setting 221 | global_explicit = ( 222 | "yes" if config["metadata"].get("itunes_explicit", False) else "no" 223 | ) 224 | 225 | # --- Metadata Section --- (Add copyright) 226 | channel = ET.SubElement(rss, "channel") 227 | metadata = config["metadata"] 228 | 229 | # Helper function to get metadata with backward compatibility 230 | def get_meta(key, old_key=None, required=False, default=None): 231 | # If old_key is not provided, use key itself for checking 232 | check_keys = [key] 233 | if old_key: 234 | check_keys.append(old_key) 235 | 236 | value = None 237 | for k in check_keys: 238 | value = metadata.get(k) 239 | if value is not None: 240 | break # Found a value 241 | 242 | if required and value is None: 243 | key_str = f"'{key}'" 244 | if old_key: 245 | key_str += f" or '{old_key}'" 246 | raise ValueError(f"Missing required metadata key: {key_str}") 247 | 248 | return value if value is not None else default 249 | 250 | ET.SubElement(channel, "title").text = metadata[ 251 | "title" 252 | ] # Title is fundamental, no old key needed 253 | ET.SubElement(channel, "description").text = format_description( 254 | metadata["description"] 255 | ) 256 | ET.SubElement(channel, "language").text = metadata.get("language", "en-us") 257 | ET.SubElement(channel, "link").text = metadata["link"] 258 | ET.SubElement( 259 | channel, "generator" 260 | ).text = ( 261 | "Podcast RSS Generator (https://github.com/vpetersson/podcast-rss-generator)" 262 | ) 263 | ET.SubElement( 264 | channel, 265 | "atom:link", 266 | href=get_meta( 267 | "rss_feed_url", "rss_feed_url", required=True 268 | ), # Use helper, though no old key needed 269 | rel="self", 270 | type="application/rss+xml", 271 | ) 272 | 273 | # Explicit tag (backward compatibility) 274 | explicit_val = get_meta("explicit", "itunes_explicit", default=False) 275 | explicit_text = "yes" if explicit_val else "no" 276 | ET.SubElement(channel, "itunes:explicit").text = explicit_text 277 | 278 | # Owner/Email (backward compatibility) 279 | email_val = get_meta("email", "itunes_email", required=True) 280 | itunes_owner = ET.SubElement(channel, "itunes:owner") 281 | ET.SubElement(itunes_owner, "itunes:email").text = email_val 282 | 283 | # Author (backward compatibility) 284 | author_val = get_meta("author", "itunes_author", required=True) 285 | ET.SubElement(channel, "itunes:author").text = author_val 286 | 287 | # Summary (use description) 288 | itunes_summary = ET.SubElement(channel, "itunes:summary") 289 | itunes_summary.text = metadata["description"] 290 | 291 | # Category (backward compatibility) 292 | category_val = get_meta("category", "itunes_category") 293 | if category_val: 294 | ET.SubElement(channel, "itunes:category", text=category_val) 295 | 296 | # Image (backward compatibility, already handled) 297 | image_val = get_meta( 298 | "image", "image" 299 | ) # Uses 'image' as both new and old effective key here 300 | if image_val: 301 | itunes_image = ET.SubElement(channel, "itunes:image") 302 | itunes_image.set("href", image_val) 303 | 304 | # Copyright (Optional) 305 | copyright_val = metadata.get("copyright") 306 | if copyright_val: 307 | ET.SubElement(channel, "copyright").text = copyright_val 308 | 309 | # Recommended Channel Elements (Podcast Standards Project) 310 | # podcast:locked 311 | locked_val = get_meta("podcast_locked", default="no") # Default to 'no' (false) 312 | # Ensure the value is either 'yes' or 'no' 313 | locked_text = "yes" if str(locked_val).lower() == "true" or str(locked_val).lower() == "yes" else "no" 314 | ET.SubElement(channel, "podcast:locked", owner=email_val).text = locked_text # Requires owner email 315 | 316 | # podcast:guid 317 | # Prefer explicitly defined GUID in config, otherwise generate based on feed URL 318 | guid_val = get_meta("podcast_guid") 319 | if not guid_val: 320 | feed_url_val = get_meta("rss_feed_url", required=True) # Feed URL is required anyway 321 | # Generate UUID v5 based on the feed URL namespace 322 | guid_val = str(uuid.uuid5(uuid.NAMESPACE_URL, feed_url_val)) 323 | print(f"Warning: podcast_guid not found in metadata. Generated GUID: {guid_val}") 324 | print("It is recommended to explicitly set podcast_guid in your config file.") 325 | ET.SubElement(channel, "podcast:guid").text = guid_val 326 | 327 | # --- Episode Processing --- (Add transcript logic) 328 | use_hash_guid = metadata.get("use_asset_hash_as_guid", False) 329 | 330 | for episode in config["episodes"]: 331 | print(f"Processing episode {episode['title']}...") 332 | 333 | # Replace \'Z\' with \'+00:00\' for Python < 3.11 compatibility with fromisoformat 334 | pub_date_str = episode["publication_date"].replace("Z", "+00:00") 335 | # Parse the date string 336 | pub_date = datetime.fromisoformat(pub_date_str) 337 | # If the parsed date is naive (no timezone info), assume it's UTC 338 | if pub_date.tzinfo is None: 339 | pub_date = pub_date.replace(tzinfo=timezone.utc) 340 | 341 | # Now compare the timezone-aware publication date with the current UTC time 342 | if not pub_date < datetime.now(timezone.utc): 343 | print( 344 | f"Skipping episode {episode['title']} as it's not scheduled to be released until {episode['publication_date']}." 345 | ) 346 | continue 347 | 348 | if skip_asset_verification: 349 | print(f" Skipping asset verification for {episode['asset_url']}") 350 | # Provide default/placeholder values 351 | file_info = { 352 | "content-length": "0", # Required by enclosure 353 | "content-type": "application/octet-stream", # Generic fallback type 354 | "duration": None, 355 | "content_hash": None, 356 | } 357 | else: 358 | file_info = get_file_info(episode["asset_url"]) 359 | 360 | item = ET.SubElement(channel, "item") 361 | ET.SubElement(item, "pubDate").text = convert_iso_to_rfc2822( 362 | pub_date_str 363 | ) 364 | ET.SubElement(item, "title").text = episode["title"] 365 | ET.SubElement(item, "description").text = format_description( 366 | episode["description"] 367 | ) 368 | 369 | # Determine GUID: Use hash if requested and available, else use asset_url 370 | guid_text = episode["asset_url"] # Default 371 | if use_hash_guid and file_info.get("content_hash"): 372 | guid_text = file_info["content_hash"] 373 | print(f" Using content hash for GUID: {guid_text}") 374 | else: 375 | print(f" Using asset URL for GUID: {guid_text}") 376 | 377 | ET.SubElement(item, "guid").text = guid_text 378 | ET.SubElement( 379 | item, 380 | "enclosure", 381 | url=episode["asset_url"], 382 | # Use fetched or default values 383 | type=file_info.get("content-type", "application/octet-stream"), 384 | length=str(file_info.get("content-length", "0")), 385 | ) 386 | 387 | # Apply itunes:explicit setting (check episode first, then global) 388 | episode_explicit_val = episode.get("explicit", episode.get("itunes_explicit")) 389 | if episode_explicit_val is not None: 390 | # Use episode-specific value if present 391 | explicit_text_item = "yes" if episode_explicit_val else "no" 392 | else: 393 | # Fallback to global setting 394 | explicit_text_item = global_explicit 395 | ET.SubElement(item, "itunes:explicit").text = explicit_text_item 396 | 397 | # Add itunes:duration tag if available 398 | if file_info.get("duration") is not None: 399 | itunes_duration = ET.SubElement(item, "itunes:duration") 400 | itunes_duration.text = str(file_info["duration"]) 401 | 402 | # iTunes-specific tags 403 | if episode.get("episode") is not None: 404 | itunes_episode = ET.SubElement(item, "itunes:episode") 405 | itunes_episode.text = str(episode["episode"]) 406 | 407 | if episode.get("season") is not None: 408 | itunes_season = ET.SubElement(item, "itunes:season") 409 | itunes_season.text = str(episode["season"]) 410 | 411 | if episode.get("episode_type") is not None: 412 | itunes_episode_type = ET.SubElement(item, "itunes:episodeType") 413 | itunes_episode_type.text = episode["episode_type"] 414 | 415 | # Add link if available, if not, use global 416 | link = ET.SubElement(item, "link") 417 | link.text = episode.get("link", metadata["link"]) 418 | 419 | # Determine the correct image URL (episode-specific or channel default) 420 | # Use episode specific artwork if available, falling back to channel image 421 | image_url = episode.get("image", metadata.get("image")) 422 | 423 | # Creating the 'itunes:image' element if an image URL is available 424 | if image_url: 425 | itunes_image = ET.SubElement(item, "itunes:image") 426 | itunes_image.set("href", image_url) 427 | 428 | # Add transcript links if available 429 | if "transcripts" in episode and isinstance(episode["transcripts"], list): 430 | for transcript_info in episode["transcripts"]: 431 | if "url" in transcript_info and "type" in transcript_info: 432 | # Basic required attributes 433 | transcript_attrs = { 434 | "url": transcript_info["url"], 435 | "type": transcript_info["type"], 436 | } 437 | # Add optional attributes if they exist 438 | if "language" in transcript_info: 439 | transcript_attrs["language"] = transcript_info["language"] 440 | if "rel" in transcript_info: 441 | transcript_attrs["rel"] = transcript_info["rel"] 442 | 443 | ET.SubElement(item, "podcast:transcript", attrib=transcript_attrs) 444 | else: 445 | print(f" Skipping invalid transcript entry for episode {episode['title']}: {transcript_info}") 446 | 447 | tree = ET.ElementTree(rss) 448 | tree.write(output_file_path, encoding="UTF-8", xml_declaration=True) 449 | 450 | 451 | def main(): 452 | parser = argparse.ArgumentParser(description="Process some parameters.") 453 | 454 | parser.add_argument( 455 | "--input-file", type=str, default="podcast_config.yaml", help="Input YAML file" 456 | ) 457 | parser.add_argument( 458 | "--output-file", type=str, default="podcast_feed.xml", help="Output XML file" 459 | ) 460 | parser.add_argument( 461 | "--skip-asset-verification", 462 | action="store_true", # Makes it a boolean flag 463 | help="Skip HTTP HEAD and ffprobe checks for asset URLs (use for testing/fake URLs)" 464 | ) 465 | 466 | # Parse arguments from the command line 467 | args = parser.parse_args() 468 | 469 | print(f"Input file: {args.input_file}, Output file: {args.output_file}") 470 | if args.skip_asset_verification: 471 | print("Skipping asset verification.") 472 | 473 | config = read_podcast_config(args.input_file) 474 | generate_rss(config, args.output_file, skip_asset_verification=args.skip_asset_verification) 475 | 476 | 477 | if __name__ == "__main__": 478 | main() 479 | -------------------------------------------------------------------------------- /tests/test_rss_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from xml.etree import ElementTree as ET 4 | from unittest.mock import patch, MagicMock 5 | from datetime import datetime, timezone, timedelta 6 | 7 | # Set test mode before importing the module 8 | os.environ["RSS_GENERATOR_TEST_MODE"] = "true" 9 | 10 | from rss_generator import ( 11 | convert_iso_to_rfc2822, 12 | generate_rss, 13 | get_file_info, 14 | read_podcast_config, 15 | ) 16 | 17 | CONFIG_FILE = "podcast_config.example.yaml" 18 | 19 | 20 | class TestRSSGenerator(unittest.TestCase): 21 | @classmethod 22 | def setUpClass(cls): 23 | # Read the configuration and generate the RSS feed once for all tests 24 | # Use the updated example config with non-prefixed keys 25 | cls.config = read_podcast_config(CONFIG_FILE) 26 | 27 | # --- Generate feed based on the example config (using new keys) --- 28 | generate_rss(cls.config, "test_podcast_feed_new_keys.xml") 29 | cls.tree_new = ET.parse("test_podcast_feed_new_keys.xml") 30 | cls.root_new = cls.tree_new.getroot() 31 | cls.channel_new = cls.root_new.find("channel") 32 | 33 | # --- Generate feed using old keys for backward compatibility testing --- 34 | cls.config_old = read_podcast_config(CONFIG_FILE) # Read again 35 | # Rename keys back to old format for this test config 36 | metadata_old = cls.config_old["metadata"] 37 | metadata_old["itunes_email"] = metadata_old.pop("email") 38 | metadata_old["itunes_author"] = metadata_old.pop("author") 39 | metadata_old["itunes_category"] = metadata_old.pop("category") 40 | metadata_old["itunes_explicit"] = metadata_old.pop("explicit") 41 | # image key remains 'image' 42 | generate_rss(cls.config_old, "test_podcast_feed_old_keys.xml") 43 | cls.tree_old = ET.parse("test_podcast_feed_old_keys.xml") 44 | cls.root_old = cls.tree_old.getroot() 45 | cls.channel_old = cls.root_old.find("channel") 46 | 47 | # Add podcast namespace for transcript testing 48 | cls.ns = { 49 | "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", 50 | "podcast": "https://podcastindex.org/namespace/1.0", 51 | } 52 | 53 | def test_config_structure(self): 54 | # Test structure based on the primary config (new keys) 55 | self.assertIn("metadata", self.config) 56 | self.assertIn("episodes", self.config) 57 | self.assertIn("image", self.config["metadata"]) 58 | self.assertIn("email", self.config["metadata"]) # Check new key presence 59 | self.assertIn("author", self.config["metadata"]) # Check new key presence 60 | self.assertIn("category", self.config["metadata"]) # Check new key presence 61 | self.assertIn("explicit", self.config["metadata"]) # Check new key presence 62 | 63 | def test_rss_structure(self): 64 | # Test general structure on both generated feeds 65 | self.assertEqual(self.root_new.tag, "rss") 66 | self.assertIsNotNone(self.channel_new) 67 | self.assertEqual(self.root_old.tag, "rss") 68 | self.assertIsNotNone(self.channel_old) 69 | 70 | def test_channel_structure(self): 71 | # Test basic channel tags on both feeds 72 | required_tags = ["title", "description", "language", "link"] 73 | # Check for optional copyright tag if present in config 74 | if "copyright" in self.config["metadata"]: 75 | required_tags.append("copyright") 76 | 77 | for tag in required_tags: 78 | self.assertIsNotNone(self.channel_new.find(tag), f"[New Keys] Missing tag: {tag}") 79 | self.assertIsNotNone(self.channel_old.find(tag), f"[Old Keys] Missing tag: {tag}") 80 | 81 | def test_itunes_tags_in_channel(self): 82 | # Test iTunes tags presence in channel for both feeds 83 | itunes_tags = [ 84 | "itunes:explicit", 85 | "itunes:owner", # Contains itunes:email 86 | "itunes:author", 87 | "itunes:image", 88 | "itunes:category", 89 | ] 90 | for tag in itunes_tags: 91 | self.assertIsNotNone( 92 | self.channel_new.find(tag, self.ns), 93 | f"[New Keys] Missing iTunes tag in channel: {tag}", 94 | ) 95 | self.assertIsNotNone( 96 | self.channel_old.find(tag, self.ns), 97 | f"[Old Keys] Missing iTunes tag in channel: {tag}", 98 | ) 99 | 100 | # Check specific values to ensure correctness 101 | self.assertEqual( 102 | self.channel_new.find("itunes:author", self.ns).text, 103 | self.config["metadata"]["author"], 104 | ) 105 | self.assertEqual( 106 | self.channel_old.find("itunes:author", self.ns).text, 107 | self.config_old["metadata"]["itunes_author"], 108 | ) 109 | 110 | self.assertEqual( 111 | self.channel_new.find("itunes:owner/itunes:email", self.ns).text, 112 | self.config["metadata"]["email"], 113 | ) 114 | self.assertEqual( 115 | self.channel_old.find("itunes:owner/itunes:email", self.ns).text, 116 | self.config_old["metadata"]["itunes_email"], 117 | ) 118 | 119 | explicit_new = self.channel_new.find("itunes:explicit", self.ns).text 120 | explicit_old = self.channel_old.find("itunes:explicit", self.ns).text 121 | self.assertEqual( 122 | explicit_new, "no" if not self.config["metadata"]["explicit"] else "yes" 123 | ) 124 | self.assertEqual( 125 | explicit_old, 126 | "no" if not self.config_old["metadata"]["itunes_explicit"] else "yes", 127 | ) 128 | 129 | def test_episode_structure(self): 130 | # Check episode structure based on new keys config (should be same for old) 131 | for episode in self.config["episodes"]: 132 | title = episode["title"] 133 | item_new = self.channel_new.find(f"item[title='{title}']") 134 | self.assertIsNotNone( 135 | item_new, f"[New Keys] Missing item for episode: {title}" 136 | ) 137 | self.assertIsNotNone( 138 | item_new.find("enclosure"), 139 | f"[New Keys] Missing enclosure tag for episode: {title}", 140 | ) 141 | # Optionally check on old keys feed too, assuming structure is identical 142 | item_old = self.channel_old.find(f"item[title='{title}']") 143 | self.assertIsNotNone( 144 | item_old, f"[Old Keys] Missing item for episode: {title}" 145 | ) 146 | self.assertIsNotNone( 147 | item_old.find("enclosure"), 148 | f"[Old Keys] Missing enclosure tag for episode: {title}", 149 | ) 150 | 151 | # Check for transcript tags if present in episode config 152 | if "transcripts" in episode and isinstance(episode["transcripts"], list): 153 | transcript_tags_new = item_new.findall("podcast:transcript", self.ns) 154 | transcript_tags_old = item_old.findall("podcast:transcript", self.ns) 155 | self.assertEqual(len(transcript_tags_new), len(episode["transcripts"]), 156 | f"[New Keys] Episode '{title}' transcript tag count mismatch") 157 | self.assertEqual(len(transcript_tags_old), len(episode["transcripts"]), 158 | f"[Old Keys] Episode '{title}' transcript tag count mismatch") 159 | 160 | # Verify attributes for *all* transcripts 161 | for i, transcript_config in enumerate(episode["transcripts"]): 162 | tag_new = transcript_tags_new[i] 163 | tag_old = transcript_tags_old[i] # Assuming order is preserved 164 | 165 | # Check New Keys Feed 166 | self.assertEqual(tag_new.get("url"), transcript_config["url"], f"[New Keys] Episode '{title}' transcript {i+1} URL mismatch") 167 | self.assertEqual(tag_new.get("type"), transcript_config["type"], f"[New Keys] Episode '{title}' transcript {i+1} type mismatch") 168 | if "language" in transcript_config: 169 | self.assertEqual(tag_new.get("language"), transcript_config["language"], f"[New Keys] Episode '{title}' transcript {i+1} language mismatch") 170 | else: 171 | self.assertIsNone(tag_new.get("language"), f"[New Keys] Episode '{title}' transcript {i+1} should not have language") 172 | 173 | # Check Old Keys Feed (assuming transcript logic remains the same) 174 | self.assertEqual(tag_old.get("url"), transcript_config["url"], f"[Old Keys] Episode '{title}' transcript {i+1} URL mismatch") 175 | self.assertEqual(tag_old.get("type"), transcript_config["type"], f"[Old Keys] Episode '{title}' transcript {i+1} type mismatch") 176 | if "language" in transcript_config: 177 | self.assertEqual(tag_old.get("language"), transcript_config["language"], f"[Old Keys] Episode '{title}' transcript {i+1} language mismatch") 178 | else: 179 | self.assertIsNone(tag_old.get("language"), f"[Old Keys] Episode '{title}' transcript {i+1} should not have language") 180 | 181 | def test_episode_itunes_tags(self): 182 | # Check episode tags based on new keys config 183 | for i, item in enumerate(self.channel_new.findall("item")): 184 | episode_config = self.config["episodes"][i] 185 | # Check for optional tags only if they exist in the config 186 | if "episode" in episode_config: 187 | itunes_episode = item.find("itunes:episode", self.ns) 188 | self.assertIsNotNone( 189 | itunes_episode, 190 | f"[New Keys] Missing itunes:episode tag in episode {i+1} when config has 'episode' key", 191 | ) 192 | self.assertEqual(str(episode_config["episode"]), itunes_episode.text) 193 | 194 | if "season" in episode_config: 195 | itunes_season = item.find("itunes:season", self.ns) 196 | self.assertIsNotNone( 197 | itunes_season, 198 | f"[New Keys] Missing itunes:season tag in episode {i+1} when config has 'season' key", 199 | ) 200 | self.assertEqual(str(episode_config["season"]), itunes_season.text) 201 | 202 | if "episode_type" in episode_config: 203 | itunes_episode_type = item.find("itunes:episodeType", self.ns) 204 | self.assertIsNotNone( 205 | itunes_episode_type, 206 | f"[New Keys] Missing itunes:episodeType tag in episode {i+1} when config has 'episode_type' key", 207 | ) 208 | self.assertEqual( 209 | episode_config["episode_type"], itunes_episode_type.text 210 | ) 211 | 212 | # Test for episode-specific itunes:image tag (this should always exist due to fallback) 213 | itunes_image_tag = item.find("itunes:image", self.ns) 214 | self.assertIsNotNone( 215 | itunes_image_tag, "[New Keys] Missing itunes:image tag in episode" 216 | ) 217 | # Could add similar loop for self.channel_old if needed, but logic is channel-level 218 | 219 | def test_episode_image_fallback(self): 220 | """Test image fallback on both new and old key feeds.""" 221 | # Test with New Keys feed 222 | channel_image_url_new = self.config["metadata"]["image"] 223 | for i, item in enumerate(self.channel_new.findall("item")): 224 | episode_config = self.config["episodes"][i] 225 | item_image = item.find("itunes:image", self.ns) 226 | self.assertIsNotNone( 227 | item_image, f"[New Keys] Episode {i+1} missing itunes:image tag" 228 | ) 229 | item_image_url = item_image.get("href") 230 | if "image" in episode_config: 231 | self.assertEqual( 232 | item_image_url, 233 | episode_config["image"], 234 | f"[New Keys] Episode {i+1} specific image URL mismatch", 235 | ) 236 | else: 237 | self.assertEqual( 238 | item_image_url, 239 | channel_image_url_new, 240 | f"[New Keys] Episode {i+1} fallback image URL mismatch", 241 | ) 242 | 243 | # Test with Old Keys feed 244 | channel_image_url_old = self.config_old["metadata"][ 245 | "image" 246 | ] # Still 'image' key here 247 | for i, item in enumerate(self.channel_old.findall("item")): 248 | episode_config = self.config_old["episodes"][ 249 | i 250 | ] # Use old config for checking episode key 251 | item_image = item.find("itunes:image", self.ns) 252 | self.assertIsNotNone( 253 | item_image, f"[Old Keys] Episode {i+1} missing itunes:image tag" 254 | ) 255 | item_image_url = item_image.get("href") 256 | if ( 257 | "image" in episode_config 258 | ): # Episode image key is 'image' in both configs 259 | self.assertEqual( 260 | item_image_url, 261 | episode_config["image"], 262 | f"[Old Keys] Episode {i+1} specific image URL mismatch", 263 | ) 264 | else: 265 | self.assertEqual( 266 | item_image_url, 267 | channel_image_url_old, 268 | f"[Old Keys] Episode {i+1} fallback image URL mismatch", 269 | ) 270 | 271 | def test_date_conversion(self): 272 | # Use a date from the example config for a more reliable test 273 | test_date = self.config["episodes"][0]["publication_date"] 274 | rfc_date = convert_iso_to_rfc2822(test_date) 275 | # Expected format based on "2023-01-15T10:00:00Z" 276 | self.assertTrue(rfc_date.startswith("Sun, 15 Jan 2023 10:00:00")) 277 | 278 | def test_file_info_retrieval(self): 279 | # Test on new keys config (should be same for old) 280 | for episode in self.config["episodes"]: 281 | file_info = get_file_info(episode["asset_url"]) 282 | self.assertIsInstance(file_info["content-length"], str) 283 | self.assertIsInstance(file_info["content-type"], str) 284 | 285 | def test_guid_logic(self): 286 | """Test GUID generation with and without use_asset_hash_as_guid flag.""" 287 | 288 | base_config = read_podcast_config(CONFIG_FILE) 289 | test_url = base_config["episodes"][0]["asset_url"] 290 | expected_sha256_guid = "sha256:test-sha256-hash" 291 | expected_gcs_md5_guid = "md5:test-gcs-md5-base64" 292 | expected_etag_guid_md5 = "etag:d41d8cd98f00b204e9800998ecf8427e" 293 | expected_etag_guid_multi = "etag:multipart-etag-abc-1" 294 | 295 | scenarios = [ 296 | # Default behavior (flag false or missing) 297 | ({"use_asset_hash_as_guid": False}, {}, test_url, "Default (flag false)"), 298 | ({}, {}, test_url, "Default (flag missing)"), 299 | # Flag true, testing header priority and fallback 300 | ( 301 | {"use_asset_hash_as_guid": True}, 302 | {"x-amz-checksum-sha256": "test-sha256-hash"}, 303 | expected_sha256_guid, 304 | "Flag true, SHA256 header", 305 | ), 306 | ({"use_asset_hash_as_guid": True}, {"x-goog-hash": "crc32c=AAA,md5=test-gcs-md5-base64"}, expected_gcs_md5_guid, "Flag true, GCS MD5 header"), 307 | # ETag scenarios (now prefixed with etag:) 308 | ({"use_asset_hash_as_guid": True}, {"ETag": '"d41d8cd98f00b204e9800998ecf8427e"'}, expected_etag_guid_md5, "Flag true, ETag (MD5-like)"), 309 | ({"use_asset_hash_as_guid": True}, {"ETag": '"multipart-etag-abc-1"'}, expected_etag_guid_multi, "Flag true, ETag (Multipart)"), 310 | # Priority: SHA256 > GCS MD5 > ETag 311 | ({"use_asset_hash_as_guid": True}, {"x-amz-checksum-sha256": "test-sha256-hash", "ETag": '"any-etag"'}, expected_sha256_guid, "Flag true, SHA256 takes priority over ETag"), 312 | ({"use_asset_hash_as_guid": True}, {"x-goog-hash": "crc32c=AAA,md5=test-gcs-md5-base64", "ETag": '"any-etag"'}, expected_gcs_md5_guid, "Flag true, GCS MD5 takes priority over ETag"), 313 | # Fallback if no headers found 314 | ({"use_asset_hash_as_guid": True}, {}, test_url, "Flag true, No hash headers fallback"), 315 | ] 316 | 317 | for meta_override, mock_headers, expected_guid, description in scenarios: 318 | with self.subTest(description=description): 319 | test_config = read_podcast_config(CONFIG_FILE) # Reset config 320 | test_config["metadata"].update(meta_override) 321 | 322 | # Mock the requests.head call within the _make_http_request scope 323 | mock_response = MagicMock() 324 | mock_response.headers = { 325 | "content-length": "1000", # Need basic headers for get_file_info 326 | "content-type": "audio/mpeg", 327 | **mock_headers, # Add scenario-specific headers 328 | } 329 | mock_response.url = test_url # Needed for ffprobe call 330 | 331 | # We patch _make_http_request which is called by get_file_info 332 | # We also need to patch _run_ffprobe_with_retry to avoid external calls 333 | with patch( 334 | "rss_generator._make_http_request", return_value=mock_response 335 | ), patch( 336 | "rss_generator._run_ffprobe_with_retry", 337 | return_value='streams.stream.0.duration="123"', 338 | ): 339 | output_filename = f"test_guid_{description.replace(' ', '_')}.xml" 340 | generate_rss(test_config, output_filename) 341 | 342 | tree = ET.parse(output_filename) 343 | root = tree.getroot() 344 | channel = root.find("channel") 345 | # Check the GUID of the first item 346 | item = channel.find("item") 347 | self.assertIsNotNone(item) 348 | guid_tag = item.find("guid") 349 | self.assertIsNotNone(guid_tag) 350 | self.assertEqual(guid_tag.text, expected_guid) 351 | 352 | if os.path.exists(output_filename): 353 | os.remove(output_filename) 354 | 355 | def test_date_comparison_with_naive_datetime(self): 356 | """Test that future-dated episodes with naive datetime strings are skipped.""" 357 | # Create a config with a future date without timezone info 358 | future_naive_date = (datetime.now(timezone.utc) + timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%S") 359 | test_config = { 360 | "metadata": self.config["metadata"].copy(), # Use existing valid metadata 361 | "episodes": [ 362 | { 363 | "title": "Future Episode (Naive)", 364 | "description": "Test description", 365 | "publication_date": future_naive_date, 366 | "asset_url": "http://example.com/future_naive.mp3", 367 | } 368 | ] 369 | } 370 | # Mock get_file_info to avoid network calls 371 | mock_file_info = { 372 | "content-length": "1000", 373 | "content-type": "audio/mpeg", 374 | "duration": 120, 375 | "content_hash": None, 376 | } 377 | with patch("rss_generator.get_file_info", return_value=mock_file_info): 378 | generate_rss(test_config, "test_naive_date_feed.xml") 379 | 380 | # Assert the feed was generated but contains no items (because the episode was skipped) 381 | tree = ET.parse("test_naive_date_feed.xml") 382 | root = tree.getroot() 383 | channel = root.find("channel") 384 | items = channel.findall("item") 385 | self.assertEqual(len(items), 0, "Future episode with naive datetime should have been skipped") 386 | 387 | if os.path.exists("test_naive_date_feed.xml"): 388 | os.remove("test_naive_date_feed.xml") 389 | 390 | @classmethod 391 | def tearDownClass(cls): 392 | # Clean up both generated files 393 | if os.path.exists("test_podcast_feed_new_keys.xml"): 394 | os.remove("test_podcast_feed_new_keys.xml") 395 | if os.path.exists("test_podcast_feed_old_keys.xml"): 396 | os.remove("test_podcast_feed_old_keys.xml") 397 | 398 | 399 | if __name__ == "__main__": 400 | unittest.main() 401 | --------------------------------------------------------------------------------