├── .dprint.json
├── .github
    └── workflows
    │   └── python-tests.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── action.yaml
├── podcast_config.example.yaml
├── requirements.txt
├── rss_generator.py
└── tests
    └── test_rss_generator.py


/.dprint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lineWidth": 0,
 3 |     "markdown": {
 4 |       "textWrap": "maintain",
 5 |       "lineWidth": 0
 6 |     },
 7 |     "includes": ["**/*.md"],
 8 |     "excludes": [
 9 |       "**/node_modules",
10 |       "**/*-lock.json",
11 |       "**/target"
12 |     ],
13 |     "plugins": [
14 |       "https://plugins.dprint.dev/markdown-0.16.3.wasm"
15 |     ]
16 |   }


--------------------------------------------------------------------------------
/.github/workflows/python-tests.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Python Unit Tests and Linting
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - master
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - name: Set up Python 3.8
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: 3.8
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install -r requirements.txt
29 |           pip install flake8
30 |           pip install yamllint
31 |           sudo apt-get update && sudo apt-get install ffmpeg
32 | 
33 |       - name: Lint with flake8
34 |         run: |
35 |           # stop the build if there are Python syntax errors or undefined names
36 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 | 
38 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 | 
41 | 
42 |       - name: Ensure example file passes YAMLlint
43 |         run: |
44 |           yamllint -fgithub -d "{rules: {line-length: false}}" podcast_config.example.yaml
45 | 
46 |       - name: Generate Feed from Example
47 |         run: python rss_generator.py --input-file podcast_config.example.yaml --output-file podcast_feed.xml --skip-asset-verification
48 | 
49 |       - name: Install xq
50 |         run: |
51 |           wget -q https://github.com/sibprogrammer/xq/releases/download/v1.2.3/xq_1.2.3_linux_amd64.tar.gz
52 |           tar xfz xq_1.2.3_linux_amd64.tar.gz
53 |           sudo mv xq /usr/local/bin/
54 | 
55 |       - name: Validate Feed XML with xq
56 |         run: xq . podcast_feed.xml
57 | 
58 |       - name: Run tests
59 |         run: |
60 |           python -m unittest discover tests
61 | 
62 |       - name: Upload Test Feed Artifact
63 |         uses: actions/upload-artifact@v4
64 |         with:
65 |           name: podcast-feed-xml
66 |           path: podcast_feed.xml
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Exclude output from script
163 | podcast_config.yaml
164 | podcast_feed.xml
165 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | # Install ffmpeg
 4 | RUN apt-get update && \
 5 |     apt-get install -y ffmpeg && \
 6 |     rm -rf /var/lib/apt/lists/*
 7 | 
 8 | # Copy your script and requirements file
 9 | COPY rss_generator.py /rss_generator.py
10 | COPY requirements.txt /requirements.txt
11 | 
12 | # Install Python dependencies
13 | RUN pip install --no-cache-dir -r /requirements.txt
14 | 
15 | # Set the entrypoint to your script
16 | ENTRYPOINT ["python", "/rss_generator.py"]
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Viktor Petersson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Podcast RSS Generator
  2 | 
  3 | [![Python Unit Tests and Linting](https://github.com/vpetersson/podcast-rss-generator/actions/workflows/python-tests.yml/badge.svg)](https://github.com/vpetersson/podcast-rss-generator/actions/workflows/python-tests.yml)
  4 | 
  5 | ## Description
  6 | 
  7 | This an RSS Feed Generator is designed to generate an RSS feed for audio/video podcasts, reading metadata and episode data from a YAML file.
  8 | 
  9 | It assumes that you self-host your video episodes somewhere (e.g. S3/GCS/R2) as well as the output of this script. You can then point YouTube/Spotify/Apple Podcast to this path.
 10 | 
 11 | This tool was written for my podcast [Nerding Out with Viktor](https://vpetersson.com/podcast/) to solve for the fact that Apple's [Podcast Connect](https://podcastsconnect.apple.com) require you to self-host videos in order to publish.
 12 | 
 13 | I also wrote an article on how you can use this tool to automatically turn a video podcast into audio in [this article](https://vpetersson.com/2024/06/27/video-to-audio-podcast.html).
 14 | 
 15 | ## Features
 16 | 
 17 | - Generates RSS feed for audio/video podcasts
 18 | - Reads podcast metadata and episode data from a YAML file
 19 | - Converts ISO format dates to RFC 2822 format
 20 | - Attempts to follow [The Podcast RSS Standard](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification)
 21 | 
 22 | ## Known Issues
 23 | 
 24 | - Videos uploaded to YouTube [via RSS](https://support.google.com/youtube/answer/13525207?hl=en#zippy=%2Ccan-i-deliver-an-rss-feed-if-i-already-have-a-podcast-on-youtube) will be uploaded as audio.
 25 | - Spotify can't handle videos via RSS yet. You will be able to see the episodes in Podcaster, but they will not be processed and sent to Spotify properly. This is apparently a known issue that they are working on resolving.
 26 | 
 27 | The workaround for the above issues is to manually upload the episodes.
 28 | 
 29 | ## Installation
 30 | 
 31 | ### Prerequisites
 32 | 
 33 | - Python 3.8 or higher
 34 | - pip (Python package installer)
 35 | - ffmpeg
 36 | 
 37 | ### Setup
 38 | 
 39 | 1. **Clone the Repository**
 40 | 
 41 | ```bash
 42 | $ git clone https://github.com/vpetersson/podcast-rss-generator.git
 43 | $ cd podcast-rss-generator
 44 | ```
 45 | 
 46 | 2. **Install Dependencies**
 47 | 
 48 | ```bash
 49 | $ pip install -r requirements.txt
 50 | ```
 51 | 
 52 | **Optional:** Install `yamllint`, `xq` and `flake8`.
 53 | 
 54 | ## Usage
 55 | 
 56 | ```bash
 57 | $ python rss_generator.py --help
 58 | usage: rss_generator.py [-h] [--input-file INPUT_FILE] [--output-file OUTPUT_FILE] [--skip-asset-verification]
 59 | 
 60 | Process some parameters.
 61 | 
 62 | options:
 63 |   -h, --help            show this help message and exit
 64 |   --input-file INPUT_FILE
 65 |                         Input YAML file
 66 |   --output-file OUTPUT_FILE
 67 |                         Output XML file
 68 |   --skip-asset-verification
 69 |                         Skip HTTP HEAD and ffprobe checks for asset URLs (use for testing/fake URLs)
 70 | ```
 71 | 
 72 | 1. **Prepare Your Data Files**
 73 | 
 74 | Copy `podcast_config.example.yaml` to `podcast_config.yaml` and fill out your podcast metadata and eepisodes.
 75 | 
 76 | The `podcast_config.yaml` file contains two main sections: `metadata` and `episodes`.
 77 | 
 78 | ### Metadata Section
 79 | 
 80 | This section contains general information about your podcast:
 81 | 
 82 | | Key                      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Notes                                                                                                                                                                                                 |
 83 | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 84 | | `title`                  | The title of your podcast.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | Required                                                                                                                                                                                              |
 85 | | `description`            | A description of your podcast.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | Required. Markdown is supported.                                                                                                                                                                      |
 86 | | `link`                   | The URL of the main website for your podcast.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Required. Also the default link for episodes if not provided per-episode.                                                                                                                             |
 87 | | `rss_feed_url`           | The public URL where your generated `podcast_feed.xml` will be hosted.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Required                                                                                                                                                                                              |
 88 | | `language`               | The language of the podcast (e.g., `en-us`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | Optional. Default: `en-us`.                                                                                                                                                                           |
 89 | | `email`                  | The contact email for the podcast owner.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | Required. Backward compatibility: `itunes_email`. Used for `<podcast:locked>`.                                                                                                                        |
 90 | | `author`                 | The author name(s).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Required. Backward compatibility: `itunes_author`.                                                                                                                                                    |
 91 | | `category`               | The primary category for iTunes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Optional. Backward compatibility: `itunes_category`.                                                                                                                                                  |
 92 | | `image`                  | The URL for the main podcast cover art (JPEG or PNG, 1400x1400 to 3000x3000 pixels).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Required. Also the default image for episodes if not provided per-episode.                                                                                                                            |
 93 | | `explicit`               | Indicates if the podcast contains explicit content.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Optional (`true`/`false`). Default: `false`. Backward compatibility: `itunes_explicit`. Can be overridden per-episode.                                                                            |
 94 | | `use_asset_hash_as_guid` | Use a content hash or ETag from the asset file\'s headers as the episode\'s `<guid>`. Prioritizes `x-amz-checksum-sha256`, then `x-goog-hash` (MD5 part), then `ETag`. **Warning:** This can break GUID permanence if asset files change or are re-uploaded, potentially causing re-downloads for subscribers. | Optional (`true`/`false`). Default: `false` (uses `asset_url`).                                                                                                                                        |
 95 | | `copyright`              | A string containing the copyright notice for the podcast.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | Optional.                                                                                                                                                                                             |
 96 | | `podcast_locked`         | Tells platforms not to import the feed without confirming ownership via the `email` address.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Optional (`yes`/`no`). Default: `no`. Based on [Podcast Standards Project](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification).                                  |
 97 | | `podcast_guid`           | A globally unique, permanent identifier (UUID recommended) for the *entire podcast show*.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Optional. If omitted, a stable UUIDv5 is generated from `rss_feed_url`. Strongly recommended to set explicitly. Based on [Podcast Standards Project](https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification). |
 98 | 
 99 | ### Episodes Section
100 | 
101 | This section is a list of your podcast episodes. Each episode is an object with the following fields:
102 | 
103 | | Key                | Description                                                                                                                                          | Notes                                                                                                                           |
104 | | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
105 | | `title`            | The title of the episode.                                                                                                                            | Required.                                                                                                                       |
106 | | `description`      | A description of the episode.                                                                                                                        | Required. Markdown is supported.                                                                                                |
107 | | `publication_date` | The date and time the episode was published. Episodes with future dates will not be included.                                                        | Required. ISO 8601 format (e.g., `2023-01-15T10:00:00Z`).                                                                         |
108 | | `asset_url`        | The direct URL to the audio or video file for the episode.                                                                                           | Required.                                                                                                                       |
109 | | `link`             | The URL for a webpage specific to this episode.                                                                                                      | Optional. Defaults to the global `link` from `metadata`.                                                                        |
110 | | `image`            | The URL for artwork specific to this episode (same format requirements as the main podcast image).                                                   | Optional. Defaults to the global `image` from `metadata`.                                                                       |
111 | | `episode`          | The episode number.                                                                                                                                  | Optional. Integer.                                                                                                              |
112 | | `season`           | The season number.                                                                                                                                   | Optional. Integer.                                                                                                              |
113 | | `episode_type`     | Defines the type of content for the episode.                                                                                                         | Optional. Can be `full` (default), `trailer`, or `bonus`.                                                                       |
114 | | `explicit`         | Indicates if this specific episode contains explicit content.                                                                                        | Optional (`true`/`false`). Overrides the global `explicit` setting for this episode. Backward compatibility: `itunes_explicit`. |
115 | | `transcripts`      | A list of transcript files associated with the episode. Each item is an object with `url` (required), `type` (required), `language` (opt), `rel` (opt). | Optional. See example config for structure.                                                                                   |
116 | 
117 | 2. **Generate the RSS Feed**
118 | 
119 | Make sure your YAML is valid:
120 | 
121 | ```bash
122 | $ yamllint podcast_config.yaml
123 | ```
124 | 
125 | Generate your `podcast_feed.xml` file:
126 | 
127 | ```bash
128 | $ python rss_generator.py
129 | ```
130 | 
131 | Now copy your `podcast_feed.xml` to S3/GCS/R2 using a tool like `s3cmd`, `aws` or `mc` (from Minio).
132 | 
133 | You can verify your RSS feed using a tool like [Podbase](https://podba.se/validate/).
134 | 
135 | ## **Optional:** Optimize video
136 | 
137 | If you're dealing with video podcasts, the file size matters for obvious reasons. What I do is to export the video as h624 from my video editor (which I upload to YouTube and Spotify).
138 | 
139 | I then re-encode the h264 video to h265 for other platforms using `ffmpeg` with the following command (on macOS):
140 | 
141 | ```bash
142 | $ ffmpeg -i input.mp4 \
143 |     -tag:v hvc1 \
144 |     -c:v hevc_videotoolbox \
145 |     -crf 28 \
146 |     -preset slowest \
147 |     -c:a aac \
148 |     -b:a 128k \
149 |     -movflags faststart \
150 |     output.mp4
151 | ```
152 | 
153 | ## Usage with GitHub Actions
154 | 
155 | To incorporate this action into your workflow, follow these steps:
156 | 
157 | 1. **Create a Workflow File**:
158 |    - In your repository, create a new file under `.github/workflows`, for example, `rss_workflow.yml`.
159 | 
160 | 2. **Set Up the Workflow**:
161 |    - Use the following configuration as a starting point:
162 | 
163 | ```yaml
164 | name: Generate Podcast RSS Feed
165 | 
166 | on: [push, pull_request]
167 | 
168 | env:
169 |   R2_BUCKET: 'foobar'
170 | jobs:
171 |   generate-rss:
172 |     runs-on: ubuntu-latest
173 | 
174 |     steps:
175 |       - name: Checkout repository
176 |         uses: actions/checkout@v2
177 | 
178 |       - name: Install yamllint
179 |         run: |
180 |           sudo apt-get update
181 |           sudo apt-get install yamllint
182 | 
183 |       - name: Lint YAML file
184 |         run: yamllint podcast_config.yaml
185 | 
186 |       - name: Run Podcast RSS Generator
187 |         uses: vpetersson/podcast-rss-generator@master
188 |         with:
189 |           input_file: 'podcast_config.yaml'
190 |           output_file: 'podcast_feed.xml'
191 | 
192 |       - name: Validate output with xq
193 |         run: |
194 |           wget -q https://github.com/sibprogrammer/xq/releases/download/v1.2.3/xq_1.2.3_linux_amd64.tar.gz
195 |           tar xfz xq_1.2.3_linux_amd64.tar.gz
196 |           cat podcast_feed.xml | ./xq
197 | 
198 |       - uses: actions/upload-artifact@v2
199 |         with:
200 |           name: podcast_feed.xml
201 |           path: podcast_feed.xml
202 | 
203 |   deploy:
204 |     runs-on: ubuntu-latest
205 |     needs: generate-rss
206 |     if: github.ref == 'refs/heads/master'
207 |     steps:
208 |       - uses: actions/download-artifact@v2
209 |         with:
210 |           name: podcast_feed.xml
211 | 
212 |       - name: Install mc
213 |         run: |
214 |           wget -q https://dl.min.io/client/mc/release/linux-amd64/mc
215 |           chmod +x mc
216 | 
217 |       - name: Set up mc
218 |         env:
219 |           R2_ENDPOINT: ${{ secrets.R2_ENDPOINT }}
220 |           R2_KEY_ID: ${{ secrets.R2_KEY_ID }}
221 |           R2_KEY_SECRET: ${{ secrets.R2_KEY_SECRET }}
222 |         run: ./mc alias set r2-storage ${R2_ENDPOINT} ${R2_KEY_ID} ${R2_KEY_SECRET}
223 | 
224 |       - name: Copy file
225 |         run: ./mc cp podcast_feed.xml r2-storage/${R2_BUCKET}/
226 | ```
227 | 
228 | 3. **Customize Your Workflow**:
229 |    - Adjust paths to the YAML configuration and the output XML files as per your repository structure.
230 |    - Ensure the `uses` field points to `vpetersson/podcast-rss-generator@master` (or specify a specific release tag/version instead of `master`).
231 | 
232 | 4. **Commit and Push Your Workflow**:
233 |    - Once you commit this workflow file to your repository, the action will be triggered based on the defined events (e.g., on push or pull request).
234 | 
235 | ### Inputs
236 | 
237 | - `input_file`: Path to the input YAML file. Default: `podcast_config.yaml`.
238 | - `output_file`: Path for the generated RSS feed XML file. Default: `podcast_feed.xml`.
239 | 
240 | ## Running Tests
241 | 
242 | To run unit tests, use:
243 | 
244 | ```bash
245 | $ python -m unittest discover tests
246 | ```
247 | 
248 | ## Contributing
249 | 
250 | Contributions to this project are welcome! Please follow these steps:
251 | 
252 | 1. Fork the repository.
253 | 2. Create a new branch for your feature.
254 | 3. Commit your changes.
255 | 4. Push to the branch.
256 | 5. Submit a pull request.
257 | 
258 | ## License
259 | 
260 | [MIT License](LICENSE)
261 | 


--------------------------------------------------------------------------------
/action.yaml:
--------------------------------------------------------------------------------
 1 | name: 'Podcast RSS Generator'
 2 | description: 'Generates a podcast RSS feed from a YAML configuration'
 3 | author: 'Viktor Petersson'
 4 | branding:
 5 |   icon: 'rss'
 6 |   color: 'purple'
 7 | 
 8 | runs:
 9 |   using: 'docker'
10 |   image: 'Dockerfile'
11 |   args:
12 |     - "--input-file"
13 |     - ${{ inputs.input_file }}
14 |     - "--output-file"
15 |     - ${{ inputs.output_file }}
16 | 
17 | inputs:
18 |   input_file:
19 |     description: 'Input YAML file'
20 |     required: true
21 |     default: 'podcast_config.yaml'
22 |   output_file:
23 |     description: 'Output XML file'
24 |     required: true
25 |     default: 'podcast_feed.xml'
26 | 


--------------------------------------------------------------------------------
/podcast_config.example.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | metadata:
 3 |   title: "My Awesome Podcast"
 4 |   description: "A podcast about technology and programming."
 5 |   link: "https://example.com/podcast" # Fallback link
 6 |   rss_feed_url: "https://example.com/podcast/feed.xml"
 7 |   language: "en-us"
 8 |   email: "podcast@example.com"
 9 |   author: "Podcast Host"
10 |   category: "Technology"
11 |   image: "https://example.com/podcast/images/podcast_cover.png" # Fallback image
12 |   explicit: false
13 |   use_asset_hash_as_guid: false # Default: Use asset_url. Set to true to use content hash (SHA256/MD5/ETag).
14 |   copyright: "© 2024 Your Name/Company" # Optional copyright notice
15 |   # --- Podcast Standards Project Recommended Tags --- #
16 |   podcast_locked: "no" # "yes" or "no". Prevents feed import by other platforms without owner confirmation (uses email above).
17 |   # podcast_guid: "YOUR-PODCAST-GUID-HERE" # A unique, permanent ID for the entire podcast (UUID recommended).
18 |                                             # If omitted, a GUID will be generated based on rss_feed_url.
19 |                                             # Generate one using: python -c "import uuid; print(uuid.uuid4())" or an online generator.
20 |                                             # Example: 9b024349-ccf0-5f69-a609-6b82873eab3c
21 | 
22 | episodes:
23 |   - title: "Episode 1: The Beginning"
24 |     description: "Introduction to the podcast."
25 |     publication_date: "2023-01-15T10:00:00Z"
26 |     asset_url: "https://example.com/podcast/episodes/episode1.mp3"
27 |     episode: 1
28 |     season: 1
29 |     episode_type: "full"
30 |     # This episode will use the global link and image from metadata
31 |     transcripts:
32 |       - url: "https://example.com/podcast/transcripts/episode1.srt"
33 |         type: "application/x-subrip"
34 |       - url: "https://example.com/podcast/transcripts/episode1.vtt"
35 |         type: "text/vtt"
36 |         language: "en" # Optional: Language code (e.g., "en", "en-US", "es"). Follows RFC 5646.
37 | 
38 |   - title: "Episode 2: Deep Dive into Python"
39 |     description: "Exploring advanced Python features."
40 |     publication_date: "2023-01-22T10:00:00Z"
41 |     asset_url: "https://example.com/podcast/episodes/episode2.mp3"
42 |     link: "https://example.com/podcast/episodes/2" # Episode-specific link
43 |     image: "https://example.com/podcast/images/episode2_cover.png" # Episode-specific image
44 |     episode: 2
45 |     season: 1
46 |     episode_type: "full"
47 |     # This episode has specific link/image but no transcripts
48 | 
49 |   - title: "Special Bonus Episode"
50 |     description: "An interview with a special guest."
51 |     publication_date: "2023-01-25T12:00:00Z"
52 |     asset_url: "https://example.com/podcast/episodes/bonus1.mp3"
53 |     link: "https://example.com/podcast/episodes/bonus" # Episode-specific link
54 |     # This episode will use the global image from metadata, but has its own link
55 |     episode_type: "bonus"
56 |     transcripts:
57 |       - url: "https://example.com/podcast/transcripts/bonus1.json"
58 |         type: "application/json"
59 |         # language: "es" # Optional: Language code (e.g., "en", "en-US", "es"). Follows RFC 5646.
60 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==6.0.1
2 | requests==2.31.0
3 | Markdown==3.5.1
4 | sh==2.0.6
5 | yamllint==1.33.0
6 | retry==0.9.2
7 | 


--------------------------------------------------------------------------------
/rss_generator.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as ET
  2 | from datetime import datetime, timezone
  3 | from email.utils import format_datetime
  4 | import argparse
  5 | import time
  6 | import os
  7 | import re
  8 | import uuid
  9 | 
 10 | import markdown
 11 | import requests
 12 | import yaml
 13 | from sh import ffprobe, ErrorReturnCode
 14 | from retry import retry
 15 | 
 16 | # Flag to indicate if we're in test mode
 17 | TEST_MODE = os.environ.get("RSS_GENERATOR_TEST_MODE", "false").lower() == "true"
 18 | 
 19 | # Mock ffprobe output for testing
 20 | MOCK_FFPROBE_OUTPUT = """streams.stream.0.index=0
 21 | streams.stream.0.codec_name="aac"
 22 | streams.stream.0.codec_long_name="AAC (Advanced Audio Coding)"
 23 | streams.stream.0.profile="LC"
 24 | streams.stream.0.codec_type="audio"
 25 | streams.stream.0.codec_tag_string="mp4a"
 26 | streams.stream.0.codec_tag="0x6134706d"
 27 | streams.stream.0.sample_fmt="fltp"
 28 | streams.stream.0.sample_rate="44100"
 29 | streams.stream.0.channels=2
 30 | streams.stream.0.channel_layout="stereo"
 31 | streams.stream.0.bits_per_sample=0
 32 | streams.stream.0.initial_padding=0
 33 | streams.stream.0.id="0x1"
 34 | streams.stream.0.r_frame_rate="0/0"
 35 | streams.stream.0.avg_frame_rate="0/0"
 36 | streams.stream.0.time_base="1/44100"
 37 | streams.stream.0.start_pts=0
 38 | streams.stream.0.start_time="0.000000"
 39 | streams.stream.0.duration_ts=156170240
 40 | streams.stream.0.duration="3541.275283"
 41 | streams.stream.0.bit_rate="107301"
 42 | streams.stream.0.max_bit_rate="N/A"
 43 | streams.stream.0.bits_per_raw_sample="N/A"
 44 | streams.stream.0.nb_frames="152510"
 45 | streams.stream.0.nb_read_frames="N/A"
 46 | streams.stream.0.nb_read_packets="N/A"
 47 | streams.stream.0.extradata_size=2
 48 | streams.stream.0.disposition.default=1"""
 49 | 
 50 | 
 51 | # Mock HTTP response for testing
 52 | class MockResponse:
 53 |     def __init__(self, url):
 54 |         self.url = url
 55 |         self.headers = {
 56 |             "content-length": "12345678",
 57 |             "content-type": "audio/mpeg",
 58 |             # Example headers for testing hash extraction
 59 |             "ETag": '"d41d8cd98f00b204e9800998ecf8427e"',  # MD5 hash
 60 |             # 'ETag': '"abc-1"', # Multipart ETag
 61 |             # 'x-amz-checksum-sha256': 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855',
 62 |             # 'x-goog-hash': 'crc32c=AAAAAA==,md5=1B2M2Y8AsgTpgAmY7PhCfg==' # Base64 MD5
 63 |         }
 64 | 
 65 | 
 66 | def read_podcast_config(yaml_file_path):
 67 |     with open(yaml_file_path, "r", encoding="utf-8") as file:
 68 |         return yaml.safe_load(file)
 69 | 
 70 | 
 71 | def convert_iso_to_rfc2822(iso_date):
 72 |     # Replace 'Z' with '+00:00' for Python < 3.11 compatibility
 73 |     compatible_iso_date = iso_date.replace("Z", "+00:00")
 74 |     date_obj = datetime.fromisoformat(compatible_iso_date)
 75 |     return format_datetime(date_obj)
 76 | 
 77 | 
 78 | @retry(tries=5, delay=2, backoff=2, logger=None)
 79 | def _make_http_request(url):
 80 |     """Make HTTP request with retry logic"""
 81 |     if TEST_MODE:
 82 |         return MockResponse(url)
 83 |     return requests.head(url, allow_redirects=True)
 84 | 
 85 | 
 86 | def _run_ffprobe_with_retry(url, max_retries=5, delay=2):
 87 |     """
 88 |     Run ffprobe with manual retry logic to handle ErrorReturnCode exceptions
 89 |     """
 90 |     if TEST_MODE:
 91 |         return MOCK_FFPROBE_OUTPUT
 92 | 
 93 |     retries = 0
 94 |     while retries < max_retries:
 95 |         try:
 96 |             return ffprobe(
 97 |                 "-hide_banner",
 98 |                 "-v",
 99 |                 "quiet",
100 |                 "-show_streams",
101 |                 "-print_format",
102 |                 "flat",
103 |                 url,
104 |             )
105 |         except ErrorReturnCode:
106 |             retries += 1
107 |             if retries >= max_retries:
108 |                 print(
109 |                     f"Failed to run ffprobe after {max_retries} attempts for URL: {url}"
110 |                 )
111 |                 # Return empty string if all retries fail
112 |                 return ""
113 |             print(
114 |                 f"ffprobe failed (attempt {retries}/{max_retries}), retrying in {delay} seconds..."
115 |             )
116 |             time.sleep(delay)
117 |             delay *= 2  # Exponential backoff
118 | 
119 | 
120 | def get_file_info(url):
121 |     # Make HTTP request with retry logic
122 |     response = _make_http_request(url)
123 | 
124 |     # Get duration of audio/video file
125 |     # We're using the response.url here in order to
126 |     # follow redirects and get the actual file
127 | 
128 |     # Run ffprobe with retry logic
129 |     probe = _run_ffprobe_with_retry(response.url)
130 | 
131 |     # If probe is empty (all retries failed), set duration to None
132 |     if not probe:
133 |         return {
134 |             "content-length": response.headers.get("content-length"),
135 |             "content-type": response.headers.get("content-type"),
136 |             "duration": None,
137 |         }
138 | 
139 |     lines = probe.split("\n")
140 | 
141 |     # Filtering out the line that contains 'streams.stream.0.duration'
142 |     duration_line = next(
143 |         (line for line in lines if line.startswith("streams.stream.0.duration=")), None
144 |     )
145 | 
146 |     if duration_line:
147 |         # Extracting the numeric value and converting it to an integer
148 |         duration = int(float(duration_line.split("=")[1].strip('"')))
149 |     else:
150 |         duration = None
151 | 
152 |     # --- Extract content hash from headers ---
153 |     content_hash = None
154 |     headers = response.headers
155 | 
156 |     # 1. Check for x-amz-checksum-sha256
157 |     sha256_hash = headers.get("x-amz-checksum-sha256")
158 |     if sha256_hash:
159 |         content_hash = f"sha256:{sha256_hash}"
160 | 
161 |     # 2. Check for GCS MD5 (if SHA256 not found)
162 |     if not content_hash:
163 |         gcs_hash = headers.get("x-goog-hash")
164 |         if gcs_hash:
165 |             # Extract base64 md5 value - look for md5= and capture until next comma or end of string
166 |             match = re.search(r"md5=([^,]+)", gcs_hash)
167 |             if match:
168 |                 # Note: GCS MD5 is base64 encoded, needs decoding if we wanted raw bytes,
169 |                 # but for a GUID string, the base64 representation is fine and unique.
170 |                 content_hash = f"md5:{match.group(1)}"
171 | 
172 |     # 3. Check ETag (if other hashes not found)
173 |     if not content_hash:
174 |         etag = headers.get("ETag", "").strip('" ') # Remove quotes and whitespace
175 |         if etag: # Use any non-empty ETag as a fallback hash
176 |             content_hash = f"etag:{etag}"
177 | 
178 |     return {
179 |         "content-length": headers.get("content-length"),
180 |         "content-type": headers.get("content-type"),
181 |         "duration": duration,
182 |         "content_hash": content_hash,  # Add the extracted hash to the result
183 |     }
184 | 
185 | 
186 | def format_description(description):
187 |     """
188 |     Convert Markdown description to HTML
189 |     """
190 |     html_description = markdown.markdown(description)
191 |     wrapped_description = f"<![CDATA[{html_description}]]>"
192 | 
193 |     # Ensure byte limit for the channel description
194 |     byte_limit = 4000
195 |     if len(wrapped_description.encode("utf-8")) > byte_limit:
196 |         # Truncate the description if it exceeds the limit
197 |         # Note: Truncation logic might need to be more sophisticated to handle HTML correctly
198 |         wrapped_description = wrapped_description[:byte_limit]
199 | 
200 |     return wrapped_description
201 | 
202 | 
203 | def generate_rss(config, output_file_path, skip_asset_verification=False):
204 |     # --- Namespace Registration --- (Ensure podcast namespace is included)
205 |     ET.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd")
206 |     ET.register_namespace("atom", "http://www.w3.org/2005/Atom")
207 |     ET.register_namespace("podcast", "https://podcastindex.org/namespace/1.0") # Add podcast namespace
208 | 
209 |     # --- Root Element Setup --- (Add podcast namespace attribute)
210 |     rss = ET.Element(
211 |         "rss",
212 |         version="2.0",
213 |         attrib={
214 |             "xmlns:itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
215 |             "xmlns:atom": "http://www.w3.org/2005/Atom",
216 |             "xmlns:podcast": "https://podcastindex.org/namespace/1.0" # Add podcast namespace
217 |         },
218 |     )
219 | 
220 |     # Global itunes:explicit setting
221 |     global_explicit = (
222 |         "yes" if config["metadata"].get("itunes_explicit", False) else "no"
223 |     )
224 | 
225 |     # --- Metadata Section --- (Add copyright)
226 |     channel = ET.SubElement(rss, "channel")
227 |     metadata = config["metadata"]
228 | 
229 |     # Helper function to get metadata with backward compatibility
230 |     def get_meta(key, old_key=None, required=False, default=None):
231 |         # If old_key is not provided, use key itself for checking
232 |         check_keys = [key]
233 |         if old_key:
234 |             check_keys.append(old_key)
235 | 
236 |         value = None
237 |         for k in check_keys:
238 |             value = metadata.get(k)
239 |             if value is not None:
240 |                 break # Found a value
241 | 
242 |         if required and value is None:
243 |             key_str = f"'{key}'"
244 |             if old_key:
245 |                 key_str += f" or '{old_key}'"
246 |             raise ValueError(f"Missing required metadata key: {key_str}")
247 | 
248 |         return value if value is not None else default
249 | 
250 |     ET.SubElement(channel, "title").text = metadata[
251 |         "title"
252 |     ]  # Title is fundamental, no old key needed
253 |     ET.SubElement(channel, "description").text = format_description(
254 |         metadata["description"]
255 |     )
256 |     ET.SubElement(channel, "language").text = metadata.get("language", "en-us")
257 |     ET.SubElement(channel, "link").text = metadata["link"]
258 |     ET.SubElement(
259 |         channel, "generator"
260 |     ).text = (
261 |         "Podcast RSS Generator (https://github.com/vpetersson/podcast-rss-generator)"
262 |     )
263 |     ET.SubElement(
264 |         channel,
265 |         "atom:link",
266 |         href=get_meta(
267 |             "rss_feed_url", "rss_feed_url", required=True
268 |         ),  # Use helper, though no old key needed
269 |         rel="self",
270 |         type="application/rss+xml",
271 |     )
272 | 
273 |     # Explicit tag (backward compatibility)
274 |     explicit_val = get_meta("explicit", "itunes_explicit", default=False)
275 |     explicit_text = "yes" if explicit_val else "no"
276 |     ET.SubElement(channel, "itunes:explicit").text = explicit_text
277 | 
278 |     # Owner/Email (backward compatibility)
279 |     email_val = get_meta("email", "itunes_email", required=True)
280 |     itunes_owner = ET.SubElement(channel, "itunes:owner")
281 |     ET.SubElement(itunes_owner, "itunes:email").text = email_val
282 | 
283 |     # Author (backward compatibility)
284 |     author_val = get_meta("author", "itunes_author", required=True)
285 |     ET.SubElement(channel, "itunes:author").text = author_val
286 | 
287 |     # Summary (use description)
288 |     itunes_summary = ET.SubElement(channel, "itunes:summary")
289 |     itunes_summary.text = metadata["description"]
290 | 
291 |     # Category (backward compatibility)
292 |     category_val = get_meta("category", "itunes_category")
293 |     if category_val:
294 |         ET.SubElement(channel, "itunes:category", text=category_val)
295 | 
296 |     # Image (backward compatibility, already handled)
297 |     image_val = get_meta(
298 |         "image", "image"
299 |     )  # Uses 'image' as both new and old effective key here
300 |     if image_val:
301 |         itunes_image = ET.SubElement(channel, "itunes:image")
302 |         itunes_image.set("href", image_val)
303 | 
304 |     # Copyright (Optional)
305 |     copyright_val = metadata.get("copyright")
306 |     if copyright_val:
307 |         ET.SubElement(channel, "copyright").text = copyright_val
308 | 
309 |     # Recommended Channel Elements (Podcast Standards Project)
310 |     # podcast:locked
311 |     locked_val = get_meta("podcast_locked", default="no") # Default to 'no' (false)
312 |     # Ensure the value is either 'yes' or 'no'
313 |     locked_text = "yes" if str(locked_val).lower() == "true" or str(locked_val).lower() == "yes" else "no"
314 |     ET.SubElement(channel, "podcast:locked", owner=email_val).text = locked_text # Requires owner email
315 | 
316 |     # podcast:guid
317 |     # Prefer explicitly defined GUID in config, otherwise generate based on feed URL
318 |     guid_val = get_meta("podcast_guid")
319 |     if not guid_val:
320 |         feed_url_val = get_meta("rss_feed_url", required=True) # Feed URL is required anyway
321 |         # Generate UUID v5 based on the feed URL namespace
322 |         guid_val = str(uuid.uuid5(uuid.NAMESPACE_URL, feed_url_val))
323 |         print(f"Warning: podcast_guid not found in metadata. Generated GUID: {guid_val}")
324 |         print("It is recommended to explicitly set podcast_guid in your config file.")
325 |     ET.SubElement(channel, "podcast:guid").text = guid_val
326 | 
327 |     # --- Episode Processing --- (Add transcript logic)
328 |     use_hash_guid = metadata.get("use_asset_hash_as_guid", False)
329 | 
330 |     for episode in config["episodes"]:
331 |         print(f"Processing episode {episode['title']}...")
332 | 
333 |         # Replace \'Z\' with \'+00:00\' for Python < 3.11 compatibility with fromisoformat
334 |         pub_date_str = episode["publication_date"].replace("Z", "+00:00")
335 |         # Parse the date string
336 |         pub_date = datetime.fromisoformat(pub_date_str)
337 |         # If the parsed date is naive (no timezone info), assume it's UTC
338 |         if pub_date.tzinfo is None:
339 |             pub_date = pub_date.replace(tzinfo=timezone.utc)
340 | 
341 |         # Now compare the timezone-aware publication date with the current UTC time
342 |         if not pub_date < datetime.now(timezone.utc):
343 |             print(
344 |                 f"Skipping episode {episode['title']} as it's not scheduled to be released until {episode['publication_date']}."
345 |             )
346 |             continue
347 | 
348 |         if skip_asset_verification:
349 |             print(f"  Skipping asset verification for {episode['asset_url']}")
350 |             # Provide default/placeholder values
351 |             file_info = {
352 |                 "content-length": "0", # Required by enclosure
353 |                 "content-type": "application/octet-stream", # Generic fallback type
354 |                 "duration": None,
355 |                 "content_hash": None,
356 |             }
357 |         else:
358 |             file_info = get_file_info(episode["asset_url"])
359 | 
360 |         item = ET.SubElement(channel, "item")
361 |         ET.SubElement(item, "pubDate").text = convert_iso_to_rfc2822(
362 |             pub_date_str
363 |         )
364 |         ET.SubElement(item, "title").text = episode["title"]
365 |         ET.SubElement(item, "description").text = format_description(
366 |             episode["description"]
367 |         )
368 | 
369 |         # Determine GUID: Use hash if requested and available, else use asset_url
370 |         guid_text = episode["asset_url"]  # Default
371 |         if use_hash_guid and file_info.get("content_hash"):
372 |             guid_text = file_info["content_hash"]
373 |             print(f"  Using content hash for GUID: {guid_text}")
374 |         else:
375 |             print(f"  Using asset URL for GUID: {guid_text}")
376 | 
377 |         ET.SubElement(item, "guid").text = guid_text
378 |         ET.SubElement(
379 |             item,
380 |             "enclosure",
381 |             url=episode["asset_url"],
382 |             # Use fetched or default values
383 |             type=file_info.get("content-type", "application/octet-stream"),
384 |             length=str(file_info.get("content-length", "0")),
385 |         )
386 | 
387 |         # Apply itunes:explicit setting (check episode first, then global)
388 |         episode_explicit_val = episode.get("explicit", episode.get("itunes_explicit"))
389 |         if episode_explicit_val is not None:
390 |             # Use episode-specific value if present
391 |             explicit_text_item = "yes" if episode_explicit_val else "no"
392 |         else:
393 |             # Fallback to global setting
394 |             explicit_text_item = global_explicit
395 |         ET.SubElement(item, "itunes:explicit").text = explicit_text_item
396 | 
397 |         # Add itunes:duration tag if available
398 |         if file_info.get("duration") is not None:
399 |             itunes_duration = ET.SubElement(item, "itunes:duration")
400 |             itunes_duration.text = str(file_info["duration"])
401 | 
402 |         # iTunes-specific tags
403 |         if episode.get("episode") is not None:
404 |             itunes_episode = ET.SubElement(item, "itunes:episode")
405 |             itunes_episode.text = str(episode["episode"])
406 | 
407 |         if episode.get("season") is not None:
408 |             itunes_season = ET.SubElement(item, "itunes:season")
409 |             itunes_season.text = str(episode["season"])
410 | 
411 |         if episode.get("episode_type") is not None:
412 |             itunes_episode_type = ET.SubElement(item, "itunes:episodeType")
413 |             itunes_episode_type.text = episode["episode_type"]
414 | 
415 |         # Add link if available, if not, use global
416 |         link = ET.SubElement(item, "link")
417 |         link.text = episode.get("link", metadata["link"])
418 | 
419 |         # Determine the correct image URL (episode-specific or channel default)
420 |         # Use episode specific artwork if available, falling back to channel image
421 |         image_url = episode.get("image", metadata.get("image"))
422 | 
423 |         # Creating the 'itunes:image' element if an image URL is available
424 |         if image_url:
425 |             itunes_image = ET.SubElement(item, "itunes:image")
426 |             itunes_image.set("href", image_url)
427 | 
428 |         # Add transcript links if available
429 |         if "transcripts" in episode and isinstance(episode["transcripts"], list):
430 |             for transcript_info in episode["transcripts"]:
431 |                 if "url" in transcript_info and "type" in transcript_info:
432 |                     # Basic required attributes
433 |                     transcript_attrs = {
434 |                         "url": transcript_info["url"],
435 |                         "type": transcript_info["type"],
436 |                     }
437 |                     # Add optional attributes if they exist
438 |                     if "language" in transcript_info:
439 |                         transcript_attrs["language"] = transcript_info["language"]
440 |                     if "rel" in transcript_info:
441 |                         transcript_attrs["rel"] = transcript_info["rel"]
442 | 
443 |                     ET.SubElement(item, "podcast:transcript", attrib=transcript_attrs)
444 |                 else:
445 |                     print(f"  Skipping invalid transcript entry for episode {episode['title']}: {transcript_info}")
446 | 
447 |     tree = ET.ElementTree(rss)
448 |     tree.write(output_file_path, encoding="UTF-8", xml_declaration=True)
449 | 
450 | 
451 | def main():
452 |     parser = argparse.ArgumentParser(description="Process some parameters.")
453 | 
454 |     parser.add_argument(
455 |         "--input-file", type=str, default="podcast_config.yaml", help="Input YAML file"
456 |     )
457 |     parser.add_argument(
458 |         "--output-file", type=str, default="podcast_feed.xml", help="Output XML file"
459 |     )
460 |     parser.add_argument(
461 |         "--skip-asset-verification",
462 |         action="store_true", # Makes it a boolean flag
463 |         help="Skip HTTP HEAD and ffprobe checks for asset URLs (use for testing/fake URLs)"
464 |     )
465 | 
466 |     # Parse arguments from the command line
467 |     args = parser.parse_args()
468 | 
469 |     print(f"Input file: {args.input_file}, Output file: {args.output_file}")
470 |     if args.skip_asset_verification:
471 |         print("Skipping asset verification.")
472 | 
473 |     config = read_podcast_config(args.input_file)
474 |     generate_rss(config, args.output_file, skip_asset_verification=args.skip_asset_verification)
475 | 
476 | 
477 | if __name__ == "__main__":
478 |     main()
479 | 


--------------------------------------------------------------------------------
/tests/test_rss_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from xml.etree import ElementTree as ET
  4 | from unittest.mock import patch, MagicMock
  5 | from datetime import datetime, timezone, timedelta
  6 | 
  7 | # Set test mode before importing the module
  8 | os.environ["RSS_GENERATOR_TEST_MODE"] = "true"
  9 | 
 10 | from rss_generator import (
 11 |     convert_iso_to_rfc2822,
 12 |     generate_rss,
 13 |     get_file_info,
 14 |     read_podcast_config,
 15 | )
 16 | 
 17 | CONFIG_FILE = "podcast_config.example.yaml"
 18 | 
 19 | 
 20 | class TestRSSGenerator(unittest.TestCase):
 21 |     @classmethod
 22 |     def setUpClass(cls):
 23 |         # Read the configuration and generate the RSS feed once for all tests
 24 |         # Use the updated example config with non-prefixed keys
 25 |         cls.config = read_podcast_config(CONFIG_FILE)
 26 | 
 27 |         # --- Generate feed based on the example config (using new keys) ---
 28 |         generate_rss(cls.config, "test_podcast_feed_new_keys.xml")
 29 |         cls.tree_new = ET.parse("test_podcast_feed_new_keys.xml")
 30 |         cls.root_new = cls.tree_new.getroot()
 31 |         cls.channel_new = cls.root_new.find("channel")
 32 | 
 33 |         # --- Generate feed using old keys for backward compatibility testing ---
 34 |         cls.config_old = read_podcast_config(CONFIG_FILE)  # Read again
 35 |         # Rename keys back to old format for this test config
 36 |         metadata_old = cls.config_old["metadata"]
 37 |         metadata_old["itunes_email"] = metadata_old.pop("email")
 38 |         metadata_old["itunes_author"] = metadata_old.pop("author")
 39 |         metadata_old["itunes_category"] = metadata_old.pop("category")
 40 |         metadata_old["itunes_explicit"] = metadata_old.pop("explicit")
 41 |         # image key remains 'image'
 42 |         generate_rss(cls.config_old, "test_podcast_feed_old_keys.xml")
 43 |         cls.tree_old = ET.parse("test_podcast_feed_old_keys.xml")
 44 |         cls.root_old = cls.tree_old.getroot()
 45 |         cls.channel_old = cls.root_old.find("channel")
 46 | 
 47 |         # Add podcast namespace for transcript testing
 48 |         cls.ns = {
 49 |             "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
 50 |             "podcast": "https://podcastindex.org/namespace/1.0",
 51 |             }
 52 | 
 53 |     def test_config_structure(self):
 54 |         # Test structure based on the primary config (new keys)
 55 |         self.assertIn("metadata", self.config)
 56 |         self.assertIn("episodes", self.config)
 57 |         self.assertIn("image", self.config["metadata"])
 58 |         self.assertIn("email", self.config["metadata"])  # Check new key presence
 59 |         self.assertIn("author", self.config["metadata"])  # Check new key presence
 60 |         self.assertIn("category", self.config["metadata"])  # Check new key presence
 61 |         self.assertIn("explicit", self.config["metadata"])  # Check new key presence
 62 | 
 63 |     def test_rss_structure(self):
 64 |         # Test general structure on both generated feeds
 65 |         self.assertEqual(self.root_new.tag, "rss")
 66 |         self.assertIsNotNone(self.channel_new)
 67 |         self.assertEqual(self.root_old.tag, "rss")
 68 |         self.assertIsNotNone(self.channel_old)
 69 | 
 70 |     def test_channel_structure(self):
 71 |         # Test basic channel tags on both feeds
 72 |         required_tags = ["title", "description", "language", "link"]
 73 |         # Check for optional copyright tag if present in config
 74 |         if "copyright" in self.config["metadata"]:
 75 |              required_tags.append("copyright")
 76 | 
 77 |         for tag in required_tags:
 78 |             self.assertIsNotNone(self.channel_new.find(tag), f"[New Keys] Missing tag: {tag}")
 79 |             self.assertIsNotNone(self.channel_old.find(tag), f"[Old Keys] Missing tag: {tag}")
 80 | 
 81 |     def test_itunes_tags_in_channel(self):
 82 |         # Test iTunes tags presence in channel for both feeds
 83 |         itunes_tags = [
 84 |             "itunes:explicit",
 85 |             "itunes:owner",  # Contains itunes:email
 86 |             "itunes:author",
 87 |             "itunes:image",
 88 |             "itunes:category",
 89 |         ]
 90 |         for tag in itunes_tags:
 91 |             self.assertIsNotNone(
 92 |                 self.channel_new.find(tag, self.ns),
 93 |                 f"[New Keys] Missing iTunes tag in channel: {tag}",
 94 |             )
 95 |             self.assertIsNotNone(
 96 |                 self.channel_old.find(tag, self.ns),
 97 |                 f"[Old Keys] Missing iTunes tag in channel: {tag}",
 98 |             )
 99 | 
100 |         # Check specific values to ensure correctness
101 |         self.assertEqual(
102 |             self.channel_new.find("itunes:author", self.ns).text,
103 |             self.config["metadata"]["author"],
104 |         )
105 |         self.assertEqual(
106 |             self.channel_old.find("itunes:author", self.ns).text,
107 |             self.config_old["metadata"]["itunes_author"],
108 |         )
109 | 
110 |         self.assertEqual(
111 |             self.channel_new.find("itunes:owner/itunes:email", self.ns).text,
112 |             self.config["metadata"]["email"],
113 |         )
114 |         self.assertEqual(
115 |             self.channel_old.find("itunes:owner/itunes:email", self.ns).text,
116 |             self.config_old["metadata"]["itunes_email"],
117 |         )
118 | 
119 |         explicit_new = self.channel_new.find("itunes:explicit", self.ns).text
120 |         explicit_old = self.channel_old.find("itunes:explicit", self.ns).text
121 |         self.assertEqual(
122 |             explicit_new, "no" if not self.config["metadata"]["explicit"] else "yes"
123 |         )
124 |         self.assertEqual(
125 |             explicit_old,
126 |             "no" if not self.config_old["metadata"]["itunes_explicit"] else "yes",
127 |         )
128 | 
129 |     def test_episode_structure(self):
130 |         # Check episode structure based on new keys config (should be same for old)
131 |         for episode in self.config["episodes"]:
132 |             title = episode["title"]
133 |             item_new = self.channel_new.find(f"item[title='{title}']")
134 |             self.assertIsNotNone(
135 |                 item_new, f"[New Keys] Missing item for episode: {title}"
136 |             )
137 |             self.assertIsNotNone(
138 |                 item_new.find("enclosure"),
139 |                 f"[New Keys] Missing enclosure tag for episode: {title}",
140 |             )
141 |             # Optionally check on old keys feed too, assuming structure is identical
142 |             item_old = self.channel_old.find(f"item[title='{title}']")
143 |             self.assertIsNotNone(
144 |                 item_old, f"[Old Keys] Missing item for episode: {title}"
145 |             )
146 |             self.assertIsNotNone(
147 |                 item_old.find("enclosure"),
148 |                 f"[Old Keys] Missing enclosure tag for episode: {title}",
149 |             )
150 | 
151 |             # Check for transcript tags if present in episode config
152 |             if "transcripts" in episode and isinstance(episode["transcripts"], list):
153 |                 transcript_tags_new = item_new.findall("podcast:transcript", self.ns)
154 |                 transcript_tags_old = item_old.findall("podcast:transcript", self.ns)
155 |                 self.assertEqual(len(transcript_tags_new), len(episode["transcripts"]),
156 |                                  f"[New Keys] Episode '{title}' transcript tag count mismatch")
157 |                 self.assertEqual(len(transcript_tags_old), len(episode["transcripts"]),
158 |                                  f"[Old Keys] Episode '{title}' transcript tag count mismatch")
159 | 
160 |                 # Verify attributes for *all* transcripts
161 |                 for i, transcript_config in enumerate(episode["transcripts"]):
162 |                     tag_new = transcript_tags_new[i]
163 |                     tag_old = transcript_tags_old[i] # Assuming order is preserved
164 | 
165 |                     # Check New Keys Feed
166 |                     self.assertEqual(tag_new.get("url"), transcript_config["url"], f"[New Keys] Episode '{title}' transcript {i+1} URL mismatch")
167 |                     self.assertEqual(tag_new.get("type"), transcript_config["type"], f"[New Keys] Episode '{title}' transcript {i+1} type mismatch")
168 |                     if "language" in transcript_config:
169 |                         self.assertEqual(tag_new.get("language"), transcript_config["language"], f"[New Keys] Episode '{title}' transcript {i+1} language mismatch")
170 |                     else:
171 |                         self.assertIsNone(tag_new.get("language"), f"[New Keys] Episode '{title}' transcript {i+1} should not have language")
172 | 
173 |                     # Check Old Keys Feed (assuming transcript logic remains the same)
174 |                     self.assertEqual(tag_old.get("url"), transcript_config["url"], f"[Old Keys] Episode '{title}' transcript {i+1} URL mismatch")
175 |                     self.assertEqual(tag_old.get("type"), transcript_config["type"], f"[Old Keys] Episode '{title}' transcript {i+1} type mismatch")
176 |                     if "language" in transcript_config:
177 |                         self.assertEqual(tag_old.get("language"), transcript_config["language"], f"[Old Keys] Episode '{title}' transcript {i+1} language mismatch")
178 |                     else:
179 |                         self.assertIsNone(tag_old.get("language"), f"[Old Keys] Episode '{title}' transcript {i+1} should not have language")
180 | 
181 |     def test_episode_itunes_tags(self):
182 |         # Check episode tags based on new keys config
183 |         for i, item in enumerate(self.channel_new.findall("item")):
184 |             episode_config = self.config["episodes"][i]
185 |             # Check for optional tags only if they exist in the config
186 |             if "episode" in episode_config:
187 |                 itunes_episode = item.find("itunes:episode", self.ns)
188 |                 self.assertIsNotNone(
189 |                     itunes_episode,
190 |                     f"[New Keys] Missing itunes:episode tag in episode {i+1} when config has 'episode' key",
191 |                 )
192 |                 self.assertEqual(str(episode_config["episode"]), itunes_episode.text)
193 | 
194 |             if "season" in episode_config:
195 |                 itunes_season = item.find("itunes:season", self.ns)
196 |                 self.assertIsNotNone(
197 |                     itunes_season,
198 |                     f"[New Keys] Missing itunes:season tag in episode {i+1} when config has 'season' key",
199 |                 )
200 |                 self.assertEqual(str(episode_config["season"]), itunes_season.text)
201 | 
202 |             if "episode_type" in episode_config:
203 |                 itunes_episode_type = item.find("itunes:episodeType", self.ns)
204 |                 self.assertIsNotNone(
205 |                     itunes_episode_type,
206 |                     f"[New Keys] Missing itunes:episodeType tag in episode {i+1} when config has 'episode_type' key",
207 |                 )
208 |                 self.assertEqual(
209 |                     episode_config["episode_type"], itunes_episode_type.text
210 |                 )
211 | 
212 |             # Test for episode-specific itunes:image tag (this should always exist due to fallback)
213 |             itunes_image_tag = item.find("itunes:image", self.ns)
214 |             self.assertIsNotNone(
215 |                 itunes_image_tag, "[New Keys] Missing itunes:image tag in episode"
216 |             )
217 |         # Could add similar loop for self.channel_old if needed, but logic is channel-level
218 | 
219 |     def test_episode_image_fallback(self):
220 |         """Test image fallback on both new and old key feeds."""
221 |         # Test with New Keys feed
222 |         channel_image_url_new = self.config["metadata"]["image"]
223 |         for i, item in enumerate(self.channel_new.findall("item")):
224 |             episode_config = self.config["episodes"][i]
225 |             item_image = item.find("itunes:image", self.ns)
226 |             self.assertIsNotNone(
227 |                 item_image, f"[New Keys] Episode {i+1} missing itunes:image tag"
228 |             )
229 |             item_image_url = item_image.get("href")
230 |             if "image" in episode_config:
231 |                 self.assertEqual(
232 |                     item_image_url,
233 |                     episode_config["image"],
234 |                     f"[New Keys] Episode {i+1} specific image URL mismatch",
235 |                 )
236 |             else:
237 |                 self.assertEqual(
238 |                     item_image_url,
239 |                     channel_image_url_new,
240 |                     f"[New Keys] Episode {i+1} fallback image URL mismatch",
241 |                 )
242 | 
243 |         # Test with Old Keys feed
244 |         channel_image_url_old = self.config_old["metadata"][
245 |             "image"
246 |         ]  # Still 'image' key here
247 |         for i, item in enumerate(self.channel_old.findall("item")):
248 |             episode_config = self.config_old["episodes"][
249 |                 i
250 |             ]  # Use old config for checking episode key
251 |             item_image = item.find("itunes:image", self.ns)
252 |             self.assertIsNotNone(
253 |                 item_image, f"[Old Keys] Episode {i+1} missing itunes:image tag"
254 |             )
255 |             item_image_url = item_image.get("href")
256 |             if (
257 |                 "image" in episode_config
258 |             ):  # Episode image key is 'image' in both configs
259 |                 self.assertEqual(
260 |                     item_image_url,
261 |                     episode_config["image"],
262 |                     f"[Old Keys] Episode {i+1} specific image URL mismatch",
263 |                 )
264 |             else:
265 |                 self.assertEqual(
266 |                     item_image_url,
267 |                     channel_image_url_old,
268 |                     f"[Old Keys] Episode {i+1} fallback image URL mismatch",
269 |                 )
270 | 
271 |     def test_date_conversion(self):
272 |         # Use a date from the example config for a more reliable test
273 |         test_date = self.config["episodes"][0]["publication_date"]
274 |         rfc_date = convert_iso_to_rfc2822(test_date)
275 |         # Expected format based on "2023-01-15T10:00:00Z"
276 |         self.assertTrue(rfc_date.startswith("Sun, 15 Jan 2023 10:00:00"))
277 | 
278 |     def test_file_info_retrieval(self):
279 |         # Test on new keys config (should be same for old)
280 |         for episode in self.config["episodes"]:
281 |             file_info = get_file_info(episode["asset_url"])
282 |             self.assertIsInstance(file_info["content-length"], str)
283 |             self.assertIsInstance(file_info["content-type"], str)
284 | 
285 |     def test_guid_logic(self):
286 |         """Test GUID generation with and without use_asset_hash_as_guid flag."""
287 | 
288 |         base_config = read_podcast_config(CONFIG_FILE)
289 |         test_url = base_config["episodes"][0]["asset_url"]
290 |         expected_sha256_guid = "sha256:test-sha256-hash"
291 |         expected_gcs_md5_guid = "md5:test-gcs-md5-base64"
292 |         expected_etag_guid_md5 = "etag:d41d8cd98f00b204e9800998ecf8427e"
293 |         expected_etag_guid_multi = "etag:multipart-etag-abc-1"
294 | 
295 |         scenarios = [
296 |             # Default behavior (flag false or missing)
297 |             ({"use_asset_hash_as_guid": False}, {}, test_url, "Default (flag false)"),
298 |             ({}, {}, test_url, "Default (flag missing)"),
299 |             # Flag true, testing header priority and fallback
300 |             (
301 |                 {"use_asset_hash_as_guid": True},
302 |                 {"x-amz-checksum-sha256": "test-sha256-hash"},
303 |                 expected_sha256_guid,
304 |                 "Flag true, SHA256 header",
305 |             ),
306 |             ({"use_asset_hash_as_guid": True}, {"x-goog-hash": "crc32c=AAA,md5=test-gcs-md5-base64"}, expected_gcs_md5_guid, "Flag true, GCS MD5 header"),
307 |             # ETag scenarios (now prefixed with etag:)
308 |             ({"use_asset_hash_as_guid": True}, {"ETag": '"d41d8cd98f00b204e9800998ecf8427e"'}, expected_etag_guid_md5, "Flag true, ETag (MD5-like)"),
309 |             ({"use_asset_hash_as_guid": True}, {"ETag": '"multipart-etag-abc-1"'}, expected_etag_guid_multi, "Flag true, ETag (Multipart)"),
310 |             # Priority: SHA256 > GCS MD5 > ETag
311 |             ({"use_asset_hash_as_guid": True}, {"x-amz-checksum-sha256": "test-sha256-hash", "ETag": '"any-etag"'}, expected_sha256_guid, "Flag true, SHA256 takes priority over ETag"),
312 |             ({"use_asset_hash_as_guid": True}, {"x-goog-hash": "crc32c=AAA,md5=test-gcs-md5-base64", "ETag": '"any-etag"'}, expected_gcs_md5_guid, "Flag true, GCS MD5 takes priority over ETag"),
313 |             # Fallback if no headers found
314 |             ({"use_asset_hash_as_guid": True}, {}, test_url, "Flag true, No hash headers fallback"),
315 |         ]
316 | 
317 |         for meta_override, mock_headers, expected_guid, description in scenarios:
318 |             with self.subTest(description=description):
319 |                 test_config = read_podcast_config(CONFIG_FILE)  # Reset config
320 |                 test_config["metadata"].update(meta_override)
321 | 
322 |                 # Mock the requests.head call within the _make_http_request scope
323 |                 mock_response = MagicMock()
324 |                 mock_response.headers = {
325 |                     "content-length": "1000",  # Need basic headers for get_file_info
326 |                     "content-type": "audio/mpeg",
327 |                     **mock_headers,  # Add scenario-specific headers
328 |                 }
329 |                 mock_response.url = test_url  # Needed for ffprobe call
330 | 
331 |                 # We patch _make_http_request which is called by get_file_info
332 |                 # We also need to patch _run_ffprobe_with_retry to avoid external calls
333 |                 with patch(
334 |                     "rss_generator._make_http_request", return_value=mock_response
335 |                 ), patch(
336 |                     "rss_generator._run_ffprobe_with_retry",
337 |                     return_value='streams.stream.0.duration="123"',
338 |                 ):
339 |                     output_filename = f"test_guid_{description.replace(' ', '_')}.xml"
340 |                     generate_rss(test_config, output_filename)
341 | 
342 |                     tree = ET.parse(output_filename)
343 |                     root = tree.getroot()
344 |                     channel = root.find("channel")
345 |                     # Check the GUID of the first item
346 |                     item = channel.find("item")
347 |                     self.assertIsNotNone(item)
348 |                     guid_tag = item.find("guid")
349 |                     self.assertIsNotNone(guid_tag)
350 |                     self.assertEqual(guid_tag.text, expected_guid)
351 | 
352 |                     if os.path.exists(output_filename):
353 |                         os.remove(output_filename)
354 | 
355 |     def test_date_comparison_with_naive_datetime(self):
356 |         """Test that future-dated episodes with naive datetime strings are skipped."""
357 |         # Create a config with a future date without timezone info
358 |         future_naive_date = (datetime.now(timezone.utc) + timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%S")
359 |         test_config = {
360 |             "metadata": self.config["metadata"].copy(), # Use existing valid metadata
361 |             "episodes": [
362 |                 {
363 |                     "title": "Future Episode (Naive)",
364 |                     "description": "Test description",
365 |                     "publication_date": future_naive_date,
366 |                     "asset_url": "http://example.com/future_naive.mp3",
367 |                 }
368 |             ]
369 |         }
370 |         # Mock get_file_info to avoid network calls
371 |         mock_file_info = {
372 |             "content-length": "1000",
373 |             "content-type": "audio/mpeg",
374 |             "duration": 120,
375 |             "content_hash": None,
376 |         }
377 |         with patch("rss_generator.get_file_info", return_value=mock_file_info):
378 |             generate_rss(test_config, "test_naive_date_feed.xml")
379 | 
380 |         # Assert the feed was generated but contains no items (because the episode was skipped)
381 |         tree = ET.parse("test_naive_date_feed.xml")
382 |         root = tree.getroot()
383 |         channel = root.find("channel")
384 |         items = channel.findall("item")
385 |         self.assertEqual(len(items), 0, "Future episode with naive datetime should have been skipped")
386 | 
387 |         if os.path.exists("test_naive_date_feed.xml"):
388 |             os.remove("test_naive_date_feed.xml")
389 | 
390 |     @classmethod
391 |     def tearDownClass(cls):
392 |         # Clean up both generated files
393 |         if os.path.exists("test_podcast_feed_new_keys.xml"):
394 |             os.remove("test_podcast_feed_new_keys.xml")
395 |         if os.path.exists("test_podcast_feed_old_keys.xml"):
396 |             os.remove("test_podcast_feed_old_keys.xml")
397 | 
398 | 
399 | if __name__ == "__main__":
400 |     unittest.main()
401 | 


--------------------------------------------------------------------------------