├── assets
├── replace.mp3
├── replace_old.mp3
└── README.md
├── .gitignore
├── config
└── feeds-example.json
├── requirements.txt
├── docker-compose.cpu.yml
├── docker-compose.yml
├── Dockerfile.cpu
├── LICENSE
├── Dockerfile
├── README.md
└── src
├── storage.py
├── rss_parser.py
├── audio_processor.py
├── ad_detector.py
├── transcriber.py
└── main.py
/assets/replace.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hemant6488/podcast-server/HEAD/assets/replace.mp3
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | .env
3 | config/feeds.json
4 | planning/
5 | __pycache__/
6 | *.pyc
7 | .DS_Store
--------------------------------------------------------------------------------
/assets/replace_old.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hemant6488/podcast-server/HEAD/assets/replace_old.mp3
--------------------------------------------------------------------------------
/config/feeds-example.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "in": "https://example.com/podcast/feed.rss",
4 | "out": "/mypodcast"
5 | }
6 | ]
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | faster-whisper>=1.1.0
2 | ctranslate2==4.4.0
3 | anthropic==0.49.0
4 | feedparser==6.0.11
5 | flask==3.0.3
6 | requests==2.32.3
7 | python-slugify==8.0.4
--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
1 | # Assets Directory
2 |
3 | Place your `replace.mp3` file here. This should be a 1-second beep/tone audio file that will replace advertisement segments in podcasts.
4 |
5 | The file should be named exactly: `replace.mp3`
--------------------------------------------------------------------------------
/docker-compose.cpu.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | podcast-server:
5 | build:
6 | context: .
7 | dockerfile: Dockerfile.cpu
8 | ports:
9 | - "8000:8000"
10 | volumes:
11 | - ./data:/app/data
12 | - ./config:/app/config:ro
13 | - ./assets:/app/assets:ro
14 | environment:
15 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
16 | - WHISPER_MODEL=small
17 | - BASE_URL=${BASE_URL:-http://localhost:8000}
18 | - WHISPER_DEVICE=cpu
19 | restart: unless-stopped
20 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.8'
2 |
3 | services:
4 | podcast-server:
5 | build: .
6 | ports:
7 | - "8000:8000"
8 | volumes:
9 | - ./data:/app/data
10 | - ./config:/app/config:ro
11 | - ./assets:/app/assets:ro
12 | environment:
13 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
14 | - WHISPER_MODEL=small
15 | - BASE_URL=${BASE_URL:-http://localhost:8000}
16 | - WHISPER_DEVICE=cuda
17 | deploy:
18 | resources:
19 | reservations:
20 | devices:
21 | - driver: nvidia
22 | count: 1
23 | capabilities: [gpu]
24 | restart: unless-stopped
--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | # Install system dependencies
4 | RUN apt-get update && apt-get install -y \
5 | ffmpeg \
6 | && rm -rf /var/lib/apt/lists/*
7 |
8 | # Set working directory
9 | WORKDIR /app
10 |
11 | # Copy requirements first for better caching
12 | COPY requirements.txt .
13 |
14 | # Install Python dependencies
15 | RUN pip install --no-cache-dir -r requirements.txt
16 |
17 | # Pre-download Faster Whisper model
18 | ENV WHISPER_MODEL=small
19 | RUN python3 -c "import os; from faster_whisper import download_model; download_model(os.getenv('WHISPER_MODEL', 'small'))"
20 |
21 | # Copy application code
22 | COPY src/ ./src/
23 | COPY config/ ./config/
24 | COPY assets/ ./assets/
25 |
26 | # Create data directory
27 | RUN mkdir -p /app/data
28 |
29 | # Expose port
30 | EXPOSE 8000
31 |
32 | # Run the application
33 | CMD ["python", "src/main.py"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Hemant Kumar
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
2 |
3 | # Install Python 3.11 and system dependencies
4 | RUN apt-get update && apt-get install -y \
5 | python3.11 \
6 | python3.11-dev \
7 | python3-pip \
8 | ffmpeg \
9 | wget \
10 | && rm -rf /var/lib/apt/lists/*
11 |
12 | # Set python3.11 as default python3
13 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
14 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
15 |
16 | # Set working directory
17 | WORKDIR /app
18 |
19 | # Copy requirements first for better caching
20 | COPY requirements.txt .
21 |
22 | # Install Python dependencies
23 | RUN pip install --no-cache-dir -r requirements.txt
24 |
25 | # Pre-download Faster Whisper model
26 | ENV WHISPER_MODEL=small
27 | RUN python3 -c "import os; from faster_whisper import download_model; download_model(os.getenv('WHISPER_MODEL', 'small'))"
28 |
29 | # Copy application code
30 | COPY src/ ./src/
31 | COPY config/ ./config/
32 | COPY assets/ ./assets/
33 |
34 | # Create data directory
35 | RUN mkdir -p /app/data
36 |
37 | # Expose port
38 | EXPOSE 8000
39 |
40 | # Run the application
41 | CMD ["python", "src/main.py"]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Podcast Ad Removal Server
2 |
3 | Removes ads from podcasts using Whisper transcription. Serves modified RSS feeds that work with any podcast app.
4 |
5 | > **Disclaimer:** This tool is for personal use only. Only use it with podcasts you have permission to modify or where such modification is permitted under applicable laws. Respect content creators and their terms of service.
6 |
7 | ## How It Works
8 |
9 | 1. **Transcription** - Whisper converts audio to text with timestamps
10 | 2. **Ad Detection** - Claude API analyzes transcript to identify ad segments
11 | 3. **Audio Processing** - FFmpeg removes detected ads and inserts short audio markers
12 | 4. **Serving** - Flask serves modified RSS feeds and processed audio files
13 |
14 | Processing happens on-demand when you play an episode. First play takes a few minutes, subsequent plays are instant (cached).
15 |
16 | ## Requirements
17 |
18 | - Docker with NVIDIA GPU support (for Whisper)
19 | - Anthropic API key
20 |
21 | ## Setup
22 |
23 | ```bash
24 | # 1. Create environment file
25 | echo "ANTHROPIC_API_KEY=your-key-here" > .env
26 |
27 | # 2. Configure feeds
28 | cp config/feeds-example.json config/feeds.json
29 | # Edit config/feeds.json with your podcast RSS URLs
30 |
31 | # 3. Run (GPU)
32 | docker-compose up --build
33 |
34 | # Or for CPU-only mode (no NVIDIA GPU required)
35 | docker-compose -f docker-compose.cpu.yml up --build
36 | ```
37 |
38 | ### CPU-Only Mode
39 |
40 | CPU transcription is significantly slower—processing can take longer than the episode duration. Since episodes are processed on-demand when you play them, your podcast app will likely timeout waiting for the first request. To work around this:
41 |
42 | 1. Tap download/play on an episode to trigger processing
43 | 2. The request will timeout, but processing continues in the background
44 | 3. Wait a few minutes (check `docker logs` for progress) for the file to get processed
45 | 4. Try playing again, the processed file will be served from cache
46 |
47 | ## Configuration
48 |
49 | Edit `config/feeds.json`:
50 | ```json
51 | [
52 | {
53 | "in": "https://example.com/podcast/feed.rss",
54 | "out": "/mypodcast"
55 | }
56 | ]
57 | ```
58 |
59 | - `in` - Original podcast RSS feed URL
60 | - `out` - URL path for your modified feed (e.g., `/mypodcast` → `http://localhost:8000/mypodcast`)
61 |
62 | ## Finding Podcast RSS Feeds
63 |
64 | Most podcasts publish RSS feeds. Common ways to find them:
65 |
66 | 1. **Podcast website** - Look for "RSS" link in footer or subscription options
67 | 2. **Apple Podcasts** - Search on [podcastindex.org](https://podcastindex.org) using the Apple Podcasts URL
68 | 3. **Spotify-exclusive** - Not available (Spotify doesn't expose RSS feeds)
69 | 4. **Hosting platforms** - Common patterns:
70 | - Libsyn: `https://showname.libsyn.com/rss`
71 | - Spreaker: `https://www.spreaker.com/show/{id}/episodes/feed`
72 | - Omny: Check page source for `omnycontent.com` URLs
73 |
74 | ## Usage
75 |
76 | Add your modified feed URL to any podcast app:
77 | ```
78 | http://your-server:8000/mypodcast
79 | ```
80 |
81 | ## Environment Variables
82 |
83 | | Variable | Default | Description |
84 | |----------|---------|-------------|
85 | | `ANTHROPIC_API_KEY` | required | Claude API key |
86 | | `BASE_URL` | `http://localhost:8000` | Public URL for generated feed links |
87 | | `WHISPER_MODEL` | `small` | Whisper model size (tiny/base/small/medium/large) |
88 | | `WHISPER_DEVICE` | `cuda` | Device for Whisper (cuda/cpu) |
89 |
--------------------------------------------------------------------------------
/src/storage.py:
--------------------------------------------------------------------------------
1 | """Storage management with dynamic directory creation."""
2 | import os
3 | import json
4 | import logging
5 | from pathlib import Path
6 | from typing import Dict, Any, Optional
7 | import tempfile
8 | import shutil
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | class Storage:
13 | def __init__(self, data_dir: str = "/app/data"):
14 | self.data_dir = Path(data_dir)
15 | # Ensure base data directory exists
16 | self.data_dir.mkdir(exist_ok=True)
17 | logger.info(f"Storage initialized with data_dir: {self.data_dir}")
18 |
19 | def get_podcast_dir(self, slug: str) -> Path:
20 | """Get podcast directory, creating if necessary."""
21 | podcast_dir = self.data_dir / slug
22 | podcast_dir.mkdir(exist_ok=True)
23 |
24 | # Ensure episodes directory exists
25 | episodes_dir = podcast_dir / "episodes"
26 | episodes_dir.mkdir(exist_ok=True)
27 |
28 | logger.info(f"[{slug}] Podcast directory ready: {podcast_dir}")
29 | return podcast_dir
30 |
31 | def load_data_json(self, slug: str) -> Dict[str, Any]:
32 | """Load data.json for a podcast, creating if necessary."""
33 | podcast_dir = self.get_podcast_dir(slug)
34 | data_file = podcast_dir / "data.json"
35 |
36 | if data_file.exists():
37 | try:
38 | with open(data_file, 'r') as f:
39 | data = json.load(f)
40 | logger.info(f"[{slug}] Loaded data.json with {len(data.get('episodes', {}))} episodes")
41 | return data
42 | except json.JSONDecodeError as e:
43 | logger.error(f"[{slug}] Invalid data.json, creating new: {e}")
44 |
45 | # Create default structure
46 | data = {
47 | "episodes": {},
48 | "last_checked": None
49 | }
50 | self.save_data_json(slug, data)
51 | return data
52 |
53 | def save_data_json(self, slug: str, data: Dict[str, Any]) -> None:
54 | """Save data.json atomically."""
55 | podcast_dir = self.get_podcast_dir(slug)
56 | data_file = podcast_dir / "data.json"
57 |
58 | # Atomic write: write to temp, then rename
59 | with tempfile.NamedTemporaryFile(mode='w', delete=False, dir=podcast_dir, suffix='.tmp') as tmp:
60 | json.dump(data, tmp, indent=2)
61 | tmp_path = tmp.name
62 |
63 | shutil.move(tmp_path, data_file)
64 | logger.info(f"[{slug}] Saved data.json")
65 |
66 | def get_episode_path(self, slug: str, episode_id: str, extension: str = ".mp3") -> Path:
67 | """Get path for episode file."""
68 | podcast_dir = self.get_podcast_dir(slug)
69 | return podcast_dir / "episodes" / f"{episode_id}{extension}"
70 |
71 | def save_rss(self, slug: str, content: str) -> None:
72 | """Save modified RSS feed."""
73 | podcast_dir = self.get_podcast_dir(slug)
74 | rss_file = podcast_dir / "modified-rss.xml"
75 |
76 | # Atomic write
77 | with tempfile.NamedTemporaryFile(mode='w', delete=False, dir=podcast_dir, suffix='.tmp') as tmp:
78 | tmp.write(content)
79 | tmp_path = tmp.name
80 |
81 | shutil.move(tmp_path, rss_file)
82 | logger.info(f"[{slug}] Saved modified RSS feed")
83 |
84 | def get_rss(self, slug: str) -> Optional[str]:
85 | """Get cached RSS feed."""
86 | podcast_dir = self.get_podcast_dir(slug)
87 | rss_file = podcast_dir / "modified-rss.xml"
88 |
89 | if rss_file.exists():
90 | with open(rss_file, 'r') as f:
91 | return f.read()
92 | return None
93 |
94 | def save_transcript(self, slug: str, episode_id: str, transcript: str) -> None:
95 | """Save episode transcript."""
96 | path = self.get_episode_path(slug, episode_id, "-transcript.txt")
97 | with open(path, 'w') as f:
98 | f.write(transcript)
99 | logger.info(f"[{slug}:{episode_id}] Saved transcript")
100 |
101 | def save_ads_json(self, slug: str, episode_id: str, ads_data: Any) -> None:
102 | """Save Claude's ad detection response."""
103 | path = self.get_episode_path(slug, episode_id, "-ads.json")
104 | with open(path, 'w') as f:
105 | json.dump(ads_data, f, indent=2)
106 | logger.info(f"[{slug}:{episode_id}] Saved ads detection data")
107 |
108 | def save_prompt(self, slug: str, episode_id: str, prompt: str) -> None:
109 | """Save Claude prompt for debugging."""
110 | path = self.get_episode_path(slug, episode_id, "-prompt.txt")
111 | with open(path, 'w') as f:
112 | f.write(prompt)
113 | logger.info(f"[{slug}:{episode_id}] Saved Claude prompt")
--------------------------------------------------------------------------------
/src/rss_parser.py:
--------------------------------------------------------------------------------
1 | """RSS feed parsing and management."""
2 | import feedparser
3 | import logging
4 | import hashlib
5 | import os
6 | from datetime import datetime
7 | from typing import Dict, List, Optional
8 | import requests
9 | from slugify import slugify
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 | class RSSParser:
14 | def __init__(self, base_url: str = None):
15 | self.base_url = base_url or os.getenv('BASE_URL', 'http://localhost:8000')
16 |
17 | def fetch_feed(self, url: str, timeout: int = 30) -> Optional[str]:
18 | """Fetch RSS feed from URL."""
19 | try:
20 | logger.info(f"Fetching RSS feed from: {url}")
21 | response = requests.get(url, timeout=timeout)
22 | response.raise_for_status()
23 | logger.info(f"Successfully fetched RSS feed, size: {len(response.content)} bytes")
24 | return response.text
25 | except requests.RequestException as e:
26 | logger.error(f"Failed to fetch RSS feed: {e}")
27 | return None
28 |
29 | def parse_feed(self, feed_content: str) -> Dict:
30 | """Parse RSS feed content."""
31 | try:
32 | feed = feedparser.parse(feed_content)
33 | if feed.bozo:
34 | logger.warning(f"RSS parse warning: {feed.bozo_exception}")
35 |
36 | logger.info(f"Parsed RSS feed: {feed.feed.get('title', 'Unknown')} with {len(feed.entries)} entries")
37 | return feed
38 | except Exception as e:
39 | logger.error(f"Failed to parse RSS feed: {e}")
40 | return None
41 |
42 | def generate_episode_id(self, episode_url: str) -> str:
43 | """Generate consistent episode ID from URL."""
44 | # Use MD5 hash of URL for consistent ID
45 | return hashlib.md5(episode_url.encode()).hexdigest()[:12]
46 |
47 | def modify_feed(self, feed_content: str, slug: str) -> str:
48 | """Modify RSS feed to use our server URLs."""
49 | feed = self.parse_feed(feed_content)
50 | if not feed:
51 | return feed_content
52 |
53 | # Build modified RSS
54 | lines = []
55 | lines.append('')
56 | lines.append('')
57 | lines.append('')
58 |
59 | # Copy channel metadata
60 | channel = feed.feed
61 | lines.append(f'{self._escape_xml(channel.get("title", ""))}')
62 | lines.append(f'{self._escape_xml(channel.get("link", ""))}')
63 | lines.append(f'{self._escape_xml(channel.get("description", ""))}')
64 | lines.append(f'{self._escape_xml(channel.get("language", "en"))}')
65 |
66 | # Mark as private feed for personal use only
67 | lines.append('Yes')
68 |
69 | if 'image' in channel:
70 | lines.append(f'')
71 | lines.append(f' {self._escape_xml(channel.image.get("href", ""))}')
72 | lines.append(f' {self._escape_xml(channel.image.get("title", ""))}')
73 | lines.append(f' {self._escape_xml(channel.image.get("link", ""))}')
74 | lines.append(f'')
75 |
76 | # Process each episode
77 | for entry in feed.entries:
78 | episode_url = None
79 | # Find audio URL in enclosures
80 | for enclosure in entry.get('enclosures', []):
81 | if 'audio' in enclosure.get('type', ''):
82 | episode_url = enclosure.get('href', '')
83 | break
84 |
85 | if not episode_url:
86 | # Skip entries without audio
87 | logger.warning(f"Skipping entry without audio: {entry.get('title', 'Unknown')}")
88 | continue
89 |
90 | episode_id = self.generate_episode_id(episode_url)
91 | modified_url = f"{self.base_url}/episodes/{slug}/{episode_id}.mp3"
92 |
93 | lines.append('- ')
94 | lines.append(f' {self._escape_xml(entry.get("title", ""))}')
95 | lines.append(f' {self._escape_xml(entry.get("description", ""))}')
96 | lines.append(f' {self._escape_xml(entry.get("link", ""))}')
97 | lines.append(f' {self._escape_xml(entry.get("id", episode_url))}')
98 | lines.append(f' {self._escape_xml(entry.get("published", ""))}')
99 |
100 | # Modified enclosure URL
101 | lines.append(f' ')
102 |
103 | # iTunes specific tags
104 | if 'itunes_duration' in entry:
105 | lines.append(f' {entry.itunes_duration}')
106 | if 'itunes_explicit' in entry:
107 | lines.append(f' {entry.itunes_explicit}')
108 |
109 | lines.append('
')
110 |
111 | lines.append('')
112 | lines.append('')
113 |
114 | modified_rss = '\n'.join(lines)
115 | logger.info(f"[{slug}] Modified RSS feed with {len(feed.entries)} episodes")
116 | return modified_rss
117 |
118 | def _escape_xml(self, text: str) -> str:
119 | """Escape XML special characters."""
120 | if not text:
121 | return ""
122 | return (text
123 | .replace('&', '&')
124 | .replace('<', '<')
125 | .replace('>', '>')
126 | .replace('"', '"')
127 | .replace("'", '''))
128 |
129 | def extract_episodes(self, feed_content: str) -> List[Dict]:
130 | """Extract episode information from feed."""
131 | feed = self.parse_feed(feed_content)
132 | if not feed:
133 | return []
134 |
135 | episodes = []
136 | for entry in feed.entries:
137 | episode_url = None
138 | for enclosure in entry.get('enclosures', []):
139 | if 'audio' in enclosure.get('type', ''):
140 | episode_url = enclosure.get('href', '')
141 | break
142 |
143 | if episode_url:
144 | episodes.append({
145 | 'id': self.generate_episode_id(episode_url),
146 | 'url': episode_url,
147 | 'title': entry.get('title', 'Unknown'),
148 | 'published': entry.get('published', ''),
149 | })
150 |
151 | return episodes
--------------------------------------------------------------------------------
/src/audio_processor.py:
--------------------------------------------------------------------------------
1 | """Audio processing with FFMPEG."""
2 | import logging
3 | import subprocess
4 | import tempfile
5 | import os
6 | import shutil
7 | from typing import List, Dict, Optional
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | class AudioProcessor:
12 | def __init__(self, replace_audio_path: str = "./assets/replace.mp3"):
13 | self.replace_audio_path = replace_audio_path
14 |
15 | def check_ffmpeg(self) -> bool:
16 | """Check if FFMPEG is available."""
17 | try:
18 | subprocess.run(['ffmpeg', '-version'],
19 | capture_output=True, check=True, timeout=5)
20 | return True
21 | except (subprocess.SubprocessError, FileNotFoundError):
22 | logger.error("FFMPEG not found or not working")
23 | return False
24 |
25 | def get_audio_duration(self, audio_path: str) -> Optional[float]:
26 | """Get duration of audio file in seconds."""
27 | try:
28 | cmd = [
29 | 'ffprobe', '-v', 'error',
30 | '-show_entries', 'format=duration',
31 | '-of', 'default=noprint_wrappers=1:nokey=1',
32 | audio_path
33 | ]
34 | result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
35 | if result.returncode == 0:
36 | return float(result.stdout.strip())
37 | except Exception as e:
38 | logger.error(f"Failed to get audio duration: {e}")
39 | return None
40 |
41 | def remove_ads(self, input_path: str, ad_segments: List[Dict], output_path: str) -> bool:
42 | """Remove ad segments from audio file."""
43 | if not ad_segments:
44 | # No ads to remove, just copy file
45 | logger.info("No ads to remove, copying original file")
46 | shutil.copy2(input_path, output_path)
47 | return True
48 |
49 | if not os.path.exists(self.replace_audio_path):
50 | logger.error(f"Replace audio not found: {self.replace_audio_path}")
51 | return False
52 |
53 | try:
54 | # Get total duration
55 | total_duration = self.get_audio_duration(input_path)
56 | if not total_duration:
57 | logger.error("Could not get audio duration")
58 | return False
59 |
60 | logger.info(f"Processing audio: {total_duration:.1f}s total, {len(ad_segments)} ad segments")
61 |
62 | # Sort ad segments by start time
63 | sorted_segments = sorted(ad_segments, key=lambda x: x['start'])
64 |
65 | # Merge segments with < 1 second gaps
66 | merged_ads = []
67 | current_segment = None
68 |
69 | for ad in sorted_segments:
70 | if current_segment and ad['start'] - current_segment['end'] < 1.0:
71 | # Extend current segment
72 | current_segment['end'] = ad['end']
73 | if 'reason' in ad:
74 | current_segment['reason'] = current_segment.get('reason', '') + '; ' + ad['reason']
75 | else:
76 | if current_segment:
77 | merged_ads.append(current_segment)
78 | current_segment = {'start': ad['start'], 'end': ad['end']}
79 | if 'reason' in ad:
80 | current_segment['reason'] = ad['reason']
81 |
82 | if current_segment:
83 | merged_ads.append(current_segment)
84 |
85 | ads = merged_ads
86 | logger.info(f"After merging: {len(ads)} ad segments")
87 |
88 | # Build complex filter for FFMPEG
89 | # Strategy: Split audio into segments, replace ad segments with beep
90 | filter_parts = []
91 | concat_parts = []
92 | current_time = 0
93 | segment_idx = 0
94 |
95 | for ad in ads:
96 | ad_start = ad['start']
97 | ad_end = ad['end']
98 |
99 | # Add content before ad
100 | if ad_start > current_time:
101 | filter_parts.append(f"[0:a]atrim={current_time}:{ad_start}[s{segment_idx}]")
102 | concat_parts.append(f"[s{segment_idx}]")
103 | segment_idx += 1
104 |
105 | # Add single replacement audio with volume reduction to 40%
106 | filter_parts.append(f"[1:a]volume=0.4[beep{segment_idx}]")
107 | concat_parts.append(f"[beep{segment_idx}]")
108 |
109 | current_time = ad_end
110 |
111 | # Add remaining content after last ad
112 | if current_time < total_duration:
113 | filter_parts.append(f"[0:a]atrim={current_time}:{total_duration}[s{segment_idx}]")
114 | concat_parts.append(f"[s{segment_idx}]")
115 |
116 | # Concatenate all parts
117 | filter_str = ';'.join(filter_parts)
118 | if filter_str:
119 | filter_str += ';'
120 | filter_str += ''.join(concat_parts) + f"concat=n={len(concat_parts)}:v=0:a=1[out]"
121 |
122 | # Run FFMPEG
123 | cmd = [
124 | 'ffmpeg', '-y',
125 | '-i', input_path,
126 | '-i', self.replace_audio_path,
127 | '-filter_complex', filter_str,
128 | '-map', '[out]',
129 | '-acodec', 'libmp3lame',
130 | '-ab', '128k',
131 | output_path
132 | ]
133 |
134 | logger.info(f"Running FFMPEG to remove ads")
135 | result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
136 |
137 | if result.returncode != 0:
138 | logger.error(f"FFMPEG failed: {result.stderr}")
139 | return False
140 |
141 | # Verify output
142 | new_duration = self.get_audio_duration(output_path)
143 | if new_duration:
144 | removed_time = total_duration - new_duration
145 | logger.info(f"FFMPEG processing complete: {total_duration:.1f}s → {new_duration:.1f}s (removed {removed_time:.1f}s)")
146 | return True
147 | else:
148 | logger.error("Could not verify output file")
149 | return False
150 |
151 | except subprocess.TimeoutExpired:
152 | logger.error("FFMPEG processing timed out")
153 | return False
154 | except Exception as e:
155 | logger.error(f"Audio processing failed: {e}")
156 | return False
157 |
158 | def process_episode(self, input_path: str, ad_segments: List[Dict]) -> Optional[str]:
159 | """Process episode audio to remove ads."""
160 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp:
161 | temp_output = tmp.name
162 |
163 | try:
164 | if self.remove_ads(input_path, ad_segments, temp_output):
165 | return temp_output
166 | else:
167 | # Clean up on failure
168 | if os.path.exists(temp_output):
169 | os.unlink(temp_output)
170 | return None
171 | except Exception as e:
172 | logger.error(f"Episode processing failed: {e}")
173 | if os.path.exists(temp_output):
174 | os.unlink(temp_output)
175 | return None
--------------------------------------------------------------------------------
/src/ad_detector.py:
--------------------------------------------------------------------------------
1 | """Ad detection using Claude API."""
2 | import logging
3 | import json
4 | import os
5 | from typing import List, Dict, Optional
6 | from anthropic import Anthropic
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | class AdDetector:
11 | def __init__(self, api_key: Optional[str] = None):
12 | self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
13 | if not self.api_key:
14 | logger.warning("No Anthropic API key found")
15 | self.client = None
16 |
17 | def initialize_client(self):
18 | """Initialize Anthropic client."""
19 | if self.client is None and self.api_key:
20 | try:
21 | from anthropic import Anthropic
22 | self.client = Anthropic(api_key=self.api_key)
23 | logger.info("Anthropic client initialized")
24 | except Exception as e:
25 | logger.error(f"Failed to initialize Anthropic client: {e}")
26 | raise
27 |
28 | def detect_ads(self, segments: List[Dict], podcast_name: str = "Unknown", episode_title: str = "Unknown", slug: str = None, episode_id: str = None) -> Optional[List[Dict]]:
29 | """Detect ad segments using Claude API."""
30 | if not self.api_key:
31 | logger.warning("Skipping ad detection - no API key")
32 | return []
33 |
34 | try:
35 | self.initialize_client()
36 |
37 | # Prepare transcript with timestamps for Claude
38 | transcript_lines = []
39 | for segment in segments:
40 | start = segment['start']
41 | end = segment['end']
42 | text = segment['text']
43 | transcript_lines.append(f"[{start:.1f}s - {end:.1f}s] {text}")
44 |
45 | transcript = "\n".join(transcript_lines)
46 |
47 | # Call Claude API
48 | logger.info(f"Sending transcript to Claude for ad detection: {podcast_name} - {episode_title}")
49 |
50 | prompt = f"""Podcast: {podcast_name}
51 | Episode: {episode_title}
52 |
53 | Transcript:
54 | {transcript}
55 |
56 | INSTRUCTIONS:
57 | Analyze this podcast transcript and identify ALL advertisement segments. Look for:
58 | - Product endorsements, sponsored content, or promotional messages
59 | - Promo codes, special offers, or calls to action
60 | - Clear transitions to/from ads (e.g., "This episode is brought to you by...")
61 | - Host-read advertisements
62 | - Pre-roll, mid-roll, or post-roll ads
63 | - Long intro sections filled with multiple ads before actual content begins
64 | - Mentions of other podcasts/shows from the network (cross-promotion)
65 | - Sponsor messages about credit cards, apps, products, or services
66 | - ANY podcast promos (e.g., "Listen to X on iHeart Radio app")
67 |
68 | CRITICAL MERGING RULES:
69 | 1. Analyze the FULL transcript before deciding segment boundaries - don't stop at gaps
70 | 2. Multiple ads separated by gaps of 15 seconds or less should be treated as ONE CONTINUOUS SEGMENT
71 | 3. Brief transitions, silence, or gaps between ads do NOT count as content - they're part of the same ad block
72 | 4. Only split ads if there's REAL SHOW CONTENT (actual discussion, interview, topic content) for at least 30 seconds between them
73 | 5. Consider the entire context: if ads at 1500s, 1520s, 1540s are all promotional content, return ONE segment from 1500-1560, not three separate ones
74 | 6. When in doubt, merge the segments - better to remove too much than leave ads in
75 | 7. If there's a gap followed by content that doesn't continue the previous discussion but instead introduces a completely new topic/person/show, it's likely still part of the ad block
76 |
77 | Return ONLY a JSON array of ad segments with start/end times in seconds. Be aggressive in detecting ads.
78 |
79 | Format:
80 | [{{"start": 0.0, "end": 240.0, "reason": "Continuous ad block: multiple sponsors"}}, ...]
81 |
82 | If no ads are found, return an empty array: []"""
83 |
84 | # Save the prompt for debugging
85 | if slug and episode_id:
86 | try:
87 | from storage import Storage
88 | storage = Storage()
89 | storage.save_prompt(slug, episode_id, prompt)
90 | except Exception as e:
91 | logger.warning(f"Could not save prompt: {e}")
92 |
93 | response = self.client.messages.create(
94 | model="claude-opus-4-1-20250805", # Use Claude Opus 4.1 for better ad detection
95 | max_tokens=2000,
96 | temperature=0.2,
97 | system="You are an ad detection specialist with extensive experience in identifying all forms of advertisements, sponsorships, and promotional content in podcasts. Your users absolutely cannot tolerate ads - they find them disruptive and want them completely removed. Be extremely aggressive in detecting ads. When in doubt, mark it as an ad. It's better to remove a few seconds of content than to leave any advertisement in the podcast.",
98 | messages=[{
99 | "role": "user",
100 | "content": prompt
101 | }]
102 | )
103 |
104 | # Extract JSON from response
105 | response_text = response.content[0].text if response.content else ""
106 | logger.info(f"Claude response received: {len(response_text)} chars")
107 |
108 | # Try to parse JSON from response
109 | try:
110 | # Look for JSON array in response
111 | start_idx = response_text.find('[')
112 | end_idx = response_text.rfind(']') + 1
113 | if start_idx >= 0 and end_idx > start_idx:
114 | json_str = response_text[start_idx:end_idx]
115 | ads = json.loads(json_str)
116 |
117 | # Validate structure
118 | if isinstance(ads, list):
119 | valid_ads = []
120 | for ad in ads:
121 | if isinstance(ad, dict) and 'start' in ad and 'end' in ad:
122 | valid_ads.append({
123 | 'start': float(ad['start']),
124 | 'end': float(ad['end']),
125 | 'reason': ad.get('reason', 'Advertisement detected')
126 | })
127 |
128 | total_ad_time = sum(ad['end'] - ad['start'] for ad in valid_ads)
129 | logger.info(f"Claude detected {len(valid_ads)} ad segments (total {total_ad_time/60:.1f} minutes)")
130 |
131 | # Store full response for debugging
132 | return {
133 | "ads": valid_ads,
134 | "raw_response": response_text,
135 | "model": "claude-sonnet-4-5-20250929"
136 | }
137 | else:
138 | logger.warning("No JSON array found in Claude response")
139 | return {"ads": [], "raw_response": response_text, "error": "No JSON found"}
140 |
141 | except json.JSONDecodeError as e:
142 | logger.error(f"Failed to parse JSON from Claude response: {e}")
143 | return {"ads": [], "raw_response": response_text, "error": str(e)}
144 |
145 | except Exception as e:
146 | logger.error(f"Ad detection failed: {e}")
147 | return {"ads": [], "error": str(e)}
148 |
149 | def process_transcript(self, segments: List[Dict], podcast_name: str = "Unknown", episode_title: str = "Unknown", slug: str = None, episode_id: str = None) -> Dict:
150 | """Process transcript for ad detection."""
151 | result = self.detect_ads(segments, podcast_name, episode_title, slug, episode_id)
152 | if result is None:
153 | return {"ads": [], "error": "Detection failed"}
154 | return result
--------------------------------------------------------------------------------
/src/transcriber.py:
--------------------------------------------------------------------------------
1 | """Transcription using Faster Whisper."""
2 | import logging
3 | import tempfile
4 | import os
5 | import requests
6 | from typing import List, Dict, Optional, Tuple
7 | from pathlib import Path
8 | from faster_whisper import WhisperModel, BatchedInferencePipeline
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | class WhisperModelSingleton:
13 | _instance = None
14 | _base_model = None
15 |
16 | @classmethod
17 | def get_instance(cls) -> Tuple[WhisperModel, BatchedInferencePipeline]:
18 | """
19 | Get both the base model and batched pipeline instance
20 | Returns:
21 | Tuple[WhisperModel, BatchedInferencePipeline]: Base model for operations like language detection,
22 | and batched pipeline for transcription
23 | """
24 | if cls._instance is None:
25 | model_size = os.getenv("WHISPER_MODEL", "small")
26 | device = os.getenv("WHISPER_DEVICE", "cpu")
27 |
28 | # Set compute type based on device
29 | if device == "cuda":
30 | compute_type = "float16" # Use FP16 for GPU
31 | logger.info(f"Initializing Whisper model: {model_size} on CUDA with float16")
32 | else:
33 | compute_type = "int8" # Use INT8 for CPU
34 | logger.info(f"Initializing Whisper model: {model_size} on CPU with int8")
35 |
36 | # Initialize base model
37 | cls._base_model = WhisperModel(
38 | model_size,
39 | device=device,
40 | compute_type=compute_type,
41 | )
42 |
43 | # Initialize batched pipeline
44 | cls._instance = BatchedInferencePipeline(
45 | cls._base_model
46 | )
47 | logger.info("Whisper model and batched pipeline initialized")
48 |
49 | return cls._base_model, cls._instance
50 |
51 | @classmethod
52 | def get_base_model(cls) -> WhisperModel:
53 | """
54 | Get just the base model for operations like language detection
55 | Returns:
56 | WhisperModel: Base Whisper model
57 | """
58 | if cls._base_model is None:
59 | cls.get_instance()
60 | return cls._base_model
61 |
62 | @classmethod
63 | def get_batched_pipeline(cls) -> BatchedInferencePipeline:
64 | """
65 | Get just the batched pipeline for transcription
66 | Returns:
67 | BatchedInferencePipeline: Batched pipeline for efficient transcription
68 | """
69 | if cls._instance is None:
70 | cls.get_instance()
71 | return cls._instance
72 |
73 | class Transcriber:
74 | def __init__(self):
75 | # Model is now managed by singleton
76 | pass
77 |
78 | def download_audio(self, url: str, timeout: int = 600) -> Optional[str]:
79 | """Download audio file from URL."""
80 | try:
81 | logger.info(f"Downloading audio from: {url}")
82 | response = requests.get(url, stream=True, timeout=timeout)
83 | response.raise_for_status()
84 |
85 | # Check file size
86 | content_length = response.headers.get('Content-Length')
87 | if content_length:
88 | size_mb = int(content_length) / (1024 * 1024)
89 | if size_mb > 500:
90 | logger.error(f"Audio file too large: {size_mb:.1f}MB (max 500MB)")
91 | return None
92 | logger.info(f"Audio file size: {size_mb:.1f}MB")
93 |
94 | # Save to temp file
95 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp:
96 | for chunk in response.iter_content(chunk_size=8192):
97 | tmp.write(chunk)
98 | temp_path = tmp.name
99 |
100 | logger.info(f"Downloaded audio to: {temp_path}")
101 | return temp_path
102 | except Exception as e:
103 | logger.error(f"Failed to download audio: {e}")
104 | return None
105 |
106 | def transcribe(self, audio_path: str) -> List[Dict]:
107 | """Transcribe audio file using Faster Whisper with batched pipeline."""
108 | try:
109 | # Get the batched pipeline for efficient transcription
110 | model = WhisperModelSingleton.get_batched_pipeline()
111 |
112 | logger.info(f"Starting transcription of: {audio_path}")
113 |
114 | # Create a simple prompt for podcast context
115 | initial_prompt = "This is a podcast episode."
116 |
117 | # Adjust batch size based on device
118 | device = os.getenv("WHISPER_DEVICE", "cpu")
119 | if device == "cuda":
120 | batch_size = 16 # Larger batch for GPU
121 | logger.info("Using GPU-optimized batch size: 16")
122 | else:
123 | batch_size = 8 # Smaller batch for CPU
124 |
125 | # Use the batched pipeline for transcription
126 | segments_generator, info = model.transcribe(
127 | audio_path,
128 | language="en",
129 | initial_prompt=initial_prompt,
130 | beam_size=5,
131 | batch_size=batch_size,
132 | vad_filter=True, # Enable VAD filter to skip silent parts
133 | vad_parameters=dict(
134 | min_silence_duration_ms=500,
135 | speech_pad_ms=400
136 | )
137 | )
138 |
139 | # Collect segments with real-time progress logging
140 | result = []
141 | segment_count = 0
142 | last_log_time = 0
143 |
144 | for segment in segments_generator:
145 | segment_count += 1
146 | segment_dict = {
147 | "start": segment.start,
148 | "end": segment.end,
149 | "text": segment.text.strip()
150 | }
151 | result.append(segment_dict)
152 |
153 | # Log progress every 10 segments
154 | if segment_count % 10 == 0:
155 | progress_min = segment.end / 60
156 | logger.info(f"Transcription progress: {segment_count} segments, {progress_min:.1f} minutes processed")
157 |
158 | # Log every 30 seconds of audio processed
159 | if segment.end - last_log_time > 30:
160 | last_log_time = segment.end
161 | # Log the last segment's text (truncated)
162 | text_preview = segment.text.strip()[:100] + "..." if len(segment.text.strip()) > 100 else segment.text.strip()
163 | logger.info(f"[{self.format_timestamp(segment.start)}] {text_preview}")
164 |
165 | duration_min = result[-1]['end'] / 60 if result else 0
166 | logger.info(f"Transcription completed: {len(result)} segments, {duration_min:.1f} minutes")
167 | return result
168 | except Exception as e:
169 | logger.error(f"Transcription failed: {e}")
170 | return None
171 |
172 | def format_timestamp(self, seconds: float) -> str:
173 | """Convert seconds to timestamp format."""
174 | hours = int(seconds // 3600)
175 | minutes = int((seconds % 3600) // 60)
176 | secs = seconds % 60
177 | return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
178 |
179 | def segments_to_text(self, segments: List[Dict]) -> str:
180 | """Convert segments to readable text format."""
181 | lines = []
182 | for segment in segments:
183 | start_ts = self.format_timestamp(segment['start'])
184 | end_ts = self.format_timestamp(segment['end'])
185 | lines.append(f"[{start_ts} --> {end_ts}] {segment['text']}")
186 | return '\n'.join(lines)
187 |
188 | def process_episode(self, episode_url: str) -> Optional[Dict]:
189 | """Complete transcription pipeline for an episode."""
190 | audio_path = None
191 | try:
192 | # Download audio
193 | audio_path = self.download_audio(episode_url)
194 | if not audio_path:
195 | return None
196 |
197 | # Transcribe
198 | segments = self.transcribe(audio_path)
199 | if not segments:
200 | return None
201 |
202 | # Format transcript
203 | transcript_text = self.segments_to_text(segments)
204 |
205 | return {
206 | "segments": segments,
207 | "transcript": transcript_text,
208 | "segment_count": len(segments),
209 | "duration": segments[-1]['end'] if segments else 0
210 | }
211 | finally:
212 | # Clean up temp file
213 | if audio_path and os.path.exists(audio_path):
214 | try:
215 | os.unlink(audio_path)
216 | logger.info(f"Cleaned up temp file: {audio_path}")
217 | except:
218 | pass
--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
1 | """Main Flask web server for podcast ad removal."""
2 | import logging
3 | import json
4 | import os
5 | import threading
6 | import time
7 | from datetime import datetime
8 | from pathlib import Path
9 | from flask import Flask, Response, send_file, abort
10 | from slugify import slugify
11 | import shutil
12 |
13 | from storage import Storage
14 | from rss_parser import RSSParser
15 | from transcriber import Transcriber
16 | from ad_detector import AdDetector
17 | from audio_processor import AudioProcessor
18 |
19 | # Configure logging to both file and console
20 | logging.basicConfig(
21 | level=logging.INFO,
22 | format='[%(asctime)s] [%(levelname)s] %(message)s',
23 | datefmt='%Y-%m-%d %H:%M:%S',
24 | handlers=[
25 | logging.FileHandler('/app/data/server.log'),
26 | logging.StreamHandler() # Keep console output for Docker logs
27 | ]
28 | )
29 | logger = logging.getLogger(__name__)
30 |
31 | # Initialize Flask app
32 | app = Flask(__name__)
33 |
34 | # Initialize components
35 | storage = Storage()
36 | rss_parser = RSSParser()
37 | transcriber = Transcriber()
38 | ad_detector = AdDetector()
39 | audio_processor = AudioProcessor()
40 |
41 | # Load feed configuration
42 | def load_feeds():
43 | """Load feed configuration from JSON."""
44 | config_path = Path("./config/feeds.json")
45 | if not config_path.exists():
46 | logger.error("feeds.json not found")
47 | return []
48 |
49 | try:
50 | with open(config_path, 'r') as f:
51 | feeds = json.load(f)
52 | logger.info(f"Loaded {len(feeds)} feed configurations")
53 | return feeds
54 | except Exception as e:
55 | logger.error(f"Failed to load feeds.json: {e}")
56 | return []
57 |
58 | def reload_feeds():
59 | """Reload feed configuration and update global FEED_MAP."""
60 | global FEEDS, FEED_MAP
61 | FEEDS = load_feeds()
62 | FEED_MAP = {slugify(feed['out'].strip('/')): feed for feed in FEEDS}
63 | logger.info(f"Reloaded feeds: {list(FEED_MAP.keys())}")
64 | return FEED_MAP
65 |
66 | # Initial load of feed configuration
67 | FEEDS = load_feeds()
68 | FEED_MAP = {slugify(feed['out'].strip('/')): feed for feed in FEEDS}
69 |
70 | def refresh_rss_feed(slug: str, feed_url: str):
71 | """Refresh RSS feed for a podcast."""
72 | try:
73 | logger.info(f"[{slug}] Starting RSS refresh from: {feed_url}")
74 |
75 | # Fetch original RSS
76 | feed_content = rss_parser.fetch_feed(feed_url)
77 | if not feed_content:
78 | logger.error(f"[{slug}] Failed to fetch RSS feed")
79 | return False
80 |
81 | # Modify feed URLs
82 | modified_rss = rss_parser.modify_feed(feed_content, slug)
83 |
84 | # Save modified RSS
85 | storage.save_rss(slug, modified_rss)
86 |
87 | # Update last_checked timestamp
88 | data = storage.load_data_json(slug)
89 | data['last_checked'] = datetime.utcnow().isoformat() + 'Z'
90 | storage.save_data_json(slug, data)
91 |
92 | logger.info(f"[{slug}] RSS refresh complete")
93 | return True
94 | except Exception as e:
95 | logger.error(f"[{slug}] RSS refresh failed: {e}")
96 | return False
97 |
98 | def refresh_all_feeds():
99 | """Refresh all RSS feeds once (no loop)."""
100 | try:
101 | logger.info("Refreshing all RSS feeds")
102 | # Reload feeds.json to pick up any changes
103 | reload_feeds()
104 |
105 | for slug, feed_info in FEED_MAP.items():
106 | refresh_rss_feed(slug, feed_info['in'])
107 | logger.info("RSS refresh complete")
108 | return True
109 | except Exception as e:
110 | logger.error(f"RSS refresh failed: {e}")
111 | return False
112 |
113 | def background_rss_refresh():
114 | """Background task to refresh RSS feeds every 15 minutes."""
115 | while True:
116 | refresh_all_feeds()
117 | # Wait 15 minutes
118 | time.sleep(900)
119 |
120 | def process_episode(slug: str, episode_id: str, episode_url: str, episode_title: str = "Unknown", podcast_name: str = "Unknown"):
121 | """Process a single episode (transcribe, detect ads, remove ads)."""
122 | start_time = time.time()
123 |
124 | try:
125 | # Log start with title
126 | logger.info(f"[{slug}:{episode_id}] Starting: \"{episode_title}\"")
127 |
128 | # Update status to processing
129 | data = storage.load_data_json(slug)
130 | data['episodes'][episode_id] = {
131 | 'status': 'processing',
132 | 'original_url': episode_url,
133 | 'title': episode_title,
134 | 'processed_at': datetime.utcnow().isoformat() + 'Z'
135 | }
136 | storage.save_data_json(slug, data)
137 |
138 | # Step 1: Check if transcript exists
139 | transcript_path = storage.get_episode_path(slug, episode_id, "-transcript.txt")
140 | segments = None
141 | transcript_text = None
142 |
143 | if transcript_path.exists():
144 | logger.info(f"[{slug}:{episode_id}] Found existing transcript, skipping transcription")
145 | # Load existing transcript
146 | with open(transcript_path, 'r') as f:
147 | transcript_text = f.read()
148 | # Parse segments from transcript
149 | segments = []
150 | for line in transcript_text.split('\n'):
151 | if line.strip() and line.startswith('['):
152 | # Parse format: [00:00:00.000 --> 00:00:05.200] text
153 | try:
154 | time_part, text_part = line.split('] ', 1)
155 | time_range = time_part.strip('[')
156 | start_str, end_str = time_range.split(' --> ')
157 | # Convert timestamp to seconds
158 | def parse_timestamp(ts):
159 | parts = ts.split(':')
160 | return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
161 | segments.append({
162 | 'start': parse_timestamp(start_str),
163 | 'end': parse_timestamp(end_str),
164 | 'text': text_part
165 | })
166 | except:
167 | continue
168 |
169 | if segments:
170 | segment_count = len(segments)
171 | duration_min = segments[-1]['end'] / 60 if segments else 0
172 | logger.info(f"[{slug}:{episode_id}] Loaded transcript: {segment_count} segments, {duration_min:.1f} minutes")
173 |
174 | # Still need to download audio for processing
175 | audio_path = transcriber.download_audio(episode_url)
176 | if not audio_path:
177 | raise Exception("Failed to download audio")
178 | else:
179 | # Download and transcribe
180 | logger.info(f"[{slug}:{episode_id}] Downloading audio")
181 | audio_path = transcriber.download_audio(episode_url)
182 | if not audio_path:
183 | raise Exception("Failed to download audio")
184 |
185 | logger.info(f"[{slug}:{episode_id}] Starting transcription")
186 | segments = transcriber.transcribe(audio_path)
187 | if not segments:
188 | raise Exception("Failed to transcribe audio")
189 |
190 | segment_count = len(segments)
191 | duration_min = segments[-1]['end'] / 60 if segments else 0
192 | logger.info(f"[{slug}:{episode_id}] Transcription completed: {segment_count} segments, {duration_min:.1f} minutes")
193 |
194 | # Save transcript
195 | transcript_text = transcriber.segments_to_text(segments)
196 | storage.save_transcript(slug, episode_id, transcript_text)
197 |
198 | try:
199 |
200 | # Step 2: Detect ads
201 | logger.info(f"[{slug}:{episode_id}] Sending to Claude API - Podcast: {podcast_name}, Episode: {episode_title}")
202 | ad_result = ad_detector.process_transcript(segments, podcast_name, episode_title, slug, episode_id)
203 | storage.save_ads_json(slug, episode_id, ad_result)
204 |
205 | ads = ad_result.get('ads', [])
206 | if ads:
207 | total_ad_time = sum(ad['end'] - ad['start'] for ad in ads)
208 | logger.info(f"[{slug}:{episode_id}] Claude detected {len(ads)} ad segments (total {total_ad_time/60:.1f} minutes)")
209 | else:
210 | logger.info(f"[{slug}:{episode_id}] No ads detected")
211 |
212 | # Step 3: Process audio to remove ads
213 | logger.info(f"[{slug}:{episode_id}] Starting FFMPEG")
214 | processed_path = audio_processor.process_episode(audio_path, ads)
215 | if not processed_path:
216 | raise Exception("Failed to process audio with FFMPEG")
217 |
218 | # Get durations for logging
219 | original_duration = audio_processor.get_audio_duration(audio_path)
220 | new_duration = audio_processor.get_audio_duration(processed_path)
221 |
222 | # Move processed file to final location
223 | final_path = storage.get_episode_path(slug, episode_id)
224 | shutil.move(processed_path, final_path)
225 |
226 | # Update status to processed
227 | data = storage.load_data_json(slug)
228 | data['episodes'][episode_id] = {
229 | 'status': 'processed',
230 | 'original_url': episode_url,
231 | 'title': episode_title,
232 | 'processed_file': f"episodes/{episode_id}.mp3",
233 | 'processed_at': datetime.utcnow().isoformat() + 'Z',
234 | 'original_duration': original_duration,
235 | 'new_duration': new_duration,
236 | 'ads_removed': len(ads)
237 | }
238 | storage.save_data_json(slug, data)
239 |
240 | # Calculate processing time
241 | processing_time = time.time() - start_time
242 |
243 | # Final summary log
244 | if original_duration and new_duration:
245 | time_saved = original_duration - new_duration
246 | logger.info(f"[{slug}:{episode_id}] Complete: \"{episode_title}\" | {original_duration/60:.1f}→{new_duration/60:.1f}min | {len(ads)} ads removed | {processing_time:.1f}s")
247 | else:
248 | logger.info(f"[{slug}:{episode_id}] Complete: \"{episode_title}\" | {len(ads)} ads removed | {processing_time:.1f}s")
249 |
250 | return True
251 |
252 | finally:
253 | # Clean up temp audio file
254 | if os.path.exists(audio_path):
255 | os.unlink(audio_path)
256 |
257 | except Exception as e:
258 | processing_time = time.time() - start_time
259 | logger.error(f"[{slug}:{episode_id}] Failed: \"{episode_title}\" | Error: {e} | {processing_time:.1f}s")
260 |
261 | # Update status to failed
262 | data = storage.load_data_json(slug)
263 | data['episodes'][episode_id] = {
264 | 'status': 'failed',
265 | 'original_url': episode_url,
266 | 'title': episode_title,
267 | 'error': str(e),
268 | 'failed_at': datetime.utcnow().isoformat() + 'Z'
269 | }
270 | storage.save_data_json(slug, data)
271 | return False
272 |
273 | @app.route('/')
274 | def serve_rss(slug):
275 | """Serve modified RSS feed."""
276 | if slug not in FEED_MAP:
277 | # Refresh all feeds to pick up any new ones
278 | logger.info(f"[{slug}] Not found in feeds, refreshing all")
279 | refresh_all_feeds()
280 |
281 | # Check again after refresh
282 | if slug not in FEED_MAP:
283 | logger.warning(f"[{slug}] Still not found after refresh")
284 | abort(404)
285 |
286 | # Check if RSS cache exists or is stale
287 | cached_rss = storage.get_rss(slug)
288 | data = storage.load_data_json(slug)
289 | last_checked = data.get('last_checked')
290 |
291 | # If no cache or stale (>15 min), refresh immediately
292 | should_refresh = False
293 | if not cached_rss:
294 | should_refresh = True
295 | logger.info(f"[{slug}] No RSS cache, fetching immediately")
296 | elif last_checked:
297 | try:
298 | last_time = datetime.fromisoformat(last_checked.replace('Z', '+00:00'))
299 | age_minutes = (datetime.utcnow() - last_time.replace(tzinfo=None)).total_seconds() / 60
300 | if age_minutes > 15:
301 | should_refresh = True
302 | logger.info(f"[{slug}] RSS cache stale ({age_minutes:.1f} minutes old), refreshing")
303 | except:
304 | should_refresh = True
305 |
306 | if should_refresh:
307 | refresh_rss_feed(slug, FEED_MAP[slug]['in'])
308 | cached_rss = storage.get_rss(slug)
309 |
310 | if cached_rss:
311 | logger.info(f"[{slug}] Serving RSS feed")
312 | return Response(cached_rss, mimetype='application/rss+xml')
313 | else:
314 | logger.error(f"[{slug}] RSS feed not available")
315 | abort(503)
316 |
317 | @app.route('/episodes//.mp3')
318 | def serve_episode(slug, episode_id):
319 | """Serve processed episode audio (JIT processing)."""
320 | if slug not in FEED_MAP:
321 | # Refresh all feeds to pick up any new ones
322 | logger.info(f"[{slug}] Not found in feeds for episode {episode_id}, refreshing all")
323 | refresh_all_feeds()
324 |
325 | # Check again after refresh
326 | if slug not in FEED_MAP:
327 | logger.warning(f"[{slug}] Still not found after refresh for episode {episode_id}")
328 | abort(404)
329 |
330 | # Validate episode ID (alphanumeric + dash/underscore)
331 | if not all(c.isalnum() or c in '-_' for c in episode_id):
332 | logger.warning(f"[{slug}] Invalid episode ID: {episode_id}")
333 | abort(400)
334 |
335 | # Check episode status
336 | data = storage.load_data_json(slug)
337 | episode_info = data['episodes'].get(episode_id, {})
338 | status = episode_info.get('status')
339 |
340 | if status == 'processed':
341 | # Serve cached processed file
342 | file_path = storage.get_episode_path(slug, episode_id)
343 | if file_path.exists():
344 | logger.info(f"[{slug}:{episode_id}] Cache hit, serving processed file")
345 | return send_file(file_path, mimetype='audio/mpeg')
346 | else:
347 | logger.error(f"[{slug}:{episode_id}] Processed file missing")
348 | status = None # Reprocess
349 |
350 | elif status == 'failed':
351 | # Always retry processing instead of serving fallback
352 | logger.info(f"[{slug}:{episode_id}] Previous failure detected, retrying processing")
353 | status = None # Reset status to trigger reprocessing
354 |
355 | elif status == 'processing':
356 | # Already processing, return temporary unavailable
357 | logger.info(f"[{slug}:{episode_id}] Episode currently processing")
358 | abort(503)
359 |
360 | # Status is None or unknown - need to process
361 | # First, we need to find the original URL from the RSS feed
362 | cached_rss = storage.get_rss(slug)
363 | if not cached_rss:
364 | logger.error(f"[{slug}:{episode_id}] No RSS feed available")
365 | abort(404)
366 |
367 | # Parse RSS to find original URL
368 | original_feed = rss_parser.fetch_feed(FEED_MAP[slug]['in'])
369 | if not original_feed:
370 | logger.error(f"[{slug}:{episode_id}] Could not fetch original RSS")
371 | abort(503)
372 |
373 | # Parse the feed to get podcast name
374 | parsed_feed = rss_parser.parse_feed(original_feed)
375 | podcast_name = parsed_feed.feed.get('title', 'Unknown') if parsed_feed else 'Unknown'
376 |
377 | episodes = rss_parser.extract_episodes(original_feed)
378 | original_url = None
379 | episode_title = "Unknown"
380 | for ep in episodes:
381 | if ep['id'] == episode_id:
382 | original_url = ep['url']
383 | episode_title = ep.get('title', 'Unknown')
384 | break
385 |
386 | if not original_url:
387 | logger.error(f"[{slug}:{episode_id}] Episode not found in RSS feed")
388 | abort(404)
389 |
390 | logger.info(f"[{slug}:{episode_id}] Starting new processing for {podcast_name}")
391 |
392 | # Process episode (blocking)
393 | if process_episode(slug, episode_id, original_url, episode_title, podcast_name):
394 | # Serve the newly processed file
395 | file_path = storage.get_episode_path(slug, episode_id)
396 | if file_path.exists():
397 | return send_file(file_path, mimetype='audio/mpeg')
398 |
399 | # Processing failed, serve original
400 | logger.info(f"[{slug}:{episode_id}] Processing failed, serving original")
401 | return Response(status=302, headers={'Location': original_url})
402 |
403 | @app.route('/health')
404 | def health_check():
405 | """Health check endpoint."""
406 | return {'status': 'ok', 'feeds': len(FEEDS)}
407 |
408 | if __name__ == '__main__':
409 | # Log BASE_URL configuration
410 | base_url = os.getenv('BASE_URL', 'http://localhost:8000')
411 | logger.info(f"BASE_URL configured as: {base_url}")
412 |
413 | # Start background RSS refresh thread
414 | refresh_thread = threading.Thread(target=background_rss_refresh, daemon=True)
415 | refresh_thread.start()
416 | logger.info("Started background RSS refresh thread")
417 |
418 | # Do initial RSS refresh for all feeds
419 | logger.info("Performing initial RSS refresh for all feeds")
420 | for slug, feed_info in FEED_MAP.items():
421 | refresh_rss_feed(slug, feed_info['in'])
422 | logger.info(f"Feed available at: {base_url}/{slug}")
423 |
424 | # Start Flask server
425 | logger.info("Starting Flask server on port 8000")
426 | app.run(host='0.0.0.0', port=8000, debug=False)
--------------------------------------------------------------------------------