├── assets ├── replace.mp3 ├── replace_old.mp3 └── README.md ├── .gitignore ├── config └── feeds-example.json ├── requirements.txt ├── docker-compose.cpu.yml ├── docker-compose.yml ├── Dockerfile.cpu ├── LICENSE ├── Dockerfile ├── README.md └── src ├── storage.py ├── rss_parser.py ├── audio_processor.py ├── ad_detector.py ├── transcriber.py └── main.py /assets/replace.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemant6488/podcast-server/HEAD/assets/replace.mp3 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .env 3 | config/feeds.json 4 | planning/ 5 | __pycache__/ 6 | *.pyc 7 | .DS_Store -------------------------------------------------------------------------------- /assets/replace_old.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemant6488/podcast-server/HEAD/assets/replace_old.mp3 -------------------------------------------------------------------------------- /config/feeds-example.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "in": "https://example.com/podcast/feed.rss", 4 | "out": "/mypodcast" 5 | } 6 | ] 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | faster-whisper>=1.1.0 2 | ctranslate2==4.4.0 3 | anthropic==0.49.0 4 | feedparser==6.0.11 5 | flask==3.0.3 6 | requests==2.32.3 7 | python-slugify==8.0.4 -------------------------------------------------------------------------------- /assets/README.md: -------------------------------------------------------------------------------- 1 | # Assets Directory 2 | 3 | Place your `replace.mp3` file here. This should be a 1-second beep/tone audio file that will replace advertisement segments in podcasts. 4 | 5 | The file should be named exactly: `replace.mp3` -------------------------------------------------------------------------------- /docker-compose.cpu.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | podcast-server: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile.cpu 8 | ports: 9 | - "8000:8000" 10 | volumes: 11 | - ./data:/app/data 12 | - ./config:/app/config:ro 13 | - ./assets:/app/assets:ro 14 | environment: 15 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 16 | - WHISPER_MODEL=small 17 | - BASE_URL=${BASE_URL:-http://localhost:8000} 18 | - WHISPER_DEVICE=cpu 19 | restart: unless-stopped 20 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | podcast-server: 5 | build: . 6 | ports: 7 | - "8000:8000" 8 | volumes: 9 | - ./data:/app/data 10 | - ./config:/app/config:ro 11 | - ./assets:/app/assets:ro 12 | environment: 13 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 14 | - WHISPER_MODEL=small 15 | - BASE_URL=${BASE_URL:-http://localhost:8000} 16 | - WHISPER_DEVICE=cuda 17 | deploy: 18 | resources: 19 | reservations: 20 | devices: 21 | - driver: nvidia 22 | count: 1 23 | capabilities: [gpu] 24 | restart: unless-stopped -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | # Install system dependencies 4 | RUN apt-get update && apt-get install -y \ 5 | ffmpeg \ 6 | && rm -rf /var/lib/apt/lists/* 7 | 8 | # Set working directory 9 | WORKDIR /app 10 | 11 | # Copy requirements first for better caching 12 | COPY requirements.txt . 13 | 14 | # Install Python dependencies 15 | RUN pip install --no-cache-dir -r requirements.txt 16 | 17 | # Pre-download Faster Whisper model 18 | ENV WHISPER_MODEL=small 19 | RUN python3 -c "import os; from faster_whisper import download_model; download_model(os.getenv('WHISPER_MODEL', 'small'))" 20 | 21 | # Copy application code 22 | COPY src/ ./src/ 23 | COPY config/ ./config/ 24 | COPY assets/ ./assets/ 25 | 26 | # Create data directory 27 | RUN mkdir -p /app/data 28 | 29 | # Expose port 30 | EXPOSE 8000 31 | 32 | # Run the application 33 | CMD ["python", "src/main.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Hemant Kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 2 | 3 | # Install Python 3.11 and system dependencies 4 | RUN apt-get update && apt-get install -y \ 5 | python3.11 \ 6 | python3.11-dev \ 7 | python3-pip \ 8 | ffmpeg \ 9 | wget \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | # Set python3.11 as default python3 13 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 14 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 15 | 16 | # Set working directory 17 | WORKDIR /app 18 | 19 | # Copy requirements first for better caching 20 | COPY requirements.txt . 21 | 22 | # Install Python dependencies 23 | RUN pip install --no-cache-dir -r requirements.txt 24 | 25 | # Pre-download Faster Whisper model 26 | ENV WHISPER_MODEL=small 27 | RUN python3 -c "import os; from faster_whisper import download_model; download_model(os.getenv('WHISPER_MODEL', 'small'))" 28 | 29 | # Copy application code 30 | COPY src/ ./src/ 31 | COPY config/ ./config/ 32 | COPY assets/ ./assets/ 33 | 34 | # Create data directory 35 | RUN mkdir -p /app/data 36 | 37 | # Expose port 38 | EXPOSE 8000 39 | 40 | # Run the application 41 | CMD ["python", "src/main.py"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Podcast Ad Removal Server 2 | 3 | Removes ads from podcasts using Whisper transcription. Serves modified RSS feeds that work with any podcast app. 4 | 5 | > **Disclaimer:** This tool is for personal use only. Only use it with podcasts you have permission to modify or where such modification is permitted under applicable laws. Respect content creators and their terms of service. 6 | 7 | ## How It Works 8 | 9 | 1. **Transcription** - Whisper converts audio to text with timestamps 10 | 2. **Ad Detection** - Claude API analyzes transcript to identify ad segments 11 | 3. **Audio Processing** - FFmpeg removes detected ads and inserts short audio markers 12 | 4. **Serving** - Flask serves modified RSS feeds and processed audio files 13 | 14 | Processing happens on-demand when you play an episode. First play takes a few minutes, subsequent plays are instant (cached). 15 | 16 | ## Requirements 17 | 18 | - Docker with NVIDIA GPU support (for Whisper) 19 | - Anthropic API key 20 | 21 | ## Setup 22 | 23 | ```bash 24 | # 1. Create environment file 25 | echo "ANTHROPIC_API_KEY=your-key-here" > .env 26 | 27 | # 2. Configure feeds 28 | cp config/feeds-example.json config/feeds.json 29 | # Edit config/feeds.json with your podcast RSS URLs 30 | 31 | # 3. Run (GPU) 32 | docker-compose up --build 33 | 34 | # Or for CPU-only mode (no NVIDIA GPU required) 35 | docker-compose -f docker-compose.cpu.yml up --build 36 | ``` 37 | 38 | ### CPU-Only Mode 39 | 40 | CPU transcription is significantly slower—processing can take longer than the episode duration. Since episodes are processed on-demand when you play them, your podcast app will likely timeout waiting for the first request. To work around this: 41 | 42 | 1. Tap download/play on an episode to trigger processing 43 | 2. The request will timeout, but processing continues in the background 44 | 3. Wait a few minutes (check `docker logs` for progress) for the file to get processed 45 | 4. Try playing again, the processed file will be served from cache 46 | 47 | ## Configuration 48 | 49 | Edit `config/feeds.json`: 50 | ```json 51 | [ 52 | { 53 | "in": "https://example.com/podcast/feed.rss", 54 | "out": "/mypodcast" 55 | } 56 | ] 57 | ``` 58 | 59 | - `in` - Original podcast RSS feed URL 60 | - `out` - URL path for your modified feed (e.g., `/mypodcast` → `http://localhost:8000/mypodcast`) 61 | 62 | ## Finding Podcast RSS Feeds 63 | 64 | Most podcasts publish RSS feeds. Common ways to find them: 65 | 66 | 1. **Podcast website** - Look for "RSS" link in footer or subscription options 67 | 2. **Apple Podcasts** - Search on [podcastindex.org](https://podcastindex.org) using the Apple Podcasts URL 68 | 3. **Spotify-exclusive** - Not available (Spotify doesn't expose RSS feeds) 69 | 4. **Hosting platforms** - Common patterns: 70 | - Libsyn: `https://showname.libsyn.com/rss` 71 | - Spreaker: `https://www.spreaker.com/show/{id}/episodes/feed` 72 | - Omny: Check page source for `omnycontent.com` URLs 73 | 74 | ## Usage 75 | 76 | Add your modified feed URL to any podcast app: 77 | ``` 78 | http://your-server:8000/mypodcast 79 | ``` 80 | 81 | ## Environment Variables 82 | 83 | | Variable | Default | Description | 84 | |----------|---------|-------------| 85 | | `ANTHROPIC_API_KEY` | required | Claude API key | 86 | | `BASE_URL` | `http://localhost:8000` | Public URL for generated feed links | 87 | | `WHISPER_MODEL` | `small` | Whisper model size (tiny/base/small/medium/large) | 88 | | `WHISPER_DEVICE` | `cuda` | Device for Whisper (cuda/cpu) | 89 | -------------------------------------------------------------------------------- /src/storage.py: -------------------------------------------------------------------------------- 1 | """Storage management with dynamic directory creation.""" 2 | import os 3 | import json 4 | import logging 5 | from pathlib import Path 6 | from typing import Dict, Any, Optional 7 | import tempfile 8 | import shutil 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class Storage: 13 | def __init__(self, data_dir: str = "/app/data"): 14 | self.data_dir = Path(data_dir) 15 | # Ensure base data directory exists 16 | self.data_dir.mkdir(exist_ok=True) 17 | logger.info(f"Storage initialized with data_dir: {self.data_dir}") 18 | 19 | def get_podcast_dir(self, slug: str) -> Path: 20 | """Get podcast directory, creating if necessary.""" 21 | podcast_dir = self.data_dir / slug 22 | podcast_dir.mkdir(exist_ok=True) 23 | 24 | # Ensure episodes directory exists 25 | episodes_dir = podcast_dir / "episodes" 26 | episodes_dir.mkdir(exist_ok=True) 27 | 28 | logger.info(f"[{slug}] Podcast directory ready: {podcast_dir}") 29 | return podcast_dir 30 | 31 | def load_data_json(self, slug: str) -> Dict[str, Any]: 32 | """Load data.json for a podcast, creating if necessary.""" 33 | podcast_dir = self.get_podcast_dir(slug) 34 | data_file = podcast_dir / "data.json" 35 | 36 | if data_file.exists(): 37 | try: 38 | with open(data_file, 'r') as f: 39 | data = json.load(f) 40 | logger.info(f"[{slug}] Loaded data.json with {len(data.get('episodes', {}))} episodes") 41 | return data 42 | except json.JSONDecodeError as e: 43 | logger.error(f"[{slug}] Invalid data.json, creating new: {e}") 44 | 45 | # Create default structure 46 | data = { 47 | "episodes": {}, 48 | "last_checked": None 49 | } 50 | self.save_data_json(slug, data) 51 | return data 52 | 53 | def save_data_json(self, slug: str, data: Dict[str, Any]) -> None: 54 | """Save data.json atomically.""" 55 | podcast_dir = self.get_podcast_dir(slug) 56 | data_file = podcast_dir / "data.json" 57 | 58 | # Atomic write: write to temp, then rename 59 | with tempfile.NamedTemporaryFile(mode='w', delete=False, dir=podcast_dir, suffix='.tmp') as tmp: 60 | json.dump(data, tmp, indent=2) 61 | tmp_path = tmp.name 62 | 63 | shutil.move(tmp_path, data_file) 64 | logger.info(f"[{slug}] Saved data.json") 65 | 66 | def get_episode_path(self, slug: str, episode_id: str, extension: str = ".mp3") -> Path: 67 | """Get path for episode file.""" 68 | podcast_dir = self.get_podcast_dir(slug) 69 | return podcast_dir / "episodes" / f"{episode_id}{extension}" 70 | 71 | def save_rss(self, slug: str, content: str) -> None: 72 | """Save modified RSS feed.""" 73 | podcast_dir = self.get_podcast_dir(slug) 74 | rss_file = podcast_dir / "modified-rss.xml" 75 | 76 | # Atomic write 77 | with tempfile.NamedTemporaryFile(mode='w', delete=False, dir=podcast_dir, suffix='.tmp') as tmp: 78 | tmp.write(content) 79 | tmp_path = tmp.name 80 | 81 | shutil.move(tmp_path, rss_file) 82 | logger.info(f"[{slug}] Saved modified RSS feed") 83 | 84 | def get_rss(self, slug: str) -> Optional[str]: 85 | """Get cached RSS feed.""" 86 | podcast_dir = self.get_podcast_dir(slug) 87 | rss_file = podcast_dir / "modified-rss.xml" 88 | 89 | if rss_file.exists(): 90 | with open(rss_file, 'r') as f: 91 | return f.read() 92 | return None 93 | 94 | def save_transcript(self, slug: str, episode_id: str, transcript: str) -> None: 95 | """Save episode transcript.""" 96 | path = self.get_episode_path(slug, episode_id, "-transcript.txt") 97 | with open(path, 'w') as f: 98 | f.write(transcript) 99 | logger.info(f"[{slug}:{episode_id}] Saved transcript") 100 | 101 | def save_ads_json(self, slug: str, episode_id: str, ads_data: Any) -> None: 102 | """Save Claude's ad detection response.""" 103 | path = self.get_episode_path(slug, episode_id, "-ads.json") 104 | with open(path, 'w') as f: 105 | json.dump(ads_data, f, indent=2) 106 | logger.info(f"[{slug}:{episode_id}] Saved ads detection data") 107 | 108 | def save_prompt(self, slug: str, episode_id: str, prompt: str) -> None: 109 | """Save Claude prompt for debugging.""" 110 | path = self.get_episode_path(slug, episode_id, "-prompt.txt") 111 | with open(path, 'w') as f: 112 | f.write(prompt) 113 | logger.info(f"[{slug}:{episode_id}] Saved Claude prompt") -------------------------------------------------------------------------------- /src/rss_parser.py: -------------------------------------------------------------------------------- 1 | """RSS feed parsing and management.""" 2 | import feedparser 3 | import logging 4 | import hashlib 5 | import os 6 | from datetime import datetime 7 | from typing import Dict, List, Optional 8 | import requests 9 | from slugify import slugify 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class RSSParser: 14 | def __init__(self, base_url: str = None): 15 | self.base_url = base_url or os.getenv('BASE_URL', 'http://localhost:8000') 16 | 17 | def fetch_feed(self, url: str, timeout: int = 30) -> Optional[str]: 18 | """Fetch RSS feed from URL.""" 19 | try: 20 | logger.info(f"Fetching RSS feed from: {url}") 21 | response = requests.get(url, timeout=timeout) 22 | response.raise_for_status() 23 | logger.info(f"Successfully fetched RSS feed, size: {len(response.content)} bytes") 24 | return response.text 25 | except requests.RequestException as e: 26 | logger.error(f"Failed to fetch RSS feed: {e}") 27 | return None 28 | 29 | def parse_feed(self, feed_content: str) -> Dict: 30 | """Parse RSS feed content.""" 31 | try: 32 | feed = feedparser.parse(feed_content) 33 | if feed.bozo: 34 | logger.warning(f"RSS parse warning: {feed.bozo_exception}") 35 | 36 | logger.info(f"Parsed RSS feed: {feed.feed.get('title', 'Unknown')} with {len(feed.entries)} entries") 37 | return feed 38 | except Exception as e: 39 | logger.error(f"Failed to parse RSS feed: {e}") 40 | return None 41 | 42 | def generate_episode_id(self, episode_url: str) -> str: 43 | """Generate consistent episode ID from URL.""" 44 | # Use MD5 hash of URL for consistent ID 45 | return hashlib.md5(episode_url.encode()).hexdigest()[:12] 46 | 47 | def modify_feed(self, feed_content: str, slug: str) -> str: 48 | """Modify RSS feed to use our server URLs.""" 49 | feed = self.parse_feed(feed_content) 50 | if not feed: 51 | return feed_content 52 | 53 | # Build modified RSS 54 | lines = [] 55 | lines.append('') 56 | lines.append('') 57 | lines.append('') 58 | 59 | # Copy channel metadata 60 | channel = feed.feed 61 | lines.append(f'{self._escape_xml(channel.get("title", ""))}') 62 | lines.append(f'{self._escape_xml(channel.get("link", ""))}') 63 | lines.append(f'{self._escape_xml(channel.get("description", ""))}') 64 | lines.append(f'{self._escape_xml(channel.get("language", "en"))}') 65 | 66 | # Mark as private feed for personal use only 67 | lines.append('Yes') 68 | 69 | if 'image' in channel: 70 | lines.append(f'') 71 | lines.append(f' {self._escape_xml(channel.image.get("href", ""))}') 72 | lines.append(f' {self._escape_xml(channel.image.get("title", ""))}') 73 | lines.append(f' {self._escape_xml(channel.image.get("link", ""))}') 74 | lines.append(f'') 75 | 76 | # Process each episode 77 | for entry in feed.entries: 78 | episode_url = None 79 | # Find audio URL in enclosures 80 | for enclosure in entry.get('enclosures', []): 81 | if 'audio' in enclosure.get('type', ''): 82 | episode_url = enclosure.get('href', '') 83 | break 84 | 85 | if not episode_url: 86 | # Skip entries without audio 87 | logger.warning(f"Skipping entry without audio: {entry.get('title', 'Unknown')}") 88 | continue 89 | 90 | episode_id = self.generate_episode_id(episode_url) 91 | modified_url = f"{self.base_url}/episodes/{slug}/{episode_id}.mp3" 92 | 93 | lines.append('') 94 | lines.append(f' {self._escape_xml(entry.get("title", ""))}') 95 | lines.append(f' {self._escape_xml(entry.get("description", ""))}') 96 | lines.append(f' {self._escape_xml(entry.get("link", ""))}') 97 | lines.append(f' {self._escape_xml(entry.get("id", episode_url))}') 98 | lines.append(f' {self._escape_xml(entry.get("published", ""))}') 99 | 100 | # Modified enclosure URL 101 | lines.append(f' ') 102 | 103 | # iTunes specific tags 104 | if 'itunes_duration' in entry: 105 | lines.append(f' {entry.itunes_duration}') 106 | if 'itunes_explicit' in entry: 107 | lines.append(f' {entry.itunes_explicit}') 108 | 109 | lines.append('') 110 | 111 | lines.append('') 112 | lines.append('') 113 | 114 | modified_rss = '\n'.join(lines) 115 | logger.info(f"[{slug}] Modified RSS feed with {len(feed.entries)} episodes") 116 | return modified_rss 117 | 118 | def _escape_xml(self, text: str) -> str: 119 | """Escape XML special characters.""" 120 | if not text: 121 | return "" 122 | return (text 123 | .replace('&', '&') 124 | .replace('<', '<') 125 | .replace('>', '>') 126 | .replace('"', '"') 127 | .replace("'", ''')) 128 | 129 | def extract_episodes(self, feed_content: str) -> List[Dict]: 130 | """Extract episode information from feed.""" 131 | feed = self.parse_feed(feed_content) 132 | if not feed: 133 | return [] 134 | 135 | episodes = [] 136 | for entry in feed.entries: 137 | episode_url = None 138 | for enclosure in entry.get('enclosures', []): 139 | if 'audio' in enclosure.get('type', ''): 140 | episode_url = enclosure.get('href', '') 141 | break 142 | 143 | if episode_url: 144 | episodes.append({ 145 | 'id': self.generate_episode_id(episode_url), 146 | 'url': episode_url, 147 | 'title': entry.get('title', 'Unknown'), 148 | 'published': entry.get('published', ''), 149 | }) 150 | 151 | return episodes -------------------------------------------------------------------------------- /src/audio_processor.py: -------------------------------------------------------------------------------- 1 | """Audio processing with FFMPEG.""" 2 | import logging 3 | import subprocess 4 | import tempfile 5 | import os 6 | import shutil 7 | from typing import List, Dict, Optional 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class AudioProcessor: 12 | def __init__(self, replace_audio_path: str = "./assets/replace.mp3"): 13 | self.replace_audio_path = replace_audio_path 14 | 15 | def check_ffmpeg(self) -> bool: 16 | """Check if FFMPEG is available.""" 17 | try: 18 | subprocess.run(['ffmpeg', '-version'], 19 | capture_output=True, check=True, timeout=5) 20 | return True 21 | except (subprocess.SubprocessError, FileNotFoundError): 22 | logger.error("FFMPEG not found or not working") 23 | return False 24 | 25 | def get_audio_duration(self, audio_path: str) -> Optional[float]: 26 | """Get duration of audio file in seconds.""" 27 | try: 28 | cmd = [ 29 | 'ffprobe', '-v', 'error', 30 | '-show_entries', 'format=duration', 31 | '-of', 'default=noprint_wrappers=1:nokey=1', 32 | audio_path 33 | ] 34 | result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) 35 | if result.returncode == 0: 36 | return float(result.stdout.strip()) 37 | except Exception as e: 38 | logger.error(f"Failed to get audio duration: {e}") 39 | return None 40 | 41 | def remove_ads(self, input_path: str, ad_segments: List[Dict], output_path: str) -> bool: 42 | """Remove ad segments from audio file.""" 43 | if not ad_segments: 44 | # No ads to remove, just copy file 45 | logger.info("No ads to remove, copying original file") 46 | shutil.copy2(input_path, output_path) 47 | return True 48 | 49 | if not os.path.exists(self.replace_audio_path): 50 | logger.error(f"Replace audio not found: {self.replace_audio_path}") 51 | return False 52 | 53 | try: 54 | # Get total duration 55 | total_duration = self.get_audio_duration(input_path) 56 | if not total_duration: 57 | logger.error("Could not get audio duration") 58 | return False 59 | 60 | logger.info(f"Processing audio: {total_duration:.1f}s total, {len(ad_segments)} ad segments") 61 | 62 | # Sort ad segments by start time 63 | sorted_segments = sorted(ad_segments, key=lambda x: x['start']) 64 | 65 | # Merge segments with < 1 second gaps 66 | merged_ads = [] 67 | current_segment = None 68 | 69 | for ad in sorted_segments: 70 | if current_segment and ad['start'] - current_segment['end'] < 1.0: 71 | # Extend current segment 72 | current_segment['end'] = ad['end'] 73 | if 'reason' in ad: 74 | current_segment['reason'] = current_segment.get('reason', '') + '; ' + ad['reason'] 75 | else: 76 | if current_segment: 77 | merged_ads.append(current_segment) 78 | current_segment = {'start': ad['start'], 'end': ad['end']} 79 | if 'reason' in ad: 80 | current_segment['reason'] = ad['reason'] 81 | 82 | if current_segment: 83 | merged_ads.append(current_segment) 84 | 85 | ads = merged_ads 86 | logger.info(f"After merging: {len(ads)} ad segments") 87 | 88 | # Build complex filter for FFMPEG 89 | # Strategy: Split audio into segments, replace ad segments with beep 90 | filter_parts = [] 91 | concat_parts = [] 92 | current_time = 0 93 | segment_idx = 0 94 | 95 | for ad in ads: 96 | ad_start = ad['start'] 97 | ad_end = ad['end'] 98 | 99 | # Add content before ad 100 | if ad_start > current_time: 101 | filter_parts.append(f"[0:a]atrim={current_time}:{ad_start}[s{segment_idx}]") 102 | concat_parts.append(f"[s{segment_idx}]") 103 | segment_idx += 1 104 | 105 | # Add single replacement audio with volume reduction to 40% 106 | filter_parts.append(f"[1:a]volume=0.4[beep{segment_idx}]") 107 | concat_parts.append(f"[beep{segment_idx}]") 108 | 109 | current_time = ad_end 110 | 111 | # Add remaining content after last ad 112 | if current_time < total_duration: 113 | filter_parts.append(f"[0:a]atrim={current_time}:{total_duration}[s{segment_idx}]") 114 | concat_parts.append(f"[s{segment_idx}]") 115 | 116 | # Concatenate all parts 117 | filter_str = ';'.join(filter_parts) 118 | if filter_str: 119 | filter_str += ';' 120 | filter_str += ''.join(concat_parts) + f"concat=n={len(concat_parts)}:v=0:a=1[out]" 121 | 122 | # Run FFMPEG 123 | cmd = [ 124 | 'ffmpeg', '-y', 125 | '-i', input_path, 126 | '-i', self.replace_audio_path, 127 | '-filter_complex', filter_str, 128 | '-map', '[out]', 129 | '-acodec', 'libmp3lame', 130 | '-ab', '128k', 131 | output_path 132 | ] 133 | 134 | logger.info(f"Running FFMPEG to remove ads") 135 | result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) 136 | 137 | if result.returncode != 0: 138 | logger.error(f"FFMPEG failed: {result.stderr}") 139 | return False 140 | 141 | # Verify output 142 | new_duration = self.get_audio_duration(output_path) 143 | if new_duration: 144 | removed_time = total_duration - new_duration 145 | logger.info(f"FFMPEG processing complete: {total_duration:.1f}s → {new_duration:.1f}s (removed {removed_time:.1f}s)") 146 | return True 147 | else: 148 | logger.error("Could not verify output file") 149 | return False 150 | 151 | except subprocess.TimeoutExpired: 152 | logger.error("FFMPEG processing timed out") 153 | return False 154 | except Exception as e: 155 | logger.error(f"Audio processing failed: {e}") 156 | return False 157 | 158 | def process_episode(self, input_path: str, ad_segments: List[Dict]) -> Optional[str]: 159 | """Process episode audio to remove ads.""" 160 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp: 161 | temp_output = tmp.name 162 | 163 | try: 164 | if self.remove_ads(input_path, ad_segments, temp_output): 165 | return temp_output 166 | else: 167 | # Clean up on failure 168 | if os.path.exists(temp_output): 169 | os.unlink(temp_output) 170 | return None 171 | except Exception as e: 172 | logger.error(f"Episode processing failed: {e}") 173 | if os.path.exists(temp_output): 174 | os.unlink(temp_output) 175 | return None -------------------------------------------------------------------------------- /src/ad_detector.py: -------------------------------------------------------------------------------- 1 | """Ad detection using Claude API.""" 2 | import logging 3 | import json 4 | import os 5 | from typing import List, Dict, Optional 6 | from anthropic import Anthropic 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class AdDetector: 11 | def __init__(self, api_key: Optional[str] = None): 12 | self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY') 13 | if not self.api_key: 14 | logger.warning("No Anthropic API key found") 15 | self.client = None 16 | 17 | def initialize_client(self): 18 | """Initialize Anthropic client.""" 19 | if self.client is None and self.api_key: 20 | try: 21 | from anthropic import Anthropic 22 | self.client = Anthropic(api_key=self.api_key) 23 | logger.info("Anthropic client initialized") 24 | except Exception as e: 25 | logger.error(f"Failed to initialize Anthropic client: {e}") 26 | raise 27 | 28 | def detect_ads(self, segments: List[Dict], podcast_name: str = "Unknown", episode_title: str = "Unknown", slug: str = None, episode_id: str = None) -> Optional[List[Dict]]: 29 | """Detect ad segments using Claude API.""" 30 | if not self.api_key: 31 | logger.warning("Skipping ad detection - no API key") 32 | return [] 33 | 34 | try: 35 | self.initialize_client() 36 | 37 | # Prepare transcript with timestamps for Claude 38 | transcript_lines = [] 39 | for segment in segments: 40 | start = segment['start'] 41 | end = segment['end'] 42 | text = segment['text'] 43 | transcript_lines.append(f"[{start:.1f}s - {end:.1f}s] {text}") 44 | 45 | transcript = "\n".join(transcript_lines) 46 | 47 | # Call Claude API 48 | logger.info(f"Sending transcript to Claude for ad detection: {podcast_name} - {episode_title}") 49 | 50 | prompt = f"""Podcast: {podcast_name} 51 | Episode: {episode_title} 52 | 53 | Transcript: 54 | {transcript} 55 | 56 | INSTRUCTIONS: 57 | Analyze this podcast transcript and identify ALL advertisement segments. Look for: 58 | - Product endorsements, sponsored content, or promotional messages 59 | - Promo codes, special offers, or calls to action 60 | - Clear transitions to/from ads (e.g., "This episode is brought to you by...") 61 | - Host-read advertisements 62 | - Pre-roll, mid-roll, or post-roll ads 63 | - Long intro sections filled with multiple ads before actual content begins 64 | - Mentions of other podcasts/shows from the network (cross-promotion) 65 | - Sponsor messages about credit cards, apps, products, or services 66 | - ANY podcast promos (e.g., "Listen to X on iHeart Radio app") 67 | 68 | CRITICAL MERGING RULES: 69 | 1. Analyze the FULL transcript before deciding segment boundaries - don't stop at gaps 70 | 2. Multiple ads separated by gaps of 15 seconds or less should be treated as ONE CONTINUOUS SEGMENT 71 | 3. Brief transitions, silence, or gaps between ads do NOT count as content - they're part of the same ad block 72 | 4. Only split ads if there's REAL SHOW CONTENT (actual discussion, interview, topic content) for at least 30 seconds between them 73 | 5. Consider the entire context: if ads at 1500s, 1520s, 1540s are all promotional content, return ONE segment from 1500-1560, not three separate ones 74 | 6. When in doubt, merge the segments - better to remove too much than leave ads in 75 | 7. If there's a gap followed by content that doesn't continue the previous discussion but instead introduces a completely new topic/person/show, it's likely still part of the ad block 76 | 77 | Return ONLY a JSON array of ad segments with start/end times in seconds. Be aggressive in detecting ads. 78 | 79 | Format: 80 | [{{"start": 0.0, "end": 240.0, "reason": "Continuous ad block: multiple sponsors"}}, ...] 81 | 82 | If no ads are found, return an empty array: []""" 83 | 84 | # Save the prompt for debugging 85 | if slug and episode_id: 86 | try: 87 | from storage import Storage 88 | storage = Storage() 89 | storage.save_prompt(slug, episode_id, prompt) 90 | except Exception as e: 91 | logger.warning(f"Could not save prompt: {e}") 92 | 93 | response = self.client.messages.create( 94 | model="claude-opus-4-1-20250805", # Use Claude Opus 4.1 for better ad detection 95 | max_tokens=2000, 96 | temperature=0.2, 97 | system="You are an ad detection specialist with extensive experience in identifying all forms of advertisements, sponsorships, and promotional content in podcasts. Your users absolutely cannot tolerate ads - they find them disruptive and want them completely removed. Be extremely aggressive in detecting ads. When in doubt, mark it as an ad. It's better to remove a few seconds of content than to leave any advertisement in the podcast.", 98 | messages=[{ 99 | "role": "user", 100 | "content": prompt 101 | }] 102 | ) 103 | 104 | # Extract JSON from response 105 | response_text = response.content[0].text if response.content else "" 106 | logger.info(f"Claude response received: {len(response_text)} chars") 107 | 108 | # Try to parse JSON from response 109 | try: 110 | # Look for JSON array in response 111 | start_idx = response_text.find('[') 112 | end_idx = response_text.rfind(']') + 1 113 | if start_idx >= 0 and end_idx > start_idx: 114 | json_str = response_text[start_idx:end_idx] 115 | ads = json.loads(json_str) 116 | 117 | # Validate structure 118 | if isinstance(ads, list): 119 | valid_ads = [] 120 | for ad in ads: 121 | if isinstance(ad, dict) and 'start' in ad and 'end' in ad: 122 | valid_ads.append({ 123 | 'start': float(ad['start']), 124 | 'end': float(ad['end']), 125 | 'reason': ad.get('reason', 'Advertisement detected') 126 | }) 127 | 128 | total_ad_time = sum(ad['end'] - ad['start'] for ad in valid_ads) 129 | logger.info(f"Claude detected {len(valid_ads)} ad segments (total {total_ad_time/60:.1f} minutes)") 130 | 131 | # Store full response for debugging 132 | return { 133 | "ads": valid_ads, 134 | "raw_response": response_text, 135 | "model": "claude-sonnet-4-5-20250929" 136 | } 137 | else: 138 | logger.warning("No JSON array found in Claude response") 139 | return {"ads": [], "raw_response": response_text, "error": "No JSON found"} 140 | 141 | except json.JSONDecodeError as e: 142 | logger.error(f"Failed to parse JSON from Claude response: {e}") 143 | return {"ads": [], "raw_response": response_text, "error": str(e)} 144 | 145 | except Exception as e: 146 | logger.error(f"Ad detection failed: {e}") 147 | return {"ads": [], "error": str(e)} 148 | 149 | def process_transcript(self, segments: List[Dict], podcast_name: str = "Unknown", episode_title: str = "Unknown", slug: str = None, episode_id: str = None) -> Dict: 150 | """Process transcript for ad detection.""" 151 | result = self.detect_ads(segments, podcast_name, episode_title, slug, episode_id) 152 | if result is None: 153 | return {"ads": [], "error": "Detection failed"} 154 | return result -------------------------------------------------------------------------------- /src/transcriber.py: -------------------------------------------------------------------------------- 1 | """Transcription using Faster Whisper.""" 2 | import logging 3 | import tempfile 4 | import os 5 | import requests 6 | from typing import List, Dict, Optional, Tuple 7 | from pathlib import Path 8 | from faster_whisper import WhisperModel, BatchedInferencePipeline 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class WhisperModelSingleton: 13 | _instance = None 14 | _base_model = None 15 | 16 | @classmethod 17 | def get_instance(cls) -> Tuple[WhisperModel, BatchedInferencePipeline]: 18 | """ 19 | Get both the base model and batched pipeline instance 20 | Returns: 21 | Tuple[WhisperModel, BatchedInferencePipeline]: Base model for operations like language detection, 22 | and batched pipeline for transcription 23 | """ 24 | if cls._instance is None: 25 | model_size = os.getenv("WHISPER_MODEL", "small") 26 | device = os.getenv("WHISPER_DEVICE", "cpu") 27 | 28 | # Set compute type based on device 29 | if device == "cuda": 30 | compute_type = "float16" # Use FP16 for GPU 31 | logger.info(f"Initializing Whisper model: {model_size} on CUDA with float16") 32 | else: 33 | compute_type = "int8" # Use INT8 for CPU 34 | logger.info(f"Initializing Whisper model: {model_size} on CPU with int8") 35 | 36 | # Initialize base model 37 | cls._base_model = WhisperModel( 38 | model_size, 39 | device=device, 40 | compute_type=compute_type, 41 | ) 42 | 43 | # Initialize batched pipeline 44 | cls._instance = BatchedInferencePipeline( 45 | cls._base_model 46 | ) 47 | logger.info("Whisper model and batched pipeline initialized") 48 | 49 | return cls._base_model, cls._instance 50 | 51 | @classmethod 52 | def get_base_model(cls) -> WhisperModel: 53 | """ 54 | Get just the base model for operations like language detection 55 | Returns: 56 | WhisperModel: Base Whisper model 57 | """ 58 | if cls._base_model is None: 59 | cls.get_instance() 60 | return cls._base_model 61 | 62 | @classmethod 63 | def get_batched_pipeline(cls) -> BatchedInferencePipeline: 64 | """ 65 | Get just the batched pipeline for transcription 66 | Returns: 67 | BatchedInferencePipeline: Batched pipeline for efficient transcription 68 | """ 69 | if cls._instance is None: 70 | cls.get_instance() 71 | return cls._instance 72 | 73 | class Transcriber: 74 | def __init__(self): 75 | # Model is now managed by singleton 76 | pass 77 | 78 | def download_audio(self, url: str, timeout: int = 600) -> Optional[str]: 79 | """Download audio file from URL.""" 80 | try: 81 | logger.info(f"Downloading audio from: {url}") 82 | response = requests.get(url, stream=True, timeout=timeout) 83 | response.raise_for_status() 84 | 85 | # Check file size 86 | content_length = response.headers.get('Content-Length') 87 | if content_length: 88 | size_mb = int(content_length) / (1024 * 1024) 89 | if size_mb > 500: 90 | logger.error(f"Audio file too large: {size_mb:.1f}MB (max 500MB)") 91 | return None 92 | logger.info(f"Audio file size: {size_mb:.1f}MB") 93 | 94 | # Save to temp file 95 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp: 96 | for chunk in response.iter_content(chunk_size=8192): 97 | tmp.write(chunk) 98 | temp_path = tmp.name 99 | 100 | logger.info(f"Downloaded audio to: {temp_path}") 101 | return temp_path 102 | except Exception as e: 103 | logger.error(f"Failed to download audio: {e}") 104 | return None 105 | 106 | def transcribe(self, audio_path: str) -> List[Dict]: 107 | """Transcribe audio file using Faster Whisper with batched pipeline.""" 108 | try: 109 | # Get the batched pipeline for efficient transcription 110 | model = WhisperModelSingleton.get_batched_pipeline() 111 | 112 | logger.info(f"Starting transcription of: {audio_path}") 113 | 114 | # Create a simple prompt for podcast context 115 | initial_prompt = "This is a podcast episode." 116 | 117 | # Adjust batch size based on device 118 | device = os.getenv("WHISPER_DEVICE", "cpu") 119 | if device == "cuda": 120 | batch_size = 16 # Larger batch for GPU 121 | logger.info("Using GPU-optimized batch size: 16") 122 | else: 123 | batch_size = 8 # Smaller batch for CPU 124 | 125 | # Use the batched pipeline for transcription 126 | segments_generator, info = model.transcribe( 127 | audio_path, 128 | language="en", 129 | initial_prompt=initial_prompt, 130 | beam_size=5, 131 | batch_size=batch_size, 132 | vad_filter=True, # Enable VAD filter to skip silent parts 133 | vad_parameters=dict( 134 | min_silence_duration_ms=500, 135 | speech_pad_ms=400 136 | ) 137 | ) 138 | 139 | # Collect segments with real-time progress logging 140 | result = [] 141 | segment_count = 0 142 | last_log_time = 0 143 | 144 | for segment in segments_generator: 145 | segment_count += 1 146 | segment_dict = { 147 | "start": segment.start, 148 | "end": segment.end, 149 | "text": segment.text.strip() 150 | } 151 | result.append(segment_dict) 152 | 153 | # Log progress every 10 segments 154 | if segment_count % 10 == 0: 155 | progress_min = segment.end / 60 156 | logger.info(f"Transcription progress: {segment_count} segments, {progress_min:.1f} minutes processed") 157 | 158 | # Log every 30 seconds of audio processed 159 | if segment.end - last_log_time > 30: 160 | last_log_time = segment.end 161 | # Log the last segment's text (truncated) 162 | text_preview = segment.text.strip()[:100] + "..." if len(segment.text.strip()) > 100 else segment.text.strip() 163 | logger.info(f"[{self.format_timestamp(segment.start)}] {text_preview}") 164 | 165 | duration_min = result[-1]['end'] / 60 if result else 0 166 | logger.info(f"Transcription completed: {len(result)} segments, {duration_min:.1f} minutes") 167 | return result 168 | except Exception as e: 169 | logger.error(f"Transcription failed: {e}") 170 | return None 171 | 172 | def format_timestamp(self, seconds: float) -> str: 173 | """Convert seconds to timestamp format.""" 174 | hours = int(seconds // 3600) 175 | minutes = int((seconds % 3600) // 60) 176 | secs = seconds % 60 177 | return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" 178 | 179 | def segments_to_text(self, segments: List[Dict]) -> str: 180 | """Convert segments to readable text format.""" 181 | lines = [] 182 | for segment in segments: 183 | start_ts = self.format_timestamp(segment['start']) 184 | end_ts = self.format_timestamp(segment['end']) 185 | lines.append(f"[{start_ts} --> {end_ts}] {segment['text']}") 186 | return '\n'.join(lines) 187 | 188 | def process_episode(self, episode_url: str) -> Optional[Dict]: 189 | """Complete transcription pipeline for an episode.""" 190 | audio_path = None 191 | try: 192 | # Download audio 193 | audio_path = self.download_audio(episode_url) 194 | if not audio_path: 195 | return None 196 | 197 | # Transcribe 198 | segments = self.transcribe(audio_path) 199 | if not segments: 200 | return None 201 | 202 | # Format transcript 203 | transcript_text = self.segments_to_text(segments) 204 | 205 | return { 206 | "segments": segments, 207 | "transcript": transcript_text, 208 | "segment_count": len(segments), 209 | "duration": segments[-1]['end'] if segments else 0 210 | } 211 | finally: 212 | # Clean up temp file 213 | if audio_path and os.path.exists(audio_path): 214 | try: 215 | os.unlink(audio_path) 216 | logger.info(f"Cleaned up temp file: {audio_path}") 217 | except: 218 | pass -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | """Main Flask web server for podcast ad removal.""" 2 | import logging 3 | import json 4 | import os 5 | import threading 6 | import time 7 | from datetime import datetime 8 | from pathlib import Path 9 | from flask import Flask, Response, send_file, abort 10 | from slugify import slugify 11 | import shutil 12 | 13 | from storage import Storage 14 | from rss_parser import RSSParser 15 | from transcriber import Transcriber 16 | from ad_detector import AdDetector 17 | from audio_processor import AudioProcessor 18 | 19 | # Configure logging to both file and console 20 | logging.basicConfig( 21 | level=logging.INFO, 22 | format='[%(asctime)s] [%(levelname)s] %(message)s', 23 | datefmt='%Y-%m-%d %H:%M:%S', 24 | handlers=[ 25 | logging.FileHandler('/app/data/server.log'), 26 | logging.StreamHandler() # Keep console output for Docker logs 27 | ] 28 | ) 29 | logger = logging.getLogger(__name__) 30 | 31 | # Initialize Flask app 32 | app = Flask(__name__) 33 | 34 | # Initialize components 35 | storage = Storage() 36 | rss_parser = RSSParser() 37 | transcriber = Transcriber() 38 | ad_detector = AdDetector() 39 | audio_processor = AudioProcessor() 40 | 41 | # Load feed configuration 42 | def load_feeds(): 43 | """Load feed configuration from JSON.""" 44 | config_path = Path("./config/feeds.json") 45 | if not config_path.exists(): 46 | logger.error("feeds.json not found") 47 | return [] 48 | 49 | try: 50 | with open(config_path, 'r') as f: 51 | feeds = json.load(f) 52 | logger.info(f"Loaded {len(feeds)} feed configurations") 53 | return feeds 54 | except Exception as e: 55 | logger.error(f"Failed to load feeds.json: {e}") 56 | return [] 57 | 58 | def reload_feeds(): 59 | """Reload feed configuration and update global FEED_MAP.""" 60 | global FEEDS, FEED_MAP 61 | FEEDS = load_feeds() 62 | FEED_MAP = {slugify(feed['out'].strip('/')): feed for feed in FEEDS} 63 | logger.info(f"Reloaded feeds: {list(FEED_MAP.keys())}") 64 | return FEED_MAP 65 | 66 | # Initial load of feed configuration 67 | FEEDS = load_feeds() 68 | FEED_MAP = {slugify(feed['out'].strip('/')): feed for feed in FEEDS} 69 | 70 | def refresh_rss_feed(slug: str, feed_url: str): 71 | """Refresh RSS feed for a podcast.""" 72 | try: 73 | logger.info(f"[{slug}] Starting RSS refresh from: {feed_url}") 74 | 75 | # Fetch original RSS 76 | feed_content = rss_parser.fetch_feed(feed_url) 77 | if not feed_content: 78 | logger.error(f"[{slug}] Failed to fetch RSS feed") 79 | return False 80 | 81 | # Modify feed URLs 82 | modified_rss = rss_parser.modify_feed(feed_content, slug) 83 | 84 | # Save modified RSS 85 | storage.save_rss(slug, modified_rss) 86 | 87 | # Update last_checked timestamp 88 | data = storage.load_data_json(slug) 89 | data['last_checked'] = datetime.utcnow().isoformat() + 'Z' 90 | storage.save_data_json(slug, data) 91 | 92 | logger.info(f"[{slug}] RSS refresh complete") 93 | return True 94 | except Exception as e: 95 | logger.error(f"[{slug}] RSS refresh failed: {e}") 96 | return False 97 | 98 | def refresh_all_feeds(): 99 | """Refresh all RSS feeds once (no loop).""" 100 | try: 101 | logger.info("Refreshing all RSS feeds") 102 | # Reload feeds.json to pick up any changes 103 | reload_feeds() 104 | 105 | for slug, feed_info in FEED_MAP.items(): 106 | refresh_rss_feed(slug, feed_info['in']) 107 | logger.info("RSS refresh complete") 108 | return True 109 | except Exception as e: 110 | logger.error(f"RSS refresh failed: {e}") 111 | return False 112 | 113 | def background_rss_refresh(): 114 | """Background task to refresh RSS feeds every 15 minutes.""" 115 | while True: 116 | refresh_all_feeds() 117 | # Wait 15 minutes 118 | time.sleep(900) 119 | 120 | def process_episode(slug: str, episode_id: str, episode_url: str, episode_title: str = "Unknown", podcast_name: str = "Unknown"): 121 | """Process a single episode (transcribe, detect ads, remove ads).""" 122 | start_time = time.time() 123 | 124 | try: 125 | # Log start with title 126 | logger.info(f"[{slug}:{episode_id}] Starting: \"{episode_title}\"") 127 | 128 | # Update status to processing 129 | data = storage.load_data_json(slug) 130 | data['episodes'][episode_id] = { 131 | 'status': 'processing', 132 | 'original_url': episode_url, 133 | 'title': episode_title, 134 | 'processed_at': datetime.utcnow().isoformat() + 'Z' 135 | } 136 | storage.save_data_json(slug, data) 137 | 138 | # Step 1: Check if transcript exists 139 | transcript_path = storage.get_episode_path(slug, episode_id, "-transcript.txt") 140 | segments = None 141 | transcript_text = None 142 | 143 | if transcript_path.exists(): 144 | logger.info(f"[{slug}:{episode_id}] Found existing transcript, skipping transcription") 145 | # Load existing transcript 146 | with open(transcript_path, 'r') as f: 147 | transcript_text = f.read() 148 | # Parse segments from transcript 149 | segments = [] 150 | for line in transcript_text.split('\n'): 151 | if line.strip() and line.startswith('['): 152 | # Parse format: [00:00:00.000 --> 00:00:05.200] text 153 | try: 154 | time_part, text_part = line.split('] ', 1) 155 | time_range = time_part.strip('[') 156 | start_str, end_str = time_range.split(' --> ') 157 | # Convert timestamp to seconds 158 | def parse_timestamp(ts): 159 | parts = ts.split(':') 160 | return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2]) 161 | segments.append({ 162 | 'start': parse_timestamp(start_str), 163 | 'end': parse_timestamp(end_str), 164 | 'text': text_part 165 | }) 166 | except: 167 | continue 168 | 169 | if segments: 170 | segment_count = len(segments) 171 | duration_min = segments[-1]['end'] / 60 if segments else 0 172 | logger.info(f"[{slug}:{episode_id}] Loaded transcript: {segment_count} segments, {duration_min:.1f} minutes") 173 | 174 | # Still need to download audio for processing 175 | audio_path = transcriber.download_audio(episode_url) 176 | if not audio_path: 177 | raise Exception("Failed to download audio") 178 | else: 179 | # Download and transcribe 180 | logger.info(f"[{slug}:{episode_id}] Downloading audio") 181 | audio_path = transcriber.download_audio(episode_url) 182 | if not audio_path: 183 | raise Exception("Failed to download audio") 184 | 185 | logger.info(f"[{slug}:{episode_id}] Starting transcription") 186 | segments = transcriber.transcribe(audio_path) 187 | if not segments: 188 | raise Exception("Failed to transcribe audio") 189 | 190 | segment_count = len(segments) 191 | duration_min = segments[-1]['end'] / 60 if segments else 0 192 | logger.info(f"[{slug}:{episode_id}] Transcription completed: {segment_count} segments, {duration_min:.1f} minutes") 193 | 194 | # Save transcript 195 | transcript_text = transcriber.segments_to_text(segments) 196 | storage.save_transcript(slug, episode_id, transcript_text) 197 | 198 | try: 199 | 200 | # Step 2: Detect ads 201 | logger.info(f"[{slug}:{episode_id}] Sending to Claude API - Podcast: {podcast_name}, Episode: {episode_title}") 202 | ad_result = ad_detector.process_transcript(segments, podcast_name, episode_title, slug, episode_id) 203 | storage.save_ads_json(slug, episode_id, ad_result) 204 | 205 | ads = ad_result.get('ads', []) 206 | if ads: 207 | total_ad_time = sum(ad['end'] - ad['start'] for ad in ads) 208 | logger.info(f"[{slug}:{episode_id}] Claude detected {len(ads)} ad segments (total {total_ad_time/60:.1f} minutes)") 209 | else: 210 | logger.info(f"[{slug}:{episode_id}] No ads detected") 211 | 212 | # Step 3: Process audio to remove ads 213 | logger.info(f"[{slug}:{episode_id}] Starting FFMPEG") 214 | processed_path = audio_processor.process_episode(audio_path, ads) 215 | if not processed_path: 216 | raise Exception("Failed to process audio with FFMPEG") 217 | 218 | # Get durations for logging 219 | original_duration = audio_processor.get_audio_duration(audio_path) 220 | new_duration = audio_processor.get_audio_duration(processed_path) 221 | 222 | # Move processed file to final location 223 | final_path = storage.get_episode_path(slug, episode_id) 224 | shutil.move(processed_path, final_path) 225 | 226 | # Update status to processed 227 | data = storage.load_data_json(slug) 228 | data['episodes'][episode_id] = { 229 | 'status': 'processed', 230 | 'original_url': episode_url, 231 | 'title': episode_title, 232 | 'processed_file': f"episodes/{episode_id}.mp3", 233 | 'processed_at': datetime.utcnow().isoformat() + 'Z', 234 | 'original_duration': original_duration, 235 | 'new_duration': new_duration, 236 | 'ads_removed': len(ads) 237 | } 238 | storage.save_data_json(slug, data) 239 | 240 | # Calculate processing time 241 | processing_time = time.time() - start_time 242 | 243 | # Final summary log 244 | if original_duration and new_duration: 245 | time_saved = original_duration - new_duration 246 | logger.info(f"[{slug}:{episode_id}] Complete: \"{episode_title}\" | {original_duration/60:.1f}→{new_duration/60:.1f}min | {len(ads)} ads removed | {processing_time:.1f}s") 247 | else: 248 | logger.info(f"[{slug}:{episode_id}] Complete: \"{episode_title}\" | {len(ads)} ads removed | {processing_time:.1f}s") 249 | 250 | return True 251 | 252 | finally: 253 | # Clean up temp audio file 254 | if os.path.exists(audio_path): 255 | os.unlink(audio_path) 256 | 257 | except Exception as e: 258 | processing_time = time.time() - start_time 259 | logger.error(f"[{slug}:{episode_id}] Failed: \"{episode_title}\" | Error: {e} | {processing_time:.1f}s") 260 | 261 | # Update status to failed 262 | data = storage.load_data_json(slug) 263 | data['episodes'][episode_id] = { 264 | 'status': 'failed', 265 | 'original_url': episode_url, 266 | 'title': episode_title, 267 | 'error': str(e), 268 | 'failed_at': datetime.utcnow().isoformat() + 'Z' 269 | } 270 | storage.save_data_json(slug, data) 271 | return False 272 | 273 | @app.route('/') 274 | def serve_rss(slug): 275 | """Serve modified RSS feed.""" 276 | if slug not in FEED_MAP: 277 | # Refresh all feeds to pick up any new ones 278 | logger.info(f"[{slug}] Not found in feeds, refreshing all") 279 | refresh_all_feeds() 280 | 281 | # Check again after refresh 282 | if slug not in FEED_MAP: 283 | logger.warning(f"[{slug}] Still not found after refresh") 284 | abort(404) 285 | 286 | # Check if RSS cache exists or is stale 287 | cached_rss = storage.get_rss(slug) 288 | data = storage.load_data_json(slug) 289 | last_checked = data.get('last_checked') 290 | 291 | # If no cache or stale (>15 min), refresh immediately 292 | should_refresh = False 293 | if not cached_rss: 294 | should_refresh = True 295 | logger.info(f"[{slug}] No RSS cache, fetching immediately") 296 | elif last_checked: 297 | try: 298 | last_time = datetime.fromisoformat(last_checked.replace('Z', '+00:00')) 299 | age_minutes = (datetime.utcnow() - last_time.replace(tzinfo=None)).total_seconds() / 60 300 | if age_minutes > 15: 301 | should_refresh = True 302 | logger.info(f"[{slug}] RSS cache stale ({age_minutes:.1f} minutes old), refreshing") 303 | except: 304 | should_refresh = True 305 | 306 | if should_refresh: 307 | refresh_rss_feed(slug, FEED_MAP[slug]['in']) 308 | cached_rss = storage.get_rss(slug) 309 | 310 | if cached_rss: 311 | logger.info(f"[{slug}] Serving RSS feed") 312 | return Response(cached_rss, mimetype='application/rss+xml') 313 | else: 314 | logger.error(f"[{slug}] RSS feed not available") 315 | abort(503) 316 | 317 | @app.route('/episodes//.mp3') 318 | def serve_episode(slug, episode_id): 319 | """Serve processed episode audio (JIT processing).""" 320 | if slug not in FEED_MAP: 321 | # Refresh all feeds to pick up any new ones 322 | logger.info(f"[{slug}] Not found in feeds for episode {episode_id}, refreshing all") 323 | refresh_all_feeds() 324 | 325 | # Check again after refresh 326 | if slug not in FEED_MAP: 327 | logger.warning(f"[{slug}] Still not found after refresh for episode {episode_id}") 328 | abort(404) 329 | 330 | # Validate episode ID (alphanumeric + dash/underscore) 331 | if not all(c.isalnum() or c in '-_' for c in episode_id): 332 | logger.warning(f"[{slug}] Invalid episode ID: {episode_id}") 333 | abort(400) 334 | 335 | # Check episode status 336 | data = storage.load_data_json(slug) 337 | episode_info = data['episodes'].get(episode_id, {}) 338 | status = episode_info.get('status') 339 | 340 | if status == 'processed': 341 | # Serve cached processed file 342 | file_path = storage.get_episode_path(slug, episode_id) 343 | if file_path.exists(): 344 | logger.info(f"[{slug}:{episode_id}] Cache hit, serving processed file") 345 | return send_file(file_path, mimetype='audio/mpeg') 346 | else: 347 | logger.error(f"[{slug}:{episode_id}] Processed file missing") 348 | status = None # Reprocess 349 | 350 | elif status == 'failed': 351 | # Always retry processing instead of serving fallback 352 | logger.info(f"[{slug}:{episode_id}] Previous failure detected, retrying processing") 353 | status = None # Reset status to trigger reprocessing 354 | 355 | elif status == 'processing': 356 | # Already processing, return temporary unavailable 357 | logger.info(f"[{slug}:{episode_id}] Episode currently processing") 358 | abort(503) 359 | 360 | # Status is None or unknown - need to process 361 | # First, we need to find the original URL from the RSS feed 362 | cached_rss = storage.get_rss(slug) 363 | if not cached_rss: 364 | logger.error(f"[{slug}:{episode_id}] No RSS feed available") 365 | abort(404) 366 | 367 | # Parse RSS to find original URL 368 | original_feed = rss_parser.fetch_feed(FEED_MAP[slug]['in']) 369 | if not original_feed: 370 | logger.error(f"[{slug}:{episode_id}] Could not fetch original RSS") 371 | abort(503) 372 | 373 | # Parse the feed to get podcast name 374 | parsed_feed = rss_parser.parse_feed(original_feed) 375 | podcast_name = parsed_feed.feed.get('title', 'Unknown') if parsed_feed else 'Unknown' 376 | 377 | episodes = rss_parser.extract_episodes(original_feed) 378 | original_url = None 379 | episode_title = "Unknown" 380 | for ep in episodes: 381 | if ep['id'] == episode_id: 382 | original_url = ep['url'] 383 | episode_title = ep.get('title', 'Unknown') 384 | break 385 | 386 | if not original_url: 387 | logger.error(f"[{slug}:{episode_id}] Episode not found in RSS feed") 388 | abort(404) 389 | 390 | logger.info(f"[{slug}:{episode_id}] Starting new processing for {podcast_name}") 391 | 392 | # Process episode (blocking) 393 | if process_episode(slug, episode_id, original_url, episode_title, podcast_name): 394 | # Serve the newly processed file 395 | file_path = storage.get_episode_path(slug, episode_id) 396 | if file_path.exists(): 397 | return send_file(file_path, mimetype='audio/mpeg') 398 | 399 | # Processing failed, serve original 400 | logger.info(f"[{slug}:{episode_id}] Processing failed, serving original") 401 | return Response(status=302, headers={'Location': original_url}) 402 | 403 | @app.route('/health') 404 | def health_check(): 405 | """Health check endpoint.""" 406 | return {'status': 'ok', 'feeds': len(FEEDS)} 407 | 408 | if __name__ == '__main__': 409 | # Log BASE_URL configuration 410 | base_url = os.getenv('BASE_URL', 'http://localhost:8000') 411 | logger.info(f"BASE_URL configured as: {base_url}") 412 | 413 | # Start background RSS refresh thread 414 | refresh_thread = threading.Thread(target=background_rss_refresh, daemon=True) 415 | refresh_thread.start() 416 | logger.info("Started background RSS refresh thread") 417 | 418 | # Do initial RSS refresh for all feeds 419 | logger.info("Performing initial RSS refresh for all feeds") 420 | for slug, feed_info in FEED_MAP.items(): 421 | refresh_rss_feed(slug, feed_info['in']) 422 | logger.info(f"Feed available at: {base_url}/{slug}") 423 | 424 | # Start Flask server 425 | logger.info("Starting Flask server on port 8000") 426 | app.run(host='0.0.0.0', port=8000, debug=False) --------------------------------------------------------------------------------