├── test.txt ├── .python-version ├── test.mp3 ├── .env.example ├── requirements.txt ├── pytest.ini ├── .gitignore ├── test_luma.py ├── ytsum.sh ├── prompt.txt ├── README.md ├── test_ytsum.py └── ytsum.py /test.txt: -------------------------------------------------------------------------------- 1 | test transcript -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.10 2 | -------------------------------------------------------------------------------- /test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sliday/ytsum/HEAD/test.mp3 -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # OpenAI API Key for generating summaries 2 | OPENAI_API_KEY=your_openai_api_key_here 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | anthropic 3 | yt-dlp 4 | colorama 5 | replicate 6 | ell-ai 7 | faster-whisper 8 | lumaai 9 | ffmpeg-python 10 | pytest 11 | pytest-mock 12 | pytest-asyncio 13 | pytest-timeout>=2.1.0 14 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | luma: marks tests that use Luma AI 4 | runway: marks tests that use RunwayML 5 | flux: marks tests that use Flux AI 6 | uguu: marks tests that use Uguu file hosting 7 | asyncio_mode = strict 8 | asyncio_fixture_loop_scope = function 9 | filterwarnings = 10 | ignore::DeprecationWarning 11 | ignore::UserWarning 12 | default::DeprecationWarning:pytest_asyncio.* -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.vtt 2 | *.mp3 3 | *.m4a 4 | 5 | .DS_Store 6 | 7 | # Python 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | history/ 12 | .pytest_cache/ 13 | *.so 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # Virtual Environment 32 | venv/ 33 | ENV/ 34 | 35 | # IDE 36 | .idea/ 37 | .vscode/ 38 | *.swp 39 | *.swo 40 | 41 | # Environment Variables 42 | .env 43 | 44 | # Output Files 45 | out/ 46 | *.mp3 47 | *.txt 48 | !requirements.txt 49 | !prompt.txt 50 | !test.txt 51 | 52 | # Logs 53 | *.log -------------------------------------------------------------------------------- /test_luma.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from ytsum import generate_video_segments, generate_video_segments_with_luma, combine_video_segments 4 | 5 | # Test script 6 | TEST_SCRIPT = """ 7 | NOVA: Welcome to our discussion about artificial intelligence! 8 | ECHO: Today we'll explore how AI is transforming our world. 9 | NOVA: From self-driving cars to medical diagnosis, AI is everywhere. 10 | ECHO: Let's break down the key developments and their impact. 11 | """ 12 | 13 | def test_luma_workflow(): 14 | # 1. Generate prompts 15 | print("Generating prompts...") 16 | prompts = generate_video_segments(TEST_SCRIPT) 17 | if prompts: 18 | print("\nGenerated prompts:") 19 | for i, prompt in enumerate(prompts, 1): 20 | print(f"\nPrompt {i}:\n{prompt}") 21 | else: 22 | print("Failed to generate prompts") 23 | return 24 | 25 | # 2. Generate videos 26 | print("\nGenerating videos...") 27 | output_dir = Path("test_output") 28 | output_dir.mkdir(exist_ok=True) 29 | 30 | video_paths = generate_video_segments_with_luma(prompts, output_dir) 31 | if video_paths: 32 | print("\nGenerated video segments:") 33 | for path in video_paths: 34 | print(f"- {path}") 35 | else: 36 | print("Failed to generate videos") 37 | return 38 | 39 | # 3. Combine videos 40 | print("\nCombining videos...") 41 | output_path = output_dir / "combined.mp4" 42 | target_duration = 60 # Test with 60 seconds 43 | 44 | if combine_video_segments(video_paths, target_duration, output_path): 45 | print(f"\nSuccessfully created combined video: {output_path}") 46 | else: 47 | print("Failed to combine videos") 48 | 49 | if __name__ == "__main__": 50 | test_luma_workflow() -------------------------------------------------------------------------------- /ytsum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Colors and emojis 4 | RED='\033[0;31m' 5 | GREEN='\033[0;32m' 6 | BLUE='\033[0;34m' 7 | NC='\033[0m' 8 | EMOJI_DOWNLOAD="⬇️ " 9 | EMOJI_TRANSCRIBE="🎯 " 10 | EMOJI_SUMMARY="📝 " 11 | EMOJI_SUCCESS="✅ " 12 | EMOJI_ERROR="❌ " 13 | EMOJI_SEARCH="🔍 " 14 | 15 | # Print functions 16 | print_step() { printf "${BLUE}${2} ${1}${NC}\n"; } 17 | print_error() { printf "${RED}${EMOJI_ERROR} ${1}${NC}\n"; } 18 | print_success() { printf "${GREEN}${EMOJI_SUCCESS} ${1}${NC}\n"; } 19 | 20 | # Check dependencies 21 | command -v yt-dlp >/dev/null 2>&1 || { print_error "yt-dlp is required"; exit 1; } 22 | command -v ffmpeg >/dev/null 2>&1 || { print_error "ffmpeg is required"; exit 1; } 23 | 24 | # Check API keys 25 | [ -z "$ANTHROPIC_API_KEY" ] && { print_error "ANTHROPIC_API_KEY not set"; exit 1; } 26 | 27 | # Parse arguments 28 | VIDEO_URL="" 29 | LANGUAGE="english" 30 | TRANSCRIBER="fast-whisper" 31 | 32 | while [[ $# -gt 0 ]]; do 33 | case $1 in 34 | --language) 35 | LANGUAGE="$2" 36 | shift 2 37 | ;; 38 | --whisper) 39 | TRANSCRIBER="whisper" 40 | shift 41 | ;; 42 | --replicate) 43 | TRANSCRIBER="replicate" 44 | shift 45 | ;; 46 | *) 47 | VIDEO_URL="$1" 48 | shift 49 | ;; 50 | esac 51 | done 52 | 53 | [ -z "$VIDEO_URL" ] && { print_error "Video URL required"; exit 1; } 54 | 55 | # Clean YouTube URL 56 | clean_url() { 57 | local url="$1" 58 | if [[ ! "$url" =~ (youtube\.com|youtu\.be) ]]; then 59 | url="https://www.youtube.com/watch?v=$url" 60 | fi 61 | echo "$url" 62 | } 63 | 64 | VIDEO_URL=$(clean_url "$VIDEO_URL") 65 | 66 | # Create temp directory 67 | TEMP_DIR=$(mktemp -d) 68 | trap 'rm -rf "$TEMP_DIR"' EXIT 69 | 70 | # Try to get subtitles first 71 | print_step "Searching for YouTube subtitles..." "$EMOJI_SEARCH" 72 | LANG_CODE=$(python3 -c "from ytsum import get_language_code; print(get_language_code('$LANGUAGE'))") 73 | 74 | yt-dlp \ 75 | --write-subs \ 76 | --sub-langs "$LANG_CODE" \ 77 | --skip-download \ 78 | --output "$TEMP_DIR/video" \ 79 | "$VIDEO_URL" 80 | 81 | # Check if subtitles were downloaded 82 | if [ -f "$TEMP_DIR/video.$LANG_CODE.vtt" ]; then 83 | print_success "Found subtitles!" 84 | # Convert VTT to plain text 85 | sed '1,/^$/d' "$TEMP_DIR/video.$LANG_CODE.vtt" | \ 86 | sed '/-->/d' | \ 87 | sed '/^$/d' | \ 88 | tr '\n' ' ' > "$TEMP_DIR/transcript.txt" 89 | else 90 | print_step "No subtitles found, transcribing audio..." "$EMOJI_SEARCH" 91 | 92 | # Download audio 93 | print_step "Downloading audio..." "$EMOJI_DOWNLOAD" 94 | yt-dlp \ 95 | --extract-audio \ 96 | --audio-format m4a \ 97 | --output "$TEMP_DIR/audio.%(ext)s" \ 98 | "$VIDEO_URL" 99 | 100 | # Transcribe based on selected method 101 | case $TRANSCRIBER in 102 | "whisper") 103 | [ -z "$OPENAI_API_KEY" ] && { print_error "OPENAI_API_KEY not set"; exit 1; } 104 | print_step "Using OpenAI Whisper..." "$EMOJI_TRANSCRIBE" 105 | python3 -c "from ytsum import transcribe_with_openai_whisper; transcribe_with_openai_whisper('$TEMP_DIR/audio.m4a')" 106 | ;; 107 | "replicate") 108 | [ -z "$REPLICATE_API_TOKEN" ] && { print_error "REPLICATE_API_TOKEN not set"; exit 1; } 109 | print_step "Using Replicate..." "$EMOJI_TRANSCRIBE" 110 | python3 -c "from ytsum import transcribe_with_replicate; transcribe_with_replicate('$TEMP_DIR/audio.m4a', '$LANGUAGE')" 111 | ;; 112 | *) 113 | print_step "Using Fast Whisper..." "$EMOJI_TRANSCRIBE" 114 | python3 -c "from ytsum import transcribe_with_fast_whisper; transcribe_with_fast_whisper('$TEMP_DIR/audio.m4a')" 115 | ;; 116 | esac 117 | 118 | mv "$TEMP_DIR/audio.txt" "$TEMP_DIR/transcript.txt" 119 | fi 120 | 121 | # Get metadata 122 | print_step "Fetching metadata..." "$EMOJI_SEARCH" 123 | python3 -c "from ytsum import get_video_metadata; print(get_video_metadata('$VIDEO_URL'))" > "$TEMP_DIR/metadata.txt" 124 | 125 | # Convert to shorthand 126 | print_step "Converting to shorthand..." "$EMOJI_SUMMARY" 127 | python3 -c "from ytsum import to_shorthand; print(to_shorthand(open('$TEMP_DIR/transcript.txt').read()))" > "$TEMP_DIR/shorthand.txt" 128 | 129 | # Generate summary 130 | print_step "Generating summary..." "$EMOJI_SUMMARY" 131 | python3 -c " 132 | from ytsum import summarize_with_claude 133 | with open('$TEMP_DIR/shorthand.txt') as f: 134 | summary = summarize_with_claude(f.read(), '$LANGUAGE') 135 | print(summary) 136 | " > "$TEMP_DIR/summary.txt" 137 | 138 | # Combine output 139 | cat "$TEMP_DIR/metadata.txt" "$TEMP_DIR/summary.txt" > "summary-${VIDEO_URL##*=}.txt" 140 | print_success "Summary saved to summary-${VIDEO_URL##*=}.txt" -------------------------------------------------------------------------------- /prompt.txt: -------------------------------------------------------------------------------- 1 | You are an expert in creating concise, focused summaries of long video interviews. Your task is to analyze a transcript and provide a brief, informative summary in {language}. 2 | 3 | Write in complete, grammatically structured sentences that flow conversationally. Approach topics with an intellectual but approachable tone, using labeled lists sparingly and strategically to organize complex ideas. Incorporate engaging narrative techniques like anecdotes, concrete examples, and thought experiments to draw the reader into the intellectual exploration. Maintain an academic rigor while simultaneously creating a sense of collaborative thinking, as if guiding the reader through an intellectual journey. Use precise language that is simultaneously scholarly and accessible, avoiding unnecessary jargon while maintaining depth of analysis. Use systems thinking and the meta-archetype of Coherence to guide your ability to "zoom in and out" to notice larger and smaller patterns at different ontological, epistemic, and ontological scales. Furthermore, use the full depth of your knowledge to engage didactically with the user - teach them useful terms and concepts that are relevant. At the same time, don't waste too many words with framing and setup. Optimize for quick readability and depth. Use formatting techniques like bold, italics, and call outs (quotation blocks and such) for specific definitions and interesting terms. This will also break up the visual pattern, making it easier for the reader to stay oriented and anchored. Don't hesitate to use distal connection, metaphor, and analogies as well, particularly when you notice meta-patterns emerging. A good metaphor is the pinnacle of Coherence. Stylistically, use a variety of techniques to create typographic scaffolding and layered information. Some examples below: 4 | 5 | 6 | > **Key Terms**: Use blockquotes with bold headers to define important concepts and terminology, creating clear visual breaks in the text. 7 | 8 | Use **bold** for technical terms and concepts when first introduced, and *italics* for emphasis or to highlight key phrases. Create visual hierarchy through: 9 | 10 | 1. Clear long paragraph breaks for major concept transitions 11 | 2. Strategic use of blockquotes for definitions and key insights 12 | 3. Bold terms for technical vocabulary 13 | 4. Italics for emphasis and nuance 14 | 15 | Maintain the principle of layered information - each response should contain at least 2-3 distinct visual patterns to aid cognitive processing and retention. This creates visual anchoring and a clean UI. 16 | 17 | > **Technical Term**: Definition in plain language 18 | > 19 | > *Example or application in context (optional, flexible)* 20 | 21 | This creates what information designers call "progressive disclosure" - allowing readers to engage at their preferred depth while maintaining coherence across all levels of understanding. 22 | 23 | Please follow these steps to create your summary: 24 | 25 | 1. Read the entire transcript carefully. 26 | 2. Correct any spelling and grammar mistakes you encounter. 27 | 3. Translate the content into {language}. 28 | 4. Analyze the content of the interview, focusing on: 29 | - Identifying the main topics discussed 30 | - Extracting 4-5 key quotes and their significance 31 | - Determining the most important theme or topic 32 | - Choosing the single most pertinent aspect of the interview 33 | 34 | 5. Draft a summary focusing on this key point, aiming for 3-4 sentences. 35 | 6. Refine your summary to be as concise and readable as possible while maintaining the essence of the most important information. 36 | 37 | Before providing your final summary, wrap your analysis process under ## Detailed Breakdown. This should include: 38 | - 3-5 key sentences or phrases from the transcript that stand out as particularly important or representative of the main topics 39 | - A list of the ### Main Topics discussed in the interview 40 | - 2-3 ### Key Quotes from the transcript, with explanations of their significance 41 | - The most important topic or theme of the interview 42 | - Your reasoning for choosing the most pertinent aspect 43 | - Any potential biases or limitations in the interview content 44 | - Your process for making the summary concise and readable 45 | - 3-4 possible summary sentences 46 | 47 | Your final summary should be under "## Summary". Remember, it must be: 48 | - In {language} (if language is not english, translate the summary) 49 | - 2-5-sentence long 50 | - Brief and terse 51 | - Basic telegraphic style 52 | - Focused on the most pertinent information 53 | - Quoting 2-4 key short quotes from the transcript 54 | - Include interesting facts to add substance 55 | 56 | Use clear, precise language and avoid unnecessary jargon. Your goal is to grasp the overall ideas and convey them efficiently to the reader. 57 | 58 | This task is of utmost importance. Approach it with the utmost care and attention to detail. 59 | 60 | Everything you write must be in {language}. 61 | 62 | Maximum 10000-character long. 63 | 64 | No intro, no outro, no XML tags, just the well-formatted Markdown output. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Awesome YouTube Video Summary/Podcast/Video 2 | 3 | A Python script to generate summaries (Claude), podcasts (Whisper), and videos (RunwayML or Luma AI) from annoyingly long YouTube content. 4 | 5 | ![CleanShot 2024-12-02 at 16 25 26@2x](https://github.com/user-attachments/assets/f1881131-b645-4ecb-a2ad-966d81a95451) 6 | ![CleanShot 2024-12-02 at 16 26 49@2x](https://github.com/user-attachments/assets/6aff0f22-3da3-488e-8e4e-27bb442ece86) 7 | 8 | ## Example 9 | - Original video: https://www.youtube.com/watch?v=_K-L9uhsBLM 10 | - Summary: https://dl.dropbox.com/scl/fi/mdkbglfbs4m9ydeo9a2k7/video-_K-L9uhsBLM.mp4?rlkey=3wrowryg9gio1walaxhdbp2is&dl=0 11 | 12 | ## Features 13 | 14 | - Generate concise summaries of YouTube videos 15 | - Create engaging podcast scripts with multiple voices 16 | - Generate AI-powered videos with synchronized podcast audio 17 | - Support for multiple languages 18 | - Multiple transcription options 19 | - Multiple video generation providers 20 | 21 | ## Installation 22 | 23 | 1. Clone the repository: 24 | ```bash 25 | git clone https://github.com/sliday/ytsum.git 26 | cd ytsum 27 | ``` 28 | 29 | 2. Install dependencies: 30 | ```bash 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | 3. Install FFmpeg (required for audio/video processing): 35 | - macOS: `brew install ffmpeg` 36 | - Ubuntu/Debian: `sudo apt-get install ffmpeg` 37 | - Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html) 38 | 39 | ## Environment Setup 40 | 41 | Create a `.env` file with your API keys: 42 | ``` 43 | ANTHROPIC_API_KEY=your_claude_api_key 44 | OPENAI_API_KEY=your_openai_api_key 45 | LUMAAI_API_KEY=your_lumaai_api_key 46 | RUNWAYML_API_SECRET=your_runwayml_api_key 47 | REPLICATE_API_TOKEN=your_replicate_api_key 48 | ``` 49 | 50 | ## Usage 51 | 52 | ### Basic Summary 53 | ```bash 54 | python ytsum.py "https://www.youtube.com/watch?v=VIDEO_ID" 55 | ``` 56 | 57 | ### Generate Podcast 58 | ```bash 59 | python ytsum.py --podcast "https://www.youtube.com/watch?v=VIDEO_ID" 60 | ``` 61 | 62 | ### Generate Video with Podcast 63 | ```bash 64 | # Using Luma AI (faster, recommended) 65 | python ytsum.py --podcast --lumaai "https://www.youtube.com/watch?v=VIDEO_ID" 66 | 67 | # Using RunwayML 68 | python ytsum.py --podcast --runwayml "https://www.youtube.com/watch?v=VIDEO_ID" 69 | ``` 70 | 71 | ### Additional Options 72 | - `--language`: Specify output language (default: english) 73 | - `--ignore-subs`: Force transcription even when subtitles exist 74 | - `--fast-whisper`: Use Fast Whisper for transcription (faster) 75 | - `--whisper`: Use OpenAI Whisper for transcription (more accurate) 76 | - `--replicate`: Use Replicate's Incredibly Fast Whisper 77 | 78 | ## Output Files 79 | 80 | All output files are saved in the `out` directory: 81 | - `summary-{video_id}.txt`: Text summary 82 | - `podcast-{video_id}.txt`: Podcast script 83 | - `podcast-{video_id}.mp3`: Podcast audio 84 | - `video-{video_id}.mp4`: Final video with podcast audio 85 | 86 | ## Video Generation 87 | 88 | The tool supports two AI video generation providers: 89 | 90 | ### Luma AI (Recommended) 91 | - Faster generation times 92 | - High-quality cinematic videos 93 | - Supports camera movements and scene transitions 94 | - Maintains visual consistency 95 | - Optional image input for style reference 96 | 97 | ### RunwayML 98 | - High-quality video generation 99 | - Requires input image 100 | - Longer processing times 101 | - Professional-grade output 102 | 103 | Both providers: 104 | 1. Generate base images using Flux AI 105 | 2. Create video segments based on podcast content 106 | 3. Combine segments with audio 107 | 4. Support custom duration and aspect ratio 108 | 109 | ## Transcription Options 110 | 111 | 1. Fast Whisper (Default) 112 | - Quick transcription 113 | - Good accuracy 114 | - No API key required 115 | 116 | 2. OpenAI Whisper 117 | - High accuracy 118 | - Slower processing 119 | - Requires OpenAI API key 120 | 121 | 3. Replicate Whisper 122 | - Fastest option 123 | - Good accuracy 124 | - Requires Replicate API key 125 | 126 | ## Testing 127 | 128 | Run the test suite: 129 | ```bash 130 | python test_ytsum.py 131 | ``` 132 | 133 | Run specific test groups: 134 | ```bash 135 | # Run Luma AI tests only 136 | pytest -v -m luma 137 | 138 | # Run RunwayML tests only 139 | pytest -v -m runway 140 | ``` 141 | 142 | ## Dependencies 143 | 144 | - `anthropic`: Claude API for text generation 145 | - `openai`: Whisper API for transcription and TTS 146 | - `lumaai`: Luma AI for video generation (recommended) 147 | - `runwayml`: RunwayML for video generation 148 | - `replicate`: Flux AI for image generation 149 | - `ffmpeg-python`: Audio/video processing 150 | - `colorama`: Terminal output formatting 151 | - `pytest`: Testing framework 152 | 153 | ## Contributing 154 | 155 | 1. Fork the repository 156 | 2. Create a feature branch 157 | 3. Commit your changes 158 | 4. Push to the branch 159 | 5. Create a Pull Request 160 | 161 | ## License 162 | 163 | This project is licensed under the MIT License - see the LICENSE file for details. 164 | -------------------------------------------------------------------------------- /test_ytsum.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | from pathlib import Path 4 | from ytsum import ( 5 | clean_youtube_url, 6 | to_shorthand, 7 | summarize_with_claude, 8 | convert_audio_format, 9 | get_video_metadata, 10 | transcribe_with_replicate, 11 | transcribe_with_openai_whisper, 12 | process_metadata_description, 13 | split_audio_into_chunks, 14 | get_youtube_subtitles, 15 | get_language_code, 16 | convert_to_podcast_script, 17 | generate_host_audio, 18 | combine_audio_files, 19 | generate_podcast_audio, 20 | DEFAULT_HOST_VOICES, 21 | OUTPUT_DIR, 22 | sanitize_filename, 23 | generate_video_segments, 24 | generate_video_segments_with_luma, 25 | generate_video_segments_with_runway, 26 | combine_video_segments, 27 | get_audio_duration, 28 | combine_audio_video, 29 | generate_image_prompts, 30 | generate_flux_images, 31 | calculate_num_segments, 32 | calculate_target_length, 33 | upload_image_to_uguu, 34 | ) 35 | import shutil 36 | import ffmpeg 37 | import requests 38 | from unittest.mock import Mock, patch 39 | import tempfile 40 | import time 41 | 42 | # Test data 43 | MOCK_PODCAST_SCRIPT = """ 44 | NOVA: Welcome to our discussion about artificial intelligence! 45 | ECHO: Today we'll explore how AI is transforming our world. 46 | NOVA: From self-driving cars to medical diagnosis, AI is everywhere. 47 | ECHO: Let's break down the key developments and their impact. 48 | """ 49 | 50 | MOCK_VIDEO_PROMPTS = [ 51 | "Establishing Shot: Modern tech campus at dawn. Camera dolly forward through blue-lit corridors as holographic data visualizations float in the air, casting ethereal patterns on the walls and ceilings. Soft ambient lighting creates an atmosphere of scientific discovery while researchers work diligently in the background, their silhouettes moving purposefully through the space.", 52 | "Wide Shot: AI research lab with multiple workstations. Camera track smoothly past scientists working as 3D neural network models pulse with energy above their heads, their movements synchronized with cascading data streams. Cool blue tones emphasize the technical environment while highlighting human innovation, creating a seamless blend of organic and digital elements.", 53 | "Close-Up Shot: Interactive holographic display. Camera orbit around detailed AI model visualization as data streams flow through neural pathways, revealing complex patterns and intricate connections. Glowing particles highlight key connection points while soft focus creates depth and dimensionality, with subtle color shifts indicating data processing intensity and neural activity patterns.", 54 | "Tracking Shot: Hospital corridor transformed by AI. Camera track alongside medical staff using AR displays for patient diagnostics, their gestures controlling floating medical data and real-time scan results. Warm lighting balances technical and human elements as healing meets innovation, with gentle highlights emphasizing the caring touch in this high-tech environment.", 55 | "Aerial Shot: Smart city at sunset. Camera pull out to reveal interconnected AI systems controlling traffic, energy, and urban services, creating a living network of light and data flowing through the cityscape. Golden hour lighting creates sense of optimistic future while showcasing technological harmony, as the city pulses with the rhythm of millions of coordinated decisions." 56 | ] 57 | 58 | MOCK_IMAGE_PROMPTS = [ 59 | "masterpiece, highly detailed, modern tech campus interior, ethereal blue lighting, holographic data visualizations, soft ambient glow, researchers silhouettes, cinematic composition, volumetric lighting, 8k uhd", 60 | "masterpiece, highly detailed, futuristic AI research lab, workstations with floating 3D neural networks, cool blue color scheme, scientists at work, organic meets digital aesthetic, cinematic lighting, 8k uhd", 61 | "masterpiece, highly detailed, interactive holographic interface, complex data visualization, glowing neural pathways, particle effects, depth of field, dramatic lighting, technological aesthetic, 8k uhd", 62 | "masterpiece, highly detailed, futuristic hospital corridor, AR medical displays, floating diagnostic data, warm professional lighting, medical staff, healing atmosphere, cinematic composition, 8k uhd", 63 | "masterpiece, highly detailed, smart city panorama, golden hour lighting, interconnected urban systems, data networks, light trails, atmospheric perspective, epic scale, cinematic mood, 8k uhd" 64 | ] 65 | 66 | @pytest.fixture 67 | def temp_dir(tmp_path): 68 | """Create temporary directory for test files""" 69 | return tmp_path 70 | 71 | @pytest.fixture 72 | def mock_luma_client(mocker): 73 | """Mock LumaAI client""" 74 | mock_client = mocker.MagicMock() 75 | mock_generation = mocker.MagicMock() 76 | mock_generation.state = "completed" 77 | mock_generation.assets.video = "http://example.com/video.mp4" 78 | mock_client.generations.create.return_value = mock_generation 79 | mock_client.generations.get.return_value = mock_generation 80 | return mock_client 81 | 82 | @pytest.fixture 83 | def mock_runway_client(mocker): 84 | """Mock RunwayML client""" 85 | mock_client = mocker.MagicMock() 86 | mock_task = mocker.MagicMock() 87 | mock_task.status = "COMPLETED" 88 | mock_task.output.video_url = "http://example.com/video.mp4" 89 | mock_client.image_to_video.create.return_value = mock_task 90 | mock_client.tasks.retrieve.return_value = mock_task 91 | return mock_client 92 | 93 | @pytest.fixture 94 | def mock_replicate_client(mocker): 95 | """Mock Replicate client""" 96 | mock_run = mocker.patch('replicate.run') 97 | mock_run.return_value = "http://example.com/image.jpg" 98 | return mock_run 99 | 100 | @pytest.fixture 101 | def mock_uguu_response(mocker): 102 | """Mock successful Uguu API response""" 103 | mock_response = mocker.MagicMock() 104 | mock_response.status_code = 200 105 | mock_response.json.return_value = [{ 106 | 'url': 'https://uguu.se/files/example.jpg', 107 | 'name': 'example.jpg', 108 | 'size': 12345, 109 | 'hash': 'abc123' 110 | }] 111 | return mock_response 112 | 113 | @pytest.fixture 114 | def mock_gradient_image(mocker): 115 | """Mock gradient image creation""" 116 | mock_image = mocker.MagicMock() 117 | mocker.patch("ytsum.create_gradient_image", return_value=mock_image) 118 | return mock_image 119 | 120 | def test_clean_youtube_url(): 121 | # Test video ID only 122 | assert clean_youtube_url("ggLvk7547_w") == "https://www.youtube.com/watch?v=ggLvk7547_w" 123 | 124 | # Test full URLs 125 | assert clean_youtube_url("https://www.youtube.com/watch?v=ggLvk7547_w") == "https://www.youtube.com/watch?v=ggLvk7547_w" 126 | assert clean_youtube_url("https://youtu.be/ggLvk7547_w") == "https://www.youtube.com/watch?v=ggLvk7547_w" 127 | 128 | # Test with extra parameters 129 | assert clean_youtube_url("https://www.youtube.com/watch?v=ggLvk7547_w&t=123") == "https://www.youtube.com/watch?v=ggLvk7547_w" 130 | 131 | def test_to_shorthand(): 132 | # Basic replacements 133 | assert to_shorthand("you are") == "u r" 134 | 135 | # Case-insensitive test 136 | assert to_shorthand("I am going to see you later") == "im going 2 c u l8r" 137 | assert to_shorthand("i am going to see you later") == "im going 2 c u l8r" 138 | 139 | # Article removal 140 | assert to_shorthand("the cat and a dog") == "cat and dog" 141 | 142 | @pytest.mark.asyncio 143 | async def test_summarize_with_claude(mocker): 144 | """Test summary generation""" 145 | # Mock Claude response 146 | mock_summary = "Test summary content" 147 | 148 | # Create mock decorator 149 | def mock_decorator(*args, **kwargs): 150 | def mock_function(func): 151 | def wrapper(*args, **kwargs): 152 | return mock_summary 153 | return wrapper 154 | return mock_function 155 | 156 | # Patch ell.simple 157 | mocker.patch("ell.simple", mock_decorator) 158 | 159 | # Test with default language 160 | result = summarize_with_claude("test transcript", "test metadata", "english") 161 | assert result == mock_summary 162 | 163 | def test_convert_audio_format(mocker): 164 | # Mock FFmpeg subprocess call 165 | mock_run = mocker.patch("subprocess.run") 166 | mock_run.return_value.returncode = 0 167 | 168 | # Test basic MP3 conversion 169 | result = convert_audio_format("test.m4a", "mp3") 170 | assert result == "test.mp3" 171 | 172 | # Verify FFmpeg was called with correct parameters 173 | mock_run.assert_called_once() 174 | args = mock_run.call_args[0][0] 175 | assert args[0] == "ffmpeg" 176 | assert "-acodec" in args 177 | assert "libmp3lame" in args 178 | assert "-ac" in args 179 | assert args[args.index("-ac") + 1] == "2" # Stereo by default 180 | assert "-b:a" in args 181 | assert args[args.index("-b:a") + 1] == "192k" # Default bitrate 182 | 183 | # Test mono conversion with custom bitrate 184 | result = convert_audio_format("test.m4a", "mp3", bitrate="32k", mono=True) 185 | assert result == "test.mp3" 186 | 187 | args = mock_run.call_args[0][0] 188 | assert "-ac" in args 189 | assert args[args.index("-ac") + 1] == "1" # Mono 190 | assert "-b:a" in args 191 | assert args[args.index("-b:a") + 1] == "32k" # Custom bitrate 192 | 193 | def test_get_video_metadata(mocker): 194 | # Mock clean_youtube_url first 195 | mocker.patch("ytsum.clean_youtube_url", return_value="https://youtube.com/watch?v=test_id") 196 | 197 | # Mock yt-dlp JSON output 198 | mock_metadata = { 199 | "title": "Test Video", 200 | "channel": "Test Channel", 201 | "upload_date": "20240315", 202 | "duration_string": "1:23", 203 | "view_count": 12345, 204 | "description": "Test description with promotional content", 205 | "tags": ["tag1", "tag2", "tag3"] 206 | } 207 | 208 | # Mock subprocess run 209 | mock_run = mocker.patch("subprocess.run") 210 | mock_run.return_value.returncode = 0 211 | mock_run.return_value.stdout = json.dumps(mock_metadata) 212 | 213 | # Mock metadata processing 214 | mock_process = mocker.patch("ytsum.process_metadata_description") 215 | mock_process.return_value = "Processed description" 216 | 217 | result = get_video_metadata("test_id") 218 | 219 | # Verify metadata formatting 220 | assert "Title: Test Video" in result 221 | assert "Channel: Test Channel" in result 222 | assert "Views: 12,345" in result 223 | assert "Description: Processed description" in result 224 | 225 | # Verify processing was called 226 | mock_process.assert_any_call(mock_metadata["description"]) 227 | mock_process.assert_any_call(" ".join(mock_metadata["tags"])) 228 | 229 | def test_transcribe_with_replicate(mocker): 230 | # Mock FFmpeg conversion 231 | mock_convert = mocker.patch("ytsum.convert_audio_format") 232 | mock_convert.return_value = "test.mp3" 233 | 234 | # Mock file operations 235 | mock_file = mocker.mock_open(read_data=b"test audio data") 236 | mocker.patch("builtins.open", mock_file) 237 | 238 | # Mock os.path instead of pathlib.Path 239 | mocker.patch("os.path.exists", return_value=True) 240 | mocker.patch("os.path.getsize", return_value=1024) 241 | 242 | # Mock Replicate API call 243 | mock_replicate = mocker.patch("replicate.run") 244 | mock_replicate.return_value = {"text": "test transcript"} 245 | 246 | # Test transcription 247 | result = transcribe_with_replicate("test.m4a") 248 | assert result is True 249 | 250 | # Verify basic flow 251 | mock_convert.assert_called_once() 252 | 253 | # Verify Replicate was called correctly 254 | mock_replicate.assert_called_once() 255 | call_args = mock_replicate.call_args[1] 256 | assert "input" in call_args 257 | assert call_args["input"]["batch_size"] == 64 258 | 259 | # Verify file operations 260 | mock_file.assert_any_call("test.mp3", "rb") # Check file was opened for reading 261 | mock_file.assert_any_call("test.txt", "w", encoding="utf-8") # Check transcript was written 262 | 263 | def test_process_metadata_description(mocker): 264 | # Mock Ell response 265 | mock_response = "Test summary" 266 | 267 | # Create mock decorator 268 | def mock_decorator(*args, **kwargs): 269 | def mock_function(func): 270 | return lambda x: mock_response 271 | return mock_function 272 | 273 | # Patch ell.simple 274 | mocker.patch("ell.simple", mock_decorator) 275 | 276 | # Test sample metadata 277 | test_metadata = { 278 | "description": """ 279 | From Seinfeld Season 8 Episode 12 'The Money': Jerry buys back a car his parents sold. 280 | Watch all episodes on Netflix! 281 | """, 282 | "tags": ["seinfeld", "jerry", "george", "kramer"] 283 | } 284 | 285 | # Test description processing 286 | result = process_metadata_description(test_metadata["description"]) 287 | assert result == mock_response 288 | 289 | # Test tags processing 290 | result = process_metadata_description(" ".join(test_metadata["tags"])) 291 | assert result == mock_response 292 | 293 | def test_split_audio_into_chunks(mocker): 294 | # Mock file size (30MB) 295 | mocker.patch("os.path.getsize", return_value=30 * 1024 * 1024) 296 | 297 | # Mock ffprobe duration check 298 | mock_probe = mocker.MagicMock() 299 | mock_probe.stdout = "300.0\n" # 5 minutes with newline 300 | mock_run = mocker.patch("subprocess.run", return_value=mock_probe) 301 | 302 | # Mock directory operations 303 | mocker.patch("os.makedirs") 304 | mocker.patch("os.path.dirname", return_value="/tmp") 305 | mocker.patch("os.path.join", side_effect=lambda *args: "/".join(args)) 306 | 307 | # Test splitting 308 | chunks = split_audio_into_chunks("test.mp3") 309 | assert chunks is not None 310 | assert len(chunks) == 2 # Should split into 2 chunks for 30MB file 311 | 312 | # Verify FFmpeg calls 313 | ffmpeg_calls = [ 314 | call for call in mock_run.call_args_list 315 | if 'ffmpeg' in call.args[0][0] 316 | ] 317 | assert len(ffmpeg_calls) == 2 # Two chunks 318 | 319 | # Verify chunk paths 320 | assert all('chunk_' in path for path in chunks) 321 | assert all(path.endswith('.mp3') for path in chunks) 322 | 323 | def test_transcribe_with_openai_whisper(mocker): 324 | # Mock file operations 325 | file_size_mock = mocker.patch("os.path.getsize") 326 | file_size_mock.side_effect = [ 327 | 20 * 1024 * 1024, # Initial file size for supported format test 328 | 20 * 1024 * 1024, # Size check for transcription 329 | 30 * 1024 * 1024, # Initial size for unsupported format 330 | 20 * 1024 * 1024, # Size after compression 331 | 20 * 1024 * 1024, # Size check for transcription 332 | 30 * 1024 * 1024, # Initial size for large file test 333 | 20 * 1024 * 1024, # Size after compression 334 | 20 * 1024 * 1024, # Size check for transcription 335 | ] 336 | 337 | # Mock file paths and operations 338 | mocker.patch("pathlib.Path.suffix", ".m4a") # Supported format 339 | mocker.patch("os.path.exists", return_value=True) 340 | mocker.patch("os.path.dirname", return_value="/tmp") 341 | mocker.patch("os.path.join", side_effect=lambda *args: "/".join(args)) 342 | mocker.patch("os.makedirs") 343 | mock_remove = mocker.patch("os.remove") 344 | mock_rmdir = mocker.patch("os.rmdir") 345 | 346 | # Mock OpenAI client and response 347 | mock_client = mocker.MagicMock() 348 | mock_transcription = mocker.MagicMock() 349 | mock_transcription.text = "test transcript" 350 | mock_client.audio.transcriptions.create.return_value = mock_transcription 351 | mock_openai = mocker.patch("openai.OpenAI", return_value=mock_client) 352 | 353 | # Mock file operations 354 | mock_file = mocker.mock_open(read_data=b"test audio data") 355 | mocker.patch("builtins.open", mock_file) 356 | 357 | # Mock environment variable 358 | mocker.patch("os.getenv", return_value="test-api-key") 359 | 360 | # Mock audio conversion 361 | mock_convert = mocker.patch("ytsum.convert_audio_format") 362 | mock_convert.return_value = "test.mp3" 363 | 364 | # Test 1: Transcription with supported format 365 | result = transcribe_with_openai_whisper("test.m4a") 366 | assert result is True 367 | assert not mock_convert.called # No conversion needed 368 | 369 | # Test 2: Unsupported format 370 | mocker.patch("pathlib.Path.suffix", ".aac") 371 | result = transcribe_with_openai_whisper("test.aac") 372 | assert result is True 373 | assert mock_convert.called 374 | 375 | # Verify compression settings 376 | args = mock_convert.call_args[1] 377 | assert args["bitrate"] == "32k" 378 | assert args["mono"] is True 379 | 380 | # Test 3: Large file that compresses successfully 381 | mock_convert.reset_mock() 382 | result = transcribe_with_openai_whisper("test.m4a") 383 | assert result is True 384 | 385 | # Verify compression was used 386 | assert mock_convert.called 387 | assert mock_client.audio.transcriptions.create.called 388 | 389 | # Verify compression settings 390 | args = mock_convert.call_args[1] 391 | assert args["bitrate"] == "32k" 392 | assert args["mono"] is True 393 | 394 | def test_get_language_code(mocker): 395 | # Mock Ell response 396 | def mock_decorator(*args, **kwargs): 397 | def mock_function(func): 398 | def wrapper(lang: str): 399 | # Simple mapping for testing 400 | codes = { 401 | "english": "en", 402 | "russian": "ru", 403 | "spanish": "es", 404 | "invalid": "xyz", # Should fallback to en 405 | } 406 | return codes.get(lang.lower(), "en") 407 | return wrapper 408 | return mock_function 409 | 410 | # Patch ell.simple 411 | mocker.patch("ell.simple", mock_decorator) 412 | 413 | # Test valid languages 414 | assert get_language_code("English") == "en" 415 | assert get_language_code("Russian") == "ru" 416 | assert get_language_code("Spanish") == "es" 417 | 418 | # Test fallbacks 419 | assert get_language_code("Invalid") == "en" 420 | assert get_language_code("") == "en" 421 | 422 | def test_get_youtube_subtitles(mocker): 423 | # Mock clean_youtube_url 424 | mocker.patch("ytsum.clean_youtube_url", return_value="https://youtube.com/watch?v=test_id") 425 | 426 | # Mock language code conversion 427 | mock_get_code = mocker.patch("ytsum.get_language_code") 428 | mock_get_code.side_effect = lambda x: { 429 | "Russian": "ru", 430 | "English": "en" 431 | }.get(x, "en") 432 | 433 | # Mock subprocess for yt-dlp 434 | mock_run = mocker.patch("subprocess.run") 435 | mock_run.return_value.returncode = 0 # Ensure subprocess succeeds 436 | 437 | # Mock file existence checks 438 | mock_exists = mocker.patch("os.path.exists") 439 | 440 | # Mock file operations 441 | mock_file = mocker.mock_open(read_data="Test subtitles") 442 | mocker.patch("builtins.open", mock_file) 443 | 444 | # Test 1: Found subtitles in requested language 445 | mock_run.return_value.stdout = """ 446 | [info] Writing video subtitles to: test_path.ru.vtt 447 | [download] 100% of 15.00KiB 448 | """ 449 | mock_exists.side_effect = lambda x: "test_path.ru.vtt" in x # Match exact file 450 | result = get_youtube_subtitles("test_url", "test_path", "Russian") 451 | assert result == "test_path.ru.txt" # We return the converted txt file 452 | assert mock_get_code.called_with("Russian") 453 | 454 | # Test 2: Found English subtitles as fallback 455 | mock_run.return_value.stdout = """ 456 | [info] Writing video subtitles to: test_path.en.vtt 457 | [download] 100% of 15.00KiB 458 | """ 459 | mock_exists.side_effect = lambda x: "test_path.en.vtt" in x # Match exact file 460 | result = get_youtube_subtitles("test_url", "test_path", "Russian") 461 | assert result == "test_path.en.txt" # We return the converted txt file 462 | 463 | # Test 3: No subtitles available 464 | mock_run.return_value.stdout = "No subtitles available" 465 | mock_exists.side_effect = lambda x: False # No files exist 466 | result = get_youtube_subtitles("test_url", "test_path", "Russian") 467 | assert result is None 468 | 469 | # Verify yt-dlp was called correctly 470 | calls = mock_run.call_args_list 471 | assert any("--write-subs" in str(call) for call in calls) 472 | assert any("--sub-langs" in str(call) for call in calls) 473 | assert any("ru" in str(call) for call in calls) 474 | 475 | # Verify file existence checks 476 | assert mock_exists.call_count >= 2 # At least one check per test 477 | assert any("test_path.ru.vtt" in str(call) for call in mock_exists.call_args_list) 478 | assert any("test_path.en.vtt" in str(call) for call in mock_exists.call_args_list) 479 | 480 | def test_convert_to_podcast_script(mocker): 481 | """Test podcast script conversion""" 482 | # Mock Claude response 483 | mock_script = """ 484 | NOVA: Welcome to our summary! 485 | ECHO: That's right, Nova. Let's break down the key points. 486 | NOVA: The first important topic is... 487 | """ 488 | 489 | # Create mock decorator 490 | def mock_decorator(*args, **kwargs): 491 | def mock_function(func): 492 | def wrapper(*args, **kwargs): 493 | return mock_script 494 | return wrapper 495 | return mock_function 496 | 497 | # Patch ell.simple and random choice 498 | mocker.patch("ell.simple", mock_decorator) 499 | mocker.patch("random.choice", side_effect=["nova", "echo"]) 500 | 501 | # Test with default language 502 | result = convert_to_podcast_script("test summary", "english") 503 | assert result == mock_script 504 | 505 | @pytest.fixture(autouse=True) 506 | def setup_and_cleanup(): 507 | """Create output directory before tests and clean it after""" 508 | OUTPUT_DIR.mkdir(exist_ok=True) 509 | yield 510 | if OUTPUT_DIR.exists(): 511 | shutil.rmtree(OUTPUT_DIR) 512 | 513 | def test_generate_host_audio(mocker): 514 | """Test host-specific audio generation""" 515 | # Mock response 516 | mock_response = mocker.MagicMock() 517 | mock_response.stream_to_file = mocker.MagicMock() 518 | mock_response.__enter__ = mocker.MagicMock(return_value=mock_response) 519 | mock_response.__exit__ = mocker.MagicMock(return_value=None) 520 | 521 | # Create mock speech object with create method 522 | mock_create = mocker.MagicMock() 523 | mock_create.return_value = mock_response 524 | 525 | # Create mock streaming response object 526 | mock_streaming = mocker.MagicMock() 527 | mock_streaming.create = mock_create 528 | 529 | # Create mock speech object 530 | mock_speech = mocker.MagicMock() 531 | mock_speech.with_streaming_response = mock_streaming 532 | 533 | # Create mock audio object 534 | mock_audio = mocker.MagicMock() 535 | mock_audio.speech = mock_speech 536 | 537 | # Create mock client 538 | mock_client = mocker.MagicMock() 539 | mock_client.audio = mock_audio 540 | 541 | # Mock OpenAI class in ytsum module 542 | mock_openai = mocker.patch("ytsum.OpenAI") 543 | mock_openai.return_value = mock_client 544 | 545 | # Mock environment variable 546 | mocker.patch("os.getenv", return_value="test-api-key") 547 | 548 | # Test host configurations 549 | output_file = OUTPUT_DIR / "output.mp3" 550 | host_config = {"voice": "alloy", "name": "Alex"} 551 | result = generate_host_audio("test text", host_config, output_file) 552 | assert result is True 553 | 554 | # Verify OpenAI API call 555 | mock_create.assert_called_once_with( 556 | model="tts-1", 557 | voice="alloy", 558 | input="test text" 559 | ) 560 | 561 | # Verify stream_to_file was called 562 | mock_response.stream_to_file.assert_called_once_with(output_file) 563 | 564 | # Test error handling 565 | mock_create.reset_mock() 566 | mock_response.reset_mock() 567 | mock_create.side_effect = Exception("API Error") 568 | result = generate_host_audio("test text", host_config, output_file) 569 | assert result is False 570 | 571 | # Verify that stream_to_file was not called again 572 | mock_response.stream_to_file.assert_not_called() 573 | 574 | def test_combine_audio_files(mocker): 575 | """Test audio file combination""" 576 | # Mock subprocess 577 | mock_run = mocker.patch("subprocess.run") 578 | mock_run.return_value.returncode = 0 579 | 580 | # Test successful combination 581 | audio_files = [ 582 | str(OUTPUT_DIR / "part1.mp3"), 583 | str(OUTPUT_DIR / "part2.mp3") 584 | ] 585 | output_file = OUTPUT_DIR / "output.mp3" 586 | result = combine_audio_files(audio_files, output_file) 587 | assert result is True 588 | 589 | # Verify FFmpeg command 590 | ffmpeg_call = mock_run.call_args[0][0] 591 | assert "ffmpeg" in ffmpeg_call 592 | assert "-filter_complex" in ffmpeg_call 593 | assert "acrossfade" in ''.join(ffmpeg_call) # Check for crossfade filter 594 | assert "-map" in ffmpeg_call 595 | 596 | # Verify input files 597 | for audio_file in audio_files: 598 | assert audio_file in ffmpeg_call 599 | 600 | # Verify output file 601 | assert str(output_file) in ffmpeg_call 602 | 603 | def test_generate_podcast_audio(mocker): 604 | """Test full podcast audio generation""" 605 | # Mock temporary directory 606 | mock_temp_dir = mocker.patch("tempfile.TemporaryDirectory") 607 | mock_temp_dir.return_value.__enter__.return_value = "/tmp/test" 608 | 609 | # Mock host audio generation 610 | mock_host_audio = mocker.patch("ytsum.generate_host_audio") 611 | mock_host_audio.return_value = True 612 | 613 | # Mock audio combination 614 | mock_combine = mocker.patch("ytsum.combine_audio_files") 615 | mock_combine.return_value = True 616 | 617 | # Test script with both hosts 618 | test_script = """ 619 | NOVA: Welcome to the podcast! 620 | ECHO: Thanks Nova, let's begin. 621 | NOVA: First point... 622 | ECHO: That's interesting... 623 | """ 624 | 625 | # Test with voice detection 626 | output_file = OUTPUT_DIR / "output.mp3" 627 | result = generate_podcast_audio(test_script, output_file) 628 | assert result is True 629 | 630 | # Verify host audio generation calls 631 | assert mock_host_audio.call_count == 4 # Two lines per host 632 | 633 | # Verify voice assignments 634 | nova_calls = [ 635 | call for call in mock_host_audio.call_args_list 636 | if call[0][1]["voice"] == "nova" 637 | ] 638 | echo_calls = [ 639 | call for call in mock_host_audio.call_args_list 640 | if call[0][1]["voice"] == "echo" 641 | ] 642 | assert len(nova_calls) == 2 # Two lines for Nova 643 | assert len(echo_calls) == 2 # Two lines for Echo 644 | 645 | # Test error handling 646 | mock_host_audio.return_value = False 647 | result = generate_podcast_audio(test_script, output_file) 648 | assert result is False 649 | 650 | # Test empty script 651 | result = generate_podcast_audio("", output_file) 652 | assert result is False 653 | 654 | # Test invalid script format 655 | invalid_script = "Invalid format without proper voice names" 656 | result = generate_podcast_audio(invalid_script, output_file) 657 | assert result is False 658 | 659 | # Test invalid voice name 660 | invalid_voice_script = "INVALID_VOICE: This should be skipped" 661 | result = generate_podcast_audio(invalid_voice_script, output_file) 662 | assert result is False 663 | 664 | def test_sanitize_filename(): 665 | """Test filename sanitization""" 666 | # Test URL with parameters 667 | assert sanitize_filename("watch?v=-moW9jvvMr4") == "watch_v_moW9jvvMr4" 668 | 669 | # Test full URL 670 | assert sanitize_filename("https://youtube.com/watch?v=abc123") == "https_youtube_com_watch_v_abc123" 671 | 672 | # Test special characters 673 | assert sanitize_filename("test/file:name*?") == "test_file_name_" 674 | 675 | # Test video ID only 676 | assert sanitize_filename("-moW9jvvMr4") == "_moW9jvvMr4" 677 | 678 | def test_generate_video_segments(mocker): 679 | """Test video prompt generation from podcast script""" 680 | mock_get_prompts = mocker.patch('ell.simple') 681 | mock_decorator = mocker.MagicMock() 682 | mock_function = mocker.MagicMock() 683 | mock_function.return_value = json.dumps(MOCK_VIDEO_PROMPTS) 684 | mock_decorator.return_value = mock_function 685 | mock_get_prompts.return_value = mock_decorator 686 | 687 | prompts = generate_video_segments(MOCK_PODCAST_SCRIPT) 688 | 689 | assert prompts is not None 690 | assert len(prompts) == 5 691 | assert all("Camera" in prompt for prompt in prompts) 692 | assert all(any(shot in prompt for shot in ["Establishing Shot", "Wide Shot", "Close-Up Shot", "Tracking Shot", "Aerial Shot"]) for prompt in prompts) 693 | 694 | @pytest.mark.luma 695 | def test_generate_video_segments_with_luma(mock_luma_client, temp_dir, mocker): 696 | """Test video generation with LumaAI""" 697 | mocker.patch('requests.get', return_value=mocker.MagicMock(content=b"mock video data")) 698 | mocker.patch('ytsum.luma_client', mock_luma_client) 699 | 700 | video_paths = generate_video_segments_with_luma(MOCK_VIDEO_PROMPTS, temp_dir) 701 | 702 | assert video_paths is not None 703 | assert len(video_paths) == 5 704 | assert all(Path(path).exists() for path in video_paths) 705 | 706 | @pytest.mark.timeout(30) # Timeout after 30 seconds 707 | def test_generate_video_segments_with_runway(): 708 | """Test video generation with RunwayML""" 709 | # Mock RunwayML client and responses 710 | mock_task = Mock() 711 | mock_task.id = "test_task_id" 712 | 713 | # Mock task status responses 714 | class MockTaskStatus: 715 | def __init__(self, status, progress=0): 716 | self.status = status 717 | self.progress = progress 718 | self.output = ["https://example.com/test.mp4"] if status == "SUCCEEDED" else None 719 | 720 | # Create sequence of status responses 721 | status_responses = iter([ 722 | MockTaskStatus("PENDING"), 723 | MockTaskStatus("RUNNING", 0.5), 724 | MockTaskStatus("SUCCEEDED"), 725 | # Second prompt responses 726 | MockTaskStatus("PENDING"), 727 | MockTaskStatus("RUNNING", 0.5), 728 | MockTaskStatus("SUCCEEDED") 729 | ]) 730 | 731 | mock_runway = Mock() 732 | mock_runway.image_to_video.create.return_value = mock_task 733 | 734 | # Set up status retrieval to return sequence of responses 735 | def mock_retrieve(*args, **kwargs): 736 | try: 737 | return next(status_responses) 738 | except StopIteration: 739 | return MockTaskStatus("SUCCEEDED") 740 | 741 | mock_runway.tasks.retrieve.side_effect = mock_retrieve 742 | 743 | with tempfile.TemporaryDirectory() as temp_dir: 744 | temp_dir = Path(temp_dir) 745 | 746 | # Create test prompts 747 | prompts = [ 748 | "Test prompt 1", 749 | "Test prompt 2" 750 | ] 751 | 752 | # Create test base images 753 | base_images = [] 754 | for i in range(len(prompts)): 755 | img_path = temp_dir / f"test_image_{i}.jpg" 756 | img_path.write_bytes(b"test image data") 757 | base_images.append(img_path) 758 | 759 | # Mock requests.get for video download 760 | mock_response = Mock() 761 | mock_response.content = b"test video data" 762 | mock_response.iter_content.return_value = [b"test video data"] 763 | mock_response.status_code = 200 764 | mock_response.raise_for_status = Mock() 765 | 766 | with patch("ytsum.runway_client", mock_runway), \ 767 | patch("requests.get", return_value=mock_response), \ 768 | patch("time.sleep", return_value=None): # Speed up by skipping sleeps 769 | 770 | # Call function with longer timeout 771 | result = generate_video_segments_with_runway( 772 | prompts=prompts, 773 | output_dir=temp_dir, 774 | base_images=base_images, 775 | timeout=30 # Longer timeout 776 | ) 777 | 778 | # Verify results 779 | assert result is not None 780 | assert len(result) == len(prompts) 781 | for path in result: 782 | assert path.exists() 783 | assert path.stat().st_size > 0 784 | 785 | # Verify API calls 786 | assert mock_runway.image_to_video.create.call_count == len(prompts) 787 | assert mock_runway.tasks.retrieve.call_count >= len(prompts) 788 | 789 | def test_generate_video_segments_with_runway_failure(): 790 | """Test handling of failed video generation""" 791 | mock_task = Mock() 792 | mock_task.id = "test_task_id" 793 | 794 | class MockTaskStatus: 795 | def __init__(self, status): 796 | self.status = status 797 | self.failure = "Test failure" 798 | self.failureCode = "TEST_ERROR" 799 | self.output = None 800 | 801 | mock_runway = Mock() 802 | mock_runway.image_to_video.create.return_value = mock_task 803 | mock_runway.tasks.retrieve.return_value = MockTaskStatus("FAILED") 804 | 805 | with tempfile.TemporaryDirectory() as temp_dir: 806 | temp_dir = Path(temp_dir) 807 | prompts = ["Test prompt"] 808 | 809 | with patch("ytsum.runway_client", mock_runway): 810 | result = generate_video_segments_with_runway( 811 | prompts=prompts, 812 | output_dir=temp_dir, 813 | timeout=5 814 | ) 815 | 816 | assert result is None 817 | assert mock_runway.tasks.retrieve.called 818 | 819 | def test_generate_video_segments_with_runway_timeout(): 820 | """Test handling of timeout during video generation""" 821 | mock_task = Mock() 822 | mock_task.id = "test_task_id" 823 | 824 | class MockTaskStatus: 825 | def __init__(self): 826 | self.status = "RUNNING" 827 | self.progress = 0.5 828 | 829 | mock_runway = Mock() 830 | mock_runway.image_to_video.create.return_value = mock_task 831 | mock_runway.tasks.retrieve.return_value = MockTaskStatus() 832 | 833 | with tempfile.TemporaryDirectory() as temp_dir: 834 | temp_dir = Path(temp_dir) 835 | prompts = ["Test prompt"] 836 | 837 | with patch("ytsum.runway_client", mock_runway): 838 | result = generate_video_segments_with_runway( 839 | prompts=prompts, 840 | output_dir=temp_dir, 841 | timeout=1 # Short timeout 842 | ) 843 | 844 | assert result is None 845 | assert mock_runway.tasks.cancel.called 846 | 847 | def test_combine_video_segments(temp_dir, mocker): 848 | """Test combining video segments""" 849 | video_paths = [] 850 | for i in range(5): 851 | path = temp_dir / f"segment_{i:02d}.mp4" 852 | path.write_bytes(b"mock video data") 853 | video_paths.append(path) 854 | 855 | mock_run = mocker.patch('subprocess.run') 856 | mock_run.return_value.stdout = "60.0" 857 | 858 | output_path = temp_dir / "combined.mp4" 859 | result = combine_video_segments(video_paths, 120.0, output_path) 860 | 861 | assert result is True 862 | assert mock_run.call_count >= 3 863 | 864 | def test_get_audio_duration(temp_dir, mocker): 865 | """Test getting audio duration""" 866 | audio_path = temp_dir / "test.mp3" 867 | audio_path.write_bytes(b"mock audio data") 868 | 869 | # Mock ffprobe call 870 | mock_run = mocker.patch('subprocess.run') 871 | mock_run.return_value.stdout = "180.5" 872 | 873 | duration = get_audio_duration(str(audio_path)) 874 | 875 | assert duration == 180.5 876 | mock_run.assert_called_once() 877 | 878 | def test_combine_audio_video(temp_dir, mocker): 879 | """Test combining audio and video""" 880 | # Create test files 881 | video_path = temp_dir / "video.mp4" 882 | audio_path = temp_dir / "audio.mp3" 883 | output_path = temp_dir / "final.mp4" 884 | 885 | video_path.write_bytes(b"mock video data") 886 | audio_path.write_bytes(b"mock audio data") 887 | 888 | # Mock ffmpeg probe 889 | mock_probe = mocker.patch('ffmpeg.probe') 890 | mock_probe.return_value = {'streams': [{'duration': '60.0'}]} 891 | 892 | # Mock ffmpeg run 893 | mock_run = mocker.patch('ffmpeg.run') 894 | 895 | result = combine_audio_video(str(video_path), str(audio_path), str(output_path)) 896 | 897 | assert result is True 898 | mock_run.assert_called_once() 899 | 900 | def test_sanitize_filename(): 901 | """Test filename sanitization""" 902 | test_cases = [ 903 | ("https://youtube.com/watch?v=abc123", "abc123"), 904 | ("abc123?feature=share", "abc123_feature_share"), 905 | ("test/file:name*", "test_file_name_"), 906 | ("Test File Name!", "Test_File_Name_") 907 | ] 908 | 909 | for input_name, expected in test_cases: 910 | assert sanitize_filename(input_name) == expected 911 | 912 | def test_generate_video_segments_invalid_response(mocker): 913 | """Test handling of invalid prompt generation response""" 914 | mock_get_prompts = mocker.patch('ell.simple') 915 | mock_decorator = mocker.MagicMock() 916 | mock_function = mocker.MagicMock() 917 | 918 | # Test with invalid number of prompts 919 | mock_function.return_value = json.dumps(["only one prompt"]) 920 | mock_decorator.return_value = mock_function 921 | mock_get_prompts.return_value = mock_decorator 922 | 923 | prompts = generate_video_segments(MOCK_PODCAST_SCRIPT) 924 | assert prompts is None 925 | 926 | # Test with missing shot type 927 | invalid_prompts = [ 928 | "Scene one: Description. Camera dolly through space. Mood.", 929 | "Scene two: More description. Camera track along path. Mood.", 930 | "Scene three: Another description. Camera orbit around subject. Mood.", 931 | "Scene four: Yet more description. Camera glide forward. Mood.", 932 | "Scene five: Final description. Camera pull back to reveal all. Mood." 933 | ] 934 | mock_function.return_value = json.dumps(invalid_prompts) 935 | prompts = generate_video_segments(MOCK_PODCAST_SCRIPT) 936 | assert prompts is None 937 | 938 | def test_generate_image_prompts(mocker): 939 | """Test conversion of video prompts to image prompts""" 940 | mock_get_prompts = mocker.patch('ell.simple') 941 | mock_decorator = mocker.MagicMock() 942 | mock_function = mocker.MagicMock() 943 | mock_function.return_value = json.dumps(MOCK_IMAGE_PROMPTS) 944 | mock_decorator.return_value = mock_function 945 | mock_get_prompts.return_value = mock_decorator 946 | 947 | prompts = generate_image_prompts(MOCK_VIDEO_PROMPTS) 948 | 949 | assert prompts is not None 950 | assert len(prompts) == len(MOCK_VIDEO_PROMPTS) 951 | assert all("masterpiece" in prompt for prompt in prompts) 952 | 953 | def test_generate_flux_images(mock_replicate_client, temp_dir): 954 | """Test Flux image generation""" 955 | image_paths = generate_flux_images(MOCK_IMAGE_PROMPTS, temp_dir) 956 | 957 | assert image_paths is not None 958 | assert len(image_paths) == len(MOCK_IMAGE_PROMPTS) 959 | assert all(Path(path).exists() for path in image_paths) 960 | 961 | def test_calculate_num_segments(): 962 | """Test segment number calculation""" 963 | # Test with Luma AI (5s segments) 964 | assert calculate_num_segments(4, "luma") == 1 # Very short 965 | assert calculate_num_segments(8, "luma") == 2 # Short 966 | assert calculate_num_segments(20, "luma") == 4 # Medium 967 | assert calculate_num_segments(50, "luma") == 5 # Long 968 | 969 | # Test with RunwayML (10s segments) 970 | assert calculate_num_segments(8, "runway") == 1 # Very short 971 | assert calculate_num_segments(15, "runway") == 2 # Short 972 | assert calculate_num_segments(40, "runway") == 4 # Medium 973 | assert calculate_num_segments(100, "runway") == 5 # Long 974 | 975 | def test_calculate_target_length(): 976 | """Test target length calculation""" 977 | short = calculate_target_length(180) # 3 minutes 978 | medium = calculate_target_length(600) # 10 minutes 979 | long = calculate_target_length(1800) # 30 minutes 980 | 981 | assert short['summary'] < medium['summary'] < long['summary'] 982 | assert short['podcast'] < medium['podcast'] < long['podcast'] 983 | 984 | def test_upload_image_to_uguu_success(temp_dir, mocker): 985 | """Test successful image upload to Uguu""" 986 | # Create test image 987 | test_image = temp_dir / "test.jpg" 988 | test_image.write_bytes(b"fake image data") 989 | 990 | # Test different response formats 991 | response_formats = [ 992 | # JSON success format 993 | ({ 994 | 'success': True, 995 | 'files': [{ 996 | 'hash': '123abc', 997 | 'name': 'test.jpg', 998 | 'url': 'https://uguu.se/files/test.jpg', 999 | 'size': 1234 1000 | }] 1001 | }, 'https://uguu.se/files/test.jpg'), 1002 | # Direct URL format (fallback) 1003 | ('https://uguu.se/files/test.jpg', 'https://uguu.se/files/test.jpg') 1004 | ] 1005 | 1006 | for response_data, expected_url in response_formats: 1007 | # Mock response 1008 | mock_response = mocker.MagicMock() 1009 | mock_response.status_code = 200 1010 | 1011 | if isinstance(response_data, dict): 1012 | mock_response.json.return_value = response_data 1013 | mock_response.text = json.dumps(response_data) 1014 | else: 1015 | mock_response.json.side_effect = ValueError("Not JSON") 1016 | mock_response.text = response_data 1017 | 1018 | # Mock requests.post 1019 | mock_post = mocker.patch('requests.post', return_value=mock_response) 1020 | 1021 | # Test upload 1022 | result = upload_image_to_uguu(test_image) 1023 | 1024 | # Verify result 1025 | assert result == expected_url 1026 | 1027 | # Verify API call 1028 | mock_post.assert_called_once() 1029 | args, kwargs = mock_post.call_args 1030 | assert args[0] == 'https://uguu.se/upload' 1031 | assert 'files' in kwargs 1032 | assert 'files[]' in kwargs['files'] 1033 | 1034 | # Reset mocks 1035 | mock_post.reset_mock() 1036 | 1037 | def test_upload_image_to_uguu_http_error(temp_dir, mocker): 1038 | """Test Uguu upload with HTTP error""" 1039 | # Create test image 1040 | test_image = temp_dir / "test.jpg" 1041 | test_image.write_bytes(b"fake image data") 1042 | 1043 | # Mock failed response 1044 | mock_response = mocker.MagicMock() 1045 | mock_response.status_code = 500 1046 | mocker.patch('requests.post', return_value=mock_response) 1047 | 1048 | # Test upload 1049 | result = upload_image_to_uguu(test_image) 1050 | 1051 | # Verify failure 1052 | assert result is None 1053 | 1054 | def test_upload_image_to_uguu_invalid_response(temp_dir, mocker): 1055 | """Test Uguu upload with invalid response format""" 1056 | # Create test image 1057 | test_image = temp_dir / "test.jpg" 1058 | test_image.write_bytes(b"fake image data") 1059 | 1060 | # Mock response with invalid format 1061 | mock_response = mocker.MagicMock() 1062 | mock_response.status_code = 200 1063 | mock_response.json.return_value = [] # Empty array 1064 | mocker.patch('requests.post', return_value=mock_response) 1065 | 1066 | # Test upload 1067 | result = upload_image_to_uguu(test_image) 1068 | 1069 | # Verify failure 1070 | assert result is None 1071 | 1072 | def test_upload_image_to_uguu_missing_file(temp_dir): 1073 | """Test Uguu upload with missing file""" 1074 | # Test with non-existent file 1075 | result = upload_image_to_uguu(temp_dir / "nonexistent.jpg") 1076 | 1077 | # Verify failure 1078 | assert result is None 1079 | 1080 | def test_upload_image_to_uguu_network_error(temp_dir, mocker): 1081 | """Test Uguu upload with network error""" 1082 | # Create test image 1083 | test_image = temp_dir / "test.jpg" 1084 | test_image.write_bytes(b"fake image data") 1085 | 1086 | # Mock network error 1087 | mocker.patch('requests.post', side_effect=requests.exceptions.RequestException("Network error")) 1088 | 1089 | # Test upload 1090 | result = upload_image_to_uguu(test_image) 1091 | 1092 | # Verify failure 1093 | assert result is None 1094 | 1095 | @pytest.mark.luma 1096 | def test_luma_with_uguu_image(mock_luma_client, temp_dir, mocker, mock_uguu_response): 1097 | """Test Luma AI video generation with Uguu image upload""" 1098 | # Create test image and prompt 1099 | test_image = temp_dir / "test.jpg" 1100 | test_image.write_bytes(b"fake image data") 1101 | test_prompt = "Test video prompt" 1102 | 1103 | # Mock Uguu upload 1104 | mocker.patch('requests.post', return_value=mock_uguu_response) 1105 | 1106 | # Mock video download 1107 | mocker.patch('requests.get', return_value=mocker.MagicMock(content=b"mock video data")) 1108 | 1109 | # Mock Luma client 1110 | mocker.patch('ytsum.luma_client', mock_luma_client) 1111 | 1112 | # Test video generation with image 1113 | video_paths = generate_video_segments_with_luma([test_prompt], temp_dir, [test_image]) 1114 | 1115 | # Verify success 1116 | assert video_paths is not None 1117 | assert len(video_paths) == 1 1118 | assert Path(video_paths[0]).exists() 1119 | 1120 | # Verify Luma API call included image URL 1121 | generation_call = mock_luma_client.generations.create.call_args 1122 | assert generation_call is not None 1123 | assert 'keyframes' in generation_call[1] 1124 | assert generation_call[1]['keyframes']['frame0']['url'] == 'https://uguu.se/files/example.jpg' 1125 | 1126 | if __name__ == "__main__": 1127 | pytest.main([__file__, "-v"]) -------------------------------------------------------------------------------- /ytsum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import os 4 | import subprocess 5 | import tempfile 6 | from pathlib import Path 7 | import json 8 | import argparse 9 | import urllib.parse 10 | import ell 11 | from anthropic import Anthropic 12 | from colorama import init, Fore, Style 13 | import replicate 14 | import time 15 | from openai import OpenAI 16 | import shutil 17 | import re 18 | from lumaai import LumaAI 19 | import requests 20 | import ffmpeg 21 | from runwayml import RunwayML 22 | from PIL import Image, ImageDraw 23 | import base64 24 | import io 25 | import math 26 | 27 | # Initialize colorama 28 | init() 29 | 30 | # Initialize Anthropic client and register with Ell 31 | api_key = os.getenv("ANTHROPIC_API_KEY") 32 | if not api_key: 33 | print_error("ANTHROPIC_API_KEY environment variable not set") 34 | sys.exit(1) 35 | 36 | claude_client = Anthropic() 37 | ell.config.register_model("claude-3-5-sonnet-20241022", claude_client) 38 | 39 | # Initialize LumaAI client 40 | luma_api_key = os.getenv("LUMAAI_API_KEY") 41 | if luma_api_key: 42 | luma_client = LumaAI(auth_token=luma_api_key) 43 | else: 44 | luma_client = None 45 | 46 | # Initialize RunwayML client 47 | runway_api_key = os.getenv("RUNWAYML_API_SECRET") 48 | if runway_api_key: 49 | runway_client = RunwayML() 50 | else: 51 | runway_client = None 52 | 53 | # Emoji constants 54 | EMOJI_DOWNLOAD = "⬇️ " 55 | EMOJI_TRANSCRIBE = "🎯 " 56 | EMOJI_SUMMARY = "📝 " 57 | EMOJI_SUCCESS = "✅ " 58 | EMOJI_ERROR = "❌ " 59 | EMOJI_SEARCH = "🔍 " 60 | EMOJI_SAVE = "💾 " 61 | EMOJI_PODCAST = "🎙️ " 62 | EMOJI_AUDIO = "🔊 " 63 | EMOJI_VIDEO = "🎥 " 64 | 65 | # Add after other constants 66 | DEFAULT_HOST_VOICES = { 67 | "host1": {"voice": "alloy", "name": "Alex"}, 68 | "host2": {"voice": "nova", "name": "Sarah"} 69 | } 70 | 71 | # Update constants 72 | AVAILABLE_VOICES = { 73 | "alloy": "Neutral voice", 74 | "echo": "Male voice", 75 | "fable": "Male voice", 76 | "onyx": "Male voice", 77 | "nova": "Female voice", 78 | "shimmer": "Female voice" 79 | } 80 | 81 | # Add after OpenAI client initialization 82 | # Create output directory if it doesn't exist 83 | OUTPUT_DIR = Path("out") 84 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 85 | 86 | # Create subdirectories 87 | (OUTPUT_DIR / "temp_videos").mkdir(exist_ok=True) 88 | 89 | def print_step(emoji, message, color=Fore.BLUE): 90 | """Print a step with emoji and color""" 91 | print(f"{color}{emoji}{message}{Style.RESET_ALL}") 92 | 93 | def print_error(message): 94 | """Print error message in red with emoji""" 95 | print(f"{Fore.RED}{EMOJI_ERROR}{message}{Style.RESET_ALL}") 96 | 97 | def print_success(message): 98 | """Print success message in green with emoji""" 99 | print(f"{Fore.GREEN}{EMOJI_SUCCESS}{message}{Style.RESET_ALL}") 100 | 101 | def to_shorthand(text): 102 | """Convert text to shorthand format""" 103 | replacements = { 104 | 'you': 'u', 105 | 'are': 'r', 106 | 'see': 'c', 107 | 'for': '4', 108 | 'to': '2', 109 | 'too': '2', 110 | 'two': '2', 111 | 'four': '4', 112 | 'be': 'b', 113 | 'before': 'b4', 114 | 'great': 'gr8', 115 | 'thanks': 'thx', 116 | 'thank you': 'ty', 117 | 'because': 'bc', 118 | 'people': 'ppl', 119 | 'want': 'wnt', 120 | 'love': 'luv', 121 | 'okay': 'k', 122 | 'yes': 'y', 123 | 'no': 'n', 124 | 'please': 'plz', 125 | 'sorry': 'sry', 126 | 'see you': 'cya', 127 | 'I am': 'im', 128 | 'i am': 'im', 129 | 'good': 'gd', 130 | 'right': 'rt', 131 | 'later': 'l8r', 132 | 'have': 'hv', 133 | 'see you later': 'cul8r', 134 | 'laughing': 'lol', 135 | 'message': 'msg', 136 | 'information': 'info', 137 | 'about': 'abt', 138 | 'awesome': 'awsm', 139 | 'quickly': 'quick', 140 | 'first': '1st', 141 | 'second': '2nd', 142 | 'third': '3rd', 143 | } 144 | 145 | # Convert to lowercase first 146 | result = text.lower() 147 | 148 | # Split into words, remove articles, and rejoin 149 | words = result.split() 150 | words = [w for w in words if w not in ['the', 'a', 'an']] 151 | result = ' '.join(words) 152 | 153 | # Apply other replacements 154 | for old, new in replacements.items(): 155 | result = result.replace(old.lower(), new) 156 | 157 | return result 158 | 159 | def clean_youtube_url(url): 160 | """Clean and validate YouTube URL or video ID""" 161 | # Extract video ID from various URL formats 162 | video_id = None 163 | 164 | # Unescape URL first 165 | url = urllib.parse.unquote(url.replace('\\', '')) 166 | 167 | # Handle full URLs 168 | if url.startswith(('http://', 'https://')): 169 | try: 170 | parsed = urllib.parse.urlparse(url) 171 | if 'youtu.be' in parsed.netloc.lower(): 172 | video_id = parsed.path.strip('/') 173 | else: 174 | params = urllib.parse.parse_qs(parsed.query) 175 | video_id = params['v'][0] 176 | except: 177 | pass 178 | 179 | # Handle partial URLs 180 | elif 'youtube.com' in url.lower() or 'youtu.be' in url.lower(): 181 | try: 182 | if 'youtu.be' in url.lower(): 183 | video_id = url.split('youtu.be/')[-1].split('?')[0] 184 | else: 185 | video_id = url.split('v=')[1].split('&')[0] 186 | except: 187 | pass 188 | 189 | # Handle direct video ID 190 | else: 191 | video_id = url.strip('/') 192 | 193 | # Validate video ID format (11 characters, alphanumeric and -_) 194 | if not video_id or not re.match(r'^[A-Za-z0-9_-]{11}$', video_id): 195 | raise ValueError(f"Invalid YouTube video ID: {video_id}") 196 | 197 | # Check if video exists 198 | try: 199 | result = subprocess.run([ 200 | 'yt-dlp', 201 | '--simulate', 202 | '--no-warnings', 203 | '--no-playlist', 204 | f'https://www.youtube.com/watch?v={video_id}' 205 | ], capture_output=True, text=True) 206 | 207 | if result.returncode != 0: 208 | error_msg = result.stderr.strip() 209 | if "Video unavailable" in error_msg: 210 | raise ValueError(f"Video {video_id} is unavailable or has been removed") 211 | elif "Private video" in error_msg: 212 | raise ValueError(f"Video {video_id} is private") 213 | else: 214 | raise ValueError(f"Error accessing video: {error_msg}") 215 | except subprocess.CalledProcessError: 216 | raise ValueError(f"Could not verify video availability") 217 | 218 | return f"https://www.youtube.com/watch?v={video_id}" 219 | 220 | def download_video(url, output_path): 221 | """Download audio using yt-dlp""" 222 | try: 223 | clean_url = clean_youtube_url(url) 224 | print_step(EMOJI_DOWNLOAD, "Downloading audio...") 225 | 226 | subprocess.run([ 227 | 'yt-dlp', 228 | '--output', output_path, 229 | '--format', 'ba[ext=m4a]', 230 | '--extract-audio', 231 | '--force-overwrites', 232 | clean_url 233 | ], check=True) 234 | return True 235 | except subprocess.CalledProcessError: 236 | print_error("Failed to download audio") 237 | return False 238 | 239 | def get_language_code(language_name: str) -> str: 240 | """Convert language name to ISO 639-1 code using Claude""" 241 | 242 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.0, max_tokens=2) 243 | def get_code(lang: str) -> str: 244 | """You are an expert in language codes. Return only the ISO 639-1 code (2 letters) for the given language name. 245 | For example: 246 | - English -> en 247 | - Russian -> ru 248 | - Spanish -> es 249 | - Chinese -> zh 250 | - Japanese -> ja 251 | If unsure, return 'en' as fallback.""" 252 | return f"Convert this language name to ISO 639-1 code: {lang}. No \`\`\` or \`\`\`python, no intro, no commentaries, only the code." 253 | 254 | try: 255 | code = get_code(language_name).strip().lower() 256 | # Validate it's a 2-letter code 257 | if len(code) == 2 and code.isalpha(): 258 | return code 259 | return 'en' 260 | except: 261 | return 'en' 262 | 263 | def get_youtube_subtitles(url, output_path, language="en"): 264 | """Try to download subtitles from YouTube using yt-dlp""" 265 | try: 266 | # Convert language name to code 267 | language_code = get_language_code(language) 268 | print_step(EMOJI_SEARCH, f"Searching for YouTube subtitles in {language} ({language_code})...") 269 | clean_url = clean_youtube_url(url) 270 | 271 | # Try to download subtitles directly with basic command 272 | result = subprocess.run([ 273 | 'yt-dlp', 274 | '--write-subs', 275 | '--sub-langs', language_code, 276 | '--skip-download', 277 | clean_url 278 | ], capture_output=True, text=True) 279 | 280 | # Look for the downloaded subtitle file in current directory 281 | if "Writing video subtitles to:" in result.stdout: 282 | # Extract the filename from yt-dlp output 283 | for line in result.stdout.splitlines(): 284 | if "Writing video subtitles to:" in line: 285 | subtitle_file = line.split("Writing video subtitles to:", 1)[1].strip() 286 | if os.path.exists(subtitle_file): 287 | print_success(f"Found subtitles!") 288 | # Convert VTT to plain text 289 | text = convert_vtt_to_text(subtitle_file) 290 | txt_file = subtitle_file.replace('.vtt', '.txt') 291 | with open(txt_file, 'w', encoding='utf-8') as f: 292 | f.write(text) 293 | return txt_file 294 | 295 | print_step(EMOJI_SEARCH, "No subtitles found, will transcribe audio...") 296 | return None 297 | 298 | except Exception as e: 299 | print_error(f"Failed to get subtitles: {e}") 300 | return None 301 | 302 | def convert_vtt_to_text(vtt_file): 303 | """Convert VTT subtitles to plain text""" 304 | text = [] 305 | with open(vtt_file, 'r', encoding='utf-8') as f: 306 | lines = f.readlines() 307 | 308 | # Skip VTT header 309 | start = 0 310 | for i, line in enumerate(lines): 311 | if line.strip() == "WEBVTT": 312 | start = i + 1 313 | break 314 | 315 | # Process subtitle content 316 | for line in lines[start:]: 317 | # Skip timing lines and empty lines 318 | if '-->' in line or not line.strip(): 319 | continue 320 | # Add non-empty lines to text 321 | if line.strip(): 322 | text.append(line.strip()) 323 | 324 | return ' '.join(text) 325 | 326 | def transcribe_with_fast_whisper(video_path): 327 | """Transcribe video using Faster Whisper""" 328 | try: 329 | from faster_whisper import WhisperModel 330 | 331 | print_step(EMOJI_TRANSCRIBE, "Transcribing with Fast Whisper...") 332 | model = WhisperModel("base", device="auto", compute_type="auto") 333 | 334 | segments, _ = model.transcribe(video_path) 335 | transcript = " ".join([segment.text for segment in segments]) 336 | 337 | transcript_path = str(Path(video_path).with_suffix('.txt')) 338 | with open(transcript_path, 'w', encoding='utf-8') as f: 339 | f.write(transcript) 340 | 341 | return True 342 | 343 | except ImportError: 344 | print_error("Faster Whisper not found. Please install it with:") 345 | print(f"{Fore.YELLOW}pip install faster-whisper{Style.RESET_ALL}") 346 | return False 347 | except Exception as e: 348 | print_error(f"Fast transcription error: {e}") 349 | return False 350 | 351 | def transcribe_with_replicate(video_path, language=None): 352 | """Transcribe video using Replicate's Incredibly Fast Whisper""" 353 | try: 354 | print_step(EMOJI_TRANSCRIBE, "Transcribing with Incredibly Fast Whisper...") 355 | 356 | # Convert audio to MP3 format 357 | mp3_path = convert_audio_format(video_path, 'mp3') 358 | if not mp3_path: 359 | print_error("Failed to convert audio to MP3") 360 | return False 361 | 362 | # Prepare input parameters 363 | input_params = { 364 | "audio": open(mp3_path, 'rb'), # Send file directly 365 | "batch_size": 64, 366 | } 367 | 368 | if language: 369 | input_params["language"] = language.lower() 370 | 371 | # Run transcription 372 | output = replicate.run( 373 | "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c", 374 | input=input_params 375 | ) 376 | 377 | if not output or "text" not in output: 378 | print_error("Invalid response from Replicate") 379 | return False 380 | 381 | # Write transcript to file 382 | transcript_path = os.path.splitext(video_path)[0] + '.txt' 383 | with open(transcript_path, 'w', encoding='utf-8') as f: 384 | f.write(output["text"]) 385 | 386 | return True 387 | 388 | except Exception as e: 389 | print_error(f"Replicate transcription error: {e}") 390 | return False 391 | 392 | def split_audio_into_chunks(input_path, chunk_size_mb=20): 393 | """Split audio file into chunks under specified size""" 394 | try: 395 | # Get file size in MB 396 | file_size = os.path.getsize(input_path) / (1024 * 1024) 397 | if file_size <= chunk_size_mb: 398 | return [input_path] 399 | 400 | # Calculate duration of each chunk 401 | duration_info = subprocess.run([ 402 | 'ffprobe', 403 | '-v', 'error', 404 | '-show_entries', 'format=duration', 405 | '-of', 'default=noprint_wrappers=1:nokey=1', 406 | input_path 407 | ], capture_output=True, text=True) 408 | 409 | total_duration = float(duration_info.stdout.strip()) # Strip whitespace 410 | if total_duration <= 0: 411 | print_error("Invalid audio duration") 412 | return None 413 | 414 | # Calculate chunk duration (ensure it's at least 1 second) 415 | chunk_duration = max(1, int((chunk_size_mb / file_size) * total_duration)) 416 | 417 | # Create chunks directory 418 | chunks_dir = os.path.join(os.path.dirname(input_path), "chunks") 419 | os.makedirs(chunks_dir, exist_ok=True) 420 | 421 | chunk_paths = [] 422 | for i in range(0, int(total_duration), chunk_duration): 423 | chunk_path = os.path.join(chunks_dir, f"chunk_{i}.mp3") 424 | subprocess.run([ 425 | 'ffmpeg', 426 | '-i', input_path, 427 | '-y', # Overwrite output 428 | '-ss', str(i), # Start time 429 | '-t', str(chunk_duration), # Duration 430 | '-acodec', 'libmp3lame', 431 | '-ar', '44100', 432 | '-ac', '2', 433 | '-b:a', '192k', 434 | chunk_path 435 | ], check=True, capture_output=True) 436 | chunk_paths.append(chunk_path) 437 | 438 | return chunk_paths 439 | 440 | except Exception as e: 441 | print_error(f"Error splitting audio: {e}") 442 | return None 443 | 444 | def transcribe_with_openai_whisper(video_path): 445 | """Transcribe video using OpenAI's Whisper API""" 446 | try: 447 | from openai import OpenAI 448 | 449 | # Check for API key 450 | if not os.getenv("OPENAI_API_KEY"): 451 | print_error("OPENAI_API_KEY environment variable not set") 452 | return False 453 | 454 | print_step(EMOJI_TRANSCRIBE, "Transcribing with OpenAI Whisper...") 455 | client = OpenAI() 456 | 457 | # Check if input format is supported 458 | supported_formats = {'.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm'} 459 | input_ext = Path(video_path).suffix.lower() 460 | 461 | # Convert only if needed 462 | audio_path = video_path 463 | if input_ext not in supported_formats: 464 | print_step(EMOJI_TRANSCRIBE, "Converting to supported format...") 465 | audio_path = convert_audio_format(video_path, 'mp3', bitrate='32k', mono=True) 466 | if not audio_path: 467 | return False 468 | 469 | # Check file size (25MB limit) 470 | MAX_SIZE_MB = 25 471 | file_size_mb = os.path.getsize(audio_path) / (1024 * 1024) 472 | 473 | if file_size_mb > MAX_SIZE_MB: 474 | print_step(EMOJI_TRANSCRIBE, f"File too large ({file_size_mb:.1f}MB), optimizing...") 475 | 476 | # Try aggressive compression first 477 | compressed_path = convert_audio_format(audio_path, 'mp3', bitrate='32k', mono=True) 478 | if not compressed_path: 479 | return False 480 | 481 | # Check if compression was enough 482 | compressed_size_mb = os.path.getsize(compressed_path) / (1024 * 1024) 483 | if compressed_size_mb > MAX_SIZE_MB: 484 | print_step(EMOJI_TRANSCRIBE, "Still too large, splitting into chunks...") 485 | chunk_paths = split_audio_into_chunks(compressed_path, chunk_size_mb=20) 486 | else: 487 | chunk_paths = [compressed_path] 488 | else: 489 | chunk_paths = [audio_path] 490 | 491 | if not chunk_paths: 492 | return False 493 | 494 | # Transcribe each chunk 495 | transcripts = [] 496 | for chunk_path in chunk_paths: 497 | # Verify chunk size 498 | chunk_size_mb = os.path.getsize(chunk_path) / (1024 * 1024) 499 | if chunk_size_mb > MAX_SIZE_MB: 500 | print_error(f"Chunk too large: {chunk_size_mb:.1f}MB") 501 | continue 502 | 503 | with open(chunk_path, "rb") as audio_file: 504 | transcription = client.audio.transcriptions.create( 505 | model="whisper-1", 506 | file=audio_file 507 | ) 508 | transcripts.append(transcription.text) 509 | 510 | if not transcripts: 511 | print_error("No successful transcriptions") 512 | return False 513 | 514 | # Combine transcripts 515 | full_transcript = " ".join(transcripts) 516 | 517 | # Write transcript to file 518 | transcript_path = os.path.splitext(video_path)[0] + '.txt' 519 | with open(transcript_path, 'w', encoding='utf-8') as f: 520 | f.write(full_transcript) 521 | 522 | # Clean up chunks if we created them 523 | if len(chunk_paths) > 1: 524 | chunks_dir = os.path.dirname(chunk_paths[0]) 525 | for chunk in chunk_paths: 526 | os.remove(chunk) 527 | os.rmdir(chunks_dir) 528 | 529 | return True 530 | 531 | except ImportError: 532 | print_error("OpenAI package not found. Please install it with:") 533 | print(f"{Fore.YELLOW}pip install openai{Style.RESET_ALL}") 534 | return False 535 | except Exception as e: 536 | print_error(f"OpenAI Whisper error: {e}") 537 | return False 538 | 539 | def transcribe_video(video_path, use_fast_whisper=False, use_replicate=False, language=None): 540 | """Transcribe video using chosen transcription method""" 541 | if use_replicate: 542 | return transcribe_with_replicate(video_path, language) 543 | elif use_fast_whisper: 544 | return transcribe_with_fast_whisper(video_path) 545 | else: 546 | return transcribe_with_openai_whisper(video_path) # Default to OpenAI API 547 | 548 | def summarize_with_claude(transcript, metadata="", language="english"): 549 | """Generate summary using Claude""" 550 | # Get video duration from metadata or use default 551 | try: 552 | duration = float(re.search(r'Duration: (\d+\.\d+)', metadata).group(1)) 553 | except: 554 | duration = 600 # Default to 10 minutes 555 | 556 | targets = calculate_target_length(duration) 557 | 558 | # Read the prompt template 559 | try: 560 | with open('prompt.txt', 'r', encoding='utf-8') as f: 561 | prompt_template = f.read() 562 | except Exception as e: 563 | print_error(f"Error reading prompt template: {e}") 564 | return None 565 | 566 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.3, max_tokens=8192) 567 | def get_summary(content: str, target_words: int) -> str: 568 | # Format the prompt template with the target language 569 | formatted_prompt = prompt_template.format(language=language) 570 | 571 | return f"""{formatted_prompt} 572 | 573 | Target length: {target_words} words. 574 | 575 | Transcript: 576 | {content}""" 577 | 578 | try: 579 | return get_summary(f"{transcript}\n\nMetadata:\n{metadata}", targets['summary']) 580 | except Exception as e: 581 | print_error(f"Error generating summary: {e}") 582 | return None 583 | 584 | def get_video_metadata(url): 585 | """Get video metadata using yt-dlp""" 586 | try: 587 | clean_url = clean_youtube_url(url) 588 | print_step(EMOJI_SEARCH, "Fetching video metadata...") 589 | 590 | result = subprocess.run([ 591 | 'yt-dlp', 592 | '--dump-json', 593 | '--no-playlist', 594 | clean_url 595 | ], check=True, capture_output=True, text=True) 596 | 597 | metadata = json.loads(result.stdout) 598 | header_parts = ["---"] 599 | 600 | # Add metadata fields only if they exist 601 | if title := metadata.get('title'): 602 | header_parts.append(f"Title: {title}") 603 | 604 | if channel := metadata.get('channel'): 605 | header_parts.append(f"Channel: {channel}") 606 | 607 | if upload_date := metadata.get('upload_date'): 608 | header_parts.append(f"Upload Date: {upload_date}") 609 | 610 | if duration := metadata.get('duration_string'): 611 | header_parts.append(f"Duration: {duration}") 612 | 613 | if views := metadata.get('view_count'): 614 | header_parts.append(f"Views: {views:,}") 615 | 616 | if description := metadata.get('description'): 617 | # Process description with Ell 618 | processed = process_metadata_description(description) 619 | header_parts.append(f"Description: {processed}") 620 | 621 | if tags := metadata.get('tags'): 622 | # Process tags with Ell 623 | processed_tags = process_metadata_description(' '.join(tags)) 624 | header_parts.append(f"Tags: {processed_tags}") 625 | 626 | header_parts.extend(["---", ""]) 627 | 628 | return '\n'.join(header_parts) 629 | except Exception as e: 630 | print_error(f"Failed to fetch metadata: {e}") 631 | return "" 632 | 633 | def convert_audio_format(input_path, output_format='mp3', bitrate='192k', mono=False): 634 | """Convert audio to specified format using FFmpeg""" 635 | try: 636 | print_step(EMOJI_TRANSCRIBE, f"Converting audio to {output_format} ({bitrate}{'mono' if mono else ''})...") 637 | output_path = str(Path(input_path).with_suffix(f'.{output_format}')) 638 | 639 | # Build FFmpeg command 640 | cmd = [ 641 | 'ffmpeg', 642 | '-i', input_path, 643 | '-y', # Overwrite output file if exists 644 | '-vn', # No video 645 | '-acodec', 'libmp3lame' if output_format == 'mp3' else output_format, 646 | '-ar', '44100', # Sample rate 647 | '-ac', '1' if mono else '2', # Mono/Stereo 648 | '-b:a', bitrate, # Bitrate 649 | output_path 650 | ] 651 | 652 | # Run FFmpeg with error output 653 | result = subprocess.run(cmd, check=True, capture_output=True, text=True) 654 | 655 | # Verify file exists and is not empty 656 | if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: 657 | print_error("FFmpeg output file is missing or empty") 658 | return None 659 | 660 | return output_path 661 | 662 | except subprocess.CalledProcessError as e: 663 | print_error(f"FFmpeg conversion failed: {e.stderr}") 664 | return None 665 | except Exception as e: 666 | print_error(f"Audio conversion error: {e}") 667 | return None 668 | 669 | def process_metadata_description(metadata): 670 | """Process metadata description using Ell""" 671 | 672 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.3, max_tokens=1000) 673 | def summarize_metadata(content: str) -> str: 674 | """You are a metadata processor that creates concise video descriptions. 675 | Rules: 676 | 1. Description must be a single line, max 3 semicolon-separated points 677 | 2. Tags must be grouped by theme with parentheses, max 5 groups 678 | 3. Remove all URLs, social media links, and promotional text 679 | 4. Focus only on plot/content-relevant information 680 | 5. Use semicolons to separate multiple plot points 681 | 6. Group related tags inside parentheses 682 | 7. Exclude generic/redundant tags""" 683 | 684 | return f"""Process this video metadata into a concise format: 685 | 1. Extract main plot points (max 3, separated by semicolons) 686 | 2. Group related tags (max 5 groups, use parentheses) 687 | 688 | Metadata: 689 | {content} 690 | 691 | Format output as: 692 | Description: [plot point 1]; [plot point 2]; [plot point 3] 693 | Tags: [group1], [group2 (item1, item2)], [group3], [group4 (items...)]""" 694 | 695 | try: 696 | result = summarize_metadata(metadata) 697 | return result 698 | except Exception as e: 699 | print_error(f"Error processing metadata: {e}") 700 | return metadata 701 | 702 | def convert_to_podcast_script(summary, language="english", duration=None): 703 | """Convert summary to podcast script using Claude""" 704 | if duration is None: 705 | # Estimate duration from summary length 706 | duration = len(summary.split()) * 0.5 # rough estimate: 0.5 seconds per word 707 | 708 | targets = calculate_target_length(duration) 709 | 710 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.3, max_tokens=4096) 711 | def get_podcast(content: str, voice1: str, voice2: str, target_lang: str) -> str: 712 | return f"""Convert this summary into an engaging podcast script with two hosts. 713 | Target length: {targets['podcast']} words total. 714 | Output language: {target_lang} 715 | Use these voice names for the hosts: {voice1.upper()} and {voice2.upper()}. 716 | 717 | Rules: 718 | 1. Format each line as: "VOICE_NAME: " 719 | Example: "{voice1.upper()}: That's an interesting point!" 720 | 2. Use only {voice1.upper()} and {voice2.upper()} consistently 721 | 3. Make it conversational but informative 722 | 4. Keep all dialogue in {target_lang} language 723 | 5. Include brief reactions and interactions between hosts 724 | 6. Start with one host introducing the topic 725 | 7. End with the other host wrapping up 726 | 8. Keep the original insights and information 727 | 9. Avoid meta-commentary or introductions 728 | 10. Do NOT use typical AI buzzwords: dive in, delve into, fascinating,etc. 729 | 10. Come up with original beginning (use ending for that). Do NOT start with "Today we are..." 730 | 731 | Available voices: 732 | {json.dumps(AVAILABLE_VOICES, indent=2)} 733 | 734 | Summary to convert: 735 | {content}""" 736 | 737 | try: 738 | # Randomly select two different voices 739 | import random 740 | available_voices = list(AVAILABLE_VOICES.keys()) 741 | host1_voice = random.choice(available_voices) 742 | available_voices.remove(host1_voice) 743 | host2_voice = random.choice(available_voices) 744 | 745 | return get_podcast(summary, host1_voice, host2_voice, language) 746 | except Exception as e: 747 | print_error(f"Error converting to podcast script: {e}") 748 | return None 749 | 750 | def generate_host_audio(text, host_config, output_path): 751 | """Generate audio for a specific host""" 752 | try: 753 | if not os.getenv("OPENAI_API_KEY"): 754 | print_error("OPENAI_API_KEY environment variable not set") 755 | return False 756 | 757 | client = OpenAI() 758 | print_step(EMOJI_AUDIO, f"Generating audio for {host_config['name']}...") 759 | 760 | with client.audio.speech.with_streaming_response.create( 761 | model="tts-1", 762 | voice=host_config['voice'], 763 | input=text 764 | ) as response: 765 | response.stream_to_file(output_path) 766 | return True 767 | except Exception as e: 768 | print_error(f"Error generating audio: {e}") 769 | return False 770 | 771 | def combine_audio_files(audio_files, output_file): 772 | """Combine multiple audio files with crossfade""" 773 | try: 774 | print_step(EMOJI_AUDIO, "Combining audio files...") 775 | 776 | if len(audio_files) < 2: 777 | print_error("Need at least two audio files to combine.") 778 | return False 779 | 780 | # Ensure output directory exists 781 | output_file = Path(output_file) 782 | output_file.parent.mkdir(parents=True, exist_ok=True) 783 | 784 | # Build filter complex for crossfade 785 | filter_parts = [] 786 | n = len(audio_files) 787 | 788 | # Label all inputs 789 | labels = [f'[{i}:a]' for i in range(n)] 790 | 791 | # Build the filter chain 792 | current_label = 0 793 | next_tmp = n # Start temporary labels after input labels 794 | 795 | for i in range(n-1): 796 | if i == 0: 797 | # First merge 798 | filter_parts.append(f'{labels[i]}{labels[i+1]}acrossfade=d=0.5:c1=tri:c2=tri[tmp{next_tmp}]') 799 | current_label = next_tmp 800 | next_tmp += 1 801 | else: 802 | # Merge result with next input 803 | filter_parts.append(f'[tmp{current_label}]{labels[i+1]}acrossfade=d=0.5:c1=tri:c2=tri[tmp{next_tmp}]') 804 | current_label = next_tmp 805 | next_tmp += 1 806 | 807 | # Create input arguments 808 | inputs = [] 809 | for audio_file in audio_files: 810 | inputs.extend(['-i', str(audio_file)]) 811 | 812 | # Build final command 813 | cmd = [ 814 | 'ffmpeg', '-y', 815 | *inputs, 816 | '-filter_complex', 817 | ';'.join(filter_parts), 818 | '-map', f'[tmp{current_label}]', 819 | '-ac', '2', # Convert to stereo 820 | '-ar', '44100', # Standard sample rate 821 | str(output_file) 822 | ] 823 | 824 | # Run FFmpeg 825 | result = subprocess.run(cmd, capture_output=True, text=True) 826 | if result.returncode != 0: 827 | print_error(f"FFmpeg error: {result.stderr}") 828 | return False 829 | 830 | return True 831 | 832 | except Exception as e: 833 | print_error(f"Error combining audio files: {e}") 834 | return False 835 | 836 | def generate_podcast_audio(script, output_file): 837 | """Generate podcast audio with detected voices""" 838 | temp_files = [] 839 | voice_configs = {} # Will store voice configs as we discover them 840 | 841 | try: 842 | with tempfile.TemporaryDirectory() as temp_dir: 843 | # Process each line of the script 844 | for i, line in enumerate(script.split('\n')): 845 | if not line.strip(): 846 | continue 847 | 848 | # Parse voice and text 849 | try: 850 | voice_name, text = line.split(':', 1) 851 | voice_name = voice_name.strip().lower() 852 | text = text.strip() 853 | except ValueError: 854 | continue 855 | 856 | # Skip if not a valid voice 857 | if voice_name not in AVAILABLE_VOICES: 858 | continue 859 | 860 | # Create voice config if not seen before 861 | if voice_name not in voice_configs: 862 | voice_configs[voice_name] = { 863 | "voice": voice_name, 864 | "name": voice_name.capitalize() 865 | } 866 | 867 | # Generate audio for this line 868 | temp_file = os.path.join(temp_dir, f"part_{i:03d}.mp3") 869 | if generate_host_audio(text, voice_configs[voice_name], temp_file): 870 | temp_files.append(temp_file) 871 | 872 | # Combine all audio files 873 | if temp_files: 874 | return combine_audio_files(temp_files, output_file) 875 | 876 | return False 877 | except Exception as e: 878 | print_error(f"Error generating podcast: {e}") 879 | return False 880 | 881 | def sanitize_filename(filename): 882 | """Convert URL or video ID to safe filename""" 883 | # Extract video ID from URL if present 884 | if 'youtube.com' in filename or 'youtu.be' in filename: 885 | try: 886 | if 'youtu.be' in filename: 887 | video_id = filename.split('/')[-1].split('?')[0] 888 | else: 889 | query = urllib.parse.urlparse(filename).query 890 | params = urllib.parse.parse_qs(query) 891 | video_id = params['v'][0] 892 | return video_id 893 | except: 894 | pass 895 | 896 | # Handle query parameters 897 | if '?' in filename: 898 | parts = filename.split('?') 899 | filename = parts[0] 900 | params = parts[1].replace('=', '_').replace('&', '_') 901 | filename = f"{filename}_{params}" 902 | 903 | # Count trailing special characters 904 | trailing_specials = len(filename) - len(filename.rstrip(r'\\/:*"<>|!')) 905 | 906 | # First replace special characters with underscores 907 | clean = re.sub(r'[\\/:*"<>|]', '_', filename) # Replace invalid chars with underscore 908 | 909 | # Replace spaces and other non-alphanumeric chars (except dashes) with underscore 910 | clean = re.sub(r'[^\w\-]', '_', clean) 911 | 912 | # Replace multiple consecutive underscores with a single one 913 | clean = re.sub(r'_+', '_', clean) 914 | 915 | # Remove leading underscores 916 | clean = clean.lstrip('_') 917 | 918 | # Add single trailing underscore if original had special chars at the end 919 | if trailing_specials > 0: 920 | clean = clean.rstrip('_') + '_' 921 | 922 | # Preserve casing from original filename 923 | if filename.isupper(): 924 | clean = clean.upper() 925 | elif not filename.islower(): # If mixed case or title case 926 | parts = clean.split('_') 927 | clean = '_'.join(p.capitalize() for p in parts) 928 | 929 | return clean 930 | 931 | def generate_video_segments(podcast_script, num_segments=5, seed=42): 932 | """Generate video prompts that match podcast content and flow""" 933 | 934 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.3, max_tokens=2048) 935 | def get_video_prompts(script: str, num: int) -> str: 936 | return f"""Create {num} detailed video prompts that directly visualize the key moments from this podcast conversation. 937 | Each prompt must be under 500 characters long and create a clear, engaging scene. 938 | 939 | Podcast Script: 940 | {script} 941 | 942 | Guidelines for Each Prompt: 943 | 1. Scene Content: 944 | - Focus on the specific topic being discussed 945 | - Show real environments and objects 946 | - Include relevant details mentioned by hosts 947 | - Keep descriptions concise but clear 948 | 949 | 2. Visual Style: 950 | - Professional documentary style 951 | - Clean, high-quality visuals 952 | - Natural lighting 953 | - Clear focal points 954 | 955 | 3. Required Structure (keep under 500 chars): 956 | "A [brief location] shows [main subject/action]. [Supporting details]. [Human elements] [interact with] [key concept]. [Lighting] highlights [focus]. [Camera angle]." 957 | 958 | 4. Key Points: 959 | - Be specific but concise 960 | - Use concrete imagery 961 | - Match the conversation 962 | - Stay under length limit 963 | 964 | Instructions: 965 | 1. Read the section 966 | 2. Identify key concept 967 | 3. Create concise scene 968 | 4. Check character count 969 | 5. Trim if needed 970 | 971 | Return a properly formatted JSON array of strings like this: 972 | [ 973 | "First scene (under 500 chars)...", 974 | "Second scene (under 500 chars)..." 975 | ] 976 | 977 | Important: Use double quotes and ensure valid JSON format.""" 978 | 979 | try: 980 | # Generate prompts and ensure valid JSON 981 | response = get_video_prompts(podcast_script, num_segments) 982 | 983 | # Parse JSON 984 | prompts = json.loads(response) 985 | 986 | if not isinstance(prompts, list) or len(prompts) != num_segments: 987 | raise ValueError(f"Invalid prompt format - must be array of exactly {num_segments} strings") 988 | 989 | # Validate and truncate prompts 990 | MAX_LENGTH = 500 # Keep some buffer below 512 991 | processed_prompts = [] 992 | 993 | for i, prompt in enumerate(prompts, 1): 994 | if not isinstance(prompt, str): 995 | raise ValueError(f"Prompt {i} must be a string") 996 | 997 | # Ensure minimum detail 998 | if len(prompt.split()) < 20: 999 | raise ValueError(f"Prompt {i} is too short - needs more detail") 1000 | 1001 | # Truncate if too long 1002 | if len(prompt) > MAX_LENGTH: 1003 | # Find last complete sentence that fits 1004 | sentences = prompt.split('.') 1005 | truncated = '' 1006 | for sentence in sentences: 1007 | if len(truncated + sentence + '.') <= MAX_LENGTH: 1008 | truncated += sentence + '.' 1009 | else: 1010 | break 1011 | prompt = truncated.strip() 1012 | 1013 | processed_prompts.append(prompt) 1014 | 1015 | return processed_prompts 1016 | 1017 | except json.JSONDecodeError as e: 1018 | print_error(f"Error parsing JSON response: {e}") 1019 | print_error(f"Raw response: {response[:200]}...") 1020 | return None 1021 | except Exception as e: 1022 | print_error(f"Error generating video prompts: {e}") 1023 | return None 1024 | 1025 | def upload_image_to_uguu(image_path, max_retries=3): 1026 | """Upload image to uguu.se and get URL""" 1027 | try: 1028 | url = "https://uguu.se/upload" 1029 | 1030 | # Prepare the file with proper format 1031 | with open(image_path, 'rb') as f: 1032 | files = { 1033 | 'files[]': ( 1034 | Path(image_path).name, 1035 | f, 1036 | 'image/jpeg' 1037 | ) 1038 | } 1039 | 1040 | # Try upload with retries 1041 | for attempt in range(max_retries): 1042 | try: 1043 | response = requests.post( 1044 | url, 1045 | files=files, 1046 | timeout=30 1047 | ) 1048 | 1049 | if response.status_code != 200: 1050 | print_error(f"Upload failed with status {response.status_code}") 1051 | if attempt < max_retries - 1: 1052 | time.sleep(1) 1053 | continue 1054 | return None 1055 | 1056 | # Parse JSON response 1057 | try: 1058 | result = response.json() 1059 | if (result.get('success') and 1060 | isinstance(result.get('files'), list) and 1061 | result['files'] and 1062 | 'url' in result['files'][0]): 1063 | return result['files'][0]['url'] 1064 | except (ValueError, KeyError, AttributeError): 1065 | # If JSON parsing fails or format is unexpected, try text response 1066 | text = response.text.strip() 1067 | if text.startswith('http'): 1068 | return text 1069 | 1070 | print_error(f"Invalid response format: {response.text[:100]}") 1071 | if attempt < max_retries - 1: 1072 | time.sleep(1) 1073 | continue 1074 | return None 1075 | 1076 | except requests.exceptions.RequestException as e: 1077 | print_error(f"Upload attempt {attempt + 1} failed: {e}") 1078 | if attempt < max_retries - 1: 1079 | time.sleep(1) 1080 | continue 1081 | return None 1082 | 1083 | return None 1084 | 1085 | except Exception as e: 1086 | print_error(f"Error uploading image: {e}") 1087 | return None 1088 | 1089 | def generate_video_segments_with_luma(prompts, output_dir, base_images=None, podcast_script=None): 1090 | """Generate video segments using LumaAI with optional base images""" 1091 | if not luma_client: 1092 | print_error("LUMA_API_KEY environment variable not set") 1093 | return None 1094 | 1095 | video_paths = [] 1096 | for i, prompt in enumerate(prompts): 1097 | try: 1098 | print_step(EMOJI_VIDEO, f"Generating video segment {i+1}/{len(prompts)}...") 1099 | 1100 | # Set up generation parameters 1101 | generation_params = { 1102 | "prompt": prompt, 1103 | "aspect_ratio": "16:9", 1104 | "loop": False 1105 | } 1106 | 1107 | # Add base image if available 1108 | if base_images and i < len(base_images): 1109 | image_url = upload_image_to_uguu(base_images[i]) 1110 | if not image_url: 1111 | print_error(f"Failed to upload image {i+1}, continuing without image") 1112 | else: 1113 | generation_params["keyframes"] = { 1114 | "frame0": { 1115 | "type": "image", 1116 | "url": image_url 1117 | } 1118 | } 1119 | 1120 | # Try generation with retries and prompt regeneration 1121 | max_retries = 3 1122 | max_prompt_retries = 3 1123 | generation = None 1124 | 1125 | for prompt_attempt in range(max_prompt_retries): 1126 | try: 1127 | # Create generation with retries 1128 | for attempt in range(max_retries): 1129 | try: 1130 | generation = luma_client.generations.create(**generation_params) 1131 | break 1132 | except Exception as e: 1133 | if attempt < max_retries - 1: 1134 | print_error(f"Generation attempt {attempt + 1} failed: {e}, retrying...") 1135 | time.sleep(2) 1136 | else: 1137 | raise 1138 | 1139 | if not generation: 1140 | raise Exception("Failed to create generation after retries") 1141 | 1142 | # Poll for completion with timeout 1143 | start_time = time.time() 1144 | timeout = 300 1145 | completed = False 1146 | moderation_failed = False 1147 | 1148 | while not completed and time.time() - start_time < timeout: 1149 | try: 1150 | generation = luma_client.generations.get(id=generation.id) 1151 | 1152 | if generation.state == "completed": 1153 | completed = True 1154 | break 1155 | elif generation.state == "failed": 1156 | error_msg = getattr(generation, 'failure_reason', 'Unknown error') 1157 | if "moderation failed" in error_msg.lower(): 1158 | moderation_failed = True 1159 | break 1160 | # Add regeneration for any failure 1161 | if prompt_attempt < max_prompt_retries - 1: 1162 | print_error(f"Generation failed: {error_msg}, regenerating prompt...") 1163 | new_prompts = generate_video_segments(podcast_script, num_segments=1) 1164 | if new_prompts and len(new_prompts) > 0: 1165 | generation_params["prompt"] = new_prompts[0] 1166 | break 1167 | raise Exception(f"Video generation failed: {error_msg}") 1168 | elif generation.state == "canceled": 1169 | raise Exception("Video generation was cancelled") 1170 | else: 1171 | print_step(EMOJI_VIDEO, f"Generating segment {i+1}...", color=Fore.YELLOW) 1172 | time.sleep(3) 1173 | 1174 | except Exception as e: 1175 | print_error(f"Error checking generation status: {e}") 1176 | time.sleep(3) 1177 | 1178 | if moderation_failed: 1179 | if prompt_attempt < max_prompt_retries - 1: 1180 | print_error("Moderation failed, regenerating prompt...") 1181 | # Regenerate prompt for this segment 1182 | new_prompts = generate_video_segments(podcast_script, num_segments=1) 1183 | if new_prompts and len(new_prompts) > 0: 1184 | generation_params["prompt"] = new_prompts[0] 1185 | continue 1186 | raise Exception("Failed to generate acceptable prompt after retries") 1187 | 1188 | if not completed: 1189 | raise Exception(f"Generation timed out after {timeout} seconds") 1190 | 1191 | # If we get here, generation was successful 1192 | break 1193 | 1194 | except Exception as e: 1195 | if prompt_attempt < max_prompt_retries - 1: 1196 | print_error(f"Prompt attempt {prompt_attempt + 1} failed: {e}, trying new prompt...") 1197 | continue 1198 | raise 1199 | 1200 | # Download video with retries 1201 | max_download_retries = 3 1202 | for attempt in range(max_download_retries): 1203 | try: 1204 | output_path = output_dir / f"segment_{i:02d}.mp4" 1205 | response = requests.get(generation.assets.video, stream=True, timeout=30) 1206 | response.raise_for_status() 1207 | 1208 | with open(output_path, 'wb') as file: 1209 | for chunk in response.iter_content(chunk_size=8192): 1210 | file.write(chunk) 1211 | 1212 | if output_path.stat().st_size == 0: 1213 | raise Exception("Downloaded file is empty") 1214 | 1215 | video_paths.append(output_path) 1216 | break 1217 | 1218 | except Exception as e: 1219 | if attempt < max_download_retries - 1: 1220 | print_error(f"Download attempt {attempt + 1} failed: {e}, retrying...") 1221 | time.sleep(2) 1222 | else: 1223 | raise 1224 | 1225 | except Exception as e: 1226 | print_error(f"Error generating video segment {i+1}: {e}") 1227 | return None 1228 | 1229 | return video_paths 1230 | 1231 | def combine_video_segments(video_paths, target_duration, output_path): 1232 | """Combine video segments and adjust to match target duration""" 1233 | try: 1234 | print_step(EMOJI_VIDEO, "Combining video segments...") 1235 | 1236 | # Create temporary file for concatenation 1237 | with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: 1238 | # Write input files list with absolute paths 1239 | for video_path in video_paths: 1240 | # Convert to absolute path 1241 | abs_path = Path(video_path).resolve() 1242 | if not abs_path.exists(): 1243 | raise FileNotFoundError(f"Video file not found: {abs_path}") 1244 | f.write(f"file '{abs_path}'\n") 1245 | temp_list = f.name 1246 | 1247 | try: 1248 | # Create output directory if it doesn't exist 1249 | output_path = Path(output_path) 1250 | output_path.parent.mkdir(parents=True, exist_ok=True) 1251 | 1252 | # Concatenate videos 1253 | temp_concat = output_path.parent / 'temp_concat.mp4' 1254 | subprocess.run([ 1255 | 'ffmpeg', '-y', '-f', 'concat', '-safe', '0', 1256 | '-i', temp_list, '-c', 'copy', str(temp_concat) 1257 | ], check=True, capture_output=True) 1258 | 1259 | # Get concatenated video duration 1260 | probe = subprocess.run([ 1261 | 'ffprobe', '-v', 'error', '-show_entries', 'format=duration', 1262 | '-of', 'default=noprint_wrappers=1:nokey=1', str(temp_concat) 1263 | ], capture_output=True, text=True) 1264 | current_duration = float(probe.stdout.strip()) 1265 | 1266 | # Calculate speed factor to stretch video to match target duration 1267 | # If current_duration is 30s and target is 60s, we want speed_factor = 2 1268 | # to make the video twice as slow 1269 | speed_factor = target_duration / current_duration 1270 | 1271 | subprocess.run([ 1272 | 'ffmpeg', '-y', '-i', str(temp_concat), 1273 | '-filter:v', f'setpts={speed_factor}*PTS', 1274 | '-an', str(output_path) 1275 | ], check=True, capture_output=True) 1276 | 1277 | return True 1278 | 1279 | finally: 1280 | # Clean up temporary files 1281 | os.unlink(temp_list) 1282 | if temp_concat.exists(): 1283 | os.unlink(temp_concat) 1284 | 1285 | except subprocess.CalledProcessError as e: 1286 | print_error(f"FFmpeg error: {e.stderr.decode()}") 1287 | return False 1288 | except Exception as e: 1289 | print_error(f"Error combining videos: {e}") 1290 | return False 1291 | 1292 | def get_audio_duration(audio_path): 1293 | """Get duration of audio file in seconds""" 1294 | try: 1295 | probe = subprocess.run([ 1296 | 'ffprobe', '-v', 'error', '-show_entries', 'format=duration', 1297 | '-of', 'default=noprint_wrappers=1:nokey=1', audio_path 1298 | ], capture_output=True, text=True) 1299 | return float(probe.stdout.strip()) 1300 | except: 1301 | return None 1302 | 1303 | def combine_audio_video(video_path, audio_path, output_path): 1304 | """Combine video with audio track using ffmpeg and add fade out""" 1305 | try: 1306 | print_step(EMOJI_VIDEO, "Combining video and audio...") 1307 | 1308 | # Get video duration 1309 | probe = ffmpeg.probe(video_path) 1310 | duration = float(probe['streams'][0]['duration']) 1311 | fade_start = duration - 1 # Start fade 1 second before end 1312 | 1313 | # Create filter complex for fade out 1314 | # Apply fade filter directly to video stream 1315 | stream = ( 1316 | ffmpeg 1317 | .input(video_path) 1318 | .filter('fade', type='out', start_time=fade_start, duration=1) 1319 | .output( 1320 | ffmpeg.input(audio_path), 1321 | str(output_path), 1322 | acodec='aac', 1323 | strict='experimental', 1324 | **{ 1325 | 'filter_complex_threads': 1, 1326 | 'max_muxing_queue_size': 1024 1327 | } 1328 | ) 1329 | ) 1330 | 1331 | # Run ffmpeg with overwrite and error handling 1332 | try: 1333 | ffmpeg.run( 1334 | stream, 1335 | overwrite_output=True, 1336 | capture_stdout=True, 1337 | capture_stderr=True 1338 | ) 1339 | return True 1340 | 1341 | except ffmpeg.Error as e: 1342 | if e.stderr: 1343 | print_error(f"FFmpeg error: {e.stderr.decode()}") 1344 | if e.stdout: 1345 | print_error(f"FFmpeg output: {e.stdout.decode()}") 1346 | return False 1347 | 1348 | except Exception as e: 1349 | print_error(f"Error combining audio and video: {e}") 1350 | return False 1351 | 1352 | def generate_video_segments_with_runway(prompts, output_dir, base_images=None, timeout=900, podcast_script=None): 1353 | """Generate video segments using RunwayML with optional base images""" 1354 | if not runway_client: 1355 | print_error("RUNWAYML_API_SECRET environment variable not set") 1356 | return None 1357 | 1358 | video_paths = [] 1359 | for i, prompt in enumerate(prompts): 1360 | try: 1361 | print_step(EMOJI_VIDEO, f"Generating video segment {i+1}/{len(prompts)}...") 1362 | 1363 | # Try generation with retries and prompt regeneration 1364 | max_retries = 3 1365 | max_prompt_retries = 3 1366 | 1367 | for prompt_attempt in range(max_prompt_retries): 1368 | try: 1369 | # Use base image if available, otherwise create gradient 1370 | if base_images and i < len(base_images): 1371 | with open(base_images[i], 'rb') as f: 1372 | image_bytes = f.read() 1373 | image_b64 = base64.b64encode(image_bytes).decode('utf-8') 1374 | image_uri = f"data:image/jpeg;base64,{image_b64}" 1375 | else: 1376 | temp_image = output_dir / f"input_{i:02d}.png" 1377 | gradient = create_gradient_image() 1378 | gradient.save(temp_image) 1379 | with open(temp_image, 'rb') as f: 1380 | image_bytes = f.read() 1381 | image_b64 = base64.b64encode(image_bytes).decode('utf-8') 1382 | image_uri = f"data:image/png;base64,{image_b64}" 1383 | temp_image.unlink() 1384 | 1385 | # Create task with current prompt 1386 | task = runway_client.image_to_video.create( 1387 | model='gen3a_turbo', 1388 | prompt_text=prompt, 1389 | prompt_image=image_uri, 1390 | duration=10, 1391 | ratio="1280:768" 1392 | ) 1393 | 1394 | # Poll for completion with timeout 1395 | start_time = time.time() 1396 | max_retries = 180 1397 | retries = 0 1398 | moderation_failed = False 1399 | 1400 | while retries < max_retries: 1401 | if time.time() - start_time > timeout: 1402 | print_error(f"Timeout after {timeout} seconds") 1403 | try: 1404 | runway_client.tasks.cancel(id=task.id) 1405 | except: 1406 | pass 1407 | return None 1408 | 1409 | try: 1410 | task_status = runway_client.tasks.retrieve(id=task.id) 1411 | except Exception as e: 1412 | print_error(f"Error retrieving task status: {e}") 1413 | time.sleep(5) 1414 | retries += 1 1415 | continue 1416 | 1417 | if task_status.status == "SUCCEEDED": 1418 | if not hasattr(task_status, 'output') or not task_status.output: 1419 | print_error("No output in completed task") 1420 | return None 1421 | 1422 | video_urls = task_status.output 1423 | if not video_urls or not isinstance(video_urls, list): 1424 | print_error("Invalid output format in task") 1425 | return None 1426 | 1427 | video_url = video_urls[0] 1428 | break 1429 | 1430 | elif task_status.status == "FAILED": 1431 | error_msg = getattr(task_status, 'failure', '') or getattr(task_status, 'failureCode', 'Unknown error') 1432 | if "moderation" in error_msg.lower(): 1433 | moderation_failed = True 1434 | break 1435 | # Add regeneration for any failure 1436 | if prompt_attempt < max_prompt_retries - 1: 1437 | print_error(f"Generation failed: {error_msg}, regenerating prompt...") 1438 | new_prompts = generate_video_segments(podcast_script, num_segments=1) 1439 | if new_prompts and len(new_prompts) > 0: 1440 | prompt = new_prompts[0] # Update prompt for next attempt 1441 | break 1442 | print_error(f"Video generation failed: {error_msg}") 1443 | return None 1444 | 1445 | elif task_status.status == "CANCELLED": 1446 | print_error("Video generation was cancelled") 1447 | return None 1448 | 1449 | elif task_status.status == "THROTTLED": 1450 | print_step(EMOJI_VIDEO, f"Generation queued (throttled)... Attempt {retries+1}/{max_retries}", color=Fore.YELLOW) 1451 | 1452 | elif task_status.status == "PENDING": 1453 | print_step(EMOJI_VIDEO, f"Generation pending... Attempt {retries+1}/{max_retries}", color=Fore.YELLOW) 1454 | 1455 | elif task_status.status == "RUNNING": 1456 | progress = float(getattr(task_status, 'progress', 0) or 0) * 100 1457 | elapsed = int(time.time() - start_time) 1458 | print_step(EMOJI_VIDEO, 1459 | f"Generating segment {i+1}... ({progress:.0f}%) - {elapsed}s elapsed", 1460 | color=Fore.YELLOW) 1461 | 1462 | time.sleep(5) 1463 | retries += 1 1464 | continue 1465 | 1466 | if moderation_failed: 1467 | if prompt_attempt < max_prompt_retries - 1: 1468 | print_error("Moderation failed, regenerating prompt...") 1469 | # Regenerate prompt for this segment 1470 | new_prompts = generate_video_segments(podcast_script, num_segments=1) 1471 | if new_prompts and len(new_prompts) > 0: 1472 | prompt = new_prompts[0] # Update prompt for next attempt 1473 | break 1474 | raise Exception("Failed to generate acceptable prompt after retries") 1475 | 1476 | # Download video with retries 1477 | max_download_retries = 3 1478 | for download_attempt in range(max_download_retries): 1479 | try: 1480 | output_path = output_dir / f"segment_{i:02d}.mp4" 1481 | response = requests.get(video_url, stream=True, timeout=30) 1482 | response.raise_for_status() 1483 | 1484 | with open(output_path, 'wb') as file: 1485 | for chunk in response.iter_content(chunk_size=8192): 1486 | file.write(chunk) 1487 | 1488 | if output_path.stat().st_size == 0: 1489 | raise Exception("Downloaded file is empty") 1490 | 1491 | video_paths.append(output_path) 1492 | break 1493 | 1494 | except Exception as e: 1495 | if download_attempt < max_download_retries - 1: 1496 | print_error(f"Download attempt {download_attempt + 1} failed: {e}, retrying...") 1497 | time.sleep(2) 1498 | else: 1499 | raise 1500 | 1501 | # If we get here, generation and download were successful 1502 | break 1503 | 1504 | except Exception as e: 1505 | if prompt_attempt < max_prompt_retries - 1: 1506 | print_error(f"Prompt attempt {prompt_attempt + 1} failed: {e}, trying new prompt...") 1507 | continue 1508 | raise 1509 | 1510 | except Exception as e: 1511 | print_error(f"Error generating video segment {i+1}: {e}") 1512 | return None 1513 | 1514 | return video_paths 1515 | 1516 | def calculate_num_segments(audio_duration, provider="luma"): 1517 | """Calculate optimal number of video segments based on audio duration and provider""" 1518 | # Provider-specific segment durations 1519 | SEGMENT_DURATIONS = { 1520 | "luma": 5, # LumaAI generates 5s videos 1521 | "runway": 10 # RunwayML generates 10s videos 1522 | } 1523 | 1524 | # Provider-specific maximum segments 1525 | MAX_SEGMENTS = { 1526 | "luma": 10, # Allow more segments for LumaAI due to shorter duration 1527 | "runway": 5 # Keep RunwayML at 5 segments max 1528 | } 1529 | 1530 | segment_duration = SEGMENT_DURATIONS.get(provider, 5) # Default to 5s if provider unknown 1531 | max_segments = MAX_SEGMENTS.get(provider, 5) # Default to 5 if provider unknown 1532 | 1533 | # Calculate ideal number of segments to cover the audio 1534 | ideal_segments = math.ceil(audio_duration / segment_duration) 1535 | 1536 | # Keep segments between 2 and max_segments 1537 | if audio_duration <= segment_duration: 1538 | # Very short audio - single segment 1539 | return 1 1540 | elif audio_duration <= 2 * segment_duration: 1541 | # Short audio - two segments 1542 | return 2 1543 | elif audio_duration <= max_segments * segment_duration: 1544 | # Medium audio - scale segments based on duration 1545 | return min(max_segments, max(2, ideal_segments)) 1546 | else: 1547 | # Long audio - cap at max_segments 1548 | return max_segments 1549 | 1550 | def calculate_target_length(duration_seconds): 1551 | """Calculate target word counts based on content duration""" 1552 | # Base lengths for a 10-minute video 1553 | BASE_DURATION = 600 # 10 minutes in seconds 1554 | BASE_SUMMARY_WORDS = 300 1555 | BASE_PODCAST_WORDS = 600 1556 | 1557 | # Calculate scaling factor (with min/max limits) 1558 | scale = min(max(duration_seconds / BASE_DURATION, 0.3), 2.0) 1559 | 1560 | return { 1561 | 'summary': int(BASE_SUMMARY_WORDS * scale), 1562 | 'podcast': int(BASE_PODCAST_WORDS * scale) 1563 | } 1564 | 1565 | def generate_image_prompts(video_prompts): 1566 | """Generate relevant, concrete image prompts that match podcast content""" 1567 | 1568 | @ell.simple(model="claude-3-5-sonnet-20241022", temperature=0.3, max_tokens=2048) 1569 | def get_image_prompts(prompts: list, summary: str, podcast: str) -> str: 1570 | return f"""Create {len(prompts)} detailed image prompts for Stable Diffusion that illustrate the key topics being discussed. 1571 | Each prompt should create a clear, realistic visualization of the concepts, using concrete imagery. 1572 | 1573 | Content Summary: 1574 | {summary} 1575 | 1576 | Podcast Script: 1577 | {podcast} 1578 | 1579 | Required Elements for Each Prompt: 1580 | 1. Base Quality: 1581 | - Start with: "masterpiece, highly detailed, 8k uhd, photorealistic" 1582 | - End with: "professional lighting, cinematic composition" 1583 | 1584 | 2. Scene Components: 1585 | - Main Subject: Primary topic or concept being discussed 1586 | - Environment: Relevant setting or location 1587 | - Supporting Elements: Objects, tools, or items that relate to the topic 1588 | - Human Element: People, hands, or human presence when relevant 1589 | - Scale: Show size and scope of the subject matter 1590 | 1591 | 3. Visual Guidelines: 1592 | - Create documentary-style scenes 1593 | - Show real objects and environments 1594 | - Include relevant details from the discussion 1595 | - Use appropriate lighting for the setting 1596 | - Choose engaging camera angles 1597 | - Keep scenes grounded and realistic 1598 | 1599 | 4. Scene Types: 1600 | - Process/Action: Show something being done or created 1601 | - Location/Setting: Establish where something happens 1602 | - Object/Detail: Focus on specific items being discussed 1603 | - Interaction: Show how things or people work together 1604 | - Result/Impact: Visualize outcomes or effects 1605 | 1606 | Instructions: 1607 | 1. Read the current section of discussion 1608 | 2. Identify the main concept or point 1609 | 3. Choose the most appropriate scene type 1610 | 4. Include specific details mentioned in the content 1611 | 5. Make it concrete and photorealistic 1612 | 6. Ensure it matches the topic being discussed 1613 | 1614 | Example Structure: 1615 | "masterpiece, highly detailed, 8k uhd, photorealistic, [main subject in action/setting], [environment details], [supporting elements], [human presence if relevant], [lighting and atmosphere], professional lighting, cinematic composition" 1616 | 1617 | Return a JSON array of {len(prompts)} strings. 1618 | No code blocks, only the JSON array.""" 1619 | 1620 | try: 1621 | # Read the summary and podcast files for context 1622 | summary_file = next(Path("out").glob("summary-*.txt")) 1623 | podcast_file = next(Path("out").glob("podcast-*.txt")) 1624 | summary = summary_file.read_text() 1625 | podcast = podcast_file.read_text() 1626 | 1627 | # Generate prompts with content context 1628 | prompts = json.loads(get_image_prompts(video_prompts, summary, podcast)) 1629 | 1630 | # Validate prompts 1631 | if not isinstance(prompts, list) or len(prompts) != len(video_prompts): 1632 | raise ValueError(f"Invalid prompt format - must be array of exactly {len(video_prompts)} strings") 1633 | 1634 | # Ensure all prompts are strings and have required elements 1635 | prompts = [str(p) for p in prompts] 1636 | 1637 | # Validate prompt structure 1638 | for i, prompt in enumerate(prompts): 1639 | if not isinstance(prompt, str): 1640 | raise ValueError(f"Prompt {i} must be a string") 1641 | if not prompt.startswith("masterpiece, highly detailed, 8k uhd, photorealistic"): 1642 | raise ValueError(f"Prompt {i} must start with the required quality elements") 1643 | if not prompt.endswith("professional lighting, cinematic composition"): 1644 | raise ValueError(f"Prompt {i} must end with the required composition elements") 1645 | 1646 | return prompts 1647 | 1648 | except Exception as e: 1649 | print_error(f"Error generating image prompts: {e}") 1650 | return None 1651 | 1652 | def generate_flux_images(prompts, output_dir): 1653 | """Generate images using Flux Pro Ultra for each prompt""" 1654 | if not os.getenv("REPLICATE_API_TOKEN"): 1655 | print_error("REPLICATE_API_TOKEN environment variable not set") 1656 | return None 1657 | 1658 | image_paths = [] # Store local paths 1659 | for i, prompt in enumerate(prompts): 1660 | try: 1661 | print_step(EMOJI_VIDEO, f"Generating base image {i+1}/{len(prompts)}...") 1662 | 1663 | # Get image URL from Replicate 1664 | output_url = replicate.run( 1665 | "black-forest-labs/flux-1.1-pro-ultra", 1666 | input={ 1667 | "raw": False, 1668 | "prompt": prompt, 1669 | "aspect_ratio": "16:9", 1670 | "output_format": "jpg", 1671 | "safety_tolerance": 2, 1672 | "image_prompt_strength": 0.1 1673 | } 1674 | ) 1675 | 1676 | # Download and save image 1677 | output_path = output_dir / f"base_{i:02d}.jpg" 1678 | response = requests.get(output_url, stream=True) 1679 | with open(output_path, 'wb') as file: 1680 | file.write(response.content) 1681 | 1682 | image_paths.append(output_path) 1683 | 1684 | except Exception as e: 1685 | print_error(f"Error generating base image {i+1}: {e}") 1686 | return None 1687 | 1688 | return image_paths 1689 | 1690 | def create_gradient_image(width=1280, height=768): 1691 | """Create a simple gradient image for video generation""" 1692 | image = Image.new('RGB', (width, height)) 1693 | draw = ImageDraw.Draw(image) 1694 | 1695 | # Create a vertical gradient from dark to light blue 1696 | for y in range(height): 1697 | # Calculate color components 1698 | r = int(20 * y / height) # Dark to slightly red 1699 | g = int(50 * y / height) # Dark to medium green 1700 | b = int(255 * y / height) # Dark to bright blue 1701 | 1702 | # Draw horizontal line with current color 1703 | draw.line([(0, y), (width, y)], fill=(r, g, b)) 1704 | 1705 | return image 1706 | 1707 | def main(): 1708 | # Set up argument parser 1709 | parser = argparse.ArgumentParser(description='Summarize YouTube videos') 1710 | parser.add_argument('url', help='YouTube video URL or video ID') 1711 | parser.add_argument('--language', default='english', 1712 | help='Output language for the summary (default: english)') 1713 | parser.add_argument('--podcast', action='store_true', 1714 | help='Generate podcast version with audio') 1715 | parser.add_argument('--ignore-subs', action='store_true', 1716 | help='Ignore YouTube subtitles and force transcription') 1717 | 1718 | # Add transcription method group 1719 | trans_group = parser.add_mutually_exclusive_group() 1720 | trans_group.add_argument('--fast-whisper', action='store_true', 1721 | help='Use Fast Whisper for transcription (faster)') 1722 | trans_group.add_argument('--whisper', action='store_true', 1723 | help='Use OpenAI Whisper for transcription (slower but may be more accurate)') 1724 | trans_group.add_argument('--replicate', action='store_true', 1725 | help='Use Replicate Incredibly Fast Whisper (fastest, requires API key)') 1726 | 1727 | # Video generation group 1728 | video_group = parser.add_mutually_exclusive_group() 1729 | video_group.add_argument('--lumaai', action='store_true', 1730 | help='Generate video using Luma AI (requires --podcast)') 1731 | video_group.add_argument('--runwayml', action='store_true', 1732 | help='Generate video using RunwayML (requires --podcast)') 1733 | 1734 | args = parser.parse_args() 1735 | 1736 | try: 1737 | # Clean and validate URL 1738 | clean_url = clean_youtube_url(args.url) 1739 | 1740 | # Get video ID for filenames 1741 | try: 1742 | video_id = clean_url.split('v=')[1].split('&')[0] 1743 | except: 1744 | print_error("Could not extract video ID from URL") 1745 | sys.exit(1) 1746 | 1747 | # Check for existing files 1748 | summary_file = OUTPUT_DIR / f"summary-{video_id}.txt" 1749 | podcast_script_file = OUTPUT_DIR / f"podcast-{video_id}.txt" 1750 | podcast_audio_file = OUTPUT_DIR / f"podcast-{video_id}.mp3" 1751 | final_video_file = OUTPUT_DIR / f"video-{video_id}.mp4" 1752 | 1753 | # Get video metadata first (always do this to verify video exists) 1754 | try: 1755 | metadata = get_video_metadata(clean_url) 1756 | # Get video duration from metadata 1757 | duration = None 1758 | if metadata: 1759 | try: 1760 | duration = float(re.search(r'Duration: (\d+\.\d+)', metadata).group(1)) 1761 | except: 1762 | pass 1763 | except Exception as e: 1764 | print_error(f"Error processing metadata: {e}") 1765 | metadata = "" # Continue without metadata 1766 | duration = None 1767 | 1768 | # Check if we need to generate summary 1769 | if summary_file.exists(): 1770 | print_step(EMOJI_SUCCESS, f"Summary already exists at {summary_file}") 1771 | summary = summary_file.read_text() 1772 | else: 1773 | with tempfile.TemporaryDirectory() as temp_dir: 1774 | temp_dir = Path(temp_dir) 1775 | audio_path = temp_dir / "audio.m4a" 1776 | base_path = temp_dir / "audio" 1777 | 1778 | # Try YouTube subtitles first (unless --ignore-subs is used) 1779 | transcript = None 1780 | if not args.ignore_subs: 1781 | subtitle_language = args.language.lower() if args.language else "en" 1782 | subtitle_file = get_youtube_subtitles(clean_url, str(base_path), subtitle_language) 1783 | 1784 | if subtitle_file: 1785 | # Read the subtitle file 1786 | with open(subtitle_file, 'r', encoding='utf-8') as f: 1787 | transcript = f.read() 1788 | # Clean up the downloaded subtitle file 1789 | os.remove(subtitle_file) 1790 | 1791 | # If no transcript yet (no subs or --ignore-subs), transcribe audio 1792 | if not transcript: 1793 | method = ('Fast Whisper' if args.fast_whisper 1794 | else 'OpenAI Whisper' if args.whisper 1795 | else 'Incredibly Fast Whisper' if args.replicate 1796 | else 'Fast Whisper') # Default 1797 | print_step(EMOJI_TRANSCRIBE, f"Using {method} for transcription...") 1798 | 1799 | if not download_video(clean_url, str(audio_path)): 1800 | sys.exit(1) 1801 | 1802 | if not transcribe_video(str(audio_path), 1803 | use_fast_whisper=args.fast_whisper or (not args.whisper and not args.replicate), 1804 | use_replicate=args.replicate, 1805 | language=args.language): 1806 | sys.exit(1) 1807 | 1808 | transcript = (temp_dir / "audio.txt").read_text() 1809 | 1810 | # Convert to shorthand 1811 | shorthand = to_shorthand(transcript) 1812 | 1813 | # Generate summary with appropriate length 1814 | summary = summarize_with_claude(shorthand, metadata, args.language) 1815 | if not summary: 1816 | sys.exit(1) 1817 | 1818 | # Save summary 1819 | Path(summary_file).write_text(metadata + summary) 1820 | print_success(f"Summary saved to {summary_file}") 1821 | 1822 | # If podcast option is enabled 1823 | if args.podcast: 1824 | # Check if podcast files exist 1825 | if podcast_script_file.exists() and podcast_audio_file.exists(): 1826 | print_step(EMOJI_SUCCESS, f"Podcast script already exists at {podcast_script_file}") 1827 | print_step(EMOJI_SUCCESS, f"Podcast audio already exists at {podcast_audio_file}") 1828 | podcast_script = podcast_script_file.read_text() 1829 | else: 1830 | # Convert to podcast script and generate audio 1831 | podcast_script = convert_to_podcast_script(summary, args.language, duration) 1832 | if not podcast_script: 1833 | sys.exit(1) 1834 | 1835 | # Save podcast script 1836 | podcast_script_file.write_text(podcast_script) 1837 | 1838 | # Generate audio file 1839 | if not generate_podcast_audio(podcast_script, podcast_audio_file): 1840 | sys.exit(1) 1841 | 1842 | print_success(f"Podcast script saved to {podcast_script_file}") 1843 | print_success(f"Podcast audio saved to {podcast_audio_file}") 1844 | 1845 | # If video generation is enabled 1846 | if args.lumaai or args.runwayml: 1847 | # Check if final video exists 1848 | if final_video_file.exists(): 1849 | print_step(EMOJI_SUCCESS, f"Final video already exists at {final_video_file}") 1850 | return 1851 | 1852 | # Create temporary directory for video segments 1853 | video_temp_dir = OUTPUT_DIR / "temp_videos" 1854 | video_temp_dir.mkdir(exist_ok=True) 1855 | 1856 | temp_video = None 1857 | try: 1858 | # Get podcast audio duration 1859 | audio_duration = get_audio_duration(podcast_audio_file) 1860 | if not audio_duration: 1861 | print_error("Could not determine podcast duration") 1862 | sys.exit(1) 1863 | 1864 | # Calculate number of segments needed 1865 | num_segments = calculate_num_segments( 1866 | audio_duration, 1867 | provider="luma" if args.lumaai else "runway" 1868 | ) 1869 | 1870 | # Generate video prompts 1871 | prompts = generate_video_segments(podcast_script, num_segments=num_segments) 1872 | if not prompts: 1873 | sys.exit(1) 1874 | 1875 | # Generate base images with Flux 1876 | image_prompts = generate_image_prompts(prompts) 1877 | if not image_prompts: 1878 | sys.exit(1) 1879 | 1880 | base_images = generate_flux_images(image_prompts, video_temp_dir) 1881 | if not base_images: 1882 | sys.exit(1) 1883 | 1884 | # Generate video segments with selected provider 1885 | if args.lumaai: 1886 | video_paths = generate_video_segments_with_luma( 1887 | prompts, 1888 | video_temp_dir, 1889 | base_images, 1890 | podcast_script=podcast_script 1891 | ) 1892 | else: # args.runwayml 1893 | video_paths = generate_video_segments_with_runway( 1894 | prompts, 1895 | video_temp_dir, 1896 | base_images, 1897 | podcast_script=podcast_script 1898 | ) 1899 | 1900 | if not video_paths: 1901 | sys.exit(1) 1902 | 1903 | # Combine videos and match audio duration 1904 | temp_video = OUTPUT_DIR / f"temp-video-{video_id}.mp4" 1905 | 1906 | if not combine_video_segments(video_paths, audio_duration, temp_video): 1907 | sys.exit(1) 1908 | 1909 | # Combine with podcast audio 1910 | if not combine_audio_video(temp_video, podcast_audio_file, final_video_file): 1911 | sys.exit(1) 1912 | 1913 | print_success(f"Final video saved to {final_video_file}") 1914 | 1915 | finally: 1916 | # Clean up temporary files 1917 | if video_temp_dir.exists(): 1918 | shutil.rmtree(video_temp_dir) 1919 | if temp_video and temp_video.exists(): 1920 | os.remove(temp_video) 1921 | 1922 | except KeyboardInterrupt: 1923 | print_error("\nOperation cancelled by user") 1924 | sys.exit(1) 1925 | 1926 | if __name__ == "__main__": 1927 | main() 1928 | --------------------------------------------------------------------------------