├── __init__.py ├── .python-version ├── llm_modules ├── __init__.py └── llm_zephyr.py ├── tts_modules ├── __init__.py └── tts_coqui.py ├── t2i_modules ├── __init__.py ├── t2i_sdxl.py └── t2i_juggernaut.py ├── t2v_modules ├── __init__.py ├── t2v_zeroscope.py ├── t2v_wan.py └── t2v_ltx.py ├── i2v_modules ├── __init__.py ├── i2v_slideshow.py ├── i2v_ltx.py ├── i2v_svd.py └── i2v_wan.py ├── check_versions.py ├── .gitignore ├── pyproject.toml ├── todo.todo ├── module_discovery.py ├── config_manager.py ├── package_code.sh ├── mp3_to_wav_converter.py ├── system.py ├── utils.py ├── base_modules.py ├── __requirements.txt ├── ui_task_executor.py ├── task_executor.py ├── README.md ├── video_assembly.py ├── project_manager.py └── app.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 -------------------------------------------------------------------------------- /llm_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .llm_zephyr import ZephyrLLM -------------------------------------------------------------------------------- /tts_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .tts_coqui import CoquiTTSModule -------------------------------------------------------------------------------- /t2i_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .t2i_juggernaut import JuggernautT2I 2 | from .t2i_sdxl import SdxlT2I -------------------------------------------------------------------------------- /t2v_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .t2v_zeroscope import ZeroscopeT2V 2 | from .t2v_wan import WanT2V 3 | from .t2v_ltx import LtxT2V -------------------------------------------------------------------------------- /i2v_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .i2v_ltx import LtxI2V 2 | from .i2v_svd import SvdI2V 3 | from .i2v_slideshow import SlideshowI2V 4 | from .i2v_wan import WanI2V -------------------------------------------------------------------------------- /check_versions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import streamlit 3 | import sys 4 | 5 | print(f"Python version: {sys.version}") 6 | print(f"PyTorch version: {torch.__version__}") 7 | print(f"Streamlit version: {streamlit.__version__}") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Environment 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | ENV/ 29 | 30 | # IDE 31 | .idea/ 32 | .vscode/ 33 | *.swp 34 | *.swo 35 | 36 | # Project specific 37 | prompt_helpers/ 38 | instagram_content/ 39 | output/ 40 | my_reels/ 41 | *.mp4 42 | *.wav 43 | *.png 44 | 45 | project.json 46 | system.json 47 | 48 | modular_reels_output/ 49 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "influencer" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "accelerate>=1.7.0", 9 | "coqui-tts>=0.24.3", 10 | "diffusers>=0.33.1", 11 | "ftfy>=6.3.1", 12 | "gputil>=1.4.0", 13 | "hf-transfer>=0.1.9", 14 | "hf-xet>=1.1.1", 15 | "huggingface-hub[cli]>=0.31.2", 16 | "jupyter>=1.1.1", 17 | "llvmlite>=0.44.0", 18 | "moviepy>=2.1.2", 19 | "mutagen>=1.47.0", 20 | "nicegui>=2.19.0", 21 | "numpy>=1.26.4", 22 | "psutil>=7.0.0", 23 | "pydantic>=2.11.5", 24 | "pydub>=0.25.1", 25 | "sentencepiece>=0.2.0", 26 | "streamlit>=1.45.1", 27 | "torch>=2.7.1", 28 | "torchaudio>=2.7.1", 29 | "torchvision>=0.22.1", 30 | "transformers>=4.51.3", 31 | ] 32 | -------------------------------------------------------------------------------- /todo.todo: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | narration to show or not.. font and position and size selection in project 5 | try shot and script 3 times for format then fallback to fallback system. 6 | record user generation form in project status what user actually selcted to create ths video 7 | store time taken in each segment, image, video, audio, assembly and final 8 | show better project in list, status, video duration etc. play from there. 9 | save vram and system ram in system.json and use it to filter models 10 | show module name from its config to dropdown 11 | prompt finetunder button touse llm with different prompt (or a separate module ?) 12 | are we saving reference sound in project somewhere and its path .. and showing back in dashboard page? 13 | is the logic separate from UI so we can change ui part any time without chainging logic? 14 | 15 | ✓ on dashboard keep expander of characters closed and show characters name on expander title 16 | ✗ audio tts emotion parameters (not in narration) 17 | ✗ tts language selection 18 | ✓ scene delete add facility 19 | ✓ add all characters in all scene as default -------------------------------------------------------------------------------- /module_discovery.py: -------------------------------------------------------------------------------- 1 | # In module_discovery.py 2 | 3 | import os 4 | import importlib 5 | import inspect 6 | from typing import Dict, List, Any, Type 7 | # Correctly import from base_modules 8 | from base_modules import BaseLLM, BaseTTS, BaseT2I, BaseI2V, BaseT2V, ModuleCapabilities 9 | 10 | MODULE_TYPES = { 11 | "llm": {"base_class": BaseLLM, "path": "llm_modules"}, 12 | "tts": {"base_class": BaseTTS, "path": "tts_modules"}, 13 | "t2i": {"base_class": BaseT2I, "path": "t2i_modules"}, 14 | "i2v": {"base_class": BaseI2V, "path": "i2v_modules"}, 15 | "t2v": {"base_class": BaseT2V, "path": "t2v_modules"}, 16 | } 17 | 18 | def discover_modules() -> Dict[str, List[Dict[str, Any]]]: 19 | """ 20 | Scans module directories, imports classes, and gets their capabilities. 21 | """ 22 | discovered_modules = {key: [] for key in MODULE_TYPES} 23 | 24 | for module_type, info in MODULE_TYPES.items(): 25 | module_path = info["path"] 26 | base_class = info["base_class"] 27 | 28 | if not os.path.exists(module_path): 29 | continue 30 | 31 | for filename in os.listdir(module_path): 32 | if filename.endswith(".py") and not filename.startswith("__"): 33 | module_name = f"{module_path}.{filename[:-3]}" 34 | try: 35 | module = importlib.import_module(module_name) 36 | for attribute_name in dir(module): 37 | attribute = getattr(module, attribute_name) 38 | if inspect.isclass(attribute) and issubclass(attribute, base_class) and attribute is not base_class: 39 | caps = attribute.get_capabilities() 40 | discovered_modules[module_type].append({ 41 | "name": attribute.__name__, 42 | "path": f"{module_name}.{attribute.__name__}", 43 | "caps": caps, 44 | "class": attribute 45 | }) 46 | except Exception as e: 47 | print(f"Warning: Could not load module {module_name}. Error: {e}") 48 | 49 | return discovered_modules -------------------------------------------------------------------------------- /config_manager.py: -------------------------------------------------------------------------------- 1 | # In config_manager.py 2 | import os 3 | import torch 4 | import gc 5 | from pydantic import BaseModel, Field 6 | from typing import Dict, Tuple, Literal 7 | 8 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 9 | if DEVICE == "cuda": os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" 10 | 11 | class ContentConfig(BaseModel): 12 | """Configuration for overall content generation parameters, using Pydantic.""" 13 | # --- User-defined settings from the UI --- 14 | target_video_length_hint: float = 20.0 15 | min_scenes: int = 2 16 | max_scenes: int = 5 17 | aspect_ratio_format: Literal["Portrait", "Landscape"] = "Landscape" 18 | use_svd_flow: bool = True 19 | add_narration_text_to_video: bool = True 20 | seed: int = -1 # <--- NEW: -1 means random seed 21 | 22 | # --- NEW: To be filled from UI selections --- 23 | module_selections: Dict[str, str] = Field(default_factory=dict) 24 | language: str = "en" 25 | 26 | # --- Static project-wide settings --- 27 | fps: int = 24 28 | output_dir: str = "modular_reels_output" 29 | font_for_subtitles: str = "Arial" 30 | 31 | # --- DYNAMIC settings, to be populated by the TaskExecutor --- 32 | model_max_video_shot_duration: float = 2.0 # A safe default 33 | generation_resolution: Tuple[int, int] = (1024, 1024) # A safe default 34 | 35 | @property 36 | def max_scene_narration_duration_hint(self) -> float: 37 | if self.max_scenes > 0 and self.min_scenes > 0: 38 | avg_scenes = (self.min_scenes + self.max_scenes) / 2 39 | return round(self.target_video_length_hint / avg_scenes, 1) 40 | return 6.0 41 | 42 | @property 43 | def final_output_resolution(self) -> Tuple[int, int]: 44 | if self.aspect_ratio_format == "Landscape": 45 | return (1920, 1080) 46 | return (1080, 1920) 47 | 48 | def __init__(self, **data): 49 | super().__init__(**data) 50 | os.makedirs(self.output_dir, exist_ok=True) 51 | 52 | 53 | def clear_vram_globally(*items_to_del): 54 | print(f"Attempting to clear VRAM. Received {len(items_to_del)} items to delete.") 55 | for item in items_to_del: 56 | if hasattr(item, 'to') and hasattr(item, 'dtype') and item.dtype != torch.float16: 57 | try: 58 | item.to('cpu') 59 | except Exception as e: 60 | print(f"Could not move item of type {type(item)} to CPU: {e}") 61 | del items_to_del 62 | gc.collect() 63 | if torch.cuda.is_available(): 64 | torch.cuda.empty_cache() 65 | torch.cuda.ipc_collect() 66 | print("VRAM clearing attempt finished.") -------------------------------------------------------------------------------- /package_code.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default output file name 4 | OUTPUT_FILE="combined_code.txt" 5 | 6 | # --- Configuration: Define what to include --- 7 | # Add your source directories and specific files here. 8 | # Paths should be relative to where you run the script from. 9 | # Directories will be scanned recursively. 10 | # Use spaces to separate items. 11 | FILES_TO_INCLUDE=( 12 | "README.md" 13 | "base_modules.py" 14 | "utils.py" 15 | "module_discovery.py" 16 | "app.py" 17 | "project_manager.py" 18 | "task_executor.py" 19 | "ui_task_executor.py" 20 | "config_manager.py" 21 | "video_assembly.py" 22 | "llm_modules/" 23 | "tts_modules/" 24 | "t2i_modules/" 25 | "i2v_modules/" 26 | "t2v_modules/" 27 | ) 28 | 29 | # --- End of Configuration --- 30 | 31 | # Check if an output file name was provided as an argument 32 | if [ "$1" ]; then 33 | OUTPUT_FILE="$1" 34 | echo "Using custom output file name: $OUTPUT_FILE" 35 | fi 36 | 37 | # Clear the output file to start fresh 38 | > "$OUTPUT_FILE" 39 | echo "Cleared old content from $OUTPUT_FILE." 40 | 41 | # A function to process and append a file to the output 42 | process_file() { 43 | local file_path=$1 44 | echo "Processing: $file_path" 45 | 46 | # Write the header with the relative file path 47 | echo "==== $file_path ====" >> "$OUTPUT_FILE" 48 | 49 | # Append the content of the file 50 | cat "$file_path" >> "$OUTPUT_FILE" 51 | 52 | # Add multiple newlines at the end for better separation 53 | echo -e "\n\n\n" >> "$OUTPUT_FILE" 54 | } 55 | 56 | # Loop through the configured list of files and directories 57 | for item in "${FILES_TO_INCLUDE[@]}"; do 58 | if [ -f "$item" ]; then 59 | # If it's a single file, process it directly 60 | process_file "$item" 61 | elif [ -d "$item" ]; then 62 | # If it's a directory, find all relevant files inside it 63 | # - The `find` command is powerful. 64 | # - It searches for items of type 'f' (file). 65 | # - It ignores paths containing '__pycache__', '.git', '.vscode', etc. 66 | # - It only includes files ending in '.py' or other specified extensions. 67 | find "$item" -type f \( -name "*.py" -o -name "*.sh" \) \ 68 | -not -path "*/__pycache__/*" \ 69 | -not -path "*/.git/*" \ 70 | -not -path "*/.venv/*" \ 71 | -not -path "*/.vscode/*" \ 72 | | sort | while read -r file; do 73 | process_file "$file" 74 | done 75 | else 76 | echo "Warning: Item '$item' not found. Skipping." 77 | fi 78 | done 79 | 80 | echo "=========================================" 81 | echo "✅ All done!" 82 | echo "Combined code saved to: $OUTPUT_FILE" 83 | echo "=========================================" -------------------------------------------------------------------------------- /tts_modules/tts_coqui.py: -------------------------------------------------------------------------------- 1 | # tts_modules/tts_coqui.py 2 | import os 3 | import torch 4 | import numpy as np 5 | from typing import Tuple, Optional 6 | from TTS.api import TTS as CoquiTTS 7 | from moviepy import AudioFileClip 8 | from scipy.io import wavfile 9 | 10 | from base_modules import BaseTTS, BaseModuleConfig, ModuleCapabilities 11 | from config_manager import DEVICE, clear_vram_globally 12 | 13 | class CoquiTTSConfig(BaseModuleConfig): 14 | model_id: str = "tts_models/multilingual/multi-dataset/xtts_v2" 15 | 16 | class CoquiTTSModule(BaseTTS): 17 | Config = CoquiTTSConfig 18 | 19 | @classmethod 20 | def get_capabilities(cls) -> ModuleCapabilities: 21 | return ModuleCapabilities( 22 | title="XTTS, Multi-Language, Documentary Style", 23 | vram_gb_min=2.0, # XTTS is relatively lightweight 24 | ram_gb_min=8.0, 25 | supported_tts_languages=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"] 26 | ) 27 | 28 | def _load_model(self): 29 | if self.model is None: 30 | print(f"Loading TTS model: {self.config.model_id}...") 31 | self.model = CoquiTTS(model_name=self.config.model_id, progress_bar=True).to(DEVICE) 32 | print("TTS model loaded.") 33 | 34 | def clear_vram(self): 35 | print("Clearing TTS VRAM...") 36 | if self.model is not None: 37 | clear_vram_globally(self.model) 38 | self.model = None 39 | print("TTS VRAM cleared.") 40 | 41 | def generate_audio( 42 | self, text: str, output_dir: str, scene_idx: int, language: str, speaker_wav: Optional[str] = None 43 | ) -> Tuple[str, float]: 44 | self._load_model() 45 | 46 | print(f"Generating audio in {language} for scene {scene_idx}: \"{text[:50]}...\"") 47 | output_path = os.path.join(output_dir, f"scene_{scene_idx}_audio.wav") 48 | 49 | tts_kwargs = {"language": language, "file_path": output_path} 50 | 51 | if "xtts" in self.config.model_id.lower(): 52 | if speaker_wav and os.path.exists(speaker_wav): 53 | tts_kwargs["speaker_wav"] = speaker_wav 54 | else: 55 | if speaker_wav: print(f"Warning: Speaker WAV {speaker_wav} not found. XTTS using default voice.") 56 | 57 | self.model.tts_to_file(text, **tts_kwargs) 58 | 59 | duration = 0.0 60 | try: 61 | if os.path.exists(output_path) and os.path.getsize(output_path) > 0: 62 | with AudioFileClip(output_path) as audio_clip: 63 | duration = audio_clip.duration + 0.1 # Small buffer 64 | else: raise ValueError("Audio file not generated or is empty.") 65 | except Exception as e: 66 | print(f"Error getting duration for {output_path}: {e}. Creating fallback.") 67 | samplerate = 22050 68 | wavfile.write(output_path, samplerate, np.zeros(int(0.1 * samplerate), dtype=np.int16)) 69 | duration = 0.1 70 | 71 | print(f"Actual audio duration for scene {scene_idx}: {duration:.2f}s") 72 | return output_path, duration -------------------------------------------------------------------------------- /mp3_to_wav_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | MP3 to WAV Converter 4 | Converts all MP3 files in the Downloads folder to WAV format. 5 | """ 6 | 7 | import os 8 | import sys 9 | from pathlib import Path 10 | from pydub import AudioSegment 11 | 12 | def convert_mp3_to_wav(downloads_folder="~/Downloads", output_folder=None): 13 | """ 14 | Convert all MP3 files in the Downloads folder to WAV format. 15 | 16 | Args: 17 | downloads_folder (str): Path to the Downloads folder 18 | output_folder (str): Path to output folder (defaults to same as input) 19 | """ 20 | # Expand the tilde to full path 21 | downloads_path = Path(downloads_folder).expanduser() 22 | 23 | if output_folder is None: 24 | output_path = downloads_path 25 | else: 26 | output_path = Path(output_folder).expanduser() 27 | 28 | # Create output directory if it doesn't exist 29 | output_path.mkdir(parents=True, exist_ok=True) 30 | 31 | # Find all MP3 files 32 | mp3_files = list(downloads_path.glob("*.mp3")) 33 | 34 | if not mp3_files: 35 | print("No MP3 files found in the Downloads folder.") 36 | return 37 | 38 | print(f"Found {len(mp3_files)} MP3 file(s) to convert:") 39 | for mp3_file in mp3_files: 40 | print(f" - {mp3_file.name}") 41 | 42 | print("\nStarting conversion...") 43 | 44 | converted_count = 0 45 | failed_count = 0 46 | 47 | for mp3_file in mp3_files: 48 | try: 49 | print(f"Converting: {mp3_file.name}") 50 | 51 | # Load the MP3 file 52 | audio = AudioSegment.from_mp3(str(mp3_file)) 53 | 54 | # Create output filename (replace .mp3 with .wav) 55 | wav_filename = mp3_file.stem + ".wav" 56 | wav_path = output_path / wav_filename 57 | 58 | # Export as WAV 59 | audio.export(str(wav_path), format="wav") 60 | 61 | print(f" ✓ Successfully converted to: {wav_filename}") 62 | converted_count += 1 63 | 64 | except Exception as e: 65 | print(f" ✗ Failed to convert {mp3_file.name}: {str(e)}") 66 | failed_count += 1 67 | 68 | print(f"\nConversion complete!") 69 | print(f"Successfully converted: {converted_count} files") 70 | if failed_count > 0: 71 | print(f"Failed conversions: {failed_count} files") 72 | 73 | def main(): 74 | """Main function to handle command line arguments.""" 75 | import argparse 76 | 77 | parser = argparse.ArgumentParser(description="Convert MP3 files to WAV format") 78 | parser.add_argument("--input", "-i", default="~/Downloads", 79 | help="Input folder containing MP3 files (default: ~/Downloads)") 80 | parser.add_argument("--output", "-o", 81 | help="Output folder for WAV files (default: same as input folder)") 82 | 83 | args = parser.parse_args() 84 | 85 | try: 86 | convert_mp3_to_wav(args.input, args.output) 87 | except KeyboardInterrupt: 88 | print("\nConversion interrupted by user.") 89 | sys.exit(1) 90 | except Exception as e: 91 | print(f"Error: {str(e)}") 92 | sys.exit(1) 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /system.py: -------------------------------------------------------------------------------- 1 | # In system.py 2 | import json 3 | import os 4 | from pydantic import BaseModel, Field 5 | from typing import Optional, Tuple 6 | 7 | # --- START OF MODIFICATION --- 8 | # Import necessary libraries for detection 9 | try: 10 | import psutil 11 | except ImportError: 12 | psutil = None 13 | 14 | try: 15 | import GPUtil 16 | except ImportError: 17 | GPUtil = None 18 | # --- END OF MODIFICATION --- 19 | 20 | 21 | SYSTEM_CONFIG_FILE = "system.json" 22 | 23 | class SystemConfig(BaseModel): 24 | """Stores the user's available system resources.""" 25 | vram_gb: float = Field(description="Available GPU VRAM in GB.") 26 | ram_gb: float = Field(description="Available system RAM in GB.") 27 | 28 | def save_system_config(vram_gb: float, ram_gb: float) -> None: 29 | """Saves the system resource configuration to system.json.""" 30 | config = SystemConfig(vram_gb=vram_gb, ram_gb=ram_gb) 31 | with open(SYSTEM_CONFIG_FILE, 'w') as f: 32 | f.write(config.model_dump_json(indent=4)) 33 | print(f"System configuration saved to {SYSTEM_CONFIG_FILE}") 34 | 35 | def load_system_config() -> Optional[SystemConfig]: 36 | """Loads the system resource configuration from system.json if it exists.""" 37 | if not os.path.exists(SYSTEM_CONFIG_FILE): 38 | return None 39 | try: 40 | with open(SYSTEM_CONFIG_FILE, 'r') as f: 41 | data = json.load(f) 42 | return SystemConfig(**data) 43 | except (json.JSONDecodeError, TypeError) as e: 44 | print(f"Error loading or parsing {SYSTEM_CONFIG_FILE}: {e}. Please re-enter details.") 45 | return None 46 | 47 | # --- START OF MODIFICATION --- 48 | def detect_system_specs() -> Tuple[float, float]: 49 | """ 50 | Attempts to detect available system RAM and GPU VRAM. 51 | Returns (vram_in_gb, ram_in_gb). 52 | Defaults to 8.0 for VRAM and 16.0 for RAM if detection fails. 53 | """ 54 | # Default values 55 | detected_ram_gb = 16.0 56 | detected_vram_gb = 8.0 57 | 58 | # 1. Detect System RAM 59 | if psutil: 60 | try: 61 | ram_bytes = psutil.virtual_memory().total 62 | # Round to the nearest whole number for a cleaner UI 63 | detected_ram_gb = round(ram_bytes / (1024**3)) 64 | print(f"Detected System RAM: {detected_ram_gb} GB") 65 | except Exception as e: 66 | print(f"Could not detect system RAM using psutil: {e}. Falling back to default.") 67 | else: 68 | print("psutil not installed. Cannot detect RAM. Falling back to default.") 69 | 70 | # 2. Detect GPU VRAM 71 | if GPUtil: 72 | try: 73 | gpus = GPUtil.getGPUs() 74 | if gpus: 75 | # Use the VRAM of the first detected GPU 76 | gpu = gpus[0] 77 | # VRAM is in MB, convert to GB and round to one decimal place 78 | detected_vram_gb = round(gpu.memoryTotal / 1024, 1) 79 | print(f"Detected GPU: {gpu.name} with {detected_vram_gb} GB VRAM") 80 | else: 81 | print("GPUtil found no GPUs. Falling back to default VRAM.") 82 | except Exception as e: 83 | print(f"Could not detect GPU VRAM using GPUtil: {e}. Falling back to default.") 84 | else: 85 | print("GPUtil not installed. Cannot detect VRAM. Falling back to default.") 86 | 87 | return detected_vram_gb, detected_ram_gb 88 | # --- END OF MODIFICATION --- -------------------------------------------------------------------------------- /i2v_modules/i2v_slideshow.py: -------------------------------------------------------------------------------- 1 | # In i2v_modules/i2v_slideshow.py 2 | from typing import Dict, Any, List, Optional, Union 3 | # --- THIS IS THE FIX: Importing ImageClip directly, matching the project's pattern --- 4 | from moviepy.video.VideoClip import ImageClip 5 | 6 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities 7 | from config_manager import ContentConfig 8 | 9 | class SlideshowI2VConfig(BaseModuleConfig): 10 | # This module doesn't load a model, but the config is part of the contract. 11 | model_id: str = "moviepy_image_clip" 12 | 13 | class SlideshowI2V(BaseI2V): 14 | Config = SlideshowI2VConfig 15 | 16 | @classmethod 17 | def get_capabilities(cls) -> ModuleCapabilities: 18 | """ 19 | Defines the capabilities of this simple, non-AI module. 20 | It uses minimal resources and doesn't support AI-specific features. 21 | """ 22 | return ModuleCapabilities( 23 | title="Slideshow (Static Image)", 24 | vram_gb_min=0.1, # Uses virtually no VRAM 25 | ram_gb_min=1.0, # Uses very little RAM 26 | supported_formats=["Portrait", "Landscape"], 27 | supports_ip_adapter=False, # Not an AI model 28 | supports_lora=False, # Not an AI model 29 | max_subjects=0, 30 | accepts_text_prompt=False, # Ignores prompts 31 | accepts_negative_prompt=False 32 | ) 33 | 34 | def get_model_capabilities(self) -> Dict[str, Any]: 35 | """ 36 | This module has no native resolution and can handle long durations. 37 | """ 38 | return { 39 | # It can handle any resolution, as it just wraps the image. 40 | "resolutions": {"Portrait": (1080, 1920), "Landscape": (1920, 1080)}, 41 | "max_shot_duration": 60.0 # Can be very long 42 | } 43 | 44 | def _load_pipeline(self): 45 | """No pipeline to load for this module.""" 46 | print("SlideshowI2V: No pipeline to load.") 47 | pass 48 | 49 | def clear_vram(self): 50 | """No VRAM to clear for this module.""" 51 | print("SlideshowI2V: No VRAM to clear.") 52 | pass 53 | 54 | def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str: 55 | """This module ignores prompts, so no enhancement is needed.""" 56 | return prompt 57 | 58 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 59 | """ 60 | Creates a video by holding a static image for the target duration. 61 | """ 62 | print(f"SlideshowI2V: Creating static video for {target_duration:.2f}s from {image_path}") 63 | 64 | video_clip = None 65 | try: 66 | # Create a video clip from the static image and set its duration. 67 | video_clip = ImageClip(image_path).with_duration(target_duration) 68 | 69 | # Use the correct syntax for write_videofile, matching video_assembly.py 70 | video_clip.write_videofile( 71 | output_video_path, 72 | fps=content_config.fps, 73 | codec="libx264", 74 | audio=False, # This is a visual-only shot 75 | threads=4, 76 | preset="medium", 77 | logger=None # Suppress verbose moviepy logs 78 | ) 79 | 80 | print(f"Slideshow video shot saved to {output_video_path}") 81 | return output_video_path 82 | 83 | except Exception as e: 84 | print(f"Error creating slideshow video: {e}") 85 | return "" # Return empty string on failure 86 | finally: 87 | # Ensure the clip resources are released 88 | if video_clip: 89 | video_clip.close() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # In utils.py 2 | import datetime 3 | import json 4 | import os 5 | from PIL import Image, ImageOps 6 | from moviepy import VideoFileClip 7 | import streamlit as st # Keep st for st.error 8 | 9 | def load_and_correct_image_orientation(image_source): 10 | """ 11 | Loads an image from a source (file path or uploaded file object) 12 | and corrects its orientation based on EXIF data. 13 | """ 14 | try: 15 | image = Image.open(image_source) 16 | # The magic is in exif_transpose 17 | corrected_image = ImageOps.exif_transpose(image) 18 | return corrected_image 19 | except Exception as e: 20 | # Using st.error here is okay for a simple app, but for true separation, 21 | # you might log the error and return None, letting the caller handle the UI. 22 | # For this project, this is fine. 23 | st.error(f"Could not load or correct image: {e}") 24 | return None 25 | 26 | def list_projects(): 27 | """Lists all projects from the output directory with extended details including modules.""" 28 | projects = [] 29 | base_dir = "modular_reels_output" 30 | if not os.path.exists(base_dir): return [] 31 | for project_dir in os.listdir(base_dir): 32 | project_path = os.path.join(base_dir, project_dir) 33 | if os.path.isdir(project_path): 34 | project_file = os.path.join(project_path, "project.json") 35 | if os.path.exists(project_file): 36 | try: 37 | with open(project_file, 'r') as f: 38 | data = json.load(f) 39 | 40 | project_info = data.get('project_info', {}) 41 | # --- START OF MODIFICATION --- 42 | # Use title, but fall back to topic for old projects, then to dir name. 43 | title = project_info.get('title', project_info.get('topic', project_dir)) 44 | topic = project_info.get('topic', 'N/A') # Keep topic for potential detailed views 45 | # --- END OF MODIFICATION --- 46 | 47 | config = project_info.get('config', {}) 48 | final_video_info = data.get('final_video', {}) 49 | status = project_info.get('status', 'unknown') 50 | 51 | flow = "Image-to-Video" if config.get('use_svd_flow', True) else "Text-to-Video" 52 | 53 | final_video_path = None 54 | duration = 0.0 55 | if status == 'completed': 56 | stored_path = final_video_info.get('path') 57 | if stored_path and os.path.exists(stored_path): 58 | final_video_path = stored_path 59 | try: 60 | with VideoFileClip(final_video_path) as clip: 61 | duration = clip.duration 62 | except Exception as e: 63 | print(f"Could not read video duration for {final_video_path}: {e}") 64 | duration = 0.0 65 | 66 | modules = config.get('module_selections', {}) 67 | 68 | # --- START OF MODIFICATION --- 69 | projects.append({ 70 | 'name': project_dir, 71 | 'title': title, # Use the new title field 72 | 'topic': topic, # Keep topic field for completeness 73 | 'created_at': datetime.datetime.fromtimestamp(project_info.get('created_at', 0)), 74 | 'status': status, 75 | 'flow': flow, 76 | 'final_video_path': final_video_path, 77 | 'duration': duration, 78 | 'modules': modules, 79 | }) 80 | # --- END OF MODIFICATION --- 81 | except Exception as e: 82 | print(f"Error loading project {project_dir}: {e}") 83 | return sorted(projects, key=lambda p: p['created_at'], reverse=True) -------------------------------------------------------------------------------- /t2i_modules/t2i_sdxl.py: -------------------------------------------------------------------------------- 1 | # t2i_modules/t2i_sdxl.py 2 | import torch 3 | from typing import List, Optional, Dict, Any, Union 4 | from diffusers import StableDiffusionXLPipeline, DiffusionPipeline 5 | from diffusers.utils import load_image 6 | 7 | from base_modules import BaseT2I, BaseModuleConfig, ModuleCapabilities 8 | from config_manager import DEVICE, clear_vram_globally 9 | 10 | class SdxlT2IConfig(BaseModuleConfig): 11 | model_id: str = "stabilityai/stable-diffusion-xl-base-1.0" 12 | refiner_id: Optional[str] = "stabilityai/stable-diffusion-xl-refiner-1.0" 13 | num_inference_steps: int = 30 14 | guidance_scale: float = 7.5 15 | base_denoising_end: float = 0.8 16 | refiner_denoising_start: float = 0.8 17 | 18 | class SdxlT2I(BaseT2I): 19 | Config = SdxlT2IConfig 20 | 21 | def __init__(self, config: SdxlT2IConfig): 22 | super().__init__(config) 23 | self.refiner_pipe = None 24 | 25 | @classmethod 26 | def get_capabilities(cls) -> ModuleCapabilities: 27 | return ModuleCapabilities( 28 | title="SDXL + Refiner (High VRAM): No Subjects considered", 29 | vram_gb_min=10.0, # SDXL with refiner is heavy 30 | ram_gb_min=16.0, 31 | supported_formats=["Portrait", "Landscape"], 32 | # Even if we don't implement IP-Adapter here, we declare support 33 | # because the pipeline is capable. A more advanced version could add it. 34 | supports_ip_adapter=True, 35 | supports_lora=True, 36 | max_subjects=2, 37 | accepts_text_prompt=True, 38 | accepts_negative_prompt=True 39 | ) 40 | 41 | def get_model_capabilities(self) -> Dict[str, Any]: 42 | return { 43 | "resolutions": {"Portrait": (896, 1152), "Landscape": (1344, 768)}, 44 | "max_shot_duration": 3.0 45 | } 46 | 47 | def _load_pipeline(self): 48 | if self.pipe is None: 49 | print(f"Loading T2I pipeline (SDXL): {self.config.model_id}...") 50 | self.pipe = StableDiffusionXLPipeline.from_pretrained( 51 | self.config.model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True 52 | ).to(DEVICE) 53 | print("SDXL Base pipeline loaded.") 54 | if self.config.refiner_id: 55 | print(f"Loading T2I Refiner pipeline: {self.config.refiner_id}...") 56 | self.refiner_pipe = DiffusionPipeline.from_pretrained( 57 | self.config.refiner_id, text_encoder_2=self.pipe.text_encoder_2, 58 | vae=self.pipe.vae, torch_dtype=torch.float16, 59 | use_safetensors=True, variant="fp16" 60 | ).to(DEVICE) 61 | print("SDXL Refiner pipeline loaded.") 62 | 63 | def clear_vram(self): 64 | print("Clearing T2I (SDXL) VRAM...") 65 | models = [m for m in [self.pipe, self.refiner_pipe] if m is not None] 66 | if models: clear_vram_globally(*models) 67 | self.pipe, self.refiner_pipe = None, None 68 | print("T2I (SDXL) VRAM cleared.") 69 | 70 | # --- START OF FIX: Updated method signature and implementation --- 71 | def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str: 72 | self._load_pipeline() 73 | 74 | if ip_adapter_image: 75 | print("Warning: SDXLT2I module received IP-Adapter image but does not currently implement its use.") 76 | 77 | generator = None 78 | if seed != -1: 79 | print(f"Using fixed seed for generation: {seed}") 80 | # Ensure the generator is on the same device as the pipeline 81 | generator = torch.Generator(device=self.pipe.device).manual_seed(seed) 82 | else: 83 | print("Using random seed for generation.") 84 | 85 | kwargs = { 86 | "prompt": prompt, 87 | "negative_prompt": negative_prompt, # Now passing this argument 88 | "width": width, "height": height, 89 | "num_inference_steps": self.config.num_inference_steps, 90 | "guidance_scale": self.config.guidance_scale, 91 | "generator": generator # Now passing the generator 92 | } 93 | if self.refiner_pipe: 94 | kwargs["output_type"] = "latent" 95 | kwargs["denoising_end"] = self.config.base_denoising_end 96 | 97 | image = self.pipe(**kwargs).images[0] 98 | 99 | if self.refiner_pipe: 100 | print("Refining image...") 101 | refiner_kwargs = { 102 | "prompt": prompt, 103 | "negative_prompt": negative_prompt, 104 | "image": image, 105 | "denoising_start": self.config.refiner_denoising_start, 106 | "num_inference_steps": self.config.num_inference_steps, 107 | "generator": generator 108 | } 109 | image = self.refiner_pipe(**refiner_kwargs).images[0] 110 | 111 | image.save(output_path) 112 | print(f"Image saved to {output_path}") 113 | return output_path 114 | # --- END OF FIX --- -------------------------------------------------------------------------------- /i2v_modules/i2v_ltx.py: -------------------------------------------------------------------------------- 1 | # i2v_modules/i2v_ltx.py 2 | import torch 3 | from typing import Dict, Any, List, Optional, Union 4 | from diffusers import LTXImageToVideoPipeline 5 | from diffusers.utils import export_to_video, load_image 6 | from PIL import Image 7 | 8 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities 9 | from config_manager import DEVICE, clear_vram_globally, ContentConfig 10 | 11 | class LtxI2VConfig(BaseModuleConfig): 12 | model_id: str = "Lightricks/LTX-Video" 13 | num_inference_steps: int = 50 14 | guidance_scale: float = 7.5 15 | 16 | class LtxI2V(BaseI2V): 17 | Config = LtxI2VConfig 18 | 19 | @classmethod 20 | def get_capabilities(cls) -> ModuleCapabilities: 21 | return ModuleCapabilities( 22 | title="LTX, 8bit Load, Port/LandScape, 2 Sub, Take +/- Prompts, max 4 sec", 23 | vram_gb_min=8.0, 24 | ram_gb_min=12.0, 25 | supported_formats=["Portrait", "Landscape"], 26 | supports_ip_adapter=True, 27 | supports_lora=True, # Juggernaut is a fine-tune, can easily use LoRAs 28 | max_subjects=2, # Can handle one or two IP adapter images 29 | accepts_text_prompt=True, 30 | accepts_negative_prompt=True 31 | ) 32 | 33 | 34 | def get_model_capabilities(self) -> Dict[str, Any]: 35 | return { 36 | "resolutions": {"Portrait": (480, 704), "Landscape": (704, 480)}, 37 | "max_shot_duration": 4 38 | } 39 | 40 | def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str: 41 | # SVD doesn't use text prompts, but this shows how you could add model-specific keywords. 42 | # For example, for a different model you might do: 43 | if prompt_type == "visual": 44 | return f"{prompt}, 8k, photorealistic, cinematic lighting" 45 | return prompt # Return original for SVD 46 | 47 | def _load_pipeline(self): 48 | if self.pipe is None: 49 | print(f"Loading I2V pipeline (LTX): {self.config.model_id}...") 50 | self.pipe = LTXImageToVideoPipeline.from_pretrained(self.config.model_id, torch_dtype=torch.bfloat16) 51 | self.pipe.enable_model_cpu_offload() 52 | print("I2V (LTX) pipeline loaded.") 53 | 54 | def clear_vram(self): 55 | print("Clearing I2V (LTX) VRAM...") 56 | if self.pipe is not None: clear_vram_globally(self.pipe) 57 | self.pipe = None 58 | print("I2V (LTX) VRAM cleared.") 59 | 60 | def _resize_and_pad(self, image: Image.Image, target_width: int, target_height: int) -> Image.Image: 61 | original_aspect = image.width / image.height; target_aspect = target_width / target_height 62 | if original_aspect > target_aspect: new_width, new_height = target_width, int(target_width / original_aspect) 63 | else: new_height, new_width = target_height, int(target_height * original_aspect) 64 | resized_image = image.resize((new_width, new_height), Image.LANCZOS) 65 | background = Image.new('RGB', (target_width, target_height), (0, 0, 0)) 66 | background.paste(resized_image, ((target_width - new_width) // 2, (target_height - new_height) // 2)) 67 | return background 68 | 69 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 70 | self._load_pipeline() 71 | 72 | input_image = load_image(image_path) 73 | target_res = self.get_model_capabilities()["resolutions"] 74 | aspect_ratio = "Landscape" if input_image.width > input_image.height else "Portrait" 75 | target_width, target_height = target_res[aspect_ratio] 76 | prepared_image = self._resize_and_pad(input_image, target_width, target_height) 77 | 78 | num_frames = max(16, int(target_duration * content_config.fps)) 79 | full_prompt = f"{visual_prompt}, {motion_prompt}" if motion_prompt else visual_prompt 80 | 81 | # --- NEW LOGIC TO HANDLE ip_adapter_image --- 82 | # While LTX doesn't have a formal IP-Adapter, we can use the character 83 | # reference to guide the style by adding it to the prompt. 84 | if ip_adapter_image: 85 | print("LTX I2V: Using character reference to guide prompt style.") 86 | # For simplicity, we add a generic phrase. A more complex system could use an image-to-text model. 87 | full_prompt = f"in the style of the reference character, {full_prompt}" 88 | 89 | print(f"LTX I2V using prompt: {full_prompt}") 90 | 91 | video = self.pipe( 92 | prompt=full_prompt, image=prepared_image, width=target_width, height=target_height, 93 | num_frames=num_frames, num_inference_steps=self.config.num_inference_steps, 94 | guidance_scale=self.config.guidance_scale, 95 | negative_prompt="worst quality, inconsistent motion, blurry" 96 | ).frames[0] 97 | 98 | export_to_video(video, output_video_path, fps=content_config.fps) 99 | print(f"LTX video shot saved to {output_video_path}") 100 | return output_video_path -------------------------------------------------------------------------------- /t2v_modules/t2v_zeroscope.py: -------------------------------------------------------------------------------- 1 | # In t2v_modules/t2v_zeroscope.py 2 | import torch 3 | from typing import Dict, Any, List, Optional, Union 4 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler 5 | from diffusers.utils import export_to_video 6 | 7 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities 8 | from config_manager import DEVICE, clear_vram_globally 9 | 10 | class ZeroscopeT2VConfig(BaseModuleConfig): 11 | model_id: str = "cerspense/zeroscope_v2_576w" 12 | upscaler_model_id: str = "cerspense/zeroscope_v2_xl" 13 | 14 | num_inference_steps: int = 30 15 | guidance_scale: float = 9.0 16 | # --- START OF FIX: Add strength for the upscaling process --- 17 | upscaler_strength: float = 0.7 18 | # --- END OF FIX --- 19 | 20 | class ZeroscopeT2V(BaseT2V): 21 | Config = ZeroscopeT2VConfig 22 | 23 | @classmethod 24 | def get_capabilities(cls) -> ModuleCapabilities: 25 | return ModuleCapabilities( 26 | title="Zeroscope, Port/Landscape, No Subject, 2 sec", 27 | vram_gb_min=8.0, 28 | ram_gb_min=12.0, 29 | supported_formats=["Portrait", "Landscape"], 30 | supports_ip_adapter=False, # Zeroscope does not support IP-Adapter 31 | supports_lora=False, # Zeroscope does not support LoRA loading 32 | max_subjects=0, 33 | accepts_text_prompt=True, 34 | accepts_negative_prompt=True 35 | ) 36 | 37 | 38 | def __init__(self, config: ZeroscopeT2VConfig): 39 | super().__init__(config) 40 | self.upscaler_pipe = None 41 | 42 | def get_model_capabilities(self) -> Dict[str, Any]: 43 | # Zeroscope has a fixed native resolution that is then upscaled 44 | base_resolution = (576, 320) 45 | return { 46 | "resolutions": {"Portrait": base_resolution, "Landscape": base_resolution}, 47 | "max_shot_duration": 2.0 48 | } 49 | 50 | def _load_pipeline(self): 51 | if self.pipe is None: 52 | print(f"Loading T2V pipeline ({self.config.model_id})...") 53 | self.pipe = DiffusionPipeline.from_pretrained(self.config.model_id, torch_dtype=torch.float16) 54 | self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config) 55 | self.pipe.enable_model_cpu_offload() 56 | print(f"T2V ({self.config.model_id}) pipeline loaded.") 57 | 58 | if self.upscaler_pipe is None: 59 | print(f"Loading T2V Upscaler pipeline ({self.config.upscaler_model_id})...") 60 | self.upscaler_pipe = DiffusionPipeline.from_pretrained(self.config.upscaler_model_id, torch_dtype=torch.float16) 61 | self.upscaler_pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.upscaler_pipe.scheduler.config) 62 | self.upscaler_pipe.enable_model_cpu_offload() 63 | print(f"T2V Upscaler ({self.config.upscaler_model_id}) pipeline loaded.") 64 | 65 | def clear_vram(self): 66 | print(f"Clearing T2V VRAM...") 67 | models_to_clear = [m for m in [self.pipe, self.upscaler_pipe] if m is not None] 68 | if models_to_clear: clear_vram_globally(*models_to_clear) 69 | self.pipe, self.upscaler_pipe = None, None 70 | print("T2V VRAM cleared.") 71 | 72 | def generate_video_from_text( 73 | self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None 74 | ) -> str: 75 | self._load_pipeline() 76 | 77 | if ip_adapter_image: 78 | print("Warning: ZeroscopeT2V module received IP-Adapter image but does not currently implement its use.") 79 | 80 | negative_prompt = "blurry, low quality, watermark, bad anatomy, text, letters, distorted" 81 | 82 | # Note: Zeroscope generates at a fixed resolution, so we use its capabilities directly 83 | model_res = self.get_model_capabilities()["resolutions"]["Landscape"] 84 | 85 | print(f"Stage 1: Generating T2V ({model_res[0]}x{model_res[1]}) for prompt: \"{prompt[:70]}...\"") 86 | 87 | video_frames_tensor = self.pipe( 88 | prompt=prompt, negative_prompt=negative_prompt, 89 | num_inference_steps=self.config.num_inference_steps, 90 | height=model_res[1], width=model_res[0], num_frames=num_frames, 91 | guidance_scale=self.config.guidance_scale, output_type="pt" 92 | ).frames 93 | 94 | print("Stage 2: Upscaling video to HD...") 95 | 96 | # --- START OF FIX --- 97 | upscaled_video_frames = self.upscaler_pipe( 98 | prompt=prompt, 99 | negative_prompt=negative_prompt, 100 | video=video_frames_tensor, # The argument is 'video', not 'image'. 101 | strength=self.config.upscaler_strength, # Add the strength parameter 102 | num_inference_steps=self.config.num_inference_steps, 103 | guidance_scale=self.config.guidance_scale, 104 | ).frames[0] 105 | # --- END OF FIX --- 106 | 107 | export_to_video(upscaled_video_frames, output_video_path, fps=fps) 108 | 109 | print(f"High-quality T2V video shot saved to {output_video_path}") 110 | return output_video_path -------------------------------------------------------------------------------- /t2v_modules/t2v_wan.py: -------------------------------------------------------------------------------- 1 | # In t2v_modules/t2v_wan.py 2 | import torch 3 | from typing import Dict, Any, List, Optional, Union 4 | 5 | # --- Important: Import the specific classes for this model --- 6 | from diffusers import WanPipeline, AutoencoderKLWan 7 | from diffusers.utils import export_to_video 8 | 9 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities 10 | from config_manager import DEVICE, clear_vram_globally 11 | 12 | class WanT2VConfig(BaseModuleConfig): 13 | """Configuration for the Wan 2.1 T2V model.""" 14 | model_id: str = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" 15 | # Parameters from the model card example 16 | num_inference_steps: int = 30 17 | guidance_scale: float = 5.0 18 | 19 | class WanT2V(BaseT2V): 20 | """ 21 | Text-to-Video module using Wan 2.1 T2V 1.3B model. 22 | This model is efficient and produces high-quality video but does not support 23 | character consistency (IP-Adapter). 24 | """ 25 | Config = WanT2VConfig 26 | 27 | @classmethod 28 | def get_capabilities(cls) -> ModuleCapabilities: 29 | """Declare the capabilities of the Wan 2.1 model.""" 30 | return ModuleCapabilities( 31 | title="Wan 2.1 (1.3B, Fast, 5s Shots)", 32 | vram_gb_min=15.0, # Based on the 8.19 GB requirement from the model card 33 | ram_gb_min=12.0, 34 | supported_formats=["Portrait", "Landscape"], 35 | # This model does not support IP-Adapter, so we are honest here. 36 | supports_ip_adapter=False, 37 | supports_lora=False, # The pipeline does not have a LoRA loader 38 | max_subjects=0, 39 | accepts_text_prompt=True, 40 | accepts_negative_prompt=True 41 | ) 42 | 43 | def get_model_capabilities(self) -> Dict[str, Any]: 44 | """Return the specific resolutions and max duration for this model.""" 45 | return { 46 | # Based on the example: width=832, height=480 47 | "resolutions": {"Portrait": (480, 832), "Landscape": (832, 480)}, 48 | # Based on the example: "generate a 5-second 480P video" 49 | "max_shot_duration": 5.0 50 | } 51 | 52 | def _load_pipeline(self): 53 | """Loads the custom WanPipeline and its required VAE.""" 54 | if self.pipe is not None: 55 | return 56 | 57 | print(f"Loading T2V pipeline ({self.config.model_id})...") 58 | 59 | # This model requires loading the VAE separately first 60 | vae = AutoencoderKLWan.from_pretrained( 61 | self.config.model_id, 62 | subfolder="vae", 63 | torch_dtype=torch.float32 # VAE often works better in float32 64 | ) 65 | 66 | # Then, load the main pipeline, passing the VAE to it 67 | self.pipe = WanPipeline.from_pretrained( 68 | self.config.model_id, 69 | vae=vae, 70 | torch_dtype=torch.bfloat16 # bfloat16 is recommended in the example 71 | ) 72 | 73 | self.pipe.enable_model_cpu_offload() 74 | 75 | print(f"T2V ({self.config.model_id}) pipeline loaded to {DEVICE}.") 76 | 77 | def clear_vram(self): 78 | """Clears the VRAM used by the pipeline.""" 79 | print(f"Clearing T2V (Wan) VRAM...") 80 | if self.pipe is not None: 81 | clear_vram_globally(self.pipe) 82 | self.pipe = None 83 | print("T2V (Wan) VRAM cleared.") 84 | 85 | def generate_video_from_text( 86 | self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None 87 | ) -> str: 88 | """Generates a video shot using the Wan T2V pipeline.""" 89 | self._load_pipeline() 90 | 91 | # Gracefully handle the case where character images are passed to a non-supporting model. 92 | if ip_adapter_image: 93 | print("="*50) 94 | print("WARNING: The WanT2V module does not support IP-Adapters for character consistency.") 95 | print("The provided character images will be ignored for this T2V generation.") 96 | print("="*50) 97 | 98 | # Use the detailed negative prompt from the model card for best results 99 | negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" 100 | 101 | print(f"Generating Wan T2V ({width}x{height}) for prompt: \"{prompt[:70]}...\"") 102 | 103 | video_frames = self.pipe( 104 | prompt=prompt, 105 | negative_prompt=negative_prompt, 106 | height=height, 107 | width=width, 108 | num_frames=num_frames, 109 | guidance_scale=self.config.guidance_scale, 110 | num_inference_steps=self.config.num_inference_steps 111 | ).frames[0] 112 | 113 | # The system's config determines the final FPS, not the model's example 114 | export_to_video(video_frames, output_video_path, fps=fps) 115 | 116 | print(f"Wan T2V video shot saved to {output_video_path}") 117 | return output_video_path -------------------------------------------------------------------------------- /i2v_modules/i2v_svd.py: -------------------------------------------------------------------------------- 1 | # i2v_modules/i2v_svd.py 2 | import torch 3 | from typing import Dict, Any, List, Optional, Union 4 | from diffusers import StableVideoDiffusionPipeline 5 | from diffusers.utils import load_image, export_to_video 6 | from PIL import Image 7 | 8 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities 9 | from config_manager import DEVICE, clear_vram_globally, ContentConfig 10 | 11 | class SvdI2VConfig(BaseModuleConfig): 12 | model_id: str = "stabilityai/stable-video-diffusion-img2vid-xt" 13 | decode_chunk_size: int = 8 14 | motion_bucket_id: int = 127 15 | noise_aug_strength: float = 0.02 16 | model_native_frames: int = 25 17 | 18 | class SvdI2V(BaseI2V): 19 | Config = SvdI2VConfig 20 | 21 | @classmethod 22 | def get_capabilities(cls) -> ModuleCapabilities: 23 | return ModuleCapabilities( 24 | title="SVD, Float16, Port/Landscape, No Prompt just image, Max 2 Sec", 25 | vram_gb_min=8.0, 26 | ram_gb_min=12.0, 27 | supported_formats=["Portrait", "Landscape"], 28 | supports_ip_adapter=True, 29 | supports_lora=True, # Juggernaut is a fine-tune, can easily use LoRAs 30 | max_subjects=2, # Can handle one or two IP adapter images 31 | accepts_text_prompt=False, 32 | accepts_negative_prompt=True 33 | ) 34 | 35 | 36 | def get_model_capabilities(self) -> Dict[str, Any]: 37 | return { 38 | "resolutions": {"Portrait": (576, 1024), "Landscape": (1024, 576)}, 39 | "max_shot_duration": 2.0 40 | } 41 | 42 | def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str: 43 | # SVD doesn't use text prompts, but this shows how you could add model-specific keywords. 44 | # For example, for a different model you might do: 45 | # if prompt_type == "visual": 46 | # return f"{prompt}, 8k, photorealistic, cinematic lighting" 47 | return prompt # Return original for SVD 48 | 49 | def _load_pipeline(self): 50 | if self.pipe is None: 51 | print(f"Loading I2V pipeline (SVD): {self.config.model_id}...") 52 | self.pipe = StableVideoDiffusionPipeline.from_pretrained( 53 | self.config.model_id, torch_dtype=torch.float16 54 | ) 55 | self.pipe.enable_model_cpu_offload() 56 | print("I2V (SVD) pipeline loaded.") 57 | 58 | def clear_vram(self): 59 | print("Clearing I2V (SVD) VRAM...") 60 | if self.pipe is not None: clear_vram_globally(self.pipe) 61 | self.pipe = None 62 | print("I2V (SVD) VRAM cleared.") 63 | 64 | def _resize_and_pad(self, image: Image.Image, target_width: int, target_height: int) -> Image.Image: 65 | original_aspect = image.width / image.height; target_aspect = target_width / target_height 66 | if original_aspect > target_aspect: new_width, new_height = target_width, int(target_width / original_aspect) 67 | else: new_height, new_width = target_height, int(target_height * original_aspect) 68 | resized_image = image.resize((new_width, new_height), Image.LANCZOS) 69 | background = Image.new('RGB', (target_width, target_height), (0, 0, 0)) 70 | background.paste(resized_image, ((target_width - new_width) // 2, (target_height - new_height) // 2)) 71 | return background 72 | 73 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 74 | self._load_pipeline() 75 | 76 | if ip_adapter_image: 77 | print("Warning: SvdI2V module received IP-Adapter image but does not currently implement its use.") 78 | 79 | input_image = load_image(image_path) 80 | svd_target_res = self.get_model_capabilities()["resolutions"] 81 | aspect_ratio = "Landscape" if input_image.width > input_image.height else "Portrait" 82 | svd_target_width, svd_target_height = svd_target_res[aspect_ratio] 83 | prepared_image = self._resize_and_pad(input_image, svd_target_width, svd_target_height) 84 | 85 | calculated_fps = max(1, round(self.config.model_native_frames / target_duration)) if target_duration > 0 else 8 86 | motion_bucket_id = self.config.motion_bucket_id 87 | if motion_prompt: 88 | motion_prompt_lower = motion_prompt.lower() 89 | if any(w in motion_prompt_lower for w in ['fast', 'quick', 'rapid', 'zoom in', 'pan right']): motion_bucket_id = min(255, motion_bucket_id + 50) 90 | elif any(w in motion_prompt_lower for w in ['slow', 'gentle', 'subtle', 'still']): motion_bucket_id = max(0, motion_bucket_id - 50) 91 | print(f"Adjusted motion_bucket_id to {motion_bucket_id} based on prompt: '{motion_prompt}'") 92 | 93 | video_frames = self.pipe( 94 | image=prepared_image, height=svd_target_height, width=svd_target_width, 95 | decode_chunk_size=self.config.decode_chunk_size, num_frames=self.config.model_native_frames, 96 | motion_bucket_id=motion_bucket_id, noise_aug_strength=self.config.noise_aug_strength, 97 | ).frames[0] 98 | 99 | export_to_video(video_frames, output_video_path, fps=calculated_fps) 100 | print(f"SVD video shot saved to {output_video_path}") 101 | return output_video_path -------------------------------------------------------------------------------- /t2v_modules/t2v_ltx.py: -------------------------------------------------------------------------------- 1 | # In t2v_modules/t2v_ltx.py 2 | import torch 3 | from typing import Dict, Any, List, Optional, Union 4 | import os 5 | 6 | # --- Import the necessary pipelines and configs --- 7 | from diffusers import LTXPipeline, LTXVideoTransformer3DModel 8 | from diffusers.utils import export_to_video 9 | from transformers import T5EncoderModel, BitsAndBytesConfig as TransformersBitsAndBytesConfig 10 | from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig 11 | 12 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities 13 | from config_manager import DEVICE, clear_vram_globally 14 | 15 | class LtxT2VConfig(BaseModuleConfig): 16 | model_id: str = "Lightricks/LTX-Video" 17 | use_8bit_quantization: bool = True 18 | num_inference_steps: int = 50 19 | guidance_scale: float = 7.5 20 | decode_timestep: float = 0.03 21 | decode_noise_scale: float = 0.025 22 | # No IP-Adapter configs needed as this pipeline doesn't support them 23 | 24 | class LtxT2V(BaseT2V): 25 | Config = LtxT2VConfig 26 | 27 | # No __init__ needed if we just have the default behavior 28 | 29 | @classmethod 30 | def get_capabilities(cls) -> ModuleCapabilities: 31 | """This module is for pure T2V and does NOT support IP-Adapters.""" 32 | return ModuleCapabilities( 33 | title="LTX, Port/Landscape, No Subject, 5 sec", 34 | vram_gb_min=8.0, 35 | ram_gb_min=12.0, 36 | supported_formats=["Portrait", "Landscape"], 37 | # --- THE CRITICAL CHANGE: Be honest about capabilities --- 38 | supports_ip_adapter=False, 39 | supports_lora=False, # This pipeline doesn't have a LoRA loader either 40 | max_subjects=0, 41 | accepts_text_prompt=True, 42 | accepts_negative_prompt=True 43 | ) 44 | 45 | def get_model_capabilities(self) -> Dict[str, Any]: 46 | return {"resolutions": {"Portrait": (512, 768), "Landscape": (768, 512)}, "max_shot_duration": 5.0} 47 | 48 | def _load_pipeline(self): 49 | if self.pipe is not None: return 50 | 51 | if self.config.use_8bit_quantization: 52 | print(f"Loading T2V pipeline ({self.config.model_id}) with 8-bit quantization...") 53 | text_encoder_quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True) 54 | text_encoder_8bit = T5EncoderModel.from_pretrained(self.config.model_id, subfolder="text_encoder", quantization_config=text_encoder_quant_config, torch_dtype=torch.float16) 55 | transformer_quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True) 56 | transformer_8bit = LTXVideoTransformer3DModel.from_pretrained(self.config.model_id, subfolder="transformer", quantization_config=transformer_quant_config, torch_dtype=torch.float16) 57 | 58 | # Note: We are no longer passing the `image_encoder` as it was being ignored. 59 | self.pipe = LTXPipeline.from_pretrained( 60 | self.config.model_id, 61 | text_encoder=text_encoder_8bit, 62 | transformer=transformer_8bit, 63 | torch_dtype=torch.float16, 64 | device_map="balanced", 65 | ) 66 | print("Quantized T2V pipeline loaded successfully.") 67 | else: 68 | print(f"Loading T2V pipeline ({self.config.model_id}) in full precision...") 69 | self.pipe = LTXPipeline.from_pretrained( 70 | self.config.model_id, 71 | torch_dtype=torch.bfloat16 72 | ) 73 | self.pipe.enable_model_cpu_offload() 74 | 75 | self.pipe.vae.enable_tiling() 76 | print("VAE tiling enabled for memory efficiency.") 77 | 78 | def clear_vram(self): 79 | print(f"Clearing T2V (LTX) VRAM...") 80 | if self.pipe is not None: 81 | clear_vram_globally(self.pipe) 82 | self.pipe = None 83 | print("T2V (LTX) VRAM cleared.") 84 | 85 | def generate_video_from_text( 86 | self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None 87 | ) -> str: 88 | self._load_pipeline() 89 | 90 | # --- THE GRACEFUL HANDLING --- 91 | # If character images are passed, inform the user they are being ignored. 92 | if ip_adapter_image: 93 | print("="*50) 94 | print("WARNING: The LtxT2V module does not support IP-Adapters for character consistency.") 95 | print("The provided character images will be ignored for this T2V generation.") 96 | print("="*50) 97 | 98 | # All IP-Adapter logic is removed. We just call the pipeline. 99 | pipeline_kwargs = {} 100 | 101 | negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, text, watermark, bad anatomy" 102 | print(f"Generating LTX T2V ({width}x{height}) for prompt: \"{prompt[:50]}...\"") 103 | 104 | video_frames = self.pipe( 105 | prompt=prompt, 106 | negative_prompt=negative_prompt, 107 | width=width, 108 | height=height, 109 | num_frames=num_frames, 110 | num_inference_steps=self.config.num_inference_steps, 111 | guidance_scale=self.config.guidance_scale, 112 | decode_timestep=self.config.decode_timestep, 113 | decode_noise_scale=self.config.decode_noise_scale, 114 | **pipeline_kwargs 115 | ).frames[0] 116 | 117 | export_to_video(video_frames, output_video_path, fps=fps) 118 | 119 | print(f"LTX T2V video shot saved to {output_video_path}") 120 | return output_video_path -------------------------------------------------------------------------------- /i2v_modules/i2v_wan.py: -------------------------------------------------------------------------------- 1 | # In i2v_modules/i2v_wan.py 2 | import torch 3 | import numpy as np 4 | from typing import Dict, Any, List, Optional, Union 5 | from PIL import Image 6 | 7 | # Import the necessary components 8 | from diffusers import WanImageToVideoPipeline, AutoencoderKLWan 9 | from diffusers.utils import export_to_video, load_image 10 | from transformers import CLIPVisionModel, UMT5EncoderModel, T5Tokenizer, CLIPImageProcessor 11 | 12 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities 13 | from config_manager import DEVICE, clear_vram_globally, ContentConfig 14 | 15 | class WanI2VConfig(BaseModuleConfig): 16 | """Configuration for the Wan 2.1 I2V model.""" 17 | model_id: str = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" 18 | 19 | num_inference_steps: int = 30 20 | guidance_scale: float = 5.0 21 | 22 | class WanI2V(BaseI2V): 23 | """ 24 | Image-to-Video module using the Wan 2.1 14B pipeline. 25 | """ 26 | Config = WanI2VConfig 27 | 28 | @classmethod 29 | def get_capabilities(cls) -> ModuleCapabilities: 30 | """Declare the capabilities of the Wan 2.1 I2V model.""" 31 | return ModuleCapabilities( 32 | title="Wan 2.1 I2V (14B)", 33 | vram_gb_min=40.0, 34 | ram_gb_min=24.0, 35 | supported_formats=["Portrait", "Landscape"], 36 | supports_ip_adapter=False, 37 | supports_lora=False, 38 | max_subjects=0, 39 | accepts_text_prompt=True, 40 | accepts_negative_prompt=True 41 | ) 42 | 43 | def get_model_capabilities(self) -> Dict[str, Any]: 44 | """Return the specific resolutions and max duration for this model.""" 45 | return { 46 | "resolutions": {"base_pixel_area": 399360}, # 480P model base area 47 | "max_shot_duration": 4.0 48 | } 49 | 50 | def _load_pipeline(self): 51 | """ 52 | Loads the WanImageToVideoPipeline following the official documentation example. 53 | """ 54 | if self.pipe is not None: return 55 | 56 | print(f"Loading I2V pipeline ({self.config.model_id})...") 57 | 58 | # 1. Load individual components with appropriate dtypes 59 | image_encoder = CLIPVisionModel.from_pretrained( 60 | self.config.model_id, 61 | subfolder="image_encoder", 62 | torch_dtype=torch.float32 63 | ) 64 | 65 | vae = AutoencoderKLWan.from_pretrained( 66 | self.config.model_id, 67 | subfolder="vae", 68 | torch_dtype=torch.float32 69 | ) 70 | 71 | # 2. Create the pipeline with the components 72 | self.pipe = WanImageToVideoPipeline.from_pretrained( 73 | self.config.model_id, 74 | vae=vae, 75 | image_encoder=image_encoder, 76 | torch_dtype=torch.bfloat16 77 | ) 78 | 79 | # 3. Enable model CPU offload for memory efficienc y 80 | self.pipe.enable_model_cpu_offload() 81 | 82 | print("I2V (Wan 14B) pipeline loaded successfully.") 83 | 84 | def clear_vram(self): 85 | """Clears the VRAM used by all loaded components.""" 86 | print(f"Clearing I2V (Wan 14B) VRAM...") 87 | if self.pipe is not None: 88 | clear_vram_globally(self.pipe) 89 | self.pipe = None 90 | print("I2V (Wan 14B) VRAM cleared.") 91 | 92 | def generate_video_from_image( 93 | self, image_path: str, output_video_path: str, target_duration: float, 94 | content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], 95 | ip_adapter_image: Optional[Union[str, List[str]]] = None 96 | ) -> str: 97 | """Generates a video by animating a source image using the 14B model.""" 98 | self._load_pipeline() 99 | 100 | input_image = load_image(image_path) 101 | 102 | model_caps = self.get_model_capabilities() 103 | max_area = model_caps["resolutions"]["base_pixel_area"] 104 | aspect_ratio = input_image.height / input_image.width 105 | 106 | # Calculate dimensions using the correct scale factors 107 | mod_value = self.pipe.vae_scale_factor_spatial * self.pipe.transformer.config.patch_size[1] 108 | h = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value 109 | w = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value 110 | prepared_image = input_image.resize((w, h)) 111 | 112 | num_frames = int(target_duration * content_config.fps) 113 | full_prompt = f"{visual_prompt}, {motion_prompt}" if motion_prompt else visual_prompt 114 | negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" 115 | 116 | print(f"Generating Wan I2V ({w}x{h}) from image: {image_path}") 117 | print(f" - Prompt: \"{full_prompt[:70]}...\"") 118 | 119 | video_frames = self.pipe( 120 | image=prepared_image, 121 | prompt=full_prompt, 122 | negative_prompt=negative_prompt, 123 | height=h, 124 | width=w, 125 | num_frames=num_frames, 126 | guidance_scale=self.config.guidance_scale, 127 | num_inference_steps=self.config.num_inference_steps, 128 | ).frames[0] 129 | 130 | export_to_video(video_frames, output_video_path, fps=content_config.fps) 131 | 132 | print(f"Wan I2V 14B video shot saved to {output_video_path}") 133 | return output_video_path -------------------------------------------------------------------------------- /base_modules.py: -------------------------------------------------------------------------------- 1 | # In base_modules.py 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import List, Tuple, Dict, Any, Optional, Union, Literal 5 | from pydantic import BaseModel, Field 6 | 7 | # --- NEW: Define the ModuleCapabilities Contract --- 8 | class ModuleCapabilities(BaseModel): 9 | """A standardized spec sheet for all generation modules.""" 10 | 11 | title: str = Field(description="Title to show in dropdowns") 12 | 13 | # Resource Requirements 14 | vram_gb_min: float = Field(default=4.0, description="Minimum GPU VRAM required in GB.") 15 | ram_gb_min: float = Field(default=8.0, description="Minimum system RAM required in GB.") 16 | 17 | # Format & Control Support 18 | supported_formats: List[Literal["Portrait", "Landscape"]] = Field(default=["Portrait", "Landscape"]) 19 | supports_ip_adapter: bool = Field(default=False, description="True if the module can use IP-Adapter for subject consistency.") 20 | supports_lora: bool = Field(default=False, description="True if the module supports LoRA weights.") 21 | 22 | # Subject & Prompting 23 | max_subjects: int = Field(default=0, description="Maximum number of distinct subjects/characters the module can handle at once (e.g., via IP-Adapter).") 24 | accepts_text_prompt: bool = Field(default=True, description="True if the module uses a text prompt.") 25 | accepts_negative_prompt: bool = Field(default=True, description="True if the module uses a negative prompt.") 26 | 27 | # Type-Specific 28 | supported_tts_languages: List[str] = Field(default=[], description="List of languages supported by a TTS module (e.g., ['en', 'es']).") 29 | 30 | # Forward-declare to avoid circular imports 31 | class ContentConfig(BaseModel): pass 32 | class ProjectState(BaseModel): pass 33 | 34 | # --- Base Configuration Models --- 35 | class BaseModuleConfig(BaseModel): 36 | """Base for all module-specific configurations.""" 37 | model_id: str 38 | 39 | # --- Base Module Classes --- 40 | class BaseLLM(ABC): 41 | """Abstract Base Class for Language Model modules.""" 42 | def __init__(self, config: BaseModuleConfig): 43 | self.config = config 44 | self.model = None 45 | self.tokenizer = None 46 | 47 | # --- NEW: Enforce capabilities contract --- 48 | @classmethod 49 | @abstractmethod 50 | def get_capabilities(cls) -> ModuleCapabilities: 51 | """Returns the spec sheet for this module.""" 52 | raise NotImplementedError 53 | 54 | @abstractmethod 55 | def generate_script(self, topic: str, content_config: ContentConfig) -> Dict[str, Any]: 56 | """Generates the main script, visual prompts, hashtags, and context descriptions.""" 57 | pass 58 | 59 | @abstractmethod 60 | def generate_shot_visual_prompts(self, scene_narration: str, original_scene_prompt: str, num_shots: int, content_config: ContentConfig, main_subject: str, setting: str) -> List[Tuple[str, str]]: 61 | """Generates visual and motion prompts for each shot within a scene.""" 62 | pass 63 | 64 | @abstractmethod 65 | def clear_vram(self): 66 | """Clears the VRAM used by the model and tokenizer.""" 67 | pass 68 | 69 | class BaseTTS(ABC): 70 | """Abstract Base Class for Text-to-Speech modules.""" 71 | def __init__(self, config: BaseModuleConfig): 72 | self.config = config 73 | self.model = None 74 | 75 | # --- NEW: Enforce capabilities contract --- 76 | @classmethod 77 | @abstractmethod 78 | def get_capabilities(cls) -> ModuleCapabilities: 79 | """Returns the spec sheet for this module.""" 80 | raise NotImplementedError 81 | 82 | @abstractmethod 83 | def generate_audio(self, text: str, output_dir: str, scene_idx: int, language: str, speaker_wav: Optional[str] = None) -> Tuple[str, float]: 84 | """Generates audio from text.""" 85 | pass 86 | 87 | @abstractmethod 88 | def clear_vram(self): 89 | """Clears the VRAM used by the TTS model.""" 90 | pass 91 | 92 | class BaseVideoGen(ABC): 93 | """A common base for all video generation modules (T2I, I2V, T2V).""" 94 | def __init__(self, config: BaseModuleConfig): 95 | self.config = config 96 | self.pipe = None 97 | 98 | # --- NEW: Enforce capabilities contract --- 99 | @classmethod 100 | @abstractmethod 101 | def get_capabilities(cls) -> ModuleCapabilities: 102 | """Returns the spec sheet for this module.""" 103 | raise NotImplementedError 104 | 105 | @abstractmethod 106 | def get_model_capabilities(self) -> Dict[str, Any]: 107 | """Returns a dictionary of the model's capabilities, like resolutions.""" 108 | pass 109 | 110 | def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str: 111 | return prompt 112 | 113 | @abstractmethod 114 | def clear_vram(self): 115 | """Clears the VRAM used by the pipeline.""" 116 | pass 117 | 118 | class BaseT2I(BaseVideoGen): 119 | """Abstract Base Class for Text-to-Image modules.""" 120 | @abstractmethod 121 | def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str: 122 | """Generates an image from a text prompt, optionally using an IP-Adapter image.""" 123 | pass 124 | 125 | class BaseI2V(BaseVideoGen): 126 | """Abstract Base Class for Image-to-Video modules.""" 127 | @abstractmethod 128 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 129 | """Generates a video from an initial image, optionally using an IP-Adapter image for style/subject.""" 130 | pass 131 | 132 | class BaseT2V(BaseVideoGen): 133 | """Abstract Base Class for Text-to-Video modules.""" 134 | @abstractmethod 135 | def generate_video_from_text(self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 136 | """Generates a video directly from a text prompt, optionally using an IP-Adapter image.""" 137 | pass -------------------------------------------------------------------------------- /__requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.2.2 2 | accelerate==1.6.0 3 | aiofiles==24.1.0 4 | aiohappyeyeballs==2.6.1 5 | aiohttp==3.11.18 6 | aiosignal==1.3.2 7 | albucore==0.0.24 8 | albumentations==2.0.7 9 | altair==5.5.0 10 | annotated-types==0.7.0 11 | anyascii==0.3.2 12 | anyio==4.9.0 13 | anykeystore==0.2 14 | apex==0.9.10.dev0 15 | argon2-cffi==23.1.0 16 | argon2-cffi-bindings==21.2.0 17 | arrow==1.3.0 18 | asttokens==3.0.0 19 | async-lru==2.0.5 20 | attrs==25.3.0 21 | audioread==3.0.1 22 | av==12.1.0 23 | babel==2.17.0 24 | beautifulsoup4==4.13.4 25 | bitsandbytes==0.45.5 26 | bleach==6.2.0 27 | blinker==1.9.0 28 | blis==0.7.11 29 | cachetools==5.5.2 30 | catalogue==2.0.10 31 | certifi==2025.4.26 32 | cffi==1.17.1 33 | charset-normalizer==3.4.2 34 | click==8.1.8 35 | cloudpathlib==0.21.1 36 | coloredlogs==15.0.1 37 | comm==0.2.2 38 | confection==0.1.5 39 | consisid-eva-clip==1.0.2 40 | contourpy>=1.3.0 41 | coqpit-config>=0.2.0 42 | coqui-tts>=0.26.0 43 | coqui-tts-trainer>=0.2.3 44 | cryptacular==1.6.2 45 | cycler==0.12.1 46 | cymem==2.0.11 47 | cython==3.1.0 48 | dateparser==1.1.8 49 | debugpy==1.8.14 50 | decorator==5.2.1 51 | defusedxml==0.7.1 52 | diffusers==0.33.1 53 | docopt==0.6.2 54 | easydict==1.13 55 | einops==0.8.1 56 | encodec==0.1.1 57 | executing==2.2.0 58 | facexlib==0.3.0 59 | fastapi==0.115.12 60 | fastjsonschema==2.21.1 61 | ffmpy==0.6.0 62 | filelock==3.18.0 63 | filterpy==1.4.5 64 | flatbuffers==25.2.10 65 | fonttools==4.58.0 66 | fqdn==1.5.1 67 | frozenlist==1.6.0 68 | fsspec==2025.5.1 69 | ftfy==6.3.1 70 | gitdb==4.0.12 71 | gitpython==3.1.44 72 | gradio==5.25.2 73 | gradio-client==1.8.0 74 | greenlet==3.2.2 75 | groovy==0.1.2 76 | grpcio==1.71.0 77 | gruut==2.4.0 78 | gruut-ipa==0.13.0 79 | gruut-lang-de==2.0.1 80 | gruut-lang-en==2.0.1 81 | gruut-lang-es==2.0.1 82 | gruut-lang-fr==2.0.2 83 | h11==0.16.0 84 | hf-xet==1.1.3 85 | httpcore==1.0.9 86 | httpx==0.28.1 87 | huggingface-hub==0.32.4 88 | humanfriendly==10.0 89 | hupper==1.12.1 90 | idna==3.10 91 | imageio==2.37.0 92 | imageio-ffmpeg==0.6.0 93 | importlib-metadata==8.7.0 94 | inflect==7.5.0 95 | inquirerpy==0.3.4 96 | insightface==0.7.3 97 | ipykernel==6.29.5 98 | ipython==9.2.0 99 | ipython-pygments-lexers==1.1.1 100 | ipywidgets==8.1.7 101 | isoduration==20.11.0 102 | jedi==0.19.2 103 | jinja2==3.1.6 104 | joblib==1.5.0 105 | json5==0.12.0 106 | jsonlines==1.2.0 107 | jsonpointer==3.0.0 108 | jsonschema==4.23.0 109 | jsonschema-specifications==2025.4.1 110 | jupyter==1.1.1 111 | jupyter-client==8.6.3 112 | jupyter-console==6.6.3 113 | jupyter-core==5.7.2 114 | jupyter-events==0.12.0 115 | jupyter-lsp==2.2.5 116 | jupyter-server==2.16.0 117 | jupyter-server-terminals==0.5.3 118 | jupyterlab==4.4.2 119 | jupyterlab-pygments==0.3.0 120 | jupyterlab-server==2.27.3 121 | jupyterlab-widgets==3.0.15 122 | kiwisolver==1.4.8 123 | langcodes==3.5.0 124 | language-data==1.3.0 125 | lazy-loader==0.4 126 | librosa>=0.11.0 127 | llvmlite==0.44.0 128 | marisa-trie==1.2.1 129 | markdown==3.8 130 | markdown-it-py==3.0.0 131 | markupsafe==3.0.2 132 | matplotlib==3.10.3 133 | matplotlib-inline==0.1.7 134 | mdurl==0.1.2 135 | mistune==3.1.3 136 | monotonic-alignment-search==0.1.1 137 | more-itertools==10.7.0 138 | moviepy==2.1.2 139 | mpmath==1.3.0 140 | msgpack==1.1.0 141 | multidict==6.4.3 142 | murmurhash==1.0.12 143 | narwhals==1.41.1 144 | nbclient==0.10.2 145 | nbconvert==7.16.6 146 | nbformat==5.10.4 147 | nest-asyncio==1.6.0 148 | networkx==3.5 149 | notebook==7.4.2 150 | notebook-shim==0.2.4 151 | num2words==0.5.14 152 | numba>=0.61.2 153 | numpy>=1.26.2 154 | nvidia-cublas-cu12==12.1.3.1 155 | nvidia-cuda-cupti-cu12==12.1.105 156 | nvidia-cuda-nvrtc-cu12==12.1.105 157 | nvidia-cuda-runtime-cu12==12.1.105 158 | nvidia-cudnn-cu12==9.1.0.70 159 | nvidia-cufft-cu12==11.0.2.54 160 | nvidia-cufile-cu12==1.11.1.6 161 | nvidia-curand-cu12==10.3.2.106 162 | nvidia-cusolver-cu12==11.4.5.107 163 | nvidia-cusparse-cu12==12.1.0.106 164 | nvidia-cusparselt-cu12==0.6.3 165 | nvidia-nccl-cu12==2.21.5 166 | nvidia-nvjitlink-cu12==12.6.85 167 | nvidia-nvtx-cu12==12.1.105 168 | oauthlib==3.2.2 169 | onnx==1.18.0 170 | onnxruntime-gpu==1.22.0 171 | opencv-contrib-python==4.11.0.86 172 | opencv-python==4.11.0.86 173 | opencv-python-headless==4.11.0.86 174 | orjson==3.10.18 175 | overrides==7.7.0 176 | packaging==24.2 177 | pandas==2.3.0 178 | pandocfilters==1.5.1 179 | parso==0.8.4 180 | pastedeploy==3.1.0 181 | pbkdf2==1.3 182 | peft==0.15.2 183 | pexpect==4.9.0 184 | pfzy==0.3.4 185 | pillow>=9.2.0,<11.0 186 | plaster==1.1.2 187 | plaster-pastedeploy==1.0.1 188 | platformdirs==4.3.8 189 | pooch==1.8.2 190 | preshed==3.0.9 191 | prettytable==3.16.0 192 | proglog==0.1.12 193 | prometheus-client==0.21.1 194 | prompt-toolkit==3.0.51 195 | propcache==0.3.1 196 | protobuf==6.31.0 197 | psutil==7.0.0 198 | ptyprocess==0.7.0 199 | pure-eval==0.2.3 200 | pyarrow==20.0.0 201 | pycparser==2.22 202 | pydantic==2.11.4 203 | pydantic-core==2.33.2 204 | pydeck==0.9.1 205 | pydub==0.25.1 206 | pyfacer==0.0.5 207 | pygments==2.19.1 208 | pyparsing==3.2.3 209 | pyramid==2.0.2 210 | pyramid-mailer==0.15.1 211 | pysbd==0.3.4 212 | python-crfsuite==0.9.11 213 | python-dateutil==2.9.0.post0 214 | python-dotenv==1.1.0 215 | python-json-logger==3.3.0 216 | python-multipart==0.0.20 217 | python3-openid==3.2.0 218 | pytz==2025.2 219 | pyyaml==6.0.2 220 | pyzmq==26.4.0 221 | referencing==0.36.2 222 | regex==2024.11.6 223 | repoze-sendmail==4.4.1 224 | requests==2.31.0 225 | requests-oauthlib==2.0.0 226 | rfc3339-validator==0.1.4 227 | rfc3986-validator==0.1.1 228 | rich==14.0.0 229 | rpds-py==0.25.0 230 | ruff==0.11.13 231 | safehttpx==0.1.6 232 | safetensors==0.5.3 233 | scikit-image==0.25.2 234 | scikit-learn==1.6.1 235 | scipy==1.12.0 236 | semantic-version==2.10.0 237 | send2trash==1.8.3 238 | sentencepiece==0.2.0 239 | setuptools==80.9.0 240 | shellingham==1.5.4 241 | simsimd==6.2.1 242 | six==1.17.0 243 | smart-open==7.1.0 244 | smmap==5.0.2 245 | sniffio==1.3.1 246 | soundfile==0.13.1 247 | soupsieve==2.7 248 | soxr==0.5.0.post1 249 | spacy==3.7.5 250 | spacy-legacy==3.0.12 251 | spacy-loggers==1.0.5 252 | sqlalchemy==2.0.41 253 | srsly==2.5.1 254 | stack-data==0.6.3 255 | starlette==0.46.2 256 | streamlit==1.45.0 257 | stringzilla==3.12.5 258 | sudachidict-core==20250129 259 | sudachipy==0.6.10 260 | sympy==1.13.1 261 | tenacity==9.1.2 262 | tensorboard==2.19.0 263 | tensorboard-data-server==0.7.2 264 | terminado==0.18.1 265 | thinc==8.2.5 266 | threadpoolctl==3.6.0 267 | tifffile==2025.5.10 268 | timm==1.0.15 269 | tinycss2==1.4.0 270 | tokenizers>=0.20.3 271 | toml==0.10.2 272 | tomlkit==0.13.3 273 | # torch==2.5.1+cu121 274 | # torchaudio==2.5.1+cu121 275 | torchsde==0.2.6 276 | # torchvision==0.20.1+cu121 277 | tornado==6.4.2 278 | tqdm==4.67.1 279 | traitlets==5.14.3 280 | trampoline==0.1.2 281 | transaction==5.0 282 | transformers>=4.46.2 283 | translationstring==1.4 284 | triton==3.1.0 285 | typeguard==4.4.2 286 | typer==0.15.4 287 | types-python-dateutil==2.9.0.20241206 288 | typing-extensions==4.14.0 289 | typing-inspection==0.4.0 290 | tzdata==2025.2 291 | tzlocal==5.3.1 292 | uri-template==1.3.0 293 | urllib3==2.4.0 294 | uvicorn==0.34.3 295 | validators==0.35.0 296 | velruse==1.1.1 297 | venusian==3.1.1 298 | wasabi==1.1.3 299 | watchdog==6.0.0 300 | wcwidth==0.2.13 301 | weasel==0.4.1 302 | webcolors==24.11.1 303 | webencodings==0.5.1 304 | webob==1.8.9 305 | websocket-client==1.8.0 306 | websockets==15.0.1 307 | werkzeug==3.1.3 308 | widgetsnbextension==4.0.14 309 | wrapt==1.17.2 310 | wtforms==3.2.1 311 | wtforms-recaptcha==0.3.2 312 | xformers==0.0.29.post1 313 | yarl==1.20.0 314 | zipp==3.22.0 315 | zope-deprecation==5.1 316 | zope-interface==7.2 317 | zope-sqlalchemy==3.1 318 | -------------------------------------------------------------------------------- /t2i_modules/t2i_juggernaut.py: -------------------------------------------------------------------------------- 1 | # In t2i_modules/t2i_juggernaut.py 2 | import torch 3 | from typing import List, Optional, Dict, Any, Union 4 | from diffusers import StableDiffusionXLPipeline, DiffusionPipeline 5 | from diffusers.utils import load_image 6 | from transformers import BitsAndBytesConfig 7 | from diffusers import DPMSolverMultistepScheduler as JuggernautScheduler 8 | 9 | from base_modules import BaseT2I, BaseModuleConfig, ModuleCapabilities 10 | from config_manager import DEVICE, clear_vram_globally 11 | 12 | class JuggernautT2IConfig(BaseModuleConfig): 13 | model_id: str = "RunDiffusion/Juggernaut-XL-v9" 14 | refiner_id: Optional[str] = None 15 | # --- NEW: Flag to control memory-saving quantization --- 16 | use_8bit_quantization: bool = True 17 | num_inference_steps: int = 35 18 | guidance_scale: float = 6.0 19 | ip_adapter_repo: str = "h94/IP-Adapter" 20 | ip_adapter_subfolder: str = "sdxl_models" 21 | ip_adapter_weight_name: str = "ip-adapter_sdxl.bin" 22 | 23 | 24 | class JuggernautT2I(BaseT2I): 25 | Config = JuggernautT2IConfig 26 | 27 | def __init__(self, config: JuggernautT2IConfig): 28 | super().__init__(config) 29 | self.refiner_pipe = None 30 | self._loaded_ip_adapter_count = 0 31 | 32 | @classmethod 33 | def get_capabilities(cls) -> ModuleCapabilities: 34 | return ModuleCapabilities( 35 | title="Juggernaut XL v9 (Quality), 2 Subjects considered", 36 | vram_gb_min=8.0, 37 | ram_gb_min=12.0, 38 | supported_formats=["Portrait", "Landscape"], 39 | supports_ip_adapter=True, 40 | supports_lora=True, 41 | max_subjects=2, 42 | accepts_text_prompt=True, 43 | accepts_negative_prompt=True 44 | ) 45 | 46 | def get_model_capabilities(self) -> Dict[str, Any]: 47 | return { 48 | "resolutions": {"Portrait": (832, 1216), "Landscape": (1216, 832)}, 49 | "max_shot_duration": 3.0 50 | } 51 | 52 | def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str: 53 | quality_keywords = "cinematic photography, hyperdetailed, (skin details:1.1), 8k, professional lighting" 54 | if prompt.strip().endswith(','): 55 | return f"{prompt} {quality_keywords}" 56 | else: 57 | return f"{prompt}, {quality_keywords}" 58 | 59 | def _load_pipeline(self): 60 | if self.pipe is None: 61 | if self.config.use_8bit_quantization: 62 | print("Loading T2I pipeline (Juggernaut) with 8-bit quantization to save VRAM...") 63 | bnb_config = BitsAndBytesConfig( 64 | load_in_8bit=True, 65 | ) 66 | # --- START OF FIX: Remove device_map and use .to(DEVICE) instead --- 67 | # This prevents the accelerate hook conflict when loading IP-Adapters later. 68 | self.pipe = StableDiffusionXLPipeline.from_pretrained( 69 | self.config.model_id, 70 | quantization_config=bnb_config, 71 | torch_dtype=torch.float16, 72 | variant="fp16", 73 | use_safetensors=True, 74 | ).to(DEVICE) 75 | # --- END OF FIX --- 76 | else: 77 | print(f"Loading T2I pipeline (Juggernaut) in full precision to {DEVICE}...") 78 | self.pipe = StableDiffusionXLPipeline.from_pretrained( 79 | self.config.model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True 80 | ).to(DEVICE) 81 | 82 | self.pipe.scheduler = JuggernautScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True) 83 | print(f"Juggernaut pipeline configured with {JuggernautScheduler.__name__} sampler.") 84 | 85 | if self.config.refiner_id: 86 | print(f"Refiner specified but not typically used with Juggernaut, skipping load.") 87 | 88 | def clear_vram(self): 89 | print("Clearing T2I (Juggernaut) VRAM...") 90 | models = [m for m in [self.pipe, self.refiner_pipe] if m is not None] 91 | if models: clear_vram_globally(*models) 92 | self.pipe, self.refiner_pipe = None, None 93 | self._loaded_ip_adapter_count = 0 94 | print("T2I (Juggernaut) VRAM cleared.") 95 | 96 | def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str: 97 | self._load_pipeline() 98 | 99 | generator = None 100 | if seed != -1: 101 | print(f"Using fixed seed for generation: {seed}") 102 | generator = torch.Generator(device=self.pipe.device).manual_seed(seed) 103 | else: 104 | print("Using random seed for generation.") 105 | 106 | pipeline_kwargs = {"generator": generator} if generator else {} 107 | ip_images_to_load = [] 108 | 109 | if ip_adapter_image: 110 | if isinstance(ip_adapter_image, str): 111 | ip_images_to_load = [ip_adapter_image] 112 | else: 113 | ip_images_to_load = ip_adapter_image 114 | 115 | num_ip_images = len(ip_images_to_load) 116 | 117 | if num_ip_images > 0: 118 | print(f"Juggernaut T2I: Activating IP-Adapter with {num_ip_images} character image(s).") 119 | if self._loaded_ip_adapter_count != num_ip_images: 120 | print(f"Loading {num_ip_images} IP-Adapter(s) for the pipeline...") 121 | if hasattr(self.pipe, "unload_ip_adapter"): self.pipe.unload_ip_adapter() 122 | adapter_weights = [self.config.ip_adapter_weight_name] * num_ip_images 123 | self.pipe.load_ip_adapter( 124 | self.config.ip_adapter_repo, 125 | subfolder=self.config.ip_adapter_subfolder, 126 | weight_name=adapter_weights 127 | ) 128 | self._loaded_ip_adapter_count = num_ip_images 129 | print(f"Successfully loaded {self._loaded_ip_adapter_count} adapters.") 130 | 131 | scales = [0.6] * num_ip_images 132 | self.pipe.set_ip_adapter_scale(scales) 133 | ip_images = [load_image(p) for p in ip_images_to_load] 134 | pipeline_kwargs["ip_adapter_image"] = ip_images 135 | else: 136 | print("Juggernaut T2I: No IP-Adapter image provided.") 137 | if self._loaded_ip_adapter_count > 0: 138 | if hasattr(self.pipe, "unload_ip_adapter"): self.pipe.unload_ip_adapter() 139 | self._loaded_ip_adapter_count = 0 140 | 141 | enhanced_prompt = self.enhance_prompt(prompt) 142 | print(f"Juggernaut generating image with resolution: {width}x{height}") 143 | print(f" - Prompt: '{enhanced_prompt}'") 144 | print(f" - Negative: '{negative_prompt}'") 145 | 146 | image = self.pipe( 147 | prompt=enhanced_prompt, 148 | negative_prompt=negative_prompt, 149 | width=width, 150 | height=height, 151 | num_inference_steps=self.config.num_inference_steps, 152 | guidance_scale=self.config.guidance_scale, 153 | **pipeline_kwargs 154 | ).images[0] 155 | 156 | image.save(output_path) 157 | print(f"Image saved to {output_path}") 158 | return output_path -------------------------------------------------------------------------------- /ui_task_executor.py: -------------------------------------------------------------------------------- 1 | # In ui_task_executor.py 2 | 3 | import streamlit as st 4 | from task_executor import TaskExecutor 5 | from config_manager import ContentConfig 6 | import logging 7 | from typing import List, Optional, Any 8 | import os 9 | from utils import load_and_correct_image_orientation 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class UITaskExecutor: 14 | """Handles task execution triggered from the Streamlit UI, providing user feedback.""" 15 | 16 | def __init__(self, project_manager): 17 | self.project_manager = project_manager 18 | self.task_executor: Optional[TaskExecutor] = None 19 | self._initialize_task_executor() 20 | 21 | def _initialize_task_executor(self): 22 | if not self.project_manager.state: 23 | st.error("Cannot initialize task executor: Project state not found.") 24 | return 25 | try: 26 | self.task_executor = TaskExecutor(self.project_manager) 27 | except Exception as e: 28 | logger.error(f"Failed to initialize TaskExecutor: {e}", exc_info=True) 29 | st.error(f"Configuration Error: {e}") 30 | 31 | def update_narration_text(self, scene_idx: int, text: str): 32 | self.project_manager.update_narration_part_text(scene_idx, text) 33 | 34 | def update_shot_prompts(self, scene_idx: int, shot_idx: int, visual_prompt: Optional[str] = None, motion_prompt: Optional[str] = None): 35 | self.project_manager.update_shot_content(scene_idx, shot_idx, visual_prompt, motion_prompt) 36 | 37 | def regenerate_audio(self, scene_idx: int, text: str, speaker_audio: Optional[str] = None) -> bool: 38 | if not self.task_executor: return False 39 | self.project_manager.update_narration_part_text(scene_idx, text) 40 | task_data = {"scene_idx": scene_idx, "text": text, "speaker_wav": speaker_audio if speaker_audio and os.path.exists(speaker_audio) else None} 41 | success = self.task_executor.execute_task("generate_audio", task_data) 42 | if success: st.toast(f"Audio for Scene {scene_idx + 1} generated!", icon="🔊") 43 | else: st.error(f"Failed to generate audio for Scene {scene_idx + 1}.") 44 | self.project_manager.load_project() 45 | return success 46 | 47 | def create_scene(self, scene_idx: int) -> bool: 48 | if not self.task_executor: return False 49 | success = self.task_executor.execute_task("create_scene", {"scene_idx": scene_idx}) 50 | if success: st.toast(f"Scene {scene_idx + 1} shots created!", icon="🎬") 51 | else: st.error(f"Failed to create shots for Scene {scene_idx + 1}.") 52 | self.project_manager.load_project() 53 | return success 54 | 55 | # --- NEW METHOD --- 56 | def regenerate_scene_shots(self, scene_idx: int) -> bool: 57 | """Resets a scene and triggers the 'create_scene' task to regenerate shots.""" 58 | if not self.task_executor: return False 59 | 60 | # First, reset the scene, clearing old shots and assets 61 | self.project_manager.reset_scene_for_shot_regeneration(scene_idx) 62 | st.toast(f"Cleared old shots for Scene {scene_idx + 1}. Regenerating...", icon="♻️") 63 | 64 | # Now, execute the create_scene task which will find the scene missing and create it 65 | success = self.task_executor.execute_task("create_scene", {"scene_idx": scene_idx}) 66 | 67 | if success: 68 | st.toast(f"New shots for Scene {scene_idx + 1} generated!", icon="✨") 69 | else: 70 | st.error(f"Failed to regenerate shots for Scene {scene_idx + 1}.") 71 | 72 | self.project_manager.load_project() 73 | return success 74 | 75 | def regenerate_shot_image(self, scene_idx: int, shot_idx: int) -> bool: 76 | if not self.task_executor: return False 77 | self.project_manager.update_shot_content(scene_idx, shot_idx) 78 | shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx] 79 | task_data = {"scene_idx": scene_idx, "shot_idx": shot_idx, "visual_prompt": shot.visual_prompt} 80 | success = self.task_executor.execute_task("generate_shot_image", task_data) 81 | if success: st.toast(f"Image for Shot {shot_idx + 1} generated!", icon="🖼️") 82 | else: st.error(f"Failed to generate image for Shot {shot_idx + 1}.") 83 | self.project_manager.load_project() 84 | return success 85 | 86 | def regenerate_shot_video(self, scene_idx: int, shot_idx: int) -> bool: 87 | if not self.task_executor: return False 88 | self.project_manager.update_shot_content(scene_idx, shot_idx) 89 | shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx] 90 | task_data = { 91 | "scene_idx": scene_idx, "shot_idx": shot_idx, 92 | "visual_prompt": shot.visual_prompt, 93 | "motion_prompt": shot.motion_prompt 94 | } 95 | success = self.task_executor.execute_task("generate_shot_video", task_data) 96 | if success: st.toast(f"Video for Shot {shot_idx + 1} generated!", icon="📹") 97 | else: st.error(f"Failed to generate video for Shot {shot_idx + 1}.") 98 | self.project_manager.load_project() 99 | return success 100 | 101 | def regenerate_shot_t2v(self, scene_idx: int, shot_idx: int) -> bool: 102 | if not self.task_executor: return False 103 | self.project_manager.update_shot_content(scene_idx, shot_idx) 104 | shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx] 105 | task_data = {"scene_idx": scene_idx, "shot_idx": shot_idx, "visual_prompt": shot.visual_prompt} 106 | success = self.task_executor.execute_task("generate_shot_t2v", task_data) 107 | if success: st.toast(f"T2V Shot {shot_idx + 1} generated!", icon="📹") 108 | else: st.error(f"Failed to generate T2V Shot {shot_idx + 1}.") 109 | self.project_manager.load_project() 110 | return success 111 | 112 | def assemble_final_video(self) -> bool: 113 | if not self.task_executor: return False 114 | success = self.task_executor.execute_task("assemble_final", {}) 115 | if success: st.toast("Final video assembled successfully!", icon="🏆") 116 | else: st.error("Failed to assemble final video.") 117 | self.project_manager.load_project() 118 | return success 119 | 120 | def add_character(self, name: str, image_file: "UploadedFile"): 121 | if not self.project_manager.state: return False 122 | safe_name = name.replace(" ", "_") 123 | char_dir = os.path.join(self.project_manager.output_dir, "characters", safe_name) 124 | os.makedirs(char_dir, exist_ok=True) 125 | ref_image_path = os.path.join(char_dir, "reference.png") 126 | 127 | corrected_image = load_and_correct_image_orientation(image_file) 128 | if corrected_image: 129 | corrected_image.save(ref_image_path, "PNG") 130 | char_data = {"name": name, "reference_image_path": ref_image_path} 131 | self.project_manager.add_character(char_data) 132 | st.toast(f"Character '{name}' added!", icon="👤") 133 | return True 134 | else: 135 | st.error(f"Could not process image for new character {name}. Aborting.") 136 | return False 137 | 138 | def update_character(self, old_name: str, new_name: str, new_image_file: Optional["UploadedFile"]): 139 | ref_image_path = None 140 | if new_image_file: 141 | safe_name = (new_name or old_name).replace(" ", "_") 142 | char_dir = os.path.join(self.project_manager.output_dir, "characters", safe_name) 143 | os.makedirs(char_dir, exist_ok=True) 144 | ref_image_path = os.path.join(char_dir, "reference.png") 145 | 146 | corrected_image = load_and_correct_image_orientation(new_image_file) 147 | if corrected_image: 148 | corrected_image.save(ref_image_path, "PNG") 149 | else: 150 | st.error("Failed to process the new image. Character image was not updated.") 151 | ref_image_path = None 152 | 153 | self.project_manager.update_character(old_name, new_name, ref_image_path) 154 | st.toast(f"Character '{old_name}' updated!", icon="✏️") 155 | return True 156 | 157 | def delete_character(self, name: str): 158 | self.project_manager.delete_character(name) 159 | st.toast(f"Character '{name}' deleted!", icon="🗑️") 160 | return True 161 | 162 | def update_project_config(self, key: str, value: Any): 163 | """UI wrapper to update a specific project configuration value.""" 164 | self.project_manager.update_config_value(key, value) 165 | st.toast(f"Setting '{key.replace('_', ' ').title()}' updated.") 166 | st.rerun() 167 | 168 | def update_scene_characters(self, scene_idx: int, character_names: List[str]): 169 | self.project_manager.update_scene_characters(scene_idx, character_names) 170 | st.toast(f"Characters for Scene {scene_idx+1} updated.", icon="🎬") 171 | 172 | def add_new_scene(self, scene_idx: int): 173 | """UI wrapper to add a new scene.""" 174 | self.project_manager.add_new_scene_at(scene_idx) 175 | st.toast(f"New scene added at position {scene_idx + 1}!", icon="➕") 176 | return True 177 | 178 | def remove_scene(self, scene_idx: int): 179 | """UI wrapper to remove a scene.""" 180 | self.project_manager.remove_scene_at(scene_idx) 181 | st.toast(f"Scene {scene_idx + 1} removed!", icon="🗑️") 182 | return True -------------------------------------------------------------------------------- /llm_modules/llm_zephyr.py: -------------------------------------------------------------------------------- 1 | # llm_modules/llm_zephyr.py 2 | import torch 3 | import json 4 | import re 5 | from typing import List, Optional, Tuple, Dict, Any 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | from base_modules import BaseLLM, BaseModuleConfig, ModuleCapabilities 9 | from config_manager import ContentConfig, DEVICE, clear_vram_globally 10 | 11 | class ZephyrLLMConfig(BaseModuleConfig): 12 | model_id: str = "HuggingFaceH4/zephyr-7b-beta" 13 | max_new_tokens_script: int = 2048 # Increased for new fields 14 | max_new_tokens_shot_prompt: int = 256 15 | temperature: float = 0.7 16 | top_k: int = 50 17 | top_p: float = 0.95 18 | 19 | class ZephyrLLM(BaseLLM): 20 | Config = ZephyrLLMConfig 21 | 22 | @classmethod 23 | def get_capabilities(cls) -> ModuleCapabilities: 24 | return ModuleCapabilities( 25 | title="Zephyr 7B", 26 | vram_gb_min=8.0, 27 | ram_gb_min=16.0, 28 | # LLM-specific capabilities are not the main focus, so we use defaults. 29 | ) 30 | 31 | def _load_model_and_tokenizer(self): 32 | if self.model is None or self.tokenizer is None: 33 | print(f"Loading LLM: {self.config.model_id}...") 34 | self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_id) 35 | if self.tokenizer.pad_token is None: 36 | self.tokenizer.pad_token = self.tokenizer.eos_token 37 | 38 | try: 39 | self.model = AutoModelForCausalLM.from_pretrained( 40 | self.config.model_id, torch_dtype=torch.float16 41 | ).to(DEVICE) 42 | except Exception as e: 43 | print(f"Failed to load LLM with device_map='auto' ({e}), trying with explicit device: {DEVICE}") 44 | self.model = AutoModelForCausalLM.from_pretrained( 45 | self.config.model_id, torch_dtype=torch.float16 46 | ).to(DEVICE) 47 | print("LLM loaded.") 48 | 49 | def clear_vram(self): 50 | print("Clearing LLM VRAM...") 51 | models_to_clear = [m for m in [self.model] if m is not None] 52 | if models_to_clear: clear_vram_globally(*models_to_clear) 53 | self.model, self.tokenizer = None, None 54 | print("LLM VRAM cleared.") 55 | 56 | def _parse_llm_json_response(self, decoded_output: str, context: str = "script") -> Optional[Dict]: 57 | match = re.search(r'\{[\s\S]*\}', decoded_output) 58 | json_text = match.group(0) if match else decoded_output 59 | try: 60 | return json.loads(re.sub(r',(\s*[}\]])', r'\1', json_text)) 61 | except json.JSONDecodeError as e: 62 | print(f"Error parsing LLM JSON for {context}: {e}. Raw output:\n{decoded_output}") 63 | return None 64 | 65 | def generate_script(self, topic: str, content_config: ContentConfig) -> Dict[str, Any]: 66 | self._load_model_and_tokenizer() 67 | print(f"Generating script for topic: '{topic}' in language: {content_config.language}") 68 | 69 | # --- MODIFICATION START --- 70 | # Map language code to full name for better prompting 71 | language_map = { 72 | 'en': 'English', 'es': 'Spanish', 'fr': 'French', 73 | 'de': 'German', 'it': 'Italian', 'pt': 'Portuguese', 74 | 'pl': 'Polish', 'tr': 'Turkish', 'ru': 'Russian', 75 | 'nl': 'Dutch', 'cs': 'Czech', 'ar': 'Arabic', 76 | 'zh-cn': 'Chinese (Simplified)', 'ja': 'Japanese', 77 | 'hu': 'Hungarian', 'ko': 'Korean', 'hi': 'Hindi' 78 | } 79 | target_language = language_map.get(content_config.language, 'English') 80 | 81 | system_prompt = ( 82 | "You are a multilingual AI assistant creating content for a short video. " 83 | "You will be asked to write the narration in a specific language, but all other content (visual prompts, descriptions, hashtags) must be in English for the video generation models. " 84 | "Your response must be a single, valid JSON object with these exact keys: " 85 | "\"main_subject_description\", \"setting_description\", \"narration\", \"visuals\", \"hashtags\"." 86 | ) 87 | 88 | user_prompt = f""" 89 | **IMPORTANT INSTRUCTIONS:** 90 | 1. The **"narration"** text MUST be written in **{target_language}**. Use the native script if applicable (e.g., Devanagari for Hindi). 91 | 2. Use proper punctuation (like commas and periods) in the narration for a natural-sounding voiceover. 92 | 3. All other fields ("main_subject_description", "setting_description", "visuals", "hashtags") MUST remain in **English**. 93 | 94 | --- 95 | Create content for a short video about "{topic}". 96 | The total narration should be ~{content_config.target_video_length_hint}s, with {content_config.min_scenes} to {content_config.max_scenes} scenes. 97 | Each scene's narration should be ~{content_config.max_scene_narration_duration_hint}s. 98 | 99 | Return your response in this exact JSON format: 100 | {{ 101 | "main_subject_description": "A detailed, consistent description of the main character or subject (e.g., 'Fluffy, a chubby but cute orange tabby cat with green eyes'). MUST BE IN ENGLISH.", 102 | "setting_description": "A description of the primary environment (e.g., 'a cozy, sunlit living room with plush furniture'). MUST BE IN ENGLISH.", 103 | "narration": [ 104 | {{"scene": 1, "text": "First scene narration text, written in {target_language}.", "duration_estimate": {content_config.max_scene_narration_duration_hint}}} 105 | ], 106 | "visuals": [ 107 | {{"scene": 1, "prompt": "Detailed visual prompt for scene 1. MUST BE IN ENGLISH."}} 108 | ], 109 | "hashtags": ["relevantTag1", "relevantTag2"] 110 | }} 111 | """ 112 | # --- MODIFICATION END --- 113 | 114 | messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] 115 | 116 | for attempt in range(3): 117 | print(f"Attempt {attempt + 1} of 3 to generate valid script JSON...") 118 | 119 | tokenized_chat = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(self.model.device) 120 | outputs = self.model.generate( 121 | input_ids=tokenized_chat, max_new_tokens=self.config.max_new_tokens_script, 122 | do_sample=True, top_k=self.config.top_k, top_p=self.config.top_p, 123 | temperature=self.config.temperature, pad_token_id=self.tokenizer.eos_token_id 124 | ) 125 | decoded_output = self.tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) 126 | response_data = self._parse_llm_json_response(decoded_output, "script") 127 | 128 | if response_data and all(k in response_data for k in ["narration", "visuals", "main_subject_description"]): 129 | print("Successfully generated and parsed valid script JSON.") 130 | return { 131 | "main_subject_description": response_data.get("main_subject_description"), 132 | "setting_description": response_data.get("setting_description"), 133 | "narration": sorted(response_data.get("narration", []), key=lambda x: x["scene"]), 134 | "visuals": [p["prompt"] for p in sorted(response_data.get("visuals", []), key=lambda x: x["scene"])], 135 | "hashtags": response_data.get("hashtags", []) 136 | } 137 | else: 138 | print(f"Attempt {attempt + 1} failed. The response was not a valid JSON or was missing required keys.") 139 | if attempt < 2: 140 | print("Retrying...") 141 | 142 | print("LLM script generation failed after 3 attempts. Using fallback.") 143 | # Fallback remains in English as a safe default 144 | return { 145 | "main_subject_description": topic, "setting_description": "a simple background", 146 | "narration": [{"text": f"An intro to {topic}.", "duration_estimate": 5.0}], 147 | "visuals": [f"Cinematic overview of {topic}."], "hashtags": [f"#{topic.replace(' ', '')}"] 148 | } 149 | 150 | def generate_shot_visual_prompts(self, scene_narration: str, original_scene_prompt: str, num_shots: int, content_config: ContentConfig, main_subject: str, setting: str) -> List[Tuple[str, str]]: 151 | self._load_model_and_tokenizer() 152 | shot_prompts = [] 153 | 154 | # Define the prompts, which are the same for each shot generation call 155 | system_prompt = ( 156 | "You are an Movie director. Your task is to generate a 'visual_prompt' and a 'motion_prompt' for a short video shot " 157 | "The prompts MUST incorporate the provided main subject and setting. Do NOT change the subject. " 158 | "Respond in this exact JSON format: {\"visual_prompt\": \"...\", \"motion_prompt\": \"...\"}" 159 | ) 160 | 161 | for shot_idx in range(num_shots): 162 | print(f"--- Generating prompts for Shot {shot_idx + 1}/{num_shots} ---") 163 | 164 | # --- NEW: Defensive check to prevent intermittent crashes --- 165 | # This handles rare cases where the model/tokenizer might be cleared from memory 166 | # between calls within the same task execution. 167 | if self.model is None or self.tokenizer is None: 168 | print("WARNING: LLM was unloaded unexpectedly. Forcing a reload before generating shot prompt.") 169 | self._load_model_and_tokenizer() 170 | 171 | user_prompt = f""" 172 | **Main Subject (MUST BE INCLUDED):** {main_subject} 173 | **Setting (MUST BE INCLUDED):** {setting} 174 | 175 | --- 176 | **Original Scene Goal:** "{original_scene_prompt}" 177 | **This Shot's Narration:** "{scene_narration}" 178 | 179 | Based on ALL the information above, create a visual and motion prompt for shot {shot_idx + 1}/{num_shots}. 180 | The visual prompt should be a specific, detailed moment consistent with the subject and setting. 181 | try to describe the visual prompt in minimum words but in very specific details what a director would want the image to look like. 182 | Descrive character, subject and envrionment in words, only chose important words no need to make complete sentances. 183 | try to describe the visual prompt in minimum words but in very specific details what a director would want the image to look like. 184 | Descrive character, subject and envrionment in words, only chose important words no need to make complete sentances. 185 | Also descirbe camera mm, shot type, location, lighting, color, mood, etc. 186 | Do not include any other text or comments other then given json format. 187 | """ 188 | messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] 189 | 190 | visual_prompt, motion_prompt = None, None 191 | 192 | # --- MODIFICATION START: Add retry loop for each shot --- 193 | for attempt in range(3): 194 | print(f"Attempt {attempt + 1} of 3 to generate valid prompt JSON for shot {shot_idx + 1}...") 195 | 196 | tokenized_chat = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(self.model.device) 197 | outputs = self.model.generate( 198 | input_ids=tokenized_chat, max_new_tokens=self.config.max_new_tokens_shot_prompt, 199 | do_sample=True, temperature=self.config.temperature, pad_token_id=self.tokenizer.eos_token_id 200 | ) 201 | decoded_output = self.tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) 202 | response_data = self._parse_llm_json_response(decoded_output, f"shot {shot_idx+1} prompt") 203 | 204 | # Check for a dictionary with both required string keys 205 | if (isinstance(response_data, dict) and 206 | isinstance(response_data.get("visual_prompt"), str) and 207 | isinstance(response_data.get("motion_prompt"), str)): 208 | 209 | visual_prompt = response_data["visual_prompt"] 210 | motion_prompt = response_data["motion_prompt"] 211 | print(f"Successfully generated and parsed prompts for shot {shot_idx + 1}.") 212 | break # Exit the retry loop on success 213 | else: 214 | print(f"Attempt {attempt + 1} failed for shot {shot_idx + 1}. Invalid JSON or missing keys.") 215 | # --- MODIFICATION END --- 216 | 217 | # If after 3 attempts, we still don't have prompts, use the fallback 218 | if not visual_prompt or not motion_prompt: 219 | print(f"All attempts failed for shot {shot_idx + 1}. Using fallback prompts.") 220 | visual_prompt = f"{main_subject} in {setting}, {original_scene_prompt}" 221 | motion_prompt = "gentle camera movement" 222 | 223 | shot_prompts.append((visual_prompt, motion_prompt)) 224 | print(f" > Shot {shot_idx+1} Visual: \"{visual_prompt[:80]}...\"") 225 | print(f" > Shot {shot_idx+1} Motion: \"{motion_prompt[:80]}...\"") 226 | 227 | return shot_prompts -------------------------------------------------------------------------------- /task_executor.py: -------------------------------------------------------------------------------- 1 | # In task_executor.py 2 | import logging 3 | import math 4 | import os 5 | import random 6 | from typing import Optional, Dict 7 | import torch 8 | from importlib import import_module 9 | 10 | from project_manager import ProjectManager, STATUS_IMAGE_GENERATED, STATUS_VIDEO_GENERATED, STATUS_FAILED 11 | from config_manager import ContentConfig 12 | from video_assembly import assemble_final_reel, assemble_scene_video_from_sub_clips 13 | from base_modules import ModuleCapabilities 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def _import_class(module_path_str: str): 18 | module_path, class_name = module_path_str.rsplit('.', 1) 19 | module = import_module(module_path) 20 | return getattr(module, class_name) 21 | 22 | class TaskExecutor: 23 | def __init__(self, project_manager: ProjectManager): 24 | self.project_manager = project_manager 25 | self.content_cfg = ContentConfig(**self.project_manager.state.project_info.config) 26 | 27 | module_selections = self.content_cfg.module_selections 28 | if not module_selections: 29 | raise ValueError("Project state is missing module selections. Cannot initialize TaskExecutor.") 30 | 31 | # --- START OF FIX: Use .get() for safe module loading to prevent crashes --- 32 | 33 | # LLM and TTS are always required 34 | LlmClass = _import_class(module_selections["llm"]) 35 | self.llm_module = LlmClass(LlmClass.Config()) 36 | 37 | TtsClass = _import_class(module_selections["tts"]) 38 | self.tts_module = TtsClass(TtsClass.Config()) 39 | 40 | # Video modules are optional depending on the flow 41 | self.t2i_module = None 42 | self.i2v_module = None 43 | self.t2v_module = None 44 | 45 | t2i_path = module_selections.get("t2i") 46 | if t2i_path: 47 | T2iClass = _import_class(t2i_path) 48 | self.t2i_module = T2iClass(T2iClass.Config()) 49 | 50 | i2v_path = module_selections.get("i2v") 51 | if i2v_path: 52 | I2vClass = _import_class(i2v_path) 53 | self.i2v_module = I2vClass(I2vClass.Config()) 54 | 55 | t2v_path = module_selections.get("t2v") 56 | if t2v_path: 57 | T2vClass = _import_class(t2v_path) 58 | self.t2v_module = T2vClass(T2vClass.Config()) 59 | 60 | # Determine capabilities based on which modules were actually loaded 61 | self.active_flow_supports_characters = False 62 | if self.content_cfg.use_svd_flow and self.t2i_module: 63 | t2i_caps = self.t2i_module.get_capabilities() 64 | self.active_flow_supports_characters = t2i_caps.supports_ip_adapter 65 | logger.info("Decisive module for character support: T2I module.") 66 | elif not self.content_cfg.use_svd_flow and self.t2v_module: 67 | t2v_caps = self.t2v_module.get_capabilities() 68 | self.active_flow_supports_characters = t2v_caps.supports_ip_adapter 69 | logger.info("Decisive module for character support: T2V module.") 70 | # --- END OF FIX --- 71 | 72 | logger.info(f"Holistic check: Active flow supports characters: {self.active_flow_supports_characters}") 73 | self._configure_from_model_capabilities() 74 | 75 | def _configure_from_model_capabilities(self): 76 | logger.info("--- TaskExecutor: Configuring run from model capabilities... ---") 77 | if self.content_cfg.use_svd_flow: 78 | if self.t2i_module and self.i2v_module: 79 | t2i_caps = self.t2i_module.get_model_capabilities() 80 | i2v_caps = self.i2v_module.get_model_capabilities() 81 | self.content_cfg.generation_resolution = t2i_caps["resolutions"].get(self.content_cfg.aspect_ratio_format) 82 | self.content_cfg.model_max_video_shot_duration = i2v_caps.get("max_shot_duration", 3.0) 83 | else: 84 | logger.warning("Warning: T2I or I2V module not loaded for I2V flow. Using default configurations.") 85 | else: # T2V Flow 86 | if self.t2v_module: 87 | t2v_caps = self.t2v_module.get_model_capabilities() 88 | self.content_cfg.generation_resolution = t2v_caps["resolutions"].get(self.content_cfg.aspect_ratio_format) 89 | self.content_cfg.model_max_video_shot_duration = t2v_caps.get("max_shot_duration", 2.0) 90 | else: 91 | logger.warning("Warning: T2V module not loaded for T2V flow. Using default configurations.") 92 | 93 | logger.info(f"Dynamically set Generation Resolution to: {self.content_cfg.generation_resolution}") 94 | logger.info(f"Dynamically set Max Shot Duration to: {self.content_cfg.model_max_video_shot_duration}s") 95 | self.project_manager.state.project_info.config = self.content_cfg.model_dump() 96 | self.project_manager._save_state() 97 | 98 | def execute_task(self, task: str, task_data: Dict) -> bool: 99 | try: 100 | # --- START OF FIX: Refresh config before every task to prevent stale state --- 101 | self.content_cfg = ContentConfig(**self.project_manager.state.project_info.config) 102 | logger.info(f"Executing task '{task}' with add_narration_text set to: {self.content_cfg.add_narration_text_to_video}") 103 | # --- END OF FIX --- 104 | 105 | task_map = { 106 | "generate_script": self._execute_generate_script, "generate_audio": self._execute_generate_audio, 107 | "create_scene": self._execute_create_scene, "generate_shot_image": self._execute_generate_shot_image, 108 | "generate_shot_video": self._execute_generate_shot_video, "generate_shot_t2v": self._execute_generate_shot_t2v, 109 | "assemble_scene": self._execute_assemble_scene, "assemble_final": self._execute_assemble_final, 110 | } 111 | if task in task_map: return task_map[task](**task_data) 112 | logger.error(f"Unknown task: {task}"); return False 113 | except Exception as e: 114 | logger.error(f"Error executing task {task}: {e}", exc_info=True); return False 115 | 116 | def _execute_generate_script(self, topic: str) -> bool: 117 | script_data = self.llm_module.generate_script(topic, self.content_cfg) 118 | self.llm_module.clear_vram() 119 | self.project_manager.update_script(script_data) 120 | return True 121 | 122 | def _execute_generate_audio(self, scene_idx: int, text: str, speaker_wav: Optional[str] = None) -> bool: 123 | path, duration = self.tts_module.generate_audio(text, self.content_cfg.output_dir, scene_idx, language=self.content_cfg.language, speaker_wav=speaker_wav) 124 | self.project_manager.update_narration_part_status(scene_idx, "generated", path, duration if duration > 0.1 else 0.0) 125 | return True 126 | 127 | def _execute_create_scene(self, scene_idx: int) -> bool: 128 | narration = self.project_manager.state.script.narration_parts[scene_idx] 129 | visual_prompt = self.project_manager.state.script.visual_prompts[scene_idx] 130 | main_subject = self.project_manager.state.script.main_subject_description 131 | setting = self.project_manager.state.script.setting_description 132 | 133 | actual_audio_duration = narration.duration 134 | max_shot_duration = self.content_cfg.model_max_video_shot_duration 135 | 136 | if actual_audio_duration <= 0 or max_shot_duration <= 0: 137 | num_shots = 1 138 | logger.warning(f"Warning: Invalid duration detected for Scene {scene_idx} (Audio: {actual_audio_duration}s, Max Shot: {max_shot_duration}s). Defaulting to 1 shot.") 139 | else: 140 | num_shots = math.ceil(actual_audio_duration / max_shot_duration) or 1 141 | 142 | logger.info("--- Calculating Shots for Scene {} ---".format(scene_idx)) 143 | logger.info(f" - Actual Audio Duration: {actual_audio_duration:.2f}s") 144 | logger.info(f" - Model's Max Shot Duration: {max_shot_duration:.2f}s") 145 | logger.info(f" - Calculated Number of Shots: {num_shots} ({actual_audio_duration:.2f}s / {max_shot_duration:.2f}s)") 146 | 147 | shot_prompts = self.llm_module.generate_shot_visual_prompts( 148 | narration.text, visual_prompt.prompt, num_shots, self.content_cfg, main_subject, setting 149 | ) 150 | self.llm_module.clear_vram() 151 | 152 | shots = [] 153 | for i, (visual, motion) in enumerate(shot_prompts): 154 | if i < num_shots - 1: 155 | duration = max_shot_duration 156 | else: 157 | duration = actual_audio_duration - (i * max_shot_duration) 158 | 159 | shots.append({"shot_idx": i, "target_duration": max(0.5, duration), "visual_prompt": visual, "motion_prompt": motion}) 160 | 161 | all_character_names = [char.name for char in self.project_manager.state.characters] 162 | logger.info(f"Creating Scene {scene_idx} and assigning default characters: {all_character_names}") 163 | self.project_manager.add_scene(scene_idx, shots, character_names=all_character_names) 164 | return True 165 | 166 | def _execute_generate_shot_image(self, scene_idx: int, shot_idx: int, visual_prompt: str, **kwargs) -> bool: 167 | if not self.t2i_module: 168 | logger.error("Attempted to generate image, but T2I module is not loaded for this workflow.") 169 | return False 170 | w, h = self.content_cfg.generation_resolution 171 | path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_keyframe.png") 172 | 173 | base_seed = self.content_cfg.seed 174 | shot_seed = random.randint(0, 2**32 - 1) if base_seed == -1 else base_seed + scene_idx * 100 + shot_idx 175 | 176 | negative_prompt = "worst quality, low quality, bad anatomy, text, watermark, jpeg artifacts, blurry" 177 | 178 | scene = self.project_manager.get_scene_info(scene_idx) 179 | ip_adapter_image_paths = [] 180 | if scene and scene.character_names: 181 | logger.info(f"Found characters for Scene {scene_idx}: {scene.character_names}") 182 | for name in scene.character_names: 183 | char = self.project_manager.get_character(name) 184 | if char and os.path.exists(char.reference_image_path): 185 | ip_adapter_image_paths.append(char.reference_image_path) 186 | 187 | self.t2i_module.generate_image( 188 | prompt=visual_prompt, negative_prompt=negative_prompt, output_path=path, 189 | width=w, height=h, ip_adapter_image=ip_adapter_image_paths or None, seed=shot_seed 190 | ) 191 | 192 | self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_IMAGE_GENERATED, keyframe_path=path) 193 | self.t2i_module.clear_vram() 194 | return True 195 | 196 | def _execute_generate_shot_video(self, scene_idx: int, shot_idx: int, visual_prompt: str, motion_prompt: Optional[str], **kwargs) -> bool: 197 | if not self.i2v_module: 198 | logger.error("Attempted to generate video from image, but I2V module is not loaded for this workflow.") 199 | return False 200 | shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx] 201 | if not shot.keyframe_image_path or not os.path.exists(shot.keyframe_image_path): return False 202 | 203 | enhanced_visual = self.i2v_module.enhance_prompt(visual_prompt, "visual") 204 | enhanced_motion = self.i2v_module.enhance_prompt(motion_prompt, "motion") 205 | 206 | scene = self.project_manager.get_scene_info(scene_idx) 207 | ip_adapter_image_paths = [self.project_manager.get_character(name).reference_image_path for name in scene.character_names if self.project_manager.get_character(name)] 208 | 209 | video_path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_svd.mp4") 210 | 211 | sub_clip_path = self.i2v_module.generate_video_from_image( 212 | image_path=shot.keyframe_image_path, output_video_path=video_path, target_duration=shot.target_duration, 213 | content_config=self.content_cfg, visual_prompt=enhanced_visual, motion_prompt=enhanced_motion, 214 | ip_adapter_image=ip_adapter_image_paths or None 215 | ) 216 | 217 | if sub_clip_path and os.path.exists(sub_clip_path): 218 | self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_VIDEO_GENERATED, video_path=sub_clip_path) 219 | return True 220 | self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_FAILED); return False 221 | 222 | def _execute_generate_shot_t2v(self, scene_idx: int, shot_idx: int, visual_prompt: str, **kwargs) -> bool: 223 | if not self.t2v_module: 224 | logger.error("Attempted to generate video from text, but T2V module is not loaded for this workflow.") 225 | return False 226 | shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx] 227 | num_frames = int(shot.target_duration * self.content_cfg.fps) 228 | w, h = self.content_cfg.generation_resolution 229 | 230 | enhanced_prompt = self.t2v_module.enhance_prompt(visual_prompt) 231 | 232 | scene = self.project_manager.get_scene_info(scene_idx) 233 | ip_adapter_image_paths = [self.project_manager.get_character(name).reference_image_path for name in scene.character_names if self.project_manager.get_character(name)] 234 | 235 | video_path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_t2v.mp4") 236 | 237 | sub_clip_path = self.t2v_module.generate_video_from_text( 238 | enhanced_prompt, video_path, num_frames, self.content_cfg.fps, w, h, 239 | ip_adapter_image=ip_adapter_image_paths or None 240 | ) 241 | 242 | if sub_clip_path and os.path.exists(sub_clip_path): 243 | self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_VIDEO_GENERATED, video_path=sub_clip_path) 244 | return True 245 | self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_FAILED); return False 246 | 247 | def _execute_assemble_scene(self, scene_idx: int, **kwargs) -> bool: 248 | scene = self.project_manager.get_scene_info(scene_idx) 249 | if not scene: return False 250 | video_paths = [c.video_path for c in scene.shots if c.status == STATUS_VIDEO_GENERATED] 251 | if len(video_paths) != len(scene.shots): return False 252 | 253 | narration_duration = self.project_manager.state.script.narration_parts[scene_idx].duration 254 | final_path = assemble_scene_video_from_sub_clips(video_paths, narration_duration, self.content_cfg, scene_idx) 255 | 256 | if final_path: 257 | self.project_manager.update_scene_status(scene_idx, "completed", assembled_video_path=final_path) 258 | return True 259 | self.project_manager.update_scene_status(scene_idx, "failed"); return False 260 | 261 | def _execute_assemble_final(self, **kwargs) -> bool: 262 | narration_parts = self.project_manager.state.script.narration_parts 263 | assets = [ 264 | (s.assembled_video_path, narration_parts[s.scene_idx].audio_path, {"text": narration_parts[s.scene_idx].text, "duration": narration_parts[s.scene_idx].duration}) 265 | for s in self.project_manager.state.scenes if s.status == "completed" 266 | ] 267 | if len(assets) != len(self.project_manager.state.scenes): return False 268 | 269 | topic = self.project_manager.state.project_info.topic 270 | final_path = assemble_final_reel(assets, self.content_cfg, output_filename=f"{topic.replace(' ','_')}_final.mp4") 271 | 272 | if final_path and os.path.exists(final_path): 273 | text = " ".join([a[2]["text"] for a in assets]) 274 | hashtags = self.project_manager.state.script.hashtags 275 | self.project_manager.update_final_video(final_path, "generated", text, hashtags) 276 | return True 277 | self.project_manager.update_final_video("", "pending", "", []); return False -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modular AI Video Generation Pipeline 2 | 3 | [![Python Version](https://img.shields.io/badge/Python-3.10%2B-blue.svg)](https://www.python.org/) 4 | [![Framework](https://img.shields.io/badge/Framework-Streamlit-red.svg)](https://streamlit.io) 5 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) 6 | 7 | ## ⚠️ Important Notes 8 | 9 | **Video Quality Issues**: If your generated videos appear scrambled or distorted, this typically means you're not using the optimal video dimensions that the selected model was trained on. Each AI model has specific resolution requirements for best results. Check the model documentation for recommended dimensions and adjust your video settings accordingly. 10 | 11 | **Contributors Welcome!** 🚀 This project is open to contributions from the community. If you're interested in helping improve this pipeline, adding new models, or fixing bugs, please feel free to submit pull requests or open issues. 12 | 13 | **New Project Announcement**: I've started working on a completely separate and different video generation project. If you're interested in learning more or collaborating, feel free to reach out to me on [LinkedIn](https://www.linkedin.com/in/gowravvishwakarma/)! 14 | 15 | --- 16 | 17 | An extensible, modular pipeline for generating short-form videos using a variety of AI models. This tool provides a powerful Streamlit-based web interface to define a video topic, select different AI models for each generation step (language, speech, image, video), and orchestrate the entire content creation process from script to final rendered video. 18 | 19 | ## 🎥 Demo Video 20 | 21 |
22 | 23 | Modular AI Video Generation Pipeline Demo 24 | 25 |

Watch the full demo on YouTube

26 |
27 | 28 | ## Core Features 29 | 30 | - **End-to-End Video Generation**: Go from a single topic idea to a fully edited video with narration, background visuals, and text overlays in one integrated workflow. 31 | - **Fully Modular Architecture**: Easily add, remove, or swap different AI models for each part of the pipeline. The system is designed for extension. 32 | - **Dynamic Model Discovery**: The application automatically discovers any new model modules you add, making them immediately available for selection in the UI. 33 | - **Dual Generation Workflows**: 34 | - **Image-to-Video (High Quality)**: Generates a keyframe image first, then animates it. Offers higher visual quality and control. 35 | - **Text-to-Video (Fast)**: Generates video clips directly from text prompts for a faster, more streamlined process. 36 | - **Character Consistency**: Utilizes IP-Adapters in supported models (like Juggernaut-XL) to maintain the appearance of a specific character or subject across different scenes. 37 | - **Interactive Project Dashboard**: Once a project is created, you have full control. Edit scripts, regenerate audio, modify visual prompts, and see the progress of every task in real-time. 38 | - **Stateful Project Management**: Stop and resume your work at any time. The entire project state is saved, allowing you to load existing projects, make changes, and continue where you left off. 39 | - **Multi-Language Voice Generation**: Generate narration in over 15 languages (including English, Spanish, French, German, Japanese, Hindi, and more) using advanced TTS models. 40 | - **Voice Cloning**: Provide a short `.wav` file of a reference voice to clone it for the video's narration, powered by Coqui XTTS. 41 | 42 | ## Future Development Plans 43 | 44 | ### TODO List 45 | 46 | 1. **Text-to-Music (TTM) Modules** 47 | 48 | - Background music generation for videos 49 | - Pure music production capabilities 50 | - Integration with existing video pipeline 51 | 52 | 2. **Additional Model Support** 53 | 54 | - FramePack and other advanced video generation models 55 | - Enhanced model compatibility and optimization 56 | - Lora (Low-Rank Adaptation) support for fine-tuning models 57 | - Custom Lora training and management interface 58 | - ControlNet integration for pose, depth, and style control 59 | - Advanced ControlNet features (canny, segmentation, etc.) 60 | 61 | 3. **Character Consistency Features** 62 | 63 | - Lora-based character consistency across scenes 64 | - Character style preservation and transfer 65 | - Multi-character management system 66 | - Character pose and expression control 67 | 68 | 4. **Advanced Editing Features** 69 | 70 | - Multilayer timeline style editor 71 | - Professional-grade video editing capabilities 72 | - Enhanced control over transitions and effects 73 | 74 | 5. **UI/UX Improvements** 75 | 76 | - Migration to FastAPI backend 77 | - Modern frontend with React/Vue 78 | - Enhanced user experience and performance 79 | 80 | 6. **Production Infrastructure** 81 | - Distributed model serving system 82 | - Load balancing across multiple GPUs/servers 83 | - Model caching and optimization 84 | - User quota and resource management 85 | - Queue management for multiple users 86 | - Real-time progress tracking and status updates 87 | - Automatic failover and recovery 88 | - Resource usage analytics and monitoring 89 | 90 | ## Architectural Overview 91 | 92 | The pipeline follows a state-driven, sequential process. The `ProjectManager` tracks the status of every task in a `project.json` file. The `TaskExecutor` then reads this state and executes the next pending task using the specific modules you selected for the project. 93 | 94 | ```mermaid 95 | graph TD 96 | A[Start: Create New Project in UI] --> B{Select Models & Workflow}; 97 | B --> C[Provide Topic & Settings]; 98 | C --> D[Project Initialized - project.json]; 99 | D --> E[Task: Generate Script LLM]; 100 | E --> F[Task: Generate Audio TTS]; 101 | F --> G[Task: Create Scene Shots - LLM]; 102 | 103 | subgraph "For Each Scene Shot" 104 | direction LR 105 | G --> H{I2V or T2V Flow?}; 106 | H -- I2V --> I[Task: Gen Image T2I]; 107 | I --> J[Task: Gen Video I2V]; 108 | H -- T2V --> K[Task: Gen Video T2V]; 109 | end 110 | 111 | J --> L[All Shots Done?]; 112 | K --> L; 113 | 114 | L -- Yes --> M[Task: Assemble Scene Videos]; 115 | M --> N[All Scenes Done?]; 116 | N -- Yes --> O[Task: Assemble Final Reel]; 117 | O --> P[✅ Final Video Complete]; 118 | 119 | 120 | ``` 121 | 122 | ## Installation 123 | 124 | This project uses `uv` for fast package management. 125 | 126 | **1. Prerequisites** 127 | 128 | - Python 3.10 or newer. 129 | - `git` for cloning the repository. 130 | - **For GPU acceleration (highly recommended):** NVIDIA GPU with CUDA drivers installed. 131 | - **FFmpeg**: Required by `moviepy` for video processing. Ensure it's installed and accessible in your system's PATH. 132 | - **Ubuntu**: `sudo apt update && sudo apt install ffmpeg` 133 | - **macOS (with Homebrew)**: `brew install ffmpeg` 134 | - **Windows**: Download from the [official site](https://ffmpeg.org/download.html) and add the `bin` folder to your PATH. 135 | 136 | **2. Clone the Repository** 137 | 138 | ```bash 139 | git clone https://github.com/your-username/your-repo-name.git 140 | cd your-repo-name 141 | ``` 142 | 143 | **3. Set up a Virtual Environment and Install Dependencies** 144 | 145 | First, install `uv`: 146 | 147 | ```bash 148 | pip install uv 149 | ``` 150 | 151 | Next, create a virtual environment and install all required packages using `uv`. This single command installs all dependencies, including PyTorch for your specific CUDA version (or CPU if CUDA is not available). 152 | 153 | ```bash 154 | # Create a virtual environment 155 | uv venv 156 | 157 | # Activate the environment 158 | # On macOS/Linux: 159 | source .venv/bin/activate 160 | # On Windows: 161 | .venv\Scripts\activate 162 | 163 | # Install all packages using the provided command 164 | uv pip install torch torchvision torchaudio coqui-tts transformers streamlit sentencepiece moviepy psutil gputil ftfy "huggingface-hub[cli]" hf-transfer accelerate bitsandbytes pydantic --no-build-package llvmlite 165 | ``` 166 | 167 | > **Note:** The `--no-build-package llvmlite` flag is included to prevent `uv` from trying to build the `llvmlite` package from source, which can fail without the proper LLVM toolchain. This forces it to use a pre-compiled wheel. 168 | 169 | ## Getting Started 170 | 171 | With your virtual environment activated, launch the Streamlit app: 172 | 173 | ```bash 174 | streamlit run app.py 175 | ``` 176 | 177 | Your web browser should automatically open to the application's UI. 178 | 179 | ### Workflow 180 | 181 | 1. **Create a New Project**: On the main page, fill out the "Create New Project" form. 182 | - **Generation Flow**: Choose between "Image to Video" (high quality) or "Text to Video" (fast). 183 | - **Model Selection**: Select your desired AI models from the dropdowns for each stage. 184 | - **Topic**: Enter the subject of your video. 185 | - **Settings**: Configure the video format, length, and number of scenes. 186 | - **Characters (Optional)**: If you select a model flow that supports character consistency, you can upload reference images for your subjects. 187 | 2. **Processing Dashboard**: After creating the project, you'll be taken to the dashboard. 188 | - **Automatic Mode**: Toggle "Automatic Mode" and click "Start" to have the pipeline run through all the steps automatically. 189 | - **Manual Control**: With automatic mode off, you can manually trigger each step (e.g., "Gen Audio", "Gen Image"). This is perfect for fine-tuning. 190 | - **Edit Everything**: Click into any text box to edit the script narration or visual prompts, then regenerate that specific part. 191 | 3. **Final Assembly**: Once all scenes and clips are generated, a button will appear to assemble the final video. Click it to view the finished product, complete with subtitles and synchronized audio. 192 | 193 | --- 194 | 195 | ## ⭐ How to Develop Your Own Module ⭐ 196 | 197 | The pipeline is designed for easy extension. To add a new AI model, you simply need to create a Python class that inherits from one of the abstract base classes in `base_modules.py` and implements its required methods. 198 | 199 | **The Core Contract: `base_modules.py`** 200 | 201 | This file defines the interface for every module type: 202 | 203 | - `BaseLLM`: For language models. 204 | - `BaseTTS`: For text-to-speech models. 205 | - `BaseT2I`: For text-to-image models. 206 | - `BaseI2V`: For image-to-video models. 207 | - `BaseT2V`: For text-to-video models. 208 | 209 | ### Step-by-Step Guide to Adding a New I2V Model 210 | 211 | Let's create a new hypothetical Image-to-Video module called "MotionWeaver". 212 | 213 | **1. Create the File** 214 | 215 | Create a new file in the appropriate directory: `i2v_modules/i2v_motion_weaver.py`. 216 | 217 | **2. Define the `Config` and `Module` Classes** 218 | 219 | In your new file, set up the basic structure. 220 | 221 | ```python 222 | # In i2v_modules/i2v_motion_weaver.py 223 | import torch 224 | from typing import Dict, Any, List, Optional, Union 225 | 226 | # Import from the project's own files 227 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities 228 | from config_manager import DEVICE, clear_vram_globally, ContentConfig 229 | 230 | # Step 2a: Define a Pydantic config for your model's parameters. 231 | class MotionWeaverI2VConfig(BaseModuleConfig): 232 | model_id: str = "some-repo/motion-weaver-pro" 233 | num_inference_steps: int = 20 234 | motion_strength: float = 0.9 235 | 236 | # Step 2b: Create the main class inheriting from the correct base class. 237 | class MotionWeaverI2V(BaseI2V): 238 | # Link your config class 239 | Config = MotionWeaverI2VConfig 240 | 241 | # Implement all required abstract methods... 242 | ``` 243 | 244 | **3. Implement `get_capabilities()`** 245 | 246 | This is the most important method for UI integration. It tells the application what your model can do, and this information is used to populate dropdowns and enable/disable features. 247 | 248 | ```python 249 | # Inside the MotionWeaverI2V class 250 | 251 | @classmethod 252 | def get_capabilities(cls) -> ModuleCapabilities: 253 | """Returns the spec sheet for this module.""" 254 | return ModuleCapabilities( 255 | # This title appears in the UI dropdown. Be descriptive! 256 | title="MotionWeaver Pro (Smooth & Cinematic)", 257 | vram_gb_min=10.0, 258 | ram_gb_min=16.0, 259 | supports_ip_adapter=False, # This model doesn't support it 260 | max_subjects=0, 261 | ) 262 | ``` 263 | 264 | **4. Implement Core Functionality (`generate_video_from_image`)** 265 | 266 | This is where you call your model's code. The method signature is strictly defined by `BaseI2V`. 267 | 268 | ```python 269 | # Inside the MotionWeaverI2V class 270 | 271 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str: 272 | # 1. Load the model (if not already loaded) 273 | self._load_pipeline() 274 | 275 | # 2. Prepare inputs (e.g., load image, calculate frames) 276 | from diffusers.utils import load_image, export_to_video 277 | input_image = load_image(image_path) 278 | num_frames = int(target_duration * content_config.fps) 279 | 280 | # 3. Call the pipeline 281 | video_frames = self.pipe( 282 | image=input_image, 283 | prompt=visual_prompt, # Use the prompts provided by the controller 284 | motion_prompt=motion_prompt, 285 | num_frames=num_frames, 286 | motion_strength=self.config.motion_strength 287 | ).frames 288 | 289 | # 4. Save the output and return the path 290 | export_to_video(video_frames, output_video_path, fps=content_config.fps) 291 | print(f"MotionWeaver video saved to {output_video_path}") 292 | return output_video_path 293 | ``` 294 | 295 | **5. Implement VRAM Management and Other Helpers** 296 | 297 | To manage memory, the pipeline loads and unloads models as needed. 298 | 299 | ```python 300 | # Inside the MotionWeaverI2V class 301 | 302 | def _load_pipeline(self): 303 | """Loads the model into memory. Should be idempotent.""" 304 | if self.pipe is None: 305 | from some_library import MotionWeaverPipeline # Local import 306 | print(f"Loading MotionWeaver pipeline: {self.config.model_id}...") 307 | self.pipe = MotionWeaverPipeline.from_pretrained( 308 | self.config.model_id, torch_dtype=torch.float16 309 | ).to(DEVICE) 310 | 311 | def clear_vram(self): 312 | """Releases the model from VRAM.""" 313 | print("Clearing MotionWeaver VRAM...") 314 | if self.pipe is not None: 315 | clear_vram_globally(self.pipe) # Use the global helper 316 | self.pipe = None 317 | 318 | def get_model_capabilities(self) -> Dict[str, Any]: 319 | """Return technical details about the model.""" 320 | return { 321 | "resolutions": {"Portrait": (512, 768), "Landscape": (768, 512)}, 322 | "max_shot_duration": 4.0 # Max video length it can generate at once 323 | } 324 | ``` 325 | 326 | **6. Register the Module** 327 | 328 | Finally, open the `__init__.py` file in the same directory (`i2v_modules/__init__.py`) and add an import for your new class. This makes it discoverable. 329 | 330 | ```python 331 | # In i2v_modules/__init__.py 332 | 333 | from .i2v_ltx import LtxI2V 334 | from .i2v_svd import SvdI2V 335 | from .i2v_slideshow import SlideshowI2V 336 | from .i2v_motion_weaver import MotionWeaverI2V # <-- Add this line 337 | ``` 338 | 339 | **That's it!** The next time you run `streamlit run app.py`, "MotionWeaver Pro (Smooth & Cinematic)" will appear as an option in the Image-to-Video Model dropdown. 340 | 341 | ## Directory Structure 342 | 343 | ``` 344 | . 345 | ├── app.py # Main Streamlit web application 346 | ├── base_modules.py # Abstract base classes for all modules (The Contract) 347 | ├── config_manager.py # Pydantic configs and global settings 348 | ├── module_discovery.py # Service to automatically find and load modules 349 | ├── project_manager.py # Handles loading, saving, and managing project state 350 | ├── task_executor.py # Orchestrates the execution of generation tasks 351 | ├── ui_task_executor.py # Bridges the UI with the task executor 352 | ├── utils.py # Shared utility functions 353 | ├── video_assembly.py # Functions for combining clips into the final video 354 | ├── llm_modules/ # Language model modules 355 | │ ├── __init__.py 356 | │ └── llm_zephyr.py 357 | ├── tts_modules/ # Text-to-Speech modules 358 | ├── t2i_modules/ # Text-to-Image modules 359 | ├── i2v_modules/ # Image-to-Video modules 360 | └── t2v_modules/ # Text-to-Video modules 361 | ``` 362 | 363 | ## License 364 | 365 | This project is licensed under the MIT License - see the LICENSE file for details. 366 | 367 | ### Important Notice Regarding Model Licenses 368 | 369 | While this project itself is MIT-licensed, the AI models used within this pipeline (including but not limited to language models, text-to-speech models, image generation models, and video generation models) are subject to their own respective licenses. Users of this project are responsible for: 370 | 371 | 1. Reviewing and complying with the license terms of each model they choose to use 372 | 2. Ensuring they have the necessary rights and permissions to use these models 373 | 3. Understanding that different models may have different usage restrictions, commercial terms, and attribution requirements 374 | 375 | The MIT license of this project does not override or modify the license terms of any third-party models. Users must independently verify and comply with all applicable model licenses before use. 376 | -------------------------------------------------------------------------------- /video_assembly.py: -------------------------------------------------------------------------------- 1 | import math # For math.ceil 2 | import os 3 | 4 | from typing import List, Optional, Tuple, Dict, Any 5 | from moviepy import VideoFileClip, AudioFileClip, concatenate_videoclips, TextClip, CompositeVideoClip 6 | from moviepy.audio.AudioClip import concatenate_audioclips, AudioClip 7 | from moviepy.video.VideoClip import ColorClip 8 | 9 | from config_manager import ContentConfig 10 | 11 | 12 | # --- 5. VIDEO ASSEMBLY --- 13 | 14 | def assemble_scene_video_from_sub_clips( 15 | sub_clip_paths: List[str], 16 | target_total_duration: float, 17 | config: ContentConfig, 18 | scene_idx: int 19 | ) -> str: 20 | """Assembles multiple video sub-clips into a single scene video with precise duration control. 21 | 22 | This function takes multiple video sub-clips and combines them into a single scene video 23 | that matches the target duration. If the combined duration is shorter than the target, 24 | the video will be looped. If longer, it will be trimmed. 25 | 26 | Args: 27 | sub_clip_paths (List[str]): List of paths to video sub-clips to be combined 28 | target_total_duration (float): Desired duration for the final scene video in seconds 29 | config (ContentConfig): Configuration object containing video settings 30 | scene_idx (int): Index of the scene being assembled 31 | 32 | Returns: 33 | str: Path to the assembled scene video file. Returns empty string if assembly fails. 34 | 35 | Note: 36 | - Handles resource cleanup properly 37 | - Supports video concatenation and duration adjustment 38 | - Creates output in the directory specified by config.output_dir 39 | """ 40 | if not sub_clip_paths: 41 | print(f"Warning: No sub-clips provided for scene {scene_idx}. Cannot assemble scene video.") 42 | # Create a short black placeholder? 43 | placeholder_path = os.path.join(config.output_dir, f"scene_{scene_idx}_placeholder.mp4") 44 | # Simple way to make a black clip with moviepy if needed, but for now, just return empty string or raise error. 45 | # For now, let's assume this case is handled upstream or we expect valid paths. 46 | return "" 47 | 48 | print(f"Assembling video for scene {scene_idx} from {len(sub_clip_paths)} sub-clips to match duration {target_total_duration:.2f}s.") 49 | 50 | clips_to_close = [] 51 | video_sub_clips_mvp = [] 52 | for path in sub_clip_paths: 53 | clip = VideoFileClip(path) 54 | video_sub_clips_mvp.append(clip) 55 | clips_to_close.append(clip) 56 | 57 | # Concatenate raw sub-clips first 58 | concatenated_raw_video = concatenate_videoclips(video_sub_clips_mvp, method="compose") 59 | clips_to_close.append(concatenated_raw_video) 60 | 61 | # Adjust final concatenated clip to precisely match target_total_duration 62 | current_duration = concatenated_raw_video.duration 63 | if abs(current_duration - target_total_duration) < 0.05 : # If very close, accept it 64 | final_scene_video_timed = concatenated_raw_video 65 | elif current_duration > target_total_duration: 66 | final_scene_video_timed = concatenated_raw_video.subclipped(0, target_total_duration) 67 | else: # current_duration < target_total_duration - loop the whole concatenated clip 68 | num_loops = math.ceil(target_total_duration / current_duration) 69 | looped_clips = [concatenated_raw_video] * num_loops 70 | temp_looped_video = concatenate_videoclips(looped_clips, method="compose") 71 | clips_to_close.append(temp_looped_video) # Add to close list 72 | final_scene_video_timed = temp_looped_video.subclipped(0, target_total_duration) 73 | 74 | # Add the final timed clip to close list if it's a new object (subclip creates new) 75 | if final_scene_video_timed is not concatenated_raw_video and final_scene_video_timed not in clips_to_close: 76 | clips_to_close.append(final_scene_video_timed) 77 | 78 | final_scene_video_path = os.path.join(config.output_dir, f"scene_{scene_idx}_assembled_video.mp4") 79 | try: 80 | final_scene_video_timed.write_videofile( 81 | final_scene_video_path, 82 | fps=config.fps, 83 | codec="libx264", 84 | audio=False, # Audio will be added in the final assembly step 85 | threads=4, preset="medium", logger=None # Quieter logs for sub-assemblies 86 | ) 87 | except Exception as e: 88 | print(f"Error writing assembled scene video for scene {scene_idx}: {e}") 89 | # Fallback or error handling 90 | final_scene_video_path = "" # Indicate failure 91 | finally: 92 | for clip_obj in clips_to_close: 93 | if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')): 94 | clip_obj.close() 95 | 96 | print(f"Assembled video for scene {scene_idx} saved to {final_scene_video_path} with duration {final_scene_video_timed.duration:.2f}s.") 97 | return final_scene_video_path 98 | 99 | 100 | # In video_assembly.py 101 | 102 | def assemble_final_reel( 103 | processed_scene_assets: List[Tuple[str, str, Dict[str, Any]]], 104 | config: ContentConfig, 105 | output_filename: str = "final_reel.mp4" 106 | ) -> Optional[str]: 107 | """Creates the final video reel by combining multiple scene videos with audio and text overlays. 108 | 109 | This function takes processed scene assets (video, audio, and narration info) and combines them 110 | into a final video reel. It handles video resizing, cropping, audio synchronization, and text 111 | overlay placement. The function ensures proper resource management and cleanup. 112 | 113 | Args: 114 | processed_scene_assets (List[Tuple[str, str, Dict[str, Any]]]): List of tuples containing: 115 | - scene_video_path: Path to the scene video file 116 | - scene_audio_path: Path to the scene audio file 117 | - narration_info: Dictionary containing narration text and duration 118 | config (ContentConfig): Configuration object containing video settings 119 | output_filename (str, optional): Name for the final output file. Defaults to "final_reel.mp4" 120 | 121 | Returns: 122 | Optional[str]: Path to the final assembled video file. Returns None if assembly fails. 123 | 124 | Features: 125 | - Combines video, audio, and text captions for each scene 126 | - Handles video resizing and cropping to target resolution 127 | - Manages audio synchronization 128 | - Adds text overlays with proper positioning 129 | - Implements comprehensive resource cleanup 130 | 131 | Note: 132 | - Requires proper font file for text overlays 133 | - Handles memory efficiently through proper resource cleanup 134 | - Provides error handling and fallback mechanisms 135 | """ 136 | print("Assembling final reel...") 137 | if not processed_scene_assets: 138 | print("No processed scene assets to assemble. Final video cannot be created.") 139 | return None 140 | 141 | print(f"config.add_narration_text_to_video: {config.add_narration_text_to_video}") 142 | 143 | final_scene_video_clips = [] # Renamed from final_scene_clips_for_reel for clarity 144 | 145 | # This list will store all clips that are loaded or created 146 | # and should be closed in the finally block. 147 | all_clips_to_close = [] 148 | 149 | font_path_for_textclip = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf" 150 | if not os.path.exists(font_path_for_textclip): 151 | print(f"Warning: Font file not found at {font_path_for_textclip}. TextClip will use a default font.") 152 | font_path_for_textclip = "Liberation-Sans-Bold" # Or "Arial" or None 153 | 154 | for i, (scene_video_path, scene_audio_path, narration_info) in enumerate(processed_scene_assets): 155 | video_clip_for_scene = None 156 | audio_clip_for_scene = None 157 | text_clip_for_scene = None 158 | background_clip_for_scene = None 159 | 160 | try: 161 | narration_text = narration_info["text"] 162 | actual_audio_duration = narration_info["duration"] # This is the target duration for this scene 163 | 164 | if not (scene_video_path and os.path.exists(scene_video_path) and \ 165 | scene_audio_path and os.path.exists(scene_audio_path)): 166 | print(f"Skipping scene {i} due to missing media files.") 167 | continue 168 | 169 | # Load video and audio clips 170 | video_clip_for_scene = VideoFileClip(scene_video_path) 171 | audio_clip_for_scene = AudioFileClip(scene_audio_path) 172 | all_clips_to_close.extend([video_clip_for_scene, audio_clip_for_scene]) 173 | 174 | video_duration = video_clip_for_scene.duration 175 | 176 | # --- Video Duration Matching (using subclipped and concatenate_videoclips for loop) --- 177 | # First, resize and crop to final shape before timing adjustments IF POSSIBLE, 178 | # or do timing first. Let's stick to your old code's order: 179 | # Resize, Crop, Position, THEN Time, then Audio. 180 | 181 | # 1. Resize video to target height 182 | temp_video_clip = video_clip_for_scene.resized(height=config.final_output_resolution[1]) 183 | 184 | # 2. Crop if wider than target width, or pad if narrower 185 | if temp_video_clip.w > config.final_output_resolution[0]: 186 | # Using .cropped() as per your working old code 187 | temp_video_clip = temp_video_clip.cropped(x_center=temp_video_clip.w / 2, 188 | width=config.final_output_resolution[0]) 189 | elif temp_video_clip.w < config.final_output_resolution[0]: 190 | # Pad with a background 191 | background_clip_for_scene = ColorClip(size=config.final_output_resolution, 192 | color=(0,0,0), # Black background 193 | duration=actual_audio_duration) # Duration for background 194 | all_clips_to_close.append(background_clip_for_scene) 195 | # Composite video onto background 196 | temp_video_clip = CompositeVideoClip([background_clip_for_scene, temp_video_clip.with_position('center')], 197 | size=config.final_output_resolution) 198 | 199 | # 3. Position video in center (if not already handled by padding composite) 200 | # The .with_position('center') might have been applied already if padded. 201 | # If not padded, apply it now. 202 | if not (video_clip_for_scene.w < config.final_output_resolution[0] and temp_video_clip.w == config.final_output_resolution[0]): 203 | temp_video_clip = temp_video_clip.with_position('center') 204 | 205 | # 4. Handle duration mismatches for the video 206 | if video_duration > actual_audio_duration: # If original video was longer 207 | video_clip_timed = temp_video_clip.subclipped(0, actual_audio_duration) 208 | elif video_duration < actual_audio_duration: # If original video was shorter, loop it 209 | # Note: we loop the `temp_video_clip` which is already resized/cropped/positioned 210 | num_loops = math.ceil(actual_audio_duration / video_duration) # Loop based on original duration 211 | if num_loops == 0 : num_loops = 1 # Ensure at least one instance 212 | # Create a list of the clip to be looped 213 | looped_video_parts = [temp_video_clip] * num_loops 214 | video_clip_concatenated_for_loop = concatenate_videoclips(looped_video_parts) 215 | all_clips_to_close.append(video_clip_concatenated_for_loop) # This new clip needs closing 216 | video_clip_timed = video_clip_concatenated_for_loop.subclipped(0, actual_audio_duration) 217 | else: # Durations match closely enough 218 | video_clip_timed = temp_video_clip # temp_video_clip is already at its full duration here 219 | 220 | final_audio_for_scene = audio_clip_for_scene # Start with the loaded audio 221 | if final_audio_for_scene.duration > actual_audio_duration: 222 | final_audio_for_scene = final_audio_for_scene.subclipped(0, actual_audio_duration) 223 | elif final_audio_for_scene.duration < actual_audio_duration: 224 | silence_needed = actual_audio_duration - final_audio_for_scene.duration 225 | if silence_needed > 0.01: # Only add if significant 226 | silence_clip = AudioClip(frame_function=lambda t: 0, duration=silence_needed) 227 | all_clips_to_close.append(silence_clip) 228 | final_audio_for_scene = concatenate_audioclips([final_audio_for_scene, silence_clip]) 229 | 230 | 231 | # 5. Combine video and audio 232 | video_clip_with_audio = video_clip_timed.with_audio(final_audio_for_scene) 233 | 234 | # This list will hold the video clip, and conditionally, the text clip. 235 | clips_for_composition = [video_clip_with_audio] 236 | 237 | # 6. Add text caption (if enabled in config) 238 | if config.add_narration_text_to_video: 239 | print(f"Adding narration text for scene {i}...") 240 | # Calculate font size based on video height (e.g., 5% of height) 241 | base_font_size = int(config.final_output_resolution[1] * 0.05) # 5% of height 242 | font_size = max(40, min(base_font_size, 60)) # Between 40 and 60 243 | 244 | text_width = int(config.final_output_resolution[0] * 0.8) 245 | aspect_ratio = config.final_output_resolution[0] / config.final_output_resolution[1] 246 | vertical_position = 0.7 if aspect_ratio < 1 else 0.75 247 | 248 | # --- THIS IS THE FIX: Reverted to the original working syntax --- 249 | text_clip_for_scene = TextClip( 250 | font_path_for_textclip, 251 | text=narration_text, 252 | font_size=font_size, 253 | color='white', 254 | stroke_color='black', 255 | stroke_width=2, 256 | method='caption', 257 | size=(text_width, None) 258 | ) 259 | all_clips_to_close.append(text_clip_for_scene) 260 | 261 | text_clip_final = text_clip_for_scene.with_position(('center', vertical_position), relative=True).with_duration(actual_audio_duration) 262 | 263 | clips_for_composition.append(text_clip_final) 264 | else: 265 | print(f"Skipping narration text for scene {i} as per config.") 266 | 267 | 268 | # 7. Combine video and (optional) text into final scene composite 269 | scene_composite = CompositeVideoClip( 270 | clips_for_composition, 271 | size=config.final_output_resolution # Ensure composite is target size 272 | ) 273 | final_scene_video_clips.append(scene_composite) 274 | 275 | except Exception as e_scene: 276 | print(f"Error processing scene {i}: {e_scene}") 277 | import traceback 278 | traceback.print_exc() 279 | # Any clips opened in this iteration (video_clip_for_scene, etc.) are already in all_clips_to_close 280 | continue 281 | 282 | if not final_scene_video_clips: 283 | print("No scenes were successfully composed.") 284 | # Close any clips that might have been opened 285 | for clip_obj in all_clips_to_close: 286 | if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')): 287 | try: clip_obj.close() 288 | except: pass # Ignore errors during cleanup after failure 289 | return None 290 | 291 | final_video_output_clip = None 292 | final_video_path = os.path.join(config.output_dir, output_filename) 293 | try: 294 | final_video_output_clip = concatenate_videoclips(final_scene_video_clips, method="compose") 295 | all_clips_to_close.append(final_video_output_clip) # Add final concatenated clip for closing 296 | 297 | final_video_output_clip.write_videofile( 298 | final_video_path, 299 | fps=config.fps, 300 | codec="libx264", 301 | audio_codec="aac", 302 | threads=4, 303 | preset="medium", # "ultrafast" for speed, "medium" for balance 304 | logger='bar' 305 | ) 306 | except Exception as e_write: 307 | print(f"Error during final video writing: {e_write}") 308 | import traceback 309 | traceback.print_exc() 310 | final_video_path = None # Indicate failure 311 | finally: 312 | # Close all clips. 313 | # `final_scene_video_clips` contains CompositeVideoClips that are sources for `final_video_output_clip`. 314 | # Closing `final_video_output_clip` should ideally handle its sources if method='compose'. 315 | # `all_clips_to_close` contains initial VideoFileClips, AudioFileClips, created ColorClips, TextClips, 316 | # and potentially intermediate concatenated clips. 317 | 318 | # Make a set of unique clip objects to close to avoid issues with multiple references 319 | # to the same underlying resources. 320 | clips_to_actually_close = {id(c): c for c in all_clips_to_close if c}.values() 321 | 322 | for clip_obj in clips_to_actually_close: 323 | if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')): 324 | try: 325 | clip_obj.close() 326 | except Exception as e_close: 327 | # print(f"Error closing a clip {type(clip_obj)}: {e_close}") # Can be noisy 328 | pass 329 | 330 | # Also ensure the list of scene composites themselves are closed, as they are also clips 331 | for scene_comp in final_scene_video_clips: 332 | if hasattr(scene_comp, 'close') and callable(getattr(scene_comp, 'close')): 333 | try: scene_comp.close() 334 | except: pass 335 | 336 | 337 | if final_video_path: 338 | print(f"Final reel saved to {final_video_path}") 339 | return final_video_path -------------------------------------------------------------------------------- /project_manager.py: -------------------------------------------------------------------------------- 1 | # project_manager.py 2 | import os 3 | import time 4 | import logging 5 | import shutil 6 | from typing import Dict, List, Optional, Any, Tuple 7 | from pydantic import BaseModel, Field 8 | 9 | from config_manager import ContentConfig 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | STATUS_PENDING, STATUS_GENERATED, STATUS_COMPLETED, STATUS_FAILED, STATUS_IN_PROGRESS = "pending", "generated", "completed", "failed", "in_progress" 14 | STATUS_IMAGE_GENERATED, STATUS_VIDEO_GENERATED = "image_generated", "video_generated" 15 | 16 | # --- Pydantic Models for Project State --- 17 | 18 | class ProjectInfo(BaseModel): 19 | title: str 20 | topic: str 21 | created_at: float = Field(default_factory=time.time) 22 | last_modified: float = Field(default_factory=time.time) 23 | status: str = STATUS_IN_PROGRESS 24 | config: Dict[str, Any] 25 | speaker_audio_path: Optional[str] = None # Stores the relative path within the project dir 26 | 27 | class NarrationPart(BaseModel): 28 | text: str 29 | status: str = STATUS_PENDING 30 | audio_path: str = "" 31 | duration: float = 0.0 32 | 33 | class VisualPrompt(BaseModel): 34 | prompt: str 35 | 36 | class Script(BaseModel): 37 | # NEW: Fields for consistent context 38 | main_subject_description: str = "" 39 | setting_description: str = "" 40 | 41 | narration_parts: List[NarrationPart] = Field(default_factory=list) 42 | visual_prompts: List[VisualPrompt] = Field(default_factory=list) 43 | hashtags: List[str] = Field(default_factory=list) 44 | 45 | class Character(BaseModel): 46 | """Represents a character/subject in the project.""" 47 | name: str 48 | reference_image_path: str 49 | source_prompt: Optional[str] = None 50 | source_image_path: Optional[str] = None # Path to the user-uploaded image 51 | 52 | 53 | class Shot(BaseModel): 54 | shot_idx: int 55 | target_duration: float 56 | visual_prompt: str 57 | motion_prompt: Optional[str] = "" 58 | status: str = STATUS_PENDING 59 | keyframe_image_path: str = "" 60 | video_path: str = "" 61 | 62 | class Scene(BaseModel): 63 | scene_idx: int 64 | status: str = STATUS_PENDING 65 | assembled_video_path: str = "" 66 | shots: List[Shot] = Field(default_factory=list) 67 | character_names: List[str] = Field(default_factory=list) 68 | 69 | class FinalVideo(BaseModel): 70 | status: str = STATUS_PENDING 71 | path: str = "" 72 | full_narration_text: str = "" 73 | hashtags: List[str] = Field(default_factory=list) 74 | 75 | class ProjectState(BaseModel): 76 | project_info: ProjectInfo 77 | script: Script = Field(default_factory=Script) 78 | characters: List[Character] = Field(default_factory=list) 79 | scenes: List[Scene] = Field(default_factory=list) 80 | final_video: FinalVideo = Field(default_factory=FinalVideo) 81 | 82 | # --- ProjectManager Class --- 83 | 84 | class ProjectManager: 85 | def __init__(self, output_dir: str): 86 | self.output_dir = output_dir 87 | self.project_file = os.path.join(output_dir, "project.json") 88 | self.state: Optional[ProjectState] = None 89 | os.makedirs(self.output_dir, exist_ok=True) 90 | 91 | def _save_state(self): 92 | if not self.state: return 93 | self.state.project_info.last_modified = time.time() 94 | with open(self.project_file, 'w') as f: 95 | f.write(self.state.model_dump_json(indent=4)) 96 | 97 | def initialize_project(self, title: str, topic: str, config: ContentConfig): 98 | project_info = ProjectInfo(title=title, topic=topic, config=config.model_dump()) 99 | self.state = ProjectState(project_info=project_info) 100 | self._save_state() 101 | 102 | def set_speaker_audio(self, relative_path: str): 103 | """Saves the relative path of the speaker audio to the project state.""" 104 | if not self.state: return 105 | self.state.project_info.speaker_audio_path = relative_path 106 | self._save_state() 107 | 108 | def load_project(self) -> bool: 109 | if not os.path.exists(self.project_file): return False 110 | try: 111 | with open(self.project_file, 'r') as f: 112 | self.state = ProjectState.model_validate_json(f.read()) 113 | return True 114 | except Exception as e: 115 | logger.error(f"Error loading project with Pydantic: {e}", exc_info=True); return False 116 | 117 | def update_script(self, script_data: Dict[str, Any]): 118 | if not self.state: return 119 | self.state.script.main_subject_description = script_data.get("main_subject_description", "") 120 | self.state.script.setting_description = script_data.get("setting_description", "") 121 | self.state.script.narration_parts = [NarrationPart(**p) for p in script_data.get("narration", [])] 122 | self.state.script.visual_prompts = [VisualPrompt(prompt=p) for p in script_data.get("visuals", [])] 123 | self.state.script.hashtags = script_data.get("hashtags", []) 124 | self._save_state() 125 | 126 | def get_next_pending_task(self) -> Tuple[Optional[str], Optional[Dict]]: 127 | if not self.state: return None, None 128 | 129 | cfg = ContentConfig(**self.state.project_info.config) 130 | use_svd_flow = cfg.use_svd_flow 131 | 132 | if not self.state.script.narration_parts: return "generate_script", {"topic": self.state.project_info.topic} 133 | 134 | for i, part in enumerate(self.state.script.narration_parts): 135 | if part.status != STATUS_GENERATED: return "generate_audio", {"scene_idx": i, "text": part.text} 136 | 137 | narration_indices_with_scenes = {s.scene_idx for s in self.state.scenes} 138 | for i in range(len(self.state.script.narration_parts)): 139 | if i not in narration_indices_with_scenes: return "create_scene", {"scene_idx": i} 140 | 141 | for scene in sorted(self.state.scenes, key=lambda s: s.scene_idx): 142 | for shot in sorted(scene.shots, key=lambda c: c.shot_idx): 143 | if shot.status != STATUS_VIDEO_GENERATED: 144 | task_data = { "scene_idx": scene.scene_idx, "shot_idx": shot.shot_idx, "visual_prompt": shot.visual_prompt, "motion_prompt": shot.motion_prompt} 145 | if use_svd_flow: 146 | if shot.status == STATUS_PENDING: return "generate_shot_image", task_data 147 | if shot.status == STATUS_IMAGE_GENERATED: return "generate_shot_video", task_data 148 | else: # T2V Flow 149 | return "generate_shot_t2v", task_data 150 | 151 | for scene in self.state.scenes: 152 | if all(c.status == STATUS_VIDEO_GENERATED for c in scene.shots) and scene.status != STATUS_COMPLETED: 153 | return "assemble_scene", {"scene_idx": scene.scene_idx} 154 | 155 | if self.state.scenes and all(s.status == STATUS_COMPLETED for s in self.state.scenes) and self.state.final_video.status != STATUS_GENERATED: 156 | return "assemble_final", {} 157 | 158 | return None, None 159 | 160 | def update_narration_part_text(self, part_idx: int, text: str): 161 | if not self.state or part_idx >= len(self.state.script.narration_parts): return 162 | part = self.state.script.narration_parts[part_idx] 163 | if part.text != text: 164 | part.text = text; part.status = STATUS_PENDING; part.audio_path = ""; part.duration = 0 165 | self.state.scenes = [s for s in self.state.scenes if s.scene_idx != part_idx] 166 | self._mark_final_for_reassembly() 167 | self._save_state() 168 | 169 | def add_scene(self, scene_idx: int, shots: List[Dict], character_names: List[str]): 170 | """Adds a new scene and assigns the provided characters to it.""" 171 | if not self.state: return 172 | scene_data = Scene( 173 | scene_idx=scene_idx, 174 | shots=[Shot(**c) for c in shots], 175 | character_names=character_names 176 | ) 177 | self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx] 178 | self.state.scenes.append(scene_data) 179 | self.state.scenes.sort(key=lambda s: s.scene_idx) 180 | self._save_state() 181 | 182 | def update_shot_content(self, scene_idx: int, shot_idx: int, visual_prompt: Optional[str] = None, motion_prompt: Optional[str] = None): 183 | scene = self.get_scene_info(scene_idx) 184 | if not scene or shot_idx >= len(scene.shots): return 185 | shot = scene.shots[shot_idx] 186 | changed = False 187 | if visual_prompt is not None and shot.visual_prompt != visual_prompt: 188 | shot.visual_prompt = visual_prompt; changed = True 189 | if motion_prompt is not None and shot.motion_prompt != motion_prompt: 190 | shot.motion_prompt = motion_prompt; changed = True 191 | if changed: 192 | shot.status = STATUS_PENDING; shot.keyframe_image_path = ""; shot.video_path = "" 193 | self._mark_scene_for_reassembly(scene_idx) 194 | self._save_state() 195 | 196 | def _mark_scene_for_reassembly(self, scene_idx: int): 197 | scene = self.get_scene_info(scene_idx) 198 | if scene and scene.status == STATUS_COMPLETED: 199 | scene.status = STATUS_PENDING; scene.assembled_video_path = "" 200 | self._mark_final_for_reassembly() 201 | 202 | def _mark_final_for_reassembly(self): 203 | if self.state and self.state.final_video.status == STATUS_GENERATED: 204 | self.state.final_video.status = STATUS_PENDING; self.state.final_video.path = "" 205 | self.state.project_info.status = STATUS_IN_PROGRESS 206 | 207 | def get_scene_info(self, scene_idx: int) -> Optional[Scene]: 208 | if not self.state: return None 209 | return next((s for s in self.state.scenes if s.scene_idx == scene_idx), None) 210 | 211 | def update_narration_part_status(self, part_idx: int, status: str, audio_path: str = "", duration: float = 0.0): 212 | if not self.state or part_idx >= len(self.state.script.narration_parts): return 213 | part = self.state.script.narration_parts[part_idx] 214 | part.status = status; part.audio_path = audio_path; part.duration = duration 215 | self._save_state() 216 | 217 | def update_shot_status(self, scene_idx, shot_idx, status, keyframe_path=None, video_path=None): 218 | scene = self.get_scene_info(scene_idx) 219 | if not scene or shot_idx >= len(scene.shots): return 220 | shot = scene.shots[shot_idx] 221 | shot.status = status 222 | if keyframe_path: shot.keyframe_image_path = keyframe_path 223 | if video_path: shot.video_path = video_path 224 | self._save_state() 225 | 226 | def update_scene_status(self, scene_idx, status, assembled_video_path=None): 227 | scene = self.get_scene_info(scene_idx) 228 | if not scene: return 229 | scene.status = status 230 | if assembled_video_path: scene.assembled_video_path = assembled_video_path 231 | self._save_state() 232 | 233 | def update_final_video(self, path, status, full_narration_text, hashtags): 234 | if not self.state: return 235 | self.state.final_video.path = path 236 | self.state.final_video.status = status 237 | self.state.final_video.full_narration_text = full_narration_text 238 | self.state.final_video.hashtags = hashtags 239 | if status == "generated": self.state.project_info.status = "completed" 240 | self._save_state() 241 | 242 | def add_character(self, character_data: Dict[str, Any]): 243 | if not self.state: return 244 | char = Character(**character_data) 245 | self.state.characters = [c for c in self.state.characters if c.name != char.name] 246 | self.state.characters.append(char) 247 | self._save_state() 248 | 249 | def update_config_value(self, key: str, value: Any): 250 | """Updates a specific key in the project's ContentConfig.""" 251 | if not self.state: return 252 | 253 | if key in ContentConfig.model_fields: 254 | config_dict = self.state.project_info.config 255 | if config_dict.get(key) != value: 256 | config_dict[key] = value 257 | self._mark_final_for_reassembly() # If assembly setting changes, reassembly is needed 258 | self._save_state() 259 | logger.info(f"Updated project config: set {key} to {value}") 260 | else: 261 | logger.warning(f"Warning: Attempted to update an unknown config key: {key}") 262 | 263 | def update_character(self, old_name: str, new_name: str, new_reference_image_path: Optional[str]): 264 | char = self.get_character(old_name) 265 | if not char: return 266 | 267 | image_changed = new_reference_image_path and char.reference_image_path != new_reference_image_path 268 | name_changed = new_name and char.name != new_name 269 | 270 | if image_changed: 271 | char.reference_image_path = new_reference_image_path 272 | self._reset_visuals_for_character(old_name) 273 | 274 | if name_changed: 275 | char.name = new_name 276 | self._update_scene_references_on_name_change(old_name, new_name) 277 | 278 | self._save_state() 279 | 280 | def delete_character(self, name: str): 281 | if not self.state: return 282 | self.state.characters = [c for c in self.state.characters if c.name != name] 283 | for scene in self.state.scenes: 284 | if name in scene.character_names: 285 | scene.character_names.remove(name) 286 | 287 | safe_name = name.replace(" ", "_") 288 | char_dir = os.path.join(self.output_dir, "characters", safe_name) 289 | if os.path.exists(char_dir): 290 | shutil.rmtree(char_dir) 291 | logger.info(f"Removed character asset directory: {char_dir}") 292 | 293 | self._save_state() 294 | 295 | def _reset_visuals_for_character(self, character_name: str): 296 | logger.info(f"Resetting visuals for scenes containing character: {character_name}") 297 | for scene in self.state.scenes: 298 | if character_name in scene.character_names: 299 | for shot in scene.shots: 300 | shot.status = STATUS_PENDING 301 | shot.keyframe_image_path = "" 302 | shot.video_path = "" 303 | scene.status = STATUS_PENDING 304 | scene.assembled_video_path = "" 305 | self._mark_final_for_reassembly() 306 | 307 | def _update_scene_references_on_name_change(self, old_name: str, new_name: str): 308 | for scene in self.state.scenes: 309 | if old_name in scene.character_names: 310 | scene.character_names = [new_name if name == old_name else name for name in scene.character_names] 311 | 312 | def get_character(self, name: str) -> Optional[Character]: 313 | if not self.state: return None 314 | return next((c for c in self.state.characters if c.name == name), None) 315 | 316 | def update_scene_characters(self, scene_idx: int, character_names: List[str]): 317 | scene = self.get_scene_info(scene_idx) 318 | if scene: 319 | scene.character_names = character_names 320 | self._save_state() 321 | 322 | def add_new_scene_at(self, scene_idx: int, narration_text: str = "New scene narration.", visual_prompt: str = "A vibrant new scene."): 323 | if not self.state: return 324 | logger.info(f"Adding new scene at index {scene_idx}") 325 | 326 | new_narration = NarrationPart(text=narration_text) 327 | new_visual = VisualPrompt(prompt=visual_prompt) 328 | self.state.script.narration_parts.insert(scene_idx, new_narration) 329 | self.state.script.visual_prompts.insert(scene_idx, new_visual) 330 | 331 | for i in range(len(self.state.scenes) - 1, -1, -1): 332 | scene = self.state.scenes[i] 333 | if scene.scene_idx >= scene_idx: 334 | scene.scene_idx += 1 335 | 336 | self._mark_final_for_reassembly() 337 | self._save_state() 338 | 339 | # --- NEW METHOD --- 340 | def reset_scene_for_shot_regeneration(self, scene_idx: int): 341 | """Deletes a scene's assets and state, preparing it for shot regeneration.""" 342 | if not self.state: return 343 | 344 | scene_to_reset = self.get_scene_info(scene_idx) 345 | if not scene_to_reset: 346 | logger.warning(f"No scene found at index {scene_idx} to reset.") 347 | return 348 | 349 | logger.info(f"Resetting Scene {scene_idx} for shot regeneration.") 350 | # Delete physical assets associated with the scene's shots 351 | for shot in scene_to_reset.shots: 352 | if shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path): 353 | try: os.remove(shot.keyframe_image_path) 354 | except OSError as e: logger.error(f"Error removing keyframe image {shot.keyframe_image_path}: {e}") 355 | if shot.video_path and os.path.exists(shot.video_path): 356 | try: os.remove(shot.video_path) 357 | except OSError as e: logger.error(f"Error removing shot video {shot.video_path}: {e}") 358 | 359 | # Delete the assembled scene video if it exists 360 | if scene_to_reset.assembled_video_path and os.path.exists(scene_to_reset.assembled_video_path): 361 | try: os.remove(scene_to_reset.assembled_video_path) 362 | except OSError as e: logger.error(f"Error removing assembled scene video {scene_to_reset.assembled_video_path}: {e}") 363 | 364 | # Remove the Scene object from the state 365 | self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx] 366 | 367 | # Mark the final video for reassembly 368 | self._mark_final_for_reassembly() 369 | self._save_state() 370 | 371 | 372 | def remove_scene_at(self, scene_idx: int): 373 | if not self.state or scene_idx >= len(self.state.script.narration_parts): return 374 | logger.info(f"Removing scene at index {scene_idx}") 375 | 376 | del self.state.script.narration_parts[scene_idx] 377 | del self.state.script.visual_prompts[scene_idx] 378 | 379 | scene_to_remove = self.get_scene_info(scene_idx) 380 | if scene_to_remove: 381 | base_dir = self.output_dir 382 | audio_path = os.path.join(base_dir, f"scene_{scene_idx}_audio.wav") 383 | if os.path.exists(audio_path): os.remove(audio_path) 384 | assembled_path = os.path.join(base_dir, f"scene_{scene_idx}_assembled_video.mp4") 385 | if os.path.exists(assembled_path): os.remove(assembled_path) 386 | for shot in scene_to_remove.shots: 387 | if shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path): 388 | os.remove(shot.keyframe_image_path) 389 | if shot.video_path and os.path.exists(shot.video_path): 390 | os.remove(shot.video_path) 391 | 392 | self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx] 393 | 394 | for scene in self.state.scenes: 395 | if scene.scene_idx > scene_idx: 396 | scene.scene_idx -= 1 397 | 398 | self._mark_final_for_reassembly() 399 | self._save_state() -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # In app.py 2 | 3 | import streamlit as st 4 | import os 5 | import json 6 | from datetime import datetime 7 | import torch 8 | import time 9 | from typing import List, Dict, Any 10 | 11 | # Fix for Streamlit/Torch conflict 12 | torch.classes.__path__ = [] 13 | 14 | # Local imports 15 | from project_manager import ProjectManager 16 | from config_manager import ContentConfig 17 | from ui_task_executor import UITaskExecutor 18 | from utils import list_projects, load_and_correct_image_orientation 19 | from module_discovery import discover_modules 20 | # --- START OF MODIFICATION --- 21 | # Import the new detection function 22 | from system import SystemConfig, load_system_config, save_system_config, detect_system_specs 23 | # --- END OF MODIFICATION --- 24 | 25 | # Page Config 26 | st.set_page_config(page_title="AI Video Generation Pipeline", page_icon="🎥", layout="wide") 27 | 28 | # Session State 29 | def init_session_state(): 30 | system_config = load_system_config() 31 | 32 | defaults = { 33 | 'current_project': None, 34 | 'current_step': 'system_config_setup' if not system_config else 'project_selection', 35 | 'system_config': system_config, 36 | 'auto_mode': True, 37 | 'ui_executor': None, 38 | 'speaker_audio': None, 39 | 'is_processing': False, 40 | 'new_project_characters': [], 41 | 'discovered_modules': discover_modules() 42 | } 43 | for key, value in defaults.items(): 44 | if key not in st.session_state: st.session_state[key] = value 45 | init_session_state() 46 | 47 | def go_to_step(step_name): 48 | st.session_state.current_step = step_name 49 | st.rerun() 50 | 51 | def load_project(project_name): 52 | project_manager = ProjectManager(f"modular_reels_output/{project_name}") 53 | if project_manager.load_project(): 54 | st.session_state.current_project = project_manager 55 | st.session_state.ui_executor = UITaskExecutor(project_manager) 56 | st.session_state.auto_mode = False; 57 | st.session_state.is_processing = False 58 | speaker_relative_path = project_manager.state.project_info.speaker_audio_path 59 | if speaker_relative_path: 60 | full_speaker_path = os.path.join(project_manager.output_dir, speaker_relative_path) 61 | if os.path.exists(full_speaker_path): 62 | st.session_state.speaker_audio = full_speaker_path 63 | else: 64 | st.session_state.speaker_audio = None 65 | st.warning(f"Saved speaker audio not found at: {full_speaker_path}") 66 | else: 67 | st.session_state.speaker_audio = None 68 | go_to_step('processing_dashboard') 69 | else: 70 | st.error("Failed to load project.") 71 | 72 | 73 | def create_new_project(title, topic, auto, audio, video_format, length, min_s, max_s, use_svd, characters, module_selections, language, add_narration_text, seed): 74 | name = "".join(c for c in title.lower() if c.isalnum() or c in " ").replace(" ", "_")[:50] 75 | output_dir = f"modular_reels_output/{name}_{int(time.time())}" 76 | 77 | cfg = ContentConfig( 78 | output_dir=output_dir, 79 | aspect_ratio_format=video_format, 80 | target_video_length_hint=length, 81 | min_scenes=min_s, 82 | max_scenes=max_s, 83 | use_svd_flow=use_svd, 84 | module_selections=module_selections, 85 | language=language, 86 | add_narration_text_to_video=add_narration_text, 87 | seed=seed 88 | ) 89 | pm = ProjectManager(output_dir) 90 | pm.initialize_project(title, topic, cfg) 91 | 92 | if characters: 93 | for char_info in characters: 94 | safe_name = char_info['name'].replace(" ", "_") 95 | char_dir = os.path.join(output_dir, "characters", safe_name) 96 | os.makedirs(char_dir, exist_ok=True) 97 | ref_image_path = os.path.join(char_dir, "reference.png") 98 | 99 | corrected_image = load_and_correct_image_orientation(char_info['image']) 100 | if corrected_image: 101 | corrected_image.save(ref_image_path, "PNG") 102 | pm.add_character({"name": char_info['name'], "reference_image_path": ref_image_path}) 103 | else: 104 | st.error(f"Could not process image for character {char_info['name']}. Skipping.") 105 | 106 | st.session_state.current_project = pm 107 | st.session_state.ui_executor = UITaskExecutor(pm) 108 | st.session_state.auto_mode = auto 109 | if audio: 110 | relative_speaker_path = "speaker_audio.wav" 111 | full_speaker_path = os.path.join(output_dir, relative_speaker_path) 112 | with open(full_speaker_path, "wb") as f: f.write(audio.getbuffer()) 113 | st.session_state.speaker_audio = full_speaker_path 114 | pm.set_speaker_audio(relative_speaker_path) 115 | 116 | with st.spinner("Generating script..."): 117 | success = st.session_state.ui_executor.task_executor.execute_task("generate_script", {"topic": topic}) 118 | 119 | if success: 120 | st.success("Script generated!") 121 | st.session_state.current_project.load_project() 122 | st.session_state.new_project_characters = [] 123 | go_to_step('processing_dashboard') 124 | else: 125 | st.error("Failed to generate script.") 126 | st.session_state.current_project = None 127 | 128 | def handle_flow_change(): 129 | st.session_state.new_project_characters = [] 130 | 131 | def render_system_config_setup(): 132 | st.title("⚙️ System Configuration") 133 | st.info("First, let's specify your available system resources. This helps the pipeline select compatible AI models and prevent memory errors. This information will be saved locally in `system.json` for future use.") 134 | 135 | # --- START OF MODIFICATION --- 136 | # Call the detection function to get default values for the form 137 | detected_vram, detected_ram = detect_system_specs() 138 | # --- END OF MODIFICATION --- 139 | 140 | with st.form("system_config_form"): 141 | # --- START OF MODIFICATION --- 142 | # Use the detected values as the default for the number_input widgets 143 | vram = st.number_input("Available GPU VRAM (GB)", min_value=1.0, value=detected_vram, step=0.5, help="We've tried to detect this automatically. Please confirm or adjust.") 144 | ram = st.number_input("Available System RAM (GB)", min_value=1.0, value=float(detected_ram), step=1.0, help="We've tried to detect this automatically. Please confirm or adjust.") 145 | # --- END OF MODIFICATION --- 146 | 147 | submitted = st.form_submit_button("Save and Continue", type="primary") 148 | 149 | if submitted: 150 | save_system_config(vram, ram) 151 | st.session_state.system_config = SystemConfig(vram_gb=vram, ram_gb=ram) 152 | st.success("System configuration saved!") 153 | time.sleep(1) 154 | go_to_step('project_selection') 155 | 156 | 157 | def render_project_selection(): 158 | st.title("🎥 AI Video Generation Pipeline") 159 | 160 | def filter_modules_by_resources(modules: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 161 | system_config = st.session_state.system_config 162 | if not system_config: 163 | return [] 164 | 165 | compatible_modules = [] 166 | for mod in modules: 167 | caps = mod['caps'] 168 | if caps.vram_gb_min <= system_config.vram_gb and caps.ram_gb_min <= system_config.ram_gb: 169 | compatible_modules.append(mod) 170 | else: 171 | print(f"Filtering out module '{caps.title}': Needs {caps.vram_gb_min}GB VRAM / {caps.ram_gb_min}GB RAM. Have {system_config.vram_gb}/{system_config.ram_gb}.") 172 | return compatible_modules 173 | 174 | def get_caps_from_path(mod_type: str, path: str) -> Dict[str, Any]: 175 | if not path: return None 176 | for mod in st.session_state.discovered_modules.get(mod_type, []): 177 | if mod['path'] == path: 178 | return mod['caps'] 179 | return None 180 | 181 | def format_module_option(mod_type: str, path: str) -> str: 182 | caps = get_caps_from_path(mod_type, path) 183 | return caps.title if caps and caps.title else (path.split('.')[-1] if path else "Not Selected") 184 | 185 | c1, c2 = st.columns([1.2, 2]) 186 | 187 | with c2: 188 | st.subheader("Existing Projects") 189 | 190 | projects = list_projects() 191 | if not projects: 192 | st.info("No projects found. Create one to get started!") 193 | 194 | for p in projects: 195 | with st.container(border=True): 196 | proj_c1, proj_c2 = st.columns([3, 1]) 197 | with proj_c1: 198 | st.markdown(f"**{p['title']}**") 199 | with proj_c2: 200 | st.caption(f"_{p['created_at'].strftime('%Y-%m-%d %H:%M')}_") 201 | 202 | status_map = { "completed": "✅ Completed", "in_progress": "⚙️ In Progress", "failed": "❌ Failed" } 203 | display_status = status_map.get(p['status'], p['status'].title()) 204 | 205 | info_parts = [ f"**Flow:** {p['flow']}", f"**Status:** {display_status}" ] 206 | if p['duration'] > 0: info_parts.append(f"**Duration:** {p['duration']:.1f}s") 207 | 208 | st.markdown(" | ".join(info_parts), help="Project details") 209 | 210 | with st.expander("Show Modules Used"): 211 | modules_used = p.get('modules', {}) 212 | if not modules_used: 213 | st.caption("Module info not available.") 214 | else: 215 | module_info_str = "" 216 | llm_title = format_module_option('llm', modules_used.get('llm')) 217 | tts_title = format_module_option('tts', modules_used.get('tts')) 218 | 219 | module_info_str += f"- **LLM:** {llm_title}\n" 220 | module_info_str += f"- **TTS:** {tts_title}\n" 221 | 222 | if p['flow'] == "Image-to-Video": 223 | t2i_title = format_module_option('t2i', modules_used.get('t2i')) 224 | i2v_title = format_module_option('i2v', modules_used.get('i2v')) 225 | module_info_str += f"- **Image Model:** {t2i_title}\n" 226 | module_info_str += f"- **Video Model:** {i2v_title}\n" 227 | else: # Text-to-Video 228 | t2v_title = format_module_option('t2v', modules_used.get('t2v')) 229 | module_info_str += f"- **Video Model:** {t2v_title}\n" 230 | 231 | st.markdown(module_info_str) 232 | 233 | btn_c1, btn_c2 = st.columns(2) 234 | 235 | with btn_c1: 236 | st.button("Load Project", key=f"load_{p['name']}", on_click=load_project, args=(p['name'],), use_container_width=True) 237 | 238 | with btn_c2: 239 | if p['final_video_path']: 240 | with st.popover("▶️ Play Video", use_container_width=True): 241 | st.video(p['final_video_path']) 242 | else: 243 | st.button("▶️ Play Video", key=f"play_{p['name']}", disabled=True, use_container_width=True, help="Video not available or project not completed.") 244 | 245 | with c1: 246 | st.subheader("Create New Project") 247 | with st.container(border=True): 248 | st.markdown(f"**System Specs:** `{st.session_state.system_config.vram_gb}` GB VRAM | `{st.session_state.system_config.ram_gb}` GB RAM") 249 | if st.button("Change System Specs", key="change_specs"): 250 | go_to_step('system_config_setup') 251 | 252 | st.info("Step 1: Choose your workflow and AI models (filtered by your specs).") 253 | st.radio("Generation Flow", ("Image to Video (High Quality)", "Text to Video (Fast)"), horizontal=True, key="flow_choice", on_change=handle_flow_change) 254 | use_svd = st.session_state.flow_choice == "Image to Video (High Quality)" 255 | 256 | tts_options = filter_modules_by_resources(st.session_state.discovered_modules.get('tts', [])) 257 | tts_paths = [m['path'] for m in tts_options] 258 | st.selectbox("Text-to-Speech Model", options=tts_paths, format_func=lambda path: format_module_option('tts', path), key="selected_tts_module", on_change=lambda: st.session_state.update()) 259 | 260 | selected_tts_caps = get_caps_from_path('tts', st.session_state.get('selected_tts_module')) 261 | language = "en" 262 | if selected_tts_caps and selected_tts_caps.supported_tts_languages: 263 | supported_langs = selected_tts_caps.supported_tts_languages 264 | language = st.selectbox("Narration Language", options=supported_langs, index=0, key="selected_language") 265 | elif selected_tts_caps: 266 | st.caption("Language selection not available for this model.") 267 | 268 | with st.form("new_project_form"): 269 | has_characters = len(st.session_state.new_project_characters) > 0 270 | module_selections = {'tts': st.session_state.get('selected_tts_module')} 271 | 272 | llm_options_filtered = filter_modules_by_resources(st.session_state.discovered_modules.get('llm', [])) 273 | module_selections['llm'] = st.selectbox("Language Model (LLM)", options=[m['path'] for m in llm_options_filtered], format_func=lambda path: format_module_option('llm', path)) 274 | 275 | show_char_section = False 276 | 277 | selected_video_model_path = None 278 | if use_svd: 279 | all_t2i_options = filter_modules_by_resources(st.session_state.discovered_modules.get('t2i', [])) 280 | t2i_options = [m for m in all_t2i_options if not has_characters or m['caps'].supports_ip_adapter] 281 | 282 | all_i2v_options_filtered = filter_modules_by_resources(st.session_state.discovered_modules.get('i2v', [])) 283 | 284 | module_selections['t2i'] = st.selectbox("Image Model (T2I)", options=[m['path'] for m in t2i_options], format_func=lambda path: format_module_option('t2i', path), key="t2i_selection", help="Models are filtered based on your system specs and character support.") 285 | module_selections['i2v'] = st.selectbox("Image-to-Video Model (I2V)", options=[m['path'] for m in all_i2v_options_filtered], format_func=lambda path: format_module_option('i2v', path), help="Models are filtered based on your system specs.") 286 | 287 | selected_video_model_path = module_selections.get('t2i') 288 | if selected_video_model_path: 289 | selected_caps = get_caps_from_path('t2i', selected_video_model_path) 290 | if selected_caps and selected_caps.supports_ip_adapter: 291 | show_char_section = True 292 | else: # T2V Flow 293 | all_t2v_options = filter_modules_by_resources(st.session_state.discovered_modules.get('t2v', [])) 294 | t2v_options = [m for m in all_t2v_options if not has_characters or m['caps'].supports_ip_adapter] 295 | 296 | module_selections['t2v'] = st.selectbox("Text-to-Video Model (T2V)", options=[m['path'] for m in t2v_options], format_func=lambda path: format_module_option('t2v', path), key="t2v_selection", help="Models are filtered based on your system specs and character support.") 297 | 298 | selected_video_model_path = module_selections.get('t2v') 299 | if selected_video_model_path: 300 | selected_caps = get_caps_from_path('t2v', selected_video_model_path) 301 | if selected_caps and selected_caps.supports_ip_adapter: 302 | show_char_section = True 303 | 304 | st.divider() 305 | st.info("Step 2: Define your project title and content topic.") 306 | title = st.text_input("Project Title", help="A user-friendly name for your project. This will be used for the folder name.") 307 | topic = st.text_area("Video Topic / Prompt", help="The main idea or prompt for the AI to generate the script.") 308 | col1, col2 = st.columns(2) 309 | fmt = col1.selectbox("Format", ("Portrait", "Landscape"), index=1) 310 | length = col2.number_input("Length (s)", min_value=5, value=20, step=5) 311 | c1_s, c2_s = st.columns(2) 312 | min_s = c1_s.number_input("Min Scenes", 1, 10, 2, 1) 313 | max_s = c2_s.number_input("Max Scenes", min_s, 10, 5, 1) 314 | 315 | st.divider() 316 | st.info("Step 3: Final Touches") 317 | seed = st.number_input("Image Generation Seed", min_value=-1, value=-1, step=1, help="-1 for a random seed, or any other number for a fixed seed.") 318 | auto = st.checkbox("Automatic Mode", value=True) 319 | audio = st.file_uploader( 320 | "Reference Speaker Audio (Required, .wav)", 321 | type=['wav'], 322 | help="Upload a short .wav file of the desired voice. This is required to create a project." 323 | ) 324 | add_narration_text = st.checkbox("Add Narration Text to Video", value=True, help="Renders the narration text as captions on the final video.") 325 | 326 | submitted = st.form_submit_button("Create & Start Project", type="primary") 327 | if submitted: 328 | final_language = st.session_state.get('selected_language', 'en') 329 | flow_is_valid = (use_svd and module_selections.get('t2i') and module_selections.get('i2v')) or \ 330 | (not use_svd and module_selections.get('t2v')) 331 | 332 | if not flow_is_valid or not module_selections.get('llm') or not module_selections.get('tts'): 333 | st.error("A required module for the selected workflow is missing. Please check your selections.") 334 | elif not title: 335 | st.error("Project Title is required.") 336 | elif not topic: 337 | st.error("Video Topic / Prompt is required.") 338 | elif not audio: 339 | st.error("Reference Speaker Audio is required. Please upload a .wav file.") 340 | else: 341 | final_chars = st.session_state.new_project_characters if show_char_section else [] 342 | create_new_project(title, topic, auto, audio, fmt, length, min_s, max_s, use_svd, final_chars, module_selections, final_language, add_narration_text, seed) 343 | 344 | st.divider() 345 | st.subheader("Add Characters (Optional)") 346 | if show_char_section: 347 | st.caption("Add characters to use for consistent generation.") 348 | for i, char in enumerate(st.session_state.new_project_characters): 349 | with st.container(border=True): 350 | char_c1, char_c2 = st.columns([1, 4]) 351 | corrected_image = load_and_correct_image_orientation(char['image']) 352 | if corrected_image: char_c1.image(corrected_image, width=64) 353 | char_c2.write(f"**{char['name']}**") 354 | with st.expander("Add a New Character"): 355 | with st.form("add_character_form", clear_on_submit=True): 356 | char_name = st.text_input("Character Name") 357 | char_image = st.file_uploader("Upload Character Image", type=['png', 'jpg', 'jpeg']) 358 | if st.form_submit_button("Add Character to Project"): 359 | if char_name and char_image: 360 | st.session_state.new_project_characters.append({"name": char_name, "image": char_image}) 361 | st.rerun() 362 | else: st.warning("Character name and image are required.") 363 | else: 364 | st.info("The selected model workflow does not support character consistency.") 365 | if st.session_state.new_project_characters: st.session_state.new_project_characters = [] 366 | 367 | 368 | def render_processing_dashboard(): 369 | project = st.session_state.current_project 370 | ui_executor = st.session_state.ui_executor 371 | 372 | def add_scene_at_callback(index_to_add): st.session_state.ui_executor.add_new_scene(index_to_add) 373 | def remove_scene_callback(scene_idx_to_remove): st.session_state.ui_executor.remove_scene(scene_idx_to_remove) 374 | def regen_shots_callback(scene_idx_to_regen): 375 | with st.spinner(f"Regenerating shots for Scene {scene_idx_to_regen + 1}..."): 376 | st.session_state.ui_executor.regenerate_scene_shots(scene_idx_to_regen) 377 | st.rerun() 378 | 379 | supports_characters = ui_executor.task_executor.active_flow_supports_characters 380 | use_svd_flow = project.state.project_info.config.get("use_svd_flow", True) 381 | 382 | st.title(f"🎬 Project: {project.state.project_info.title}") 383 | st.caption(f"LLM Topic: {project.state.project_info.topic}") 384 | 385 | with st.container(border=True): 386 | def get_module_title(mod_type: str, path: str) -> str: 387 | if not path: return "N/A" 388 | for mod in st.session_state.discovered_modules.get(mod_type, []): 389 | if mod['path'] == path: 390 | return mod['caps'].title 391 | return path.split('.')[-1] 392 | 393 | config_dict = project.state.project_info.config 394 | modules = config_dict.get('module_selections', {}) 395 | 396 | c1, c2, c3 = st.columns(3) 397 | 398 | with c1: 399 | st.caption("Project Settings") 400 | flow = "Image-to-Video" if config_dict.get('use_svd_flow', True) else "Text-to-Video" 401 | fmt = config_dict.get('aspect_ratio_format', 'N/A') 402 | length = config_dict.get('target_video_length_hint', 'N/A') 403 | st.markdown(f"**Flow:** {flow}
**Format:** {fmt}
**Length:** {length}s", unsafe_allow_html=True) 404 | 405 | with c2: 406 | st.caption("Core Models") 407 | llm_title = get_module_title('llm', modules.get('llm')) 408 | tts_title = get_module_title('tts', modules.get('tts')) 409 | st.markdown(f"**LLM:** {llm_title}
**TTS:** {tts_title}", unsafe_allow_html=True) 410 | 411 | with c3: 412 | st.caption("Video Generation Models") 413 | if config_dict.get('use_svd_flow', True): 414 | t2i_title = get_module_title('t2i', modules.get('t2i')) 415 | i2v_title = get_module_title('i2v', modules.get('i2v')) 416 | st.markdown(f"**Image:** {t2i_title}
**Video:** {i2v_title}", unsafe_allow_html=True) 417 | else: 418 | t2v_title = get_module_title('t2v', modules.get('t2v')) 419 | st.markdown(f"**Video:** {t2v_title}") 420 | 421 | c1, c2, c3 = st.columns([2, 3, 2]) 422 | with c1: 423 | if st.button("⬅️ Back to Projects"): go_to_step('project_selection') 424 | with c2: 425 | if st.session_state.auto_mode: 426 | btn_text = "⏹️ Stop" if st.session_state.is_processing else "🚀 Start" 427 | if st.button(f"{btn_text} Automatic Processing", use_container_width=True, type="primary" if not st.session_state.is_processing else "secondary"): 428 | st.session_state.is_processing = not st.session_state.is_processing 429 | with c3: 430 | st.session_state.auto_mode = st.toggle("Automatic Mode", value=st.session_state.auto_mode, disabled=st.session_state.is_processing) 431 | st.divider() 432 | 433 | if supports_characters: 434 | expander_label = "👤 Project Characters & Subjects" 435 | if project.state.characters: expander_label = f"👤 Project Characters & Subjects: {', '.join([c.name for c in project.state.characters])}" 436 | with st.expander(expander_label, expanded=False): 437 | if not project.state.characters: st.info("No characters defined.") 438 | for char in project.state.characters: 439 | with st.container(border=True): 440 | c1_char, c2_char = st.columns([1, 3]) 441 | with c1_char: 442 | corrected_image = load_and_correct_image_orientation(char.reference_image_path) 443 | if corrected_image: st.image(corrected_image, caption=char.name, use_container_width=True) 444 | with c2_char: 445 | with st.popover("Edit Character", use_container_width=True): 446 | with st.form(f"edit_char_{char.name}"): 447 | st.write(f"Editing: **{char.name}**") 448 | new_name = st.text_input("New Name", value=char.name) 449 | new_image = st.file_uploader("Upload New Image", type=['png', 'jpg', 'jpeg'], key=f"edit_img_{char.name}") 450 | if st.form_submit_button("Save", type="primary"): ui_executor.update_character(char.name, new_name, new_image) 451 | if st.button("Delete Character", key=f"del_char_{char.name}", type="secondary", use_container_width=True): ui_executor.delete_character(char.name) 452 | with st.form("add_new_character_dashboard"): 453 | st.subheader("Add New Character") 454 | name = st.text_input("Character Name") 455 | image = st.file_uploader("Upload Reference Image", type=['png', 'jpg', 'jpeg']) 456 | if st.form_submit_button("Add Character", type="primary"): 457 | if name and image: ui_executor.add_character(name, image) 458 | else: st.error("Name and image are required.") 459 | else: 460 | st.info("This project's workflow does not support character consistency.") 461 | 462 | st.subheader("Content Generation Dashboard") 463 | 464 | with st.expander("Assembly & Export Settings"): 465 | cfg = ContentConfig(**project.state.project_info.config) 466 | 467 | c1, c2 = st.columns(2) 468 | current_text_setting = cfg.add_narration_text_to_video 469 | new_text_setting = c1.checkbox("Add Narration Text to Video", value=current_text_setting, help="Render the narration as captions. Requires re-assembly.") 470 | if new_text_setting != current_text_setting: 471 | ui_executor.update_project_config('add_narration_text_to_video', new_text_setting) 472 | 473 | current_seed = cfg.seed 474 | new_seed = c2.number_input("Image Seed", value=current_seed, min_value=-1, step=1, help="-1 for random. Changing this requires re-generating images.") 475 | if new_seed != current_seed: 476 | ui_executor.update_project_config('seed', new_seed) 477 | 478 | 479 | with st.expander("Reference Speaker Audio"): 480 | uploaded_file = st.file_uploader("Upload New Speaker Audio (.wav)", key="speaker_upload", disabled=st.session_state.is_processing) 481 | if uploaded_file: 482 | relative_speaker_path = "speaker_audio.wav" 483 | speaker_path = os.path.join(project.output_dir, relative_speaker_path) 484 | with open(speaker_path, "wb") as f: f.write(uploaded_file.getbuffer()) 485 | st.session_state.speaker_audio = speaker_path 486 | project.set_speaker_audio(relative_speaker_path) 487 | st.success("Speaker audio updated!") 488 | st.rerun() 489 | if st.session_state.speaker_audio and os.path.exists(st.session_state.speaker_audio): 490 | st.write("Current audio:"); st.audio(st.session_state.speaker_audio) 491 | else: 492 | st.info("No reference audio provided.") 493 | 494 | next_task_name, next_task_data = project.get_next_pending_task() 495 | is_ready_for_assembly = (next_task_name == "assemble_final") 496 | is_fully_complete = (next_task_name is None) 497 | 498 | if is_ready_for_assembly or is_fully_complete: 499 | if st.button("Assemble / View Final Video ➡️", type="primary"): 500 | if is_ready_for_assembly: 501 | with st.spinner("Assembling final video..."): 502 | success = ui_executor.assemble_final_video() 503 | if success: go_to_step('video_assembly') 504 | else: 505 | go_to_step('video_assembly') 506 | 507 | st.write("---") 508 | 509 | insert_c1, insert_c2, insert_c3 = st.columns([1, 1, 1]) 510 | with insert_c2: 511 | st.button("➕ Insert Scene Here", key="add_scene_at_0", on_click=add_scene_at_callback, args=(0,), use_container_width=True, disabled=st.session_state.is_processing) 512 | 513 | for i, part in enumerate(project.state.script.narration_parts): 514 | with st.container(border=True): 515 | header_c1, header_c2 = st.columns([0.9, 0.1]) 516 | with header_c1: st.header(f"Scene {i+1}") 517 | with header_c2: st.button("❌", key=f"delete_scene_{i}", help="Delete this scene", disabled=st.session_state.is_processing, on_click=remove_scene_callback, args=(i,)) 518 | 519 | if supports_characters: 520 | scene = project.get_scene_info(i) 521 | if scene and project.state.characters: 522 | all_char_names = [c.name for c in project.state.characters] 523 | selected_chars = st.multiselect("Characters in this Scene", options=all_char_names, default=scene.character_names, key=f"scene_chars_{i}") 524 | if selected_chars != scene.character_names: ui_executor.update_scene_characters(i, selected_chars) 525 | 526 | st.subheader("Narration") 527 | new_text = st.text_area("Script", part.text, key=f"text_{i}", height=100, label_visibility="collapsed", disabled=st.session_state.is_processing) 528 | if new_text != part.text: ui_executor.update_narration_text(i, new_text) 529 | 530 | audio_col1, audio_col2 = st.columns(2) 531 | if part.audio_path and os.path.exists(part.audio_path): 532 | audio_col1.audio(part.audio_path) 533 | if audio_col2.button("Regen Audio", key=f"regen_audio_{i}", disabled=st.session_state.is_processing, use_container_width=True): 534 | with st.spinner("..."): ui_executor.regenerate_audio(i, new_text, st.session_state.speaker_audio); st.rerun() 535 | else: 536 | if audio_col1.button("Gen Audio", key=f"gen_audio_{i}", disabled=st.session_state.is_processing, use_container_width=True): 537 | with st.spinner("..."): ui_executor.regenerate_audio(i, new_text, st.session_state.speaker_audio); st.rerun() 538 | 539 | st.divider() 540 | 541 | scene = project.get_scene_info(i) 542 | if scene: 543 | shots_header_c1, shots_header_c2 = st.columns([0.75, 0.25]) 544 | with shots_header_c1: st.subheader("Visual Shots") 545 | with shots_header_c2: st.button("Regen Shots", key=f"regen_shots_{i}", on_click=regen_shots_callback, args=(i,), disabled=st.session_state.is_processing, use_container_width=True, help="Regenerate all visual and motion prompts for this scene.") 546 | 547 | for shot in scene.shots: 548 | shot_idx = shot.shot_idx 549 | with st.container(border=True): 550 | if use_svd_flow: 551 | p_col, i_col, v_col = st.columns([2, 1, 1]) 552 | with p_col: 553 | st.write(f"**Shot {shot_idx + 1}**") 554 | vis = st.text_area("Visual", shot.visual_prompt, key=f"v_prompt_{i}_{shot_idx}", height=125, disabled=st.session_state.is_processing) 555 | if vis != shot.visual_prompt: ui_executor.update_shot_prompts(i, shot_idx, visual_prompt=vis) 556 | mot = st.text_area("Motion", shot.motion_prompt, key=f"m_prompt_{i}_{shot_idx}", height=75, disabled=st.session_state.is_processing) 557 | if mot != shot.motion_prompt: ui_executor.update_shot_prompts(i, shot_idx, motion_prompt=mot) 558 | with i_col: 559 | st.write("**Image**"); has_image = shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path) 560 | if has_image: st.image(shot.keyframe_image_path) 561 | else: st.info("Image pending...") 562 | if st.button("Regen Image" if has_image else "Gen Image", key=f"gen_img_{i}_{shot_idx}", disabled=st.session_state.is_processing, use_container_width=True): 563 | with st.spinner("..."): ui_executor.regenerate_shot_image(i, shot_idx); st.rerun() 564 | with v_col: 565 | st.write("**Video**"); has_video = shot.video_path and os.path.exists(shot.video_path) 566 | if has_video: st.video(shot.video_path) 567 | else: st.info("Video pending...") 568 | if st.button("Regen Video" if has_video else "Gen Video", key=f"gen_vid_{i}_{shot_idx}", disabled=st.session_state.is_processing or not has_image, use_container_width=True): 569 | with st.spinner("..."): ui_executor.regenerate_shot_video(i, shot_idx); st.rerun() 570 | else: # T2V Flow 571 | p_col, v_col = st.columns([2, 1]) 572 | with p_col: 573 | st.write(f"**Shot {shot_idx + 1} Prompt**") 574 | vis = st.text_area("Prompt", shot.visual_prompt, key=f"v_prompt_{i}_{shot_idx}", height=125, disabled=st.session_state.is_processing) 575 | if vis != shot.visual_prompt: ui_executor.update_shot_prompts(i, shot_idx, visual_prompt=vis) 576 | with v_col: 577 | st.write("**Video**"); has_video = shot.video_path and os.path.exists(shot.video_path) 578 | if has_video: st.video(shot.video_path) 579 | else: st.info("Video pending...") 580 | if st.button("Regen Video" if has_video else "Gen Video", key=f"gen_t2v_{i}_{shot_idx}", disabled=st.session_state.is_processing, use_container_width=True): 581 | with st.spinner("..."): ui_executor.regenerate_shot_t2v(i, shot_idx); st.rerun() 582 | elif part.status == "generated": 583 | if st.button("Define Visual Shots", key=f"create_scene_{i}", disabled=st.session_state.is_processing, use_container_width=True, help="Generates the visual and motion prompts for this scene based on its narration."): 584 | with st.spinner("..."): ui_executor.create_scene(i); st.rerun() 585 | else: st.info("Generate audio before scene creation.") 586 | 587 | insert_c1, insert_c2, insert_c3 = st.columns([1, 1, 1]) 588 | with insert_c2: 589 | st.button("➕ Insert Scene Here", key=f"add_scene_at_{i+1}", on_click=add_scene_at_callback, args=(i + 1,), use_container_width=True, disabled=st.session_state.is_processing) 590 | 591 | st.divider() 592 | 593 | if st.session_state.auto_mode and st.session_state.is_processing: 594 | if next_task_name is None: 595 | st.session_state.is_processing = False; st.toast("✅ All tasks done!"); go_to_step('video_assembly') 596 | else: 597 | msg = f"Executing: {next_task_name.replace('_', ' ')} for Scene {next_task_data.get('scene_idx', 0) + 1}..." 598 | if "shot" in next_task_name: msg += f" / Shot {next_task_data.get('shot_idx', 0) + 1}" 599 | with st.spinner(msg): 600 | if next_task_name == 'generate_audio': next_task_data['speaker_wav'] = st.session_state.speaker_audio 601 | success = st.session_state.ui_executor.task_executor.execute_task(next_task_name, next_task_data) 602 | if success: 603 | fresh_pm = ProjectManager(st.session_state.current_project.output_dir); fresh_pm.load_project() 604 | st.session_state.current_project = fresh_pm 605 | st.session_state.ui_executor = UITaskExecutor(fresh_pm) 606 | st.rerun() 607 | else: 608 | st.error(f"❌ Failed on: {next_task_name}. Stopping."); st.session_state.is_processing = False 609 | 610 | def render_video_assembly(): 611 | st.title("Final Video Assembly") 612 | project = st.session_state.current_project 613 | if st.button("⬅️ Back to Dashboard"): go_to_step('processing_dashboard') 614 | st.divider() 615 | final_path = project.state.final_video.path 616 | if final_path and os.path.exists(final_path): 617 | st.subheader("Final Video"); st.video(final_path) 618 | with st.expander("Details"): 619 | st.write("**Narration:**", project.state.final_video.full_narration_text) 620 | st.write("**Hashtags:**", ", ".join(project.state.final_video.hashtags)) 621 | 622 | if st.button("Re-Assemble Final Video", type="primary"): 623 | with st.spinner("..."): 624 | if not all(s.status == 'completed' for s in project.state.scenes): 625 | for scene in project.state.scenes: 626 | if scene.status != 'completed': 627 | st.write(f"Assembling scene {scene.scene_idx+1}...") 628 | st.session_state.ui_executor.task_executor.execute_task("assemble_scene", {"scene_idx": scene.scene_idx}) 629 | 630 | st.write("Assembling final video...") 631 | success = st.session_state.ui_executor.assemble_final_video() 632 | 633 | if success: 634 | st.success("Assembled!") 635 | st.rerun() 636 | else: 637 | st.error("Failed.") 638 | 639 | # Main application router 640 | if st.session_state.current_step == 'system_config_setup': 641 | render_system_config_setup() 642 | elif st.session_state.current_step == 'project_selection': 643 | render_project_selection() 644 | elif st.session_state.current_project: 645 | if st.session_state.current_step == 'processing_dashboard': 646 | render_processing_dashboard() 647 | elif st.session_state.current_step == 'video_assembly': 648 | render_video_assembly() 649 | else: 650 | go_to_step('project_selection') --------------------------------------------------------------------------------