├── __init__.py
├── .python-version
├── llm_modules
    ├── __init__.py
    └── llm_zephyr.py
├── tts_modules
    ├── __init__.py
    └── tts_coqui.py
├── t2i_modules
    ├── __init__.py
    ├── t2i_sdxl.py
    └── t2i_juggernaut.py
├── t2v_modules
    ├── __init__.py
    ├── t2v_zeroscope.py
    ├── t2v_wan.py
    └── t2v_ltx.py
├── i2v_modules
    ├── __init__.py
    ├── i2v_slideshow.py
    ├── i2v_ltx.py
    ├── i2v_svd.py
    └── i2v_wan.py
├── check_versions.py
├── .gitignore
├── pyproject.toml
├── todo.todo
├── module_discovery.py
├── config_manager.py
├── package_code.sh
├── mp3_to_wav_converter.py
├── system.py
├── utils.py
├── base_modules.py
├── __requirements.txt
├── ui_task_executor.py
├── task_executor.py
├── README.md
├── video_assembly.py
├── project_manager.py
└── app.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12


--------------------------------------------------------------------------------
/llm_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .llm_zephyr import ZephyrLLM


--------------------------------------------------------------------------------
/tts_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .tts_coqui import CoquiTTSModule


--------------------------------------------------------------------------------
/t2i_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .t2i_juggernaut import JuggernautT2I
2 | from .t2i_sdxl import SdxlT2I


--------------------------------------------------------------------------------
/t2v_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .t2v_zeroscope import ZeroscopeT2V
2 | from .t2v_wan import WanT2V
3 | from .t2v_ltx import LtxT2V


--------------------------------------------------------------------------------
/i2v_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .i2v_ltx import LtxI2V
2 | from .i2v_svd import SvdI2V
3 | from .i2v_slideshow import SlideshowI2V
4 | from .i2v_wan import WanI2V


--------------------------------------------------------------------------------
/check_versions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import streamlit
3 | import sys
4 | 
5 | print(f"Python version: {sys.version}")
6 | print(f"PyTorch version: {torch.__version__}")
7 | print(f"Streamlit version: {streamlit.__version__}") 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Environment
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | 
30 | # IDE
31 | .idea/
32 | .vscode/
33 | *.swp
34 | *.swo
35 | 
36 | # Project specific
37 | prompt_helpers/
38 | instagram_content/
39 | output/
40 | my_reels/
41 | *.mp4
42 | *.wav
43 | *.png
44 | 
45 | project.json
46 | system.json
47 | 
48 | modular_reels_output/
49 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "influencer"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "accelerate>=1.7.0",
 9 |     "coqui-tts>=0.24.3",
10 |     "diffusers>=0.33.1",
11 |     "ftfy>=6.3.1",
12 |     "gputil>=1.4.0",
13 |     "hf-transfer>=0.1.9",
14 |     "hf-xet>=1.1.1",
15 |     "huggingface-hub[cli]>=0.31.2",
16 |     "jupyter>=1.1.1",
17 |     "llvmlite>=0.44.0",
18 |     "moviepy>=2.1.2",
19 |     "mutagen>=1.47.0",
20 |     "nicegui>=2.19.0",
21 |     "numpy>=1.26.4",
22 |     "psutil>=7.0.0",
23 |     "pydantic>=2.11.5",
24 |     "pydub>=0.25.1",
25 |     "sentencepiece>=0.2.0",
26 |     "streamlit>=1.45.1",
27 |     "torch>=2.7.1",
28 |     "torchaudio>=2.7.1",
29 |     "torchvision>=0.22.1",
30 |     "transformers>=4.51.3",
31 | ]
32 | 


--------------------------------------------------------------------------------
/todo.todo:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | narration to show or not.. font and position and size selection in project
 5 | try shot and script 3 times for format then fallback to fallback system.
 6 | record user generation form in project status what user actually selcted to create ths video 
 7 | store time taken in each segment, image, video, audio, assembly and final
 8 | show better project in list, status, video duration etc. play from there.
 9 | save vram and system ram in system.json and use it to filter models
10 | show module name from its config to dropdown
11 | prompt finetunder button touse llm with different prompt (or a separate module ?)
12 | are we saving reference sound in project somewhere and its path .. and showing back in dashboard page?
13 | is the logic separate from UI so we can change ui part any time without chainging logic?
14 | 
15 | ✓ on dashboard keep expander of characters closed and show characters name on expander title
16 | ✗ audio tts emotion parameters (not in narration)
17 | ✗ tts language selection
18 | ✓ scene delete add facility 
19 | ✓ add all characters in all scene as default


--------------------------------------------------------------------------------
/module_discovery.py:
--------------------------------------------------------------------------------
 1 | # In module_discovery.py
 2 | 
 3 | import os
 4 | import importlib
 5 | import inspect
 6 | from typing import Dict, List, Any, Type
 7 | # Correctly import from base_modules
 8 | from base_modules import BaseLLM, BaseTTS, BaseT2I, BaseI2V, BaseT2V, ModuleCapabilities
 9 | 
10 | MODULE_TYPES = {
11 |     "llm": {"base_class": BaseLLM, "path": "llm_modules"},
12 |     "tts": {"base_class": BaseTTS, "path": "tts_modules"},
13 |     "t2i": {"base_class": BaseT2I, "path": "t2i_modules"},
14 |     "i2v": {"base_class": BaseI2V, "path": "i2v_modules"},
15 |     "t2v": {"base_class": BaseT2V, "path": "t2v_modules"},
16 | }
17 | 
18 | def discover_modules() -> Dict[str, List[Dict[str, Any]]]:
19 |     """
20 |     Scans module directories, imports classes, and gets their capabilities.
21 |     """
22 |     discovered_modules = {key: [] for key in MODULE_TYPES}
23 |     
24 |     for module_type, info in MODULE_TYPES.items():
25 |         module_path = info["path"]
26 |         base_class = info["base_class"]
27 |         
28 |         if not os.path.exists(module_path):
29 |             continue
30 |             
31 |         for filename in os.listdir(module_path):
32 |             if filename.endswith(".py") and not filename.startswith("__"):
33 |                 module_name = f"{module_path}.{filename[:-3]}"
34 |                 try:
35 |                     module = importlib.import_module(module_name)
36 |                     for attribute_name in dir(module):
37 |                         attribute = getattr(module, attribute_name)
38 |                         if inspect.isclass(attribute) and issubclass(attribute, base_class) and attribute is not base_class:
39 |                             caps = attribute.get_capabilities()
40 |                             discovered_modules[module_type].append({
41 |                                 "name": attribute.__name__,
42 |                                 "path": f"{module_name}.{attribute.__name__}",
43 |                                 "caps": caps,
44 |                                 "class": attribute
45 |                             })
46 |                 except Exception as e:
47 |                     print(f"Warning: Could not load module {module_name}. Error: {e}")
48 |                     
49 |     return discovered_modules


--------------------------------------------------------------------------------
/config_manager.py:
--------------------------------------------------------------------------------
 1 | # In config_manager.py
 2 | import os
 3 | import torch
 4 | import gc
 5 | from pydantic import BaseModel, Field
 6 | from typing import Dict, Tuple, Literal
 7 | 
 8 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 9 | if DEVICE == "cuda": os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
10 | 
11 | class ContentConfig(BaseModel):
12 |     """Configuration for overall content generation parameters, using Pydantic."""
13 |     # --- User-defined settings from the UI ---
14 |     target_video_length_hint: float = 20.0
15 |     min_scenes: int = 2
16 |     max_scenes: int = 5
17 |     aspect_ratio_format: Literal["Portrait", "Landscape"] = "Landscape"
18 |     use_svd_flow: bool = True
19 |     add_narration_text_to_video: bool = True
20 |     seed: int = -1 # <--- NEW: -1 means random seed
21 | 
22 |     # --- NEW: To be filled from UI selections ---
23 |     module_selections: Dict[str, str] = Field(default_factory=dict)
24 |     language: str = "en"
25 | 
26 |     # --- Static project-wide settings ---
27 |     fps: int = 24
28 |     output_dir: str = "modular_reels_output"
29 |     font_for_subtitles: str = "Arial"
30 | 
31 |     # --- DYNAMIC settings, to be populated by the TaskExecutor ---
32 |     model_max_video_shot_duration: float = 2.0 # A safe default
33 |     generation_resolution: Tuple[int, int] = (1024, 1024) # A safe default
34 | 
35 |     @property
36 |     def max_scene_narration_duration_hint(self) -> float:
37 |         if self.max_scenes > 0 and self.min_scenes > 0:
38 |             avg_scenes = (self.min_scenes + self.max_scenes) / 2
39 |             return round(self.target_video_length_hint / avg_scenes, 1)
40 |         return 6.0
41 | 
42 |     @property
43 |     def final_output_resolution(self) -> Tuple[int, int]:
44 |         if self.aspect_ratio_format == "Landscape":
45 |             return (1920, 1080)
46 |         return (1080, 1920)
47 | 
48 |     def __init__(self, **data):
49 |         super().__init__(**data)
50 |         os.makedirs(self.output_dir, exist_ok=True)
51 | 
52 | 
53 | def clear_vram_globally(*items_to_del):
54 |     print(f"Attempting to clear VRAM. Received {len(items_to_del)} items to delete.")
55 |     for item in items_to_del:
56 |         if hasattr(item, 'to') and hasattr(item, 'dtype') and item.dtype != torch.float16:
57 |             try:
58 |                 item.to('cpu')
59 |             except Exception as e:
60 |                 print(f"Could not move item of type {type(item)} to CPU: {e}")
61 |     del items_to_del
62 |     gc.collect()
63 |     if torch.cuda.is_available():
64 |         torch.cuda.empty_cache()
65 |         torch.cuda.ipc_collect()
66 |     print("VRAM clearing attempt finished.")


--------------------------------------------------------------------------------
/package_code.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default output file name
 4 | OUTPUT_FILE="combined_code.txt"
 5 | 
 6 | # --- Configuration: Define what to include ---
 7 | # Add your source directories and specific files here.
 8 | # Paths should be relative to where you run the script from.
 9 | # Directories will be scanned recursively.
10 | # Use spaces to separate items.
11 | FILES_TO_INCLUDE=(
12 |     "README.md"
13 |     "base_modules.py"
14 |     "utils.py"
15 |     "module_discovery.py"
16 |     "app.py"
17 |     "project_manager.py"
18 |     "task_executor.py"
19 |     "ui_task_executor.py"
20 |     "config_manager.py"
21 |     "video_assembly.py"
22 |     "llm_modules/"
23 |     "tts_modules/"
24 |     "t2i_modules/"
25 |     "i2v_modules/"
26 |     "t2v_modules/"
27 | )
28 | 
29 | # --- End of Configuration ---
30 | 
31 | # Check if an output file name was provided as an argument
32 | if [ "$1" ]; then
33 |   OUTPUT_FILE="$1"
34 |   echo "Using custom output file name: $OUTPUT_FILE"
35 | fi
36 | 
37 | # Clear the output file to start fresh
38 | > "$OUTPUT_FILE"
39 | echo "Cleared old content from $OUTPUT_FILE."
40 | 
41 | # A function to process and append a file to the output
42 | process_file() {
43 |     local file_path=$1
44 |     echo "Processing: $file_path"
45 |     
46 |     # Write the header with the relative file path
47 |     echo "==== $file_path ====" >> "$OUTPUT_FILE"
48 |     
49 |     # Append the content of the file
50 |     cat "$file_path" >> "$OUTPUT_FILE"
51 |     
52 |     # Add multiple newlines at the end for better separation
53 |     echo -e "\n\n\n" >> "$OUTPUT_FILE"
54 | }
55 | 
56 | # Loop through the configured list of files and directories
57 | for item in "${FILES_TO_INCLUDE[@]}"; do
58 |     if [ -f "$item" ]; then
59 |         # If it's a single file, process it directly
60 |         process_file "$item"
61 |     elif [ -d "$item" ]; then
62 |         # If it's a directory, find all relevant files inside it
63 |         # - The `find` command is powerful.
64 |         # - It searches for items of type 'f' (file).
65 |         # - It ignores paths containing '__pycache__', '.git', '.vscode', etc.
66 |         # - It only includes files ending in '.py' or other specified extensions.
67 |         find "$item" -type f \( -name "*.py" -o -name "*.sh" \) \
68 |         -not -path "*/__pycache__/*" \
69 |         -not -path "*/.git/*" \
70 |         -not -path "*/.venv/*" \
71 |         -not -path "*/.vscode/*" \
72 |         | sort | while read -r file; do
73 |             process_file "$file"
74 |         done
75 |     else
76 |         echo "Warning: Item '$item' not found. Skipping."
77 |     fi
78 | done
79 | 
80 | echo "========================================="
81 | echo "✅ All done!"
82 | echo "Combined code saved to: $OUTPUT_FILE"
83 | echo "========================================="


--------------------------------------------------------------------------------
/tts_modules/tts_coqui.py:
--------------------------------------------------------------------------------
 1 | # tts_modules/tts_coqui.py
 2 | import os
 3 | import torch
 4 | import numpy as np
 5 | from typing import Tuple, Optional
 6 | from TTS.api import TTS as CoquiTTS
 7 | from moviepy import AudioFileClip
 8 | from scipy.io import wavfile
 9 | 
10 | from base_modules import BaseTTS, BaseModuleConfig, ModuleCapabilities
11 | from config_manager import DEVICE, clear_vram_globally
12 | 
13 | class CoquiTTSConfig(BaseModuleConfig):
14 |     model_id: str = "tts_models/multilingual/multi-dataset/xtts_v2"
15 | 
16 | class CoquiTTSModule(BaseTTS):
17 |     Config = CoquiTTSConfig
18 | 
19 |     @classmethod
20 |     def get_capabilities(cls) -> ModuleCapabilities:
21 |         return ModuleCapabilities(
22 |             title="XTTS, Multi-Language, Documentary Style",
23 |             vram_gb_min=2.0, # XTTS is relatively lightweight
24 |             ram_gb_min=8.0,
25 |             supported_tts_languages=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"]
26 |         )
27 | 
28 |     def _load_model(self):
29 |         if self.model is None:
30 |             print(f"Loading TTS model: {self.config.model_id}...")
31 |             self.model = CoquiTTS(model_name=self.config.model_id, progress_bar=True).to(DEVICE)
32 |             print("TTS model loaded.")
33 |     
34 |     def clear_vram(self):
35 |         print("Clearing TTS VRAM...")
36 |         if self.model is not None:
37 |             clear_vram_globally(self.model)
38 |         self.model = None
39 |         print("TTS VRAM cleared.")
40 | 
41 |     def generate_audio(
42 |         self, text: str, output_dir: str, scene_idx: int, language: str, speaker_wav: Optional[str] = None
43 |     ) -> Tuple[str, float]:
44 |         self._load_model()
45 |         
46 |         print(f"Generating audio in {language} for scene {scene_idx}: \"{text[:50]}...\"")
47 |         output_path = os.path.join(output_dir, f"scene_{scene_idx}_audio.wav")
48 |         
49 |         tts_kwargs = {"language": language, "file_path": output_path}
50 |         
51 |         if "xtts" in self.config.model_id.lower():
52 |             if speaker_wav and os.path.exists(speaker_wav):
53 |                 tts_kwargs["speaker_wav"] = speaker_wav
54 |             else:
55 |                 if speaker_wav: print(f"Warning: Speaker WAV {speaker_wav} not found. XTTS using default voice.")
56 |         
57 |         self.model.tts_to_file(text, **tts_kwargs)
58 |         
59 |         duration = 0.0
60 |         try:
61 |             if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
62 |                 with AudioFileClip(output_path) as audio_clip:
63 |                     duration = audio_clip.duration + 0.1 # Small buffer
64 |             else: raise ValueError("Audio file not generated or is empty.")
65 |         except Exception as e:
66 |             print(f"Error getting duration for {output_path}: {e}. Creating fallback.")
67 |             samplerate = 22050 
68 |             wavfile.write(output_path, samplerate, np.zeros(int(0.1 * samplerate), dtype=np.int16))
69 |             duration = 0.1
70 | 
71 |         print(f"Actual audio duration for scene {scene_idx}: {duration:.2f}s")
72 |         return output_path, duration


--------------------------------------------------------------------------------
/mp3_to_wav_converter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | MP3 to WAV Converter
 4 | Converts all MP3 files in the Downloads folder to WAV format.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | from pathlib import Path
10 | from pydub import AudioSegment
11 | 
12 | def convert_mp3_to_wav(downloads_folder="~/Downloads", output_folder=None):
13 |     """
14 |     Convert all MP3 files in the Downloads folder to WAV format.
15 |     
16 |     Args:
17 |         downloads_folder (str): Path to the Downloads folder
18 |         output_folder (str): Path to output folder (defaults to same as input)
19 |     """
20 |     # Expand the tilde to full path
21 |     downloads_path = Path(downloads_folder).expanduser()
22 |     
23 |     if output_folder is None:
24 |         output_path = downloads_path
25 |     else:
26 |         output_path = Path(output_folder).expanduser()
27 |     
28 |     # Create output directory if it doesn't exist
29 |     output_path.mkdir(parents=True, exist_ok=True)
30 |     
31 |     # Find all MP3 files
32 |     mp3_files = list(downloads_path.glob("*.mp3"))
33 |     
34 |     if not mp3_files:
35 |         print("No MP3 files found in the Downloads folder.")
36 |         return
37 |     
38 |     print(f"Found {len(mp3_files)} MP3 file(s) to convert:")
39 |     for mp3_file in mp3_files:
40 |         print(f"  - {mp3_file.name}")
41 |     
42 |     print("\nStarting conversion...")
43 |     
44 |     converted_count = 0
45 |     failed_count = 0
46 |     
47 |     for mp3_file in mp3_files:
48 |         try:
49 |             print(f"Converting: {mp3_file.name}")
50 |             
51 |             # Load the MP3 file
52 |             audio = AudioSegment.from_mp3(str(mp3_file))
53 |             
54 |             # Create output filename (replace .mp3 with .wav)
55 |             wav_filename = mp3_file.stem + ".wav"
56 |             wav_path = output_path / wav_filename
57 |             
58 |             # Export as WAV
59 |             audio.export(str(wav_path), format="wav")
60 |             
61 |             print(f"  ✓ Successfully converted to: {wav_filename}")
62 |             converted_count += 1
63 |             
64 |         except Exception as e:
65 |             print(f"  ✗ Failed to convert {mp3_file.name}: {str(e)}")
66 |             failed_count += 1
67 |     
68 |     print(f"\nConversion complete!")
69 |     print(f"Successfully converted: {converted_count} files")
70 |     if failed_count > 0:
71 |         print(f"Failed conversions: {failed_count} files")
72 | 
73 | def main():
74 |     """Main function to handle command line arguments."""
75 |     import argparse
76 |     
77 |     parser = argparse.ArgumentParser(description="Convert MP3 files to WAV format")
78 |     parser.add_argument("--input", "-i", default="~/Downloads", 
79 |                        help="Input folder containing MP3 files (default: ~/Downloads)")
80 |     parser.add_argument("--output", "-o", 
81 |                        help="Output folder for WAV files (default: same as input folder)")
82 |     
83 |     args = parser.parse_args()
84 |     
85 |     try:
86 |         convert_mp3_to_wav(args.input, args.output)
87 |     except KeyboardInterrupt:
88 |         print("\nConversion interrupted by user.")
89 |         sys.exit(1)
90 |     except Exception as e:
91 |         print(f"Error: {str(e)}")
92 |         sys.exit(1)
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/system.py:
--------------------------------------------------------------------------------
 1 | # In system.py
 2 | import json
 3 | import os
 4 | from pydantic import BaseModel, Field
 5 | from typing import Optional, Tuple
 6 | 
 7 | # --- START OF MODIFICATION ---
 8 | # Import necessary libraries for detection
 9 | try:
10 |     import psutil
11 | except ImportError:
12 |     psutil = None
13 | 
14 | try:
15 |     import GPUtil
16 | except ImportError:
17 |     GPUtil = None
18 | # --- END OF MODIFICATION ---
19 | 
20 | 
21 | SYSTEM_CONFIG_FILE = "system.json"
22 | 
23 | class SystemConfig(BaseModel):
24 |     """Stores the user's available system resources."""
25 |     vram_gb: float = Field(description="Available GPU VRAM in GB.")
26 |     ram_gb: float = Field(description="Available system RAM in GB.")
27 | 
28 | def save_system_config(vram_gb: float, ram_gb: float) -> None:
29 |     """Saves the system resource configuration to system.json."""
30 |     config = SystemConfig(vram_gb=vram_gb, ram_gb=ram_gb)
31 |     with open(SYSTEM_CONFIG_FILE, 'w') as f:
32 |         f.write(config.model_dump_json(indent=4))
33 |     print(f"System configuration saved to {SYSTEM_CONFIG_FILE}")
34 | 
35 | def load_system_config() -> Optional[SystemConfig]:
36 |     """Loads the system resource configuration from system.json if it exists."""
37 |     if not os.path.exists(SYSTEM_CONFIG_FILE):
38 |         return None
39 |     try:
40 |         with open(SYSTEM_CONFIG_FILE, 'r') as f:
41 |             data = json.load(f)
42 |             return SystemConfig(**data)
43 |     except (json.JSONDecodeError, TypeError) as e:
44 |         print(f"Error loading or parsing {SYSTEM_CONFIG_FILE}: {e}. Please re-enter details.")
45 |         return None
46 | 
47 | # --- START OF MODIFICATION ---
48 | def detect_system_specs() -> Tuple[float, float]:
49 |     """
50 |     Attempts to detect available system RAM and GPU VRAM.
51 |     Returns (vram_in_gb, ram_in_gb).
52 |     Defaults to 8.0 for VRAM and 16.0 for RAM if detection fails.
53 |     """
54 |     # Default values
55 |     detected_ram_gb = 16.0
56 |     detected_vram_gb = 8.0
57 | 
58 |     # 1. Detect System RAM
59 |     if psutil:
60 |         try:
61 |             ram_bytes = psutil.virtual_memory().total
62 |             # Round to the nearest whole number for a cleaner UI
63 |             detected_ram_gb = round(ram_bytes / (1024**3))
64 |             print(f"Detected System RAM: {detected_ram_gb} GB")
65 |         except Exception as e:
66 |             print(f"Could not detect system RAM using psutil: {e}. Falling back to default.")
67 |     else:
68 |         print("psutil not installed. Cannot detect RAM. Falling back to default.")
69 | 
70 |     # 2. Detect GPU VRAM
71 |     if GPUtil:
72 |         try:
73 |             gpus = GPUtil.getGPUs()
74 |             if gpus:
75 |                 # Use the VRAM of the first detected GPU
76 |                 gpu = gpus[0]
77 |                 # VRAM is in MB, convert to GB and round to one decimal place
78 |                 detected_vram_gb = round(gpu.memoryTotal / 1024, 1)
79 |                 print(f"Detected GPU: {gpu.name} with {detected_vram_gb} GB VRAM")
80 |             else:
81 |                 print("GPUtil found no GPUs. Falling back to default VRAM.")
82 |         except Exception as e:
83 |             print(f"Could not detect GPU VRAM using GPUtil: {e}. Falling back to default.")
84 |     else:
85 |         print("GPUtil not installed. Cannot detect VRAM. Falling back to default.")
86 |         
87 |     return detected_vram_gb, detected_ram_gb
88 | # --- END OF MODIFICATION ---


--------------------------------------------------------------------------------
/i2v_modules/i2v_slideshow.py:
--------------------------------------------------------------------------------
 1 | # In i2v_modules/i2v_slideshow.py
 2 | from typing import Dict, Any, List, Optional, Union
 3 | # --- THIS IS THE FIX: Importing ImageClip directly, matching the project's pattern ---
 4 | from moviepy.video.VideoClip import ImageClip
 5 | 
 6 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities
 7 | from config_manager import ContentConfig
 8 | 
 9 | class SlideshowI2VConfig(BaseModuleConfig):
10 |     # This module doesn't load a model, but the config is part of the contract.
11 |     model_id: str = "moviepy_image_clip"
12 | 
13 | class SlideshowI2V(BaseI2V):
14 |     Config = SlideshowI2VConfig
15 | 
16 |     @classmethod
17 |     def get_capabilities(cls) -> ModuleCapabilities:
18 |         """
19 |         Defines the capabilities of this simple, non-AI module.
20 |         It uses minimal resources and doesn't support AI-specific features.
21 |         """
22 |         return ModuleCapabilities(
23 |             title="Slideshow (Static Image)",
24 |             vram_gb_min=0.1,  # Uses virtually no VRAM
25 |             ram_gb_min=1.0,   # Uses very little RAM
26 |             supported_formats=["Portrait", "Landscape"],
27 |             supports_ip_adapter=False, # Not an AI model
28 |             supports_lora=False,       # Not an AI model
29 |             max_subjects=0,
30 |             accepts_text_prompt=False, # Ignores prompts
31 |             accepts_negative_prompt=False
32 |         )
33 | 
34 |     def get_model_capabilities(self) -> Dict[str, Any]:
35 |         """
36 |         This module has no native resolution and can handle long durations.
37 |         """
38 |         return {
39 |             # It can handle any resolution, as it just wraps the image.
40 |             "resolutions": {"Portrait": (1080, 1920), "Landscape": (1920, 1080)},
41 |             "max_shot_duration": 60.0 # Can be very long
42 |         }
43 | 
44 |     def _load_pipeline(self):
45 |         """No pipeline to load for this module."""
46 |         print("SlideshowI2V: No pipeline to load.")
47 |         pass
48 | 
49 |     def clear_vram(self):
50 |         """No VRAM to clear for this module."""
51 |         print("SlideshowI2V: No VRAM to clear.")
52 |         pass
53 | 
54 |     def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str:
55 |         """This module ignores prompts, so no enhancement is needed."""
56 |         return prompt
57 | 
58 |     def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
59 |         """
60 |         Creates a video by holding a static image for the target duration.
61 |         """
62 |         print(f"SlideshowI2V: Creating static video for {target_duration:.2f}s from {image_path}")
63 |         
64 |         video_clip = None
65 |         try:
66 |             # Create a video clip from the static image and set its duration.
67 |             video_clip = ImageClip(image_path).with_duration(target_duration)
68 |             
69 |             # Use the correct syntax for write_videofile, matching video_assembly.py
70 |             video_clip.write_videofile(
71 |                 output_video_path, 
72 |                 fps=content_config.fps,
73 |                 codec="libx264", 
74 |                 audio=False, # This is a visual-only shot
75 |                 threads=4, 
76 |                 preset="medium",
77 |                 logger=None # Suppress verbose moviepy logs
78 |             )
79 |             
80 |             print(f"Slideshow video shot saved to {output_video_path}")
81 |             return output_video_path
82 | 
83 |         except Exception as e:
84 |             print(f"Error creating slideshow video: {e}")
85 |             return "" # Return empty string on failure
86 |         finally:
87 |             # Ensure the clip resources are released
88 |             if video_clip:
89 |                 video_clip.close()


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # In utils.py
 2 | import datetime
 3 | import json
 4 | import os
 5 | from PIL import Image, ImageOps
 6 | from moviepy import VideoFileClip
 7 | import streamlit as st # Keep st for st.error
 8 | 
 9 | def load_and_correct_image_orientation(image_source):
10 |     """
11 |     Loads an image from a source (file path or uploaded file object)
12 |     and corrects its orientation based on EXIF data.
13 |     """
14 |     try:
15 |         image = Image.open(image_source)
16 |         # The magic is in exif_transpose
17 |         corrected_image = ImageOps.exif_transpose(image)
18 |         return corrected_image
19 |     except Exception as e:
20 |         # Using st.error here is okay for a simple app, but for true separation,
21 |         # you might log the error and return None, letting the caller handle the UI.
22 |         # For this project, this is fine.
23 |         st.error(f"Could not load or correct image: {e}")
24 |         return None
25 | 
26 | def list_projects():
27 |     """Lists all projects from the output directory with extended details including modules."""
28 |     projects = []
29 |     base_dir = "modular_reels_output"
30 |     if not os.path.exists(base_dir): return []
31 |     for project_dir in os.listdir(base_dir):
32 |         project_path = os.path.join(base_dir, project_dir)
33 |         if os.path.isdir(project_path):
34 |             project_file = os.path.join(project_path, "project.json")
35 |             if os.path.exists(project_file):
36 |                 try:
37 |                     with open(project_file, 'r') as f: 
38 |                         data = json.load(f)
39 |                     
40 |                     project_info = data.get('project_info', {})
41 |                     # --- START OF MODIFICATION ---
42 |                     # Use title, but fall back to topic for old projects, then to dir name.
43 |                     title = project_info.get('title', project_info.get('topic', project_dir))
44 |                     topic = project_info.get('topic', 'N/A') # Keep topic for potential detailed views
45 |                     # --- END OF MODIFICATION ---
46 | 
47 |                     config = project_info.get('config', {})
48 |                     final_video_info = data.get('final_video', {})
49 |                     status = project_info.get('status', 'unknown')
50 | 
51 |                     flow = "Image-to-Video" if config.get('use_svd_flow', True) else "Text-to-Video"
52 |                     
53 |                     final_video_path = None
54 |                     duration = 0.0
55 |                     if status == 'completed':
56 |                         stored_path = final_video_info.get('path')
57 |                         if stored_path and os.path.exists(stored_path):
58 |                             final_video_path = stored_path
59 |                             try:
60 |                                 with VideoFileClip(final_video_path) as clip:
61 |                                     duration = clip.duration
62 |                             except Exception as e:
63 |                                 print(f"Could not read video duration for {final_video_path}: {e}")
64 |                                 duration = 0.0
65 |                     
66 |                     modules = config.get('module_selections', {})
67 |                     
68 |                     # --- START OF MODIFICATION ---
69 |                     projects.append({
70 |                         'name': project_dir, 
71 |                         'title': title, # Use the new title field
72 |                         'topic': topic, # Keep topic field for completeness
73 |                         'created_at': datetime.datetime.fromtimestamp(project_info.get('created_at', 0)), 
74 |                         'status': status,
75 |                         'flow': flow,
76 |                         'final_video_path': final_video_path,
77 |                         'duration': duration,
78 |                         'modules': modules,
79 |                     })
80 |                     # --- END OF MODIFICATION ---
81 |                 except Exception as e:
82 |                     print(f"Error loading project {project_dir}: {e}") 
83 |     return sorted(projects, key=lambda p: p['created_at'], reverse=True)


--------------------------------------------------------------------------------
/t2i_modules/t2i_sdxl.py:
--------------------------------------------------------------------------------
  1 | # t2i_modules/t2i_sdxl.py
  2 | import torch
  3 | from typing import List, Optional, Dict, Any, Union
  4 | from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
  5 | from diffusers.utils import load_image
  6 | 
  7 | from base_modules import BaseT2I, BaseModuleConfig, ModuleCapabilities
  8 | from config_manager import DEVICE, clear_vram_globally
  9 | 
 10 | class SdxlT2IConfig(BaseModuleConfig):
 11 |     model_id: str = "stabilityai/stable-diffusion-xl-base-1.0"
 12 |     refiner_id: Optional[str] = "stabilityai/stable-diffusion-xl-refiner-1.0"
 13 |     num_inference_steps: int = 30
 14 |     guidance_scale: float = 7.5
 15 |     base_denoising_end: float = 0.8
 16 |     refiner_denoising_start: float = 0.8
 17 | 
 18 | class SdxlT2I(BaseT2I):
 19 |     Config = SdxlT2IConfig
 20 |     
 21 |     def __init__(self, config: SdxlT2IConfig):
 22 |         super().__init__(config)
 23 |         self.refiner_pipe = None
 24 | 
 25 |     @classmethod
 26 |     def get_capabilities(cls) -> ModuleCapabilities:
 27 |         return ModuleCapabilities(
 28 |             title="SDXL + Refiner (High VRAM): No Subjects considered",
 29 |             vram_gb_min=10.0, # SDXL with refiner is heavy
 30 |             ram_gb_min=16.0,
 31 |             supported_formats=["Portrait", "Landscape"],
 32 |             # Even if we don't implement IP-Adapter here, we declare support
 33 |             # because the pipeline is capable. A more advanced version could add it.
 34 |             supports_ip_adapter=True,
 35 |             supports_lora=True,
 36 |             max_subjects=2,
 37 |             accepts_text_prompt=True,
 38 |             accepts_negative_prompt=True
 39 |         )
 40 | 
 41 |     def get_model_capabilities(self) -> Dict[str, Any]:
 42 |         return {
 43 |             "resolutions": {"Portrait": (896, 1152), "Landscape": (1344, 768)},
 44 |             "max_shot_duration": 3.0
 45 |         }
 46 | 
 47 |     def _load_pipeline(self):
 48 |         if self.pipe is None:
 49 |             print(f"Loading T2I pipeline (SDXL): {self.config.model_id}...")
 50 |             self.pipe = StableDiffusionXLPipeline.from_pretrained(
 51 |                 self.config.model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
 52 |             ).to(DEVICE)
 53 |             print("SDXL Base pipeline loaded.")
 54 |             if self.config.refiner_id:
 55 |                 print(f"Loading T2I Refiner pipeline: {self.config.refiner_id}...")
 56 |                 self.refiner_pipe = DiffusionPipeline.from_pretrained(
 57 |                     self.config.refiner_id, text_encoder_2=self.pipe.text_encoder_2,
 58 |                     vae=self.pipe.vae, torch_dtype=torch.float16,
 59 |                     use_safetensors=True, variant="fp16"
 60 |                 ).to(DEVICE)
 61 |                 print("SDXL Refiner pipeline loaded.")
 62 | 
 63 |     def clear_vram(self):
 64 |         print("Clearing T2I (SDXL) VRAM...")
 65 |         models = [m for m in [self.pipe, self.refiner_pipe] if m is not None]
 66 |         if models: clear_vram_globally(*models)
 67 |         self.pipe, self.refiner_pipe = None, None
 68 |         print("T2I (SDXL) VRAM cleared.")
 69 | 
 70 |     # --- START OF FIX: Updated method signature and implementation ---
 71 |     def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str:
 72 |         self._load_pipeline()
 73 | 
 74 |         if ip_adapter_image:
 75 |             print("Warning: SDXLT2I module received IP-Adapter image but does not currently implement its use.")
 76 |         
 77 |         generator = None
 78 |         if seed != -1:
 79 |             print(f"Using fixed seed for generation: {seed}")
 80 |             # Ensure the generator is on the same device as the pipeline
 81 |             generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
 82 |         else:
 83 |             print("Using random seed for generation.")
 84 | 
 85 |         kwargs = {
 86 |             "prompt": prompt,
 87 |             "negative_prompt": negative_prompt, # Now passing this argument
 88 |             "width": width, "height": height,
 89 |             "num_inference_steps": self.config.num_inference_steps,
 90 |             "guidance_scale": self.config.guidance_scale,
 91 |             "generator": generator # Now passing the generator
 92 |         }
 93 |         if self.refiner_pipe:
 94 |             kwargs["output_type"] = "latent"
 95 |             kwargs["denoising_end"] = self.config.base_denoising_end
 96 | 
 97 |         image = self.pipe(**kwargs).images[0]
 98 |         
 99 |         if self.refiner_pipe:
100 |             print("Refining image...")
101 |             refiner_kwargs = {
102 |                 "prompt": prompt,
103 |                 "negative_prompt": negative_prompt,
104 |                 "image": image,
105 |                 "denoising_start": self.config.refiner_denoising_start,
106 |                 "num_inference_steps": self.config.num_inference_steps,
107 |                 "generator": generator
108 |             }
109 |             image = self.refiner_pipe(**refiner_kwargs).images[0]
110 |         
111 |         image.save(output_path)
112 |         print(f"Image saved to {output_path}")
113 |         return output_path
114 |     # --- END OF FIX ---


--------------------------------------------------------------------------------
/i2v_modules/i2v_ltx.py:
--------------------------------------------------------------------------------
  1 | # i2v_modules/i2v_ltx.py
  2 | import torch
  3 | from typing import Dict, Any, List, Optional, Union
  4 | from diffusers import LTXImageToVideoPipeline
  5 | from diffusers.utils import export_to_video, load_image
  6 | from PIL import Image
  7 | 
  8 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities
  9 | from config_manager import DEVICE, clear_vram_globally, ContentConfig
 10 | 
 11 | class LtxI2VConfig(BaseModuleConfig):
 12 |     model_id: str = "Lightricks/LTX-Video"
 13 |     num_inference_steps: int = 50
 14 |     guidance_scale: float = 7.5
 15 | 
 16 | class LtxI2V(BaseI2V):
 17 |     Config = LtxI2VConfig
 18 | 
 19 |     @classmethod
 20 |     def get_capabilities(cls) -> ModuleCapabilities:
 21 |         return ModuleCapabilities(
 22 |             title="LTX, 8bit Load, Port/LandScape, 2 Sub, Take +/- Prompts, max 4 sec",
 23 |             vram_gb_min=8.0,
 24 |             ram_gb_min=12.0,
 25 |             supported_formats=["Portrait", "Landscape"],
 26 |             supports_ip_adapter=True,
 27 |             supports_lora=True, # Juggernaut is a fine-tune, can easily use LoRAs
 28 |             max_subjects=2, # Can handle one or two IP adapter images
 29 |             accepts_text_prompt=True,
 30 |             accepts_negative_prompt=True
 31 |         )
 32 | 
 33 | 
 34 |     def get_model_capabilities(self) -> Dict[str, Any]:
 35 |         return {
 36 |             "resolutions": {"Portrait": (480, 704), "Landscape": (704, 480)},
 37 |             "max_shot_duration": 4 
 38 |         }
 39 |     
 40 |     def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str:
 41 |         # SVD doesn't use text prompts, but this shows how you could add model-specific keywords.
 42 |         # For example, for a different model you might do:
 43 |         if prompt_type == "visual":
 44 |            return f"{prompt}, 8k, photorealistic, cinematic lighting"
 45 |         return prompt # Return original for SVD
 46 | 
 47 |     def _load_pipeline(self):
 48 |         if self.pipe is None:
 49 |             print(f"Loading I2V pipeline (LTX): {self.config.model_id}...")
 50 |             self.pipe = LTXImageToVideoPipeline.from_pretrained(self.config.model_id, torch_dtype=torch.bfloat16)
 51 |             self.pipe.enable_model_cpu_offload()
 52 |             print("I2V (LTX) pipeline loaded.")
 53 | 
 54 |     def clear_vram(self):
 55 |         print("Clearing I2V (LTX) VRAM...")
 56 |         if self.pipe is not None: clear_vram_globally(self.pipe)
 57 |         self.pipe = None
 58 |         print("I2V (LTX) VRAM cleared.")
 59 | 
 60 |     def _resize_and_pad(self, image: Image.Image, target_width: int, target_height: int) -> Image.Image:
 61 |         original_aspect = image.width / image.height; target_aspect = target_width / target_height
 62 |         if original_aspect > target_aspect: new_width, new_height = target_width, int(target_width / original_aspect)
 63 |         else: new_height, new_width = target_height, int(target_height * original_aspect)
 64 |         resized_image = image.resize((new_width, new_height), Image.LANCZOS)
 65 |         background = Image.new('RGB', (target_width, target_height), (0, 0, 0))
 66 |         background.paste(resized_image, ((target_width - new_width) // 2, (target_height - new_height) // 2))
 67 |         return background
 68 | 
 69 |     def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
 70 |         self._load_pipeline()
 71 |         
 72 |         input_image = load_image(image_path)
 73 |         target_res = self.get_model_capabilities()["resolutions"]
 74 |         aspect_ratio = "Landscape" if input_image.width > input_image.height else "Portrait"
 75 |         target_width, target_height = target_res[aspect_ratio]
 76 |         prepared_image = self._resize_and_pad(input_image, target_width, target_height)
 77 | 
 78 |         num_frames = max(16, int(target_duration * content_config.fps))
 79 |         full_prompt = f"{visual_prompt}, {motion_prompt}" if motion_prompt else visual_prompt
 80 | 
 81 |         # --- NEW LOGIC TO HANDLE ip_adapter_image ---
 82 |         # While LTX doesn't have a formal IP-Adapter, we can use the character
 83 |         # reference to guide the style by adding it to the prompt.
 84 |         if ip_adapter_image:
 85 |             print("LTX I2V: Using character reference to guide prompt style.")
 86 |             # For simplicity, we add a generic phrase. A more complex system could use an image-to-text model.
 87 |             full_prompt = f"in the style of the reference character, {full_prompt}"
 88 |             
 89 |         print(f"LTX I2V using prompt: {full_prompt}")
 90 |         
 91 |         video = self.pipe(
 92 |             prompt=full_prompt, image=prepared_image, width=target_width, height=target_height,
 93 |             num_frames=num_frames, num_inference_steps=self.config.num_inference_steps,
 94 |             guidance_scale=self.config.guidance_scale,
 95 |             negative_prompt="worst quality, inconsistent motion, blurry"
 96 |         ).frames[0]
 97 |         
 98 |         export_to_video(video, output_video_path, fps=content_config.fps)
 99 |         print(f"LTX video shot saved to {output_video_path}")
100 |         return output_video_path


--------------------------------------------------------------------------------
/t2v_modules/t2v_zeroscope.py:
--------------------------------------------------------------------------------
  1 | # In t2v_modules/t2v_zeroscope.py
  2 | import torch
  3 | from typing import Dict, Any, List, Optional, Union
  4 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
  5 | from diffusers.utils import export_to_video
  6 | 
  7 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities
  8 | from config_manager import DEVICE, clear_vram_globally
  9 | 
 10 | class ZeroscopeT2VConfig(BaseModuleConfig):
 11 |     model_id: str = "cerspense/zeroscope_v2_576w"
 12 |     upscaler_model_id: str = "cerspense/zeroscope_v2_xl"
 13 |     
 14 |     num_inference_steps: int = 30
 15 |     guidance_scale: float = 9.0
 16 |     # --- START OF FIX: Add strength for the upscaling process ---
 17 |     upscaler_strength: float = 0.7 
 18 |     # --- END OF FIX ---
 19 | 
 20 | class ZeroscopeT2V(BaseT2V):
 21 |     Config = ZeroscopeT2VConfig
 22 | 
 23 |     @classmethod
 24 |     def get_capabilities(cls) -> ModuleCapabilities:
 25 |         return ModuleCapabilities(
 26 |             title="Zeroscope, Port/Landscape, No Subject, 2 sec",
 27 |             vram_gb_min=8.0,
 28 |             ram_gb_min=12.0,
 29 |             supported_formats=["Portrait", "Landscape"],
 30 |             supports_ip_adapter=False, # Zeroscope does not support IP-Adapter
 31 |             supports_lora=False, # Zeroscope does not support LoRA loading
 32 |             max_subjects=0,
 33 |             accepts_text_prompt=True,
 34 |             accepts_negative_prompt=True
 35 |         )
 36 | 
 37 |     
 38 |     def __init__(self, config: ZeroscopeT2VConfig):
 39 |         super().__init__(config)
 40 |         self.upscaler_pipe = None
 41 | 
 42 |     def get_model_capabilities(self) -> Dict[str, Any]:
 43 |         # Zeroscope has a fixed native resolution that is then upscaled
 44 |         base_resolution = (576, 320)
 45 |         return {
 46 |             "resolutions": {"Portrait": base_resolution, "Landscape": base_resolution},
 47 |             "max_shot_duration": 2.0 
 48 |         }
 49 | 
 50 |     def _load_pipeline(self):
 51 |         if self.pipe is None:
 52 |             print(f"Loading T2V pipeline ({self.config.model_id})...")
 53 |             self.pipe = DiffusionPipeline.from_pretrained(self.config.model_id, torch_dtype=torch.float16)
 54 |             self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
 55 |             self.pipe.enable_model_cpu_offload()
 56 |             print(f"T2V ({self.config.model_id}) pipeline loaded.")
 57 |             
 58 |         if self.upscaler_pipe is None:
 59 |             print(f"Loading T2V Upscaler pipeline ({self.config.upscaler_model_id})...")
 60 |             self.upscaler_pipe = DiffusionPipeline.from_pretrained(self.config.upscaler_model_id, torch_dtype=torch.float16)
 61 |             self.upscaler_pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.upscaler_pipe.scheduler.config)
 62 |             self.upscaler_pipe.enable_model_cpu_offload()
 63 |             print(f"T2V Upscaler ({self.config.upscaler_model_id}) pipeline loaded.")
 64 | 
 65 |     def clear_vram(self):
 66 |         print(f"Clearing T2V VRAM...")
 67 |         models_to_clear = [m for m in [self.pipe, self.upscaler_pipe] if m is not None]
 68 |         if models_to_clear: clear_vram_globally(*models_to_clear)
 69 |         self.pipe, self.upscaler_pipe = None, None
 70 |         print("T2V VRAM cleared.")
 71 | 
 72 |     def generate_video_from_text(
 73 |         self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None
 74 |     ) -> str:
 75 |         self._load_pipeline()
 76 |         
 77 |         if ip_adapter_image:
 78 |             print("Warning: ZeroscopeT2V module received IP-Adapter image but does not currently implement its use.")
 79 | 
 80 |         negative_prompt = "blurry, low quality, watermark, bad anatomy, text, letters, distorted"
 81 |         
 82 |         # Note: Zeroscope generates at a fixed resolution, so we use its capabilities directly
 83 |         model_res = self.get_model_capabilities()["resolutions"]["Landscape"]
 84 |         
 85 |         print(f"Stage 1: Generating T2V ({model_res[0]}x{model_res[1]}) for prompt: \"{prompt[:70]}...\"")
 86 |         
 87 |         video_frames_tensor = self.pipe(
 88 |             prompt=prompt, negative_prompt=negative_prompt,
 89 |             num_inference_steps=self.config.num_inference_steps,
 90 |             height=model_res[1], width=model_res[0], num_frames=num_frames,
 91 |             guidance_scale=self.config.guidance_scale, output_type="pt"
 92 |         ).frames
 93 |         
 94 |         print("Stage 2: Upscaling video to HD...")
 95 |         
 96 |         # --- START OF FIX ---
 97 |         upscaled_video_frames = self.upscaler_pipe(
 98 |             prompt=prompt,
 99 |             negative_prompt=negative_prompt,
100 |             video=video_frames_tensor, # The argument is 'video', not 'image'.
101 |             strength=self.config.upscaler_strength, # Add the strength parameter
102 |             num_inference_steps=self.config.num_inference_steps,
103 |             guidance_scale=self.config.guidance_scale,
104 |         ).frames[0]
105 |         # --- END OF FIX ---
106 | 
107 |         export_to_video(upscaled_video_frames, output_video_path, fps=fps)
108 |         
109 |         print(f"High-quality T2V video shot saved to {output_video_path}")
110 |         return output_video_path


--------------------------------------------------------------------------------
/t2v_modules/t2v_wan.py:
--------------------------------------------------------------------------------
  1 | # In t2v_modules/t2v_wan.py
  2 | import torch
  3 | from typing import Dict, Any, List, Optional, Union
  4 | 
  5 | # --- Important: Import the specific classes for this model ---
  6 | from diffusers import WanPipeline, AutoencoderKLWan
  7 | from diffusers.utils import export_to_video
  8 | 
  9 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities
 10 | from config_manager import DEVICE, clear_vram_globally
 11 | 
 12 | class WanT2VConfig(BaseModuleConfig):
 13 |     """Configuration for the Wan 2.1 T2V model."""
 14 |     model_id: str = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
 15 |     # Parameters from the model card example
 16 |     num_inference_steps: int = 30 
 17 |     guidance_scale: float = 5.0
 18 | 
 19 | class WanT2V(BaseT2V):
 20 |     """
 21 |     Text-to-Video module using Wan 2.1 T2V 1.3B model.
 22 |     This model is efficient and produces high-quality video but does not support
 23 |     character consistency (IP-Adapter).
 24 |     """
 25 |     Config = WanT2VConfig
 26 | 
 27 |     @classmethod
 28 |     def get_capabilities(cls) -> ModuleCapabilities:
 29 |         """Declare the capabilities of the Wan 2.1 model."""
 30 |         return ModuleCapabilities(
 31 |             title="Wan 2.1 (1.3B, Fast, 5s Shots)",
 32 |             vram_gb_min=15.0, # Based on the 8.19 GB requirement from the model card
 33 |             ram_gb_min=12.0,
 34 |             supported_formats=["Portrait", "Landscape"],
 35 |             # This model does not support IP-Adapter, so we are honest here.
 36 |             supports_ip_adapter=False, 
 37 |             supports_lora=False, # The pipeline does not have a LoRA loader
 38 |             max_subjects=0,
 39 |             accepts_text_prompt=True,
 40 |             accepts_negative_prompt=True
 41 |         )
 42 | 
 43 |     def get_model_capabilities(self) -> Dict[str, Any]:
 44 |         """Return the specific resolutions and max duration for this model."""
 45 |         return {
 46 |             # Based on the example: width=832, height=480
 47 |             "resolutions": {"Portrait": (480, 832), "Landscape": (832, 480)},
 48 |             # Based on the example: "generate a 5-second 480P video"
 49 |             "max_shot_duration": 5.0 
 50 |         }
 51 | 
 52 |     def _load_pipeline(self):
 53 |         """Loads the custom WanPipeline and its required VAE."""
 54 |         if self.pipe is not None:
 55 |             return
 56 | 
 57 |         print(f"Loading T2V pipeline ({self.config.model_id})...")
 58 |         
 59 |         # This model requires loading the VAE separately first
 60 |         vae = AutoencoderKLWan.from_pretrained(
 61 |             self.config.model_id, 
 62 |             subfolder="vae", 
 63 |             torch_dtype=torch.float32 # VAE often works better in float32
 64 |         )
 65 | 
 66 |         # Then, load the main pipeline, passing the VAE to it
 67 |         self.pipe = WanPipeline.from_pretrained(
 68 |             self.config.model_id, 
 69 |             vae=vae, 
 70 |             torch_dtype=torch.bfloat16 # bfloat16 is recommended in the example
 71 |         )
 72 | 
 73 |         self.pipe.enable_model_cpu_offload()
 74 | 
 75 |         print(f"T2V ({self.config.model_id}) pipeline loaded to {DEVICE}.")
 76 | 
 77 |     def clear_vram(self):
 78 |         """Clears the VRAM used by the pipeline."""
 79 |         print(f"Clearing T2V (Wan) VRAM...")
 80 |         if self.pipe is not None:
 81 |             clear_vram_globally(self.pipe)
 82 |         self.pipe = None
 83 |         print("T2V (Wan) VRAM cleared.")
 84 | 
 85 |     def generate_video_from_text(
 86 |         self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None
 87 |     ) -> str:
 88 |         """Generates a video shot using the Wan T2V pipeline."""
 89 |         self._load_pipeline()
 90 |         
 91 |         # Gracefully handle the case where character images are passed to a non-supporting model.
 92 |         if ip_adapter_image:
 93 |             print("="*50)
 94 |             print("WARNING: The WanT2V module does not support IP-Adapters for character consistency.")
 95 |             print("The provided character images will be ignored for this T2V generation.")
 96 |             print("="*50)
 97 | 
 98 |         # Use the detailed negative prompt from the model card for best results
 99 |         negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
100 |         
101 |         print(f"Generating Wan T2V ({width}x{height}) for prompt: \"{prompt[:70]}...\"")
102 |         
103 |         video_frames = self.pipe(
104 |             prompt=prompt,
105 |             negative_prompt=negative_prompt,
106 |             height=height,
107 |             width=width,
108 |             num_frames=num_frames,
109 |             guidance_scale=self.config.guidance_scale,
110 |             num_inference_steps=self.config.num_inference_steps
111 |         ).frames[0]
112 |         
113 |         # The system's config determines the final FPS, not the model's example
114 |         export_to_video(video_frames, output_video_path, fps=fps)
115 |         
116 |         print(f"Wan T2V video shot saved to {output_video_path}")
117 |         return output_video_path


--------------------------------------------------------------------------------
/i2v_modules/i2v_svd.py:
--------------------------------------------------------------------------------
  1 | # i2v_modules/i2v_svd.py
  2 | import torch
  3 | from typing import Dict, Any, List, Optional, Union
  4 | from diffusers import StableVideoDiffusionPipeline
  5 | from diffusers.utils import load_image, export_to_video
  6 | from PIL import Image
  7 | 
  8 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities
  9 | from config_manager import DEVICE, clear_vram_globally, ContentConfig
 10 | 
 11 | class SvdI2VConfig(BaseModuleConfig):
 12 |     model_id: str = "stabilityai/stable-video-diffusion-img2vid-xt"
 13 |     decode_chunk_size: int = 8
 14 |     motion_bucket_id: int = 127
 15 |     noise_aug_strength: float = 0.02
 16 |     model_native_frames: int = 25
 17 | 
 18 | class SvdI2V(BaseI2V):
 19 |     Config = SvdI2VConfig
 20 | 
 21 |     @classmethod
 22 |     def get_capabilities(cls) -> ModuleCapabilities:
 23 |         return ModuleCapabilities(
 24 |             title="SVD, Float16, Port/Landscape, No Prompt just image, Max 2 Sec",
 25 |             vram_gb_min=8.0,
 26 |             ram_gb_min=12.0,
 27 |             supported_formats=["Portrait", "Landscape"],
 28 |             supports_ip_adapter=True,
 29 |             supports_lora=True, # Juggernaut is a fine-tune, can easily use LoRAs
 30 |             max_subjects=2, # Can handle one or two IP adapter images
 31 |             accepts_text_prompt=False,
 32 |             accepts_negative_prompt=True
 33 |         )
 34 | 
 35 | 
 36 |     def get_model_capabilities(self) -> Dict[str, Any]:
 37 |         return {
 38 |             "resolutions": {"Portrait": (576, 1024), "Landscape": (1024, 576)},
 39 |             "max_shot_duration": 2.0 
 40 |         }
 41 |         
 42 |     def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str:
 43 |         # SVD doesn't use text prompts, but this shows how you could add model-specific keywords.
 44 |         # For example, for a different model you might do:
 45 |         # if prompt_type == "visual":
 46 |         #    return f"{prompt}, 8k, photorealistic, cinematic lighting"
 47 |         return prompt # Return original for SVD
 48 | 
 49 |     def _load_pipeline(self):
 50 |         if self.pipe is None:
 51 |             print(f"Loading I2V pipeline (SVD): {self.config.model_id}...")
 52 |             self.pipe = StableVideoDiffusionPipeline.from_pretrained(
 53 |                 self.config.model_id, torch_dtype=torch.float16
 54 |             )
 55 |             self.pipe.enable_model_cpu_offload()
 56 |             print("I2V (SVD) pipeline loaded.")
 57 | 
 58 |     def clear_vram(self):
 59 |         print("Clearing I2V (SVD) VRAM...")
 60 |         if self.pipe is not None: clear_vram_globally(self.pipe)
 61 |         self.pipe = None
 62 |         print("I2V (SVD) VRAM cleared.")
 63 | 
 64 |     def _resize_and_pad(self, image: Image.Image, target_width: int, target_height: int) -> Image.Image:
 65 |         original_aspect = image.width / image.height; target_aspect = target_width / target_height
 66 |         if original_aspect > target_aspect: new_width, new_height = target_width, int(target_width / original_aspect)
 67 |         else: new_height, new_width = target_height, int(target_height * original_aspect)
 68 |         resized_image = image.resize((new_width, new_height), Image.LANCZOS)
 69 |         background = Image.new('RGB', (target_width, target_height), (0, 0, 0))
 70 |         background.paste(resized_image, ((target_width - new_width) // 2, (target_height - new_height) // 2))
 71 |         return background
 72 | 
 73 |     def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
 74 |         self._load_pipeline()
 75 | 
 76 |         if ip_adapter_image:
 77 |             print("Warning: SvdI2V module received IP-Adapter image but does not currently implement its use.")
 78 | 
 79 |         input_image = load_image(image_path)
 80 |         svd_target_res = self.get_model_capabilities()["resolutions"]
 81 |         aspect_ratio = "Landscape" if input_image.width > input_image.height else "Portrait"
 82 |         svd_target_width, svd_target_height = svd_target_res[aspect_ratio]
 83 |         prepared_image = self._resize_and_pad(input_image, svd_target_width, svd_target_height)
 84 | 
 85 |         calculated_fps = max(1, round(self.config.model_native_frames / target_duration)) if target_duration > 0 else 8
 86 |         motion_bucket_id = self.config.motion_bucket_id
 87 |         if motion_prompt:
 88 |             motion_prompt_lower = motion_prompt.lower()
 89 |             if any(w in motion_prompt_lower for w in ['fast', 'quick', 'rapid', 'zoom in', 'pan right']): motion_bucket_id = min(255, motion_bucket_id + 50)
 90 |             elif any(w in motion_prompt_lower for w in ['slow', 'gentle', 'subtle', 'still']): motion_bucket_id = max(0, motion_bucket_id - 50)
 91 |             print(f"Adjusted motion_bucket_id to {motion_bucket_id} based on prompt: '{motion_prompt}'")
 92 | 
 93 |         video_frames = self.pipe(
 94 |             image=prepared_image, height=svd_target_height, width=svd_target_width,
 95 |             decode_chunk_size=self.config.decode_chunk_size, num_frames=self.config.model_native_frames,
 96 |             motion_bucket_id=motion_bucket_id, noise_aug_strength=self.config.noise_aug_strength,
 97 |         ).frames[0]
 98 | 
 99 |         export_to_video(video_frames, output_video_path, fps=calculated_fps)
100 |         print(f"SVD video shot saved to {output_video_path}")
101 |         return output_video_path


--------------------------------------------------------------------------------
/t2v_modules/t2v_ltx.py:
--------------------------------------------------------------------------------
  1 | # In t2v_modules/t2v_ltx.py
  2 | import torch
  3 | from typing import Dict, Any, List, Optional, Union
  4 | import os
  5 | 
  6 | # --- Import the necessary pipelines and configs ---
  7 | from diffusers import LTXPipeline, LTXVideoTransformer3DModel
  8 | from diffusers.utils import export_to_video
  9 | from transformers import T5EncoderModel, BitsAndBytesConfig as TransformersBitsAndBytesConfig
 10 | from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 11 | 
 12 | from base_modules import BaseT2V, BaseModuleConfig, ModuleCapabilities
 13 | from config_manager import DEVICE, clear_vram_globally
 14 | 
 15 | class LtxT2VConfig(BaseModuleConfig):
 16 |     model_id: str = "Lightricks/LTX-Video" 
 17 |     use_8bit_quantization: bool = True
 18 |     num_inference_steps: int = 50
 19 |     guidance_scale: float = 7.5
 20 |     decode_timestep: float = 0.03
 21 |     decode_noise_scale: float = 0.025
 22 |     # No IP-Adapter configs needed as this pipeline doesn't support them
 23 | 
 24 | class LtxT2V(BaseT2V):
 25 |     Config = LtxT2VConfig
 26 | 
 27 |     # No __init__ needed if we just have the default behavior
 28 | 
 29 |     @classmethod
 30 |     def get_capabilities(cls) -> ModuleCapabilities:
 31 |         """This module is for pure T2V and does NOT support IP-Adapters."""
 32 |         return ModuleCapabilities(
 33 |             title="LTX, Port/Landscape, No Subject, 5 sec",
 34 |             vram_gb_min=8.0,
 35 |             ram_gb_min=12.0,
 36 |             supported_formats=["Portrait", "Landscape"],
 37 |             # --- THE CRITICAL CHANGE: Be honest about capabilities ---
 38 |             supports_ip_adapter=False,
 39 |             supports_lora=False, # This pipeline doesn't have a LoRA loader either
 40 |             max_subjects=0, 
 41 |             accepts_text_prompt=True,
 42 |             accepts_negative_prompt=True
 43 |         )
 44 |     
 45 |     def get_model_capabilities(self) -> Dict[str, Any]:
 46 |         return {"resolutions": {"Portrait": (512, 768), "Landscape": (768, 512)}, "max_shot_duration": 5.0}
 47 | 
 48 |     def _load_pipeline(self):
 49 |         if self.pipe is not None: return
 50 | 
 51 |         if self.config.use_8bit_quantization:
 52 |             print(f"Loading T2V pipeline ({self.config.model_id}) with 8-bit quantization...")
 53 |             text_encoder_quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True)
 54 |             text_encoder_8bit = T5EncoderModel.from_pretrained(self.config.model_id, subfolder="text_encoder", quantization_config=text_encoder_quant_config, torch_dtype=torch.float16)
 55 |             transformer_quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
 56 |             transformer_8bit = LTXVideoTransformer3DModel.from_pretrained(self.config.model_id, subfolder="transformer", quantization_config=transformer_quant_config, torch_dtype=torch.float16)
 57 |             
 58 |             # Note: We are no longer passing the `image_encoder` as it was being ignored.
 59 |             self.pipe = LTXPipeline.from_pretrained(
 60 |                 self.config.model_id,
 61 |                 text_encoder=text_encoder_8bit,
 62 |                 transformer=transformer_8bit,
 63 |                 torch_dtype=torch.float16,
 64 |                 device_map="balanced",
 65 |             )
 66 |             print("Quantized T2V pipeline loaded successfully.")
 67 |         else:
 68 |             print(f"Loading T2V pipeline ({self.config.model_id}) in full precision...")
 69 |             self.pipe = LTXPipeline.from_pretrained(
 70 |                 self.config.model_id,
 71 |                 torch_dtype=torch.bfloat16
 72 |             )
 73 |             self.pipe.enable_model_cpu_offload()
 74 | 
 75 |         self.pipe.vae.enable_tiling()
 76 |         print("VAE tiling enabled for memory efficiency.")
 77 | 
 78 |     def clear_vram(self):
 79 |         print(f"Clearing T2V (LTX) VRAM...")
 80 |         if self.pipe is not None:
 81 |             clear_vram_globally(self.pipe)
 82 |         self.pipe = None
 83 |         print("T2V (LTX) VRAM cleared.")
 84 | 
 85 |     def generate_video_from_text(
 86 |         self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None
 87 |     ) -> str:
 88 |         self._load_pipeline()
 89 | 
 90 |         # --- THE GRACEFUL HANDLING ---
 91 |         # If character images are passed, inform the user they are being ignored.
 92 |         if ip_adapter_image:
 93 |             print("="*50)
 94 |             print("WARNING: The LtxT2V module does not support IP-Adapters for character consistency.")
 95 |             print("The provided character images will be ignored for this T2V generation.")
 96 |             print("="*50)
 97 |         
 98 |         # All IP-Adapter logic is removed. We just call the pipeline.
 99 |         pipeline_kwargs = {}
100 |         
101 |         negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, text, watermark, bad anatomy"
102 |         print(f"Generating LTX T2V ({width}x{height}) for prompt: \"{prompt[:50]}...\"")
103 |         
104 |         video_frames = self.pipe(
105 |             prompt=prompt,
106 |             negative_prompt=negative_prompt,
107 |             width=width,
108 |             height=height,
109 |             num_frames=num_frames,
110 |             num_inference_steps=self.config.num_inference_steps,
111 |             guidance_scale=self.config.guidance_scale,
112 |             decode_timestep=self.config.decode_timestep,
113 |             decode_noise_scale=self.config.decode_noise_scale,
114 |             **pipeline_kwargs
115 |         ).frames[0]
116 |         
117 |         export_to_video(video_frames, output_video_path, fps=fps)
118 |         
119 |         print(f"LTX T2V video shot saved to {output_video_path}")
120 |         return output_video_path


--------------------------------------------------------------------------------
/i2v_modules/i2v_wan.py:
--------------------------------------------------------------------------------
  1 | # In i2v_modules/i2v_wan.py
  2 | import torch
  3 | import numpy as np
  4 | from typing import Dict, Any, List, Optional, Union
  5 | from PIL import Image
  6 | 
  7 | # Import the necessary components
  8 | from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
  9 | from diffusers.utils import export_to_video, load_image
 10 | from transformers import CLIPVisionModel, UMT5EncoderModel, T5Tokenizer, CLIPImageProcessor
 11 | 
 12 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities
 13 | from config_manager import DEVICE, clear_vram_globally, ContentConfig
 14 | 
 15 | class WanI2VConfig(BaseModuleConfig):
 16 |     """Configuration for the Wan 2.1 I2V model."""
 17 |     model_id: str = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 18 |     
 19 |     num_inference_steps: int = 30
 20 |     guidance_scale: float = 5.0
 21 | 
 22 | class WanI2V(BaseI2V):
 23 |     """
 24 |     Image-to-Video module using the Wan 2.1 14B pipeline.
 25 |     """
 26 |     Config = WanI2VConfig
 27 | 
 28 |     @classmethod
 29 |     def get_capabilities(cls) -> ModuleCapabilities:
 30 |         """Declare the capabilities of the Wan 2.1 I2V model."""
 31 |         return ModuleCapabilities(
 32 |             title="Wan 2.1 I2V (14B)",
 33 |             vram_gb_min=40.0,
 34 |             ram_gb_min=24.0,
 35 |             supported_formats=["Portrait", "Landscape"],
 36 |             supports_ip_adapter=False,
 37 |             supports_lora=False,
 38 |             max_subjects=0,
 39 |             accepts_text_prompt=True,
 40 |             accepts_negative_prompt=True
 41 |         )
 42 | 
 43 |     def get_model_capabilities(self) -> Dict[str, Any]:
 44 |         """Return the specific resolutions and max duration for this model."""
 45 |         return {
 46 |             "resolutions": {"base_pixel_area": 399360},  # 480P model base area
 47 |             "max_shot_duration": 4.0
 48 |         }
 49 | 
 50 |     def _load_pipeline(self):
 51 |         """
 52 |         Loads the WanImageToVideoPipeline following the official documentation example.
 53 |         """
 54 |         if self.pipe is not None: return
 55 | 
 56 |         print(f"Loading I2V pipeline ({self.config.model_id})...")
 57 | 
 58 |         # 1. Load individual components with appropriate dtypes
 59 |         image_encoder = CLIPVisionModel.from_pretrained(
 60 |             self.config.model_id, 
 61 |             subfolder="image_encoder", 
 62 |             torch_dtype=torch.float32
 63 |         )
 64 | 
 65 |         vae = AutoencoderKLWan.from_pretrained(
 66 |             self.config.model_id, 
 67 |             subfolder="vae", 
 68 |             torch_dtype=torch.float32
 69 |         )
 70 | 
 71 |         # 2. Create the pipeline with the components
 72 |         self.pipe = WanImageToVideoPipeline.from_pretrained(
 73 |             self.config.model_id,
 74 |             vae=vae,
 75 |             image_encoder=image_encoder,
 76 |             torch_dtype=torch.bfloat16
 77 |         )
 78 | 
 79 |         # 3. Enable model CPU offload for memory efficienc                                                                                                                                                                                                                                                                                                  y
 80 |         self.pipe.enable_model_cpu_offload()
 81 |         
 82 |         print("I2V (Wan 14B) pipeline loaded successfully.")
 83 | 
 84 |     def clear_vram(self):
 85 |         """Clears the VRAM used by all loaded components."""
 86 |         print(f"Clearing I2V (Wan 14B) VRAM...")
 87 |         if self.pipe is not None:
 88 |             clear_vram_globally(self.pipe)
 89 |         self.pipe = None
 90 |         print("I2V (Wan 14B) VRAM cleared.")
 91 | 
 92 |     def generate_video_from_image(
 93 |         self, image_path: str, output_video_path: str, target_duration: float, 
 94 |         content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], 
 95 |         ip_adapter_image: Optional[Union[str, List[str]]] = None
 96 |     ) -> str:
 97 |         """Generates a video by animating a source image using the 14B model."""
 98 |         self._load_pipeline()
 99 | 
100 |         input_image = load_image(image_path)
101 |         
102 |         model_caps = self.get_model_capabilities()
103 |         max_area = model_caps["resolutions"]["base_pixel_area"]
104 |         aspect_ratio = input_image.height / input_image.width
105 |         
106 |         # Calculate dimensions using the correct scale factors
107 |         mod_value = self.pipe.vae_scale_factor_spatial * self.pipe.transformer.config.patch_size[1]
108 |         h = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
109 |         w = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
110 |         prepared_image = input_image.resize((w, h))
111 | 
112 |         num_frames = int(target_duration * content_config.fps)
113 |         full_prompt = f"{visual_prompt}, {motion_prompt}" if motion_prompt else visual_prompt
114 |         negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
115 |         
116 |         print(f"Generating Wan I2V ({w}x{h}) from image: {image_path}")
117 |         print(f"  - Prompt: \"{full_prompt[:70]}...\"")
118 | 
119 |         video_frames = self.pipe(
120 |             image=prepared_image,
121 |             prompt=full_prompt,
122 |             negative_prompt=negative_prompt,
123 |             height=h,
124 |             width=w,
125 |             num_frames=num_frames,
126 |             guidance_scale=self.config.guidance_scale,
127 |             num_inference_steps=self.config.num_inference_steps,
128 |         ).frames[0]
129 |         
130 |         export_to_video(video_frames, output_video_path, fps=content_config.fps)
131 |         
132 |         print(f"Wan I2V 14B video shot saved to {output_video_path}")
133 |         return output_video_path


--------------------------------------------------------------------------------
/base_modules.py:
--------------------------------------------------------------------------------
  1 | # In base_modules.py
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from typing import List, Tuple, Dict, Any, Optional, Union, Literal
  5 | from pydantic import BaseModel, Field
  6 | 
  7 | # --- NEW: Define the ModuleCapabilities Contract ---
  8 | class ModuleCapabilities(BaseModel):
  9 |     """A standardized spec sheet for all generation modules."""
 10 |     
 11 |     title: str = Field(description="Title to show in dropdowns")
 12 | 
 13 |     # Resource Requirements
 14 |     vram_gb_min: float = Field(default=4.0, description="Minimum GPU VRAM required in GB.")
 15 |     ram_gb_min: float = Field(default=8.0, description="Minimum system RAM required in GB.")
 16 | 
 17 |     # Format & Control Support
 18 |     supported_formats: List[Literal["Portrait", "Landscape"]] = Field(default=["Portrait", "Landscape"])
 19 |     supports_ip_adapter: bool = Field(default=False, description="True if the module can use IP-Adapter for subject consistency.")
 20 |     supports_lora: bool = Field(default=False, description="True if the module supports LoRA weights.")
 21 |     
 22 |     # Subject & Prompting
 23 |     max_subjects: int = Field(default=0, description="Maximum number of distinct subjects/characters the module can handle at once (e.g., via IP-Adapter).")
 24 |     accepts_text_prompt: bool = Field(default=True, description="True if the module uses a text prompt.")
 25 |     accepts_negative_prompt: bool = Field(default=True, description="True if the module uses a negative prompt.")
 26 |     
 27 |     # Type-Specific
 28 |     supported_tts_languages: List[str] = Field(default=[], description="List of languages supported by a TTS module (e.g., ['en', 'es']).")
 29 | 
 30 | # Forward-declare to avoid circular imports
 31 | class ContentConfig(BaseModel): pass
 32 | class ProjectState(BaseModel): pass
 33 | 
 34 | # --- Base Configuration Models ---
 35 | class BaseModuleConfig(BaseModel):
 36 |     """Base for all module-specific configurations."""
 37 |     model_id: str
 38 | 
 39 | # --- Base Module Classes ---
 40 | class BaseLLM(ABC):
 41 |     """Abstract Base Class for Language Model modules."""
 42 |     def __init__(self, config: BaseModuleConfig):
 43 |         self.config = config
 44 |         self.model = None
 45 |         self.tokenizer = None
 46 | 
 47 |     # --- NEW: Enforce capabilities contract ---
 48 |     @classmethod
 49 |     @abstractmethod
 50 |     def get_capabilities(cls) -> ModuleCapabilities:
 51 |         """Returns the spec sheet for this module."""
 52 |         raise NotImplementedError
 53 | 
 54 |     @abstractmethod
 55 |     def generate_script(self, topic: str, content_config: ContentConfig) -> Dict[str, Any]:
 56 |         """Generates the main script, visual prompts, hashtags, and context descriptions."""
 57 |         pass
 58 | 
 59 |     @abstractmethod
 60 |     def generate_shot_visual_prompts(self, scene_narration: str, original_scene_prompt: str, num_shots: int, content_config: ContentConfig, main_subject: str, setting: str) -> List[Tuple[str, str]]:
 61 |         """Generates visual and motion prompts for each shot within a scene."""
 62 |         pass
 63 | 
 64 |     @abstractmethod
 65 |     def clear_vram(self):
 66 |         """Clears the VRAM used by the model and tokenizer."""
 67 |         pass
 68 | 
 69 | class BaseTTS(ABC):
 70 |     """Abstract Base Class for Text-to-Speech modules."""
 71 |     def __init__(self, config: BaseModuleConfig):
 72 |         self.config = config
 73 |         self.model = None
 74 | 
 75 |     # --- NEW: Enforce capabilities contract ---
 76 |     @classmethod
 77 |     @abstractmethod
 78 |     def get_capabilities(cls) -> ModuleCapabilities:
 79 |         """Returns the spec sheet for this module."""
 80 |         raise NotImplementedError
 81 | 
 82 |     @abstractmethod
 83 |     def generate_audio(self, text: str, output_dir: str, scene_idx: int, language: str, speaker_wav: Optional[str] = None) -> Tuple[str, float]:
 84 |         """Generates audio from text."""
 85 |         pass
 86 | 
 87 |     @abstractmethod
 88 |     def clear_vram(self):
 89 |         """Clears the VRAM used by the TTS model."""
 90 |         pass
 91 | 
 92 | class BaseVideoGen(ABC):
 93 |     """A common base for all video generation modules (T2I, I2V, T2V)."""
 94 |     def __init__(self, config: BaseModuleConfig):
 95 |         self.config = config
 96 |         self.pipe = None
 97 |         
 98 |     # --- NEW: Enforce capabilities contract ---
 99 |     @classmethod
100 |     @abstractmethod
101 |     def get_capabilities(cls) -> ModuleCapabilities:
102 |         """Returns the spec sheet for this module."""
103 |         raise NotImplementedError
104 |         
105 |     @abstractmethod
106 |     def get_model_capabilities(self) -> Dict[str, Any]:
107 |         """Returns a dictionary of the model's capabilities, like resolutions."""
108 |         pass
109 | 
110 |     def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str:
111 |         return prompt
112 | 
113 |     @abstractmethod
114 |     def clear_vram(self):
115 |         """Clears the VRAM used by the pipeline."""
116 |         pass
117 | 
118 | class BaseT2I(BaseVideoGen):
119 |     """Abstract Base Class for Text-to-Image modules."""
120 |     @abstractmethod
121 |     def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str:
122 |         """Generates an image from a text prompt, optionally using an IP-Adapter image."""
123 |         pass
124 | 
125 | class BaseI2V(BaseVideoGen):
126 |     """Abstract Base Class for Image-to-Video modules."""
127 |     @abstractmethod
128 |     def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
129 |         """Generates a video from an initial image, optionally using an IP-Adapter image for style/subject."""
130 |         pass
131 | 
132 | class BaseT2V(BaseVideoGen):
133 |     """Abstract Base Class for Text-to-Video modules."""
134 |     @abstractmethod
135 |     def generate_video_from_text(self, prompt: str, output_video_path: str, num_frames: int, fps: int, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
136 |         """Generates a video directly from a text prompt, optionally using an IP-Adapter image."""
137 |         pass


--------------------------------------------------------------------------------
/__requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==2.2.2
  2 | accelerate==1.6.0
  3 | aiofiles==24.1.0
  4 | aiohappyeyeballs==2.6.1
  5 | aiohttp==3.11.18
  6 | aiosignal==1.3.2
  7 | albucore==0.0.24
  8 | albumentations==2.0.7
  9 | altair==5.5.0
 10 | annotated-types==0.7.0
 11 | anyascii==0.3.2
 12 | anyio==4.9.0
 13 | anykeystore==0.2
 14 | apex==0.9.10.dev0
 15 | argon2-cffi==23.1.0
 16 | argon2-cffi-bindings==21.2.0
 17 | arrow==1.3.0
 18 | asttokens==3.0.0
 19 | async-lru==2.0.5
 20 | attrs==25.3.0
 21 | audioread==3.0.1
 22 | av==12.1.0
 23 | babel==2.17.0
 24 | beautifulsoup4==4.13.4
 25 | bitsandbytes==0.45.5
 26 | bleach==6.2.0
 27 | blinker==1.9.0
 28 | blis==0.7.11
 29 | cachetools==5.5.2
 30 | catalogue==2.0.10
 31 | certifi==2025.4.26
 32 | cffi==1.17.1
 33 | charset-normalizer==3.4.2
 34 | click==8.1.8
 35 | cloudpathlib==0.21.1
 36 | coloredlogs==15.0.1
 37 | comm==0.2.2
 38 | confection==0.1.5
 39 | consisid-eva-clip==1.0.2
 40 | contourpy>=1.3.0
 41 | coqpit-config>=0.2.0
 42 | coqui-tts>=0.26.0
 43 | coqui-tts-trainer>=0.2.3
 44 | cryptacular==1.6.2
 45 | cycler==0.12.1
 46 | cymem==2.0.11
 47 | cython==3.1.0
 48 | dateparser==1.1.8
 49 | debugpy==1.8.14
 50 | decorator==5.2.1
 51 | defusedxml==0.7.1
 52 | diffusers==0.33.1
 53 | docopt==0.6.2
 54 | easydict==1.13
 55 | einops==0.8.1
 56 | encodec==0.1.1
 57 | executing==2.2.0
 58 | facexlib==0.3.0
 59 | fastapi==0.115.12
 60 | fastjsonschema==2.21.1
 61 | ffmpy==0.6.0
 62 | filelock==3.18.0
 63 | filterpy==1.4.5
 64 | flatbuffers==25.2.10
 65 | fonttools==4.58.0
 66 | fqdn==1.5.1
 67 | frozenlist==1.6.0
 68 | fsspec==2025.5.1
 69 | ftfy==6.3.1
 70 | gitdb==4.0.12
 71 | gitpython==3.1.44
 72 | gradio==5.25.2
 73 | gradio-client==1.8.0
 74 | greenlet==3.2.2
 75 | groovy==0.1.2
 76 | grpcio==1.71.0
 77 | gruut==2.4.0
 78 | gruut-ipa==0.13.0
 79 | gruut-lang-de==2.0.1
 80 | gruut-lang-en==2.0.1
 81 | gruut-lang-es==2.0.1
 82 | gruut-lang-fr==2.0.2
 83 | h11==0.16.0
 84 | hf-xet==1.1.3
 85 | httpcore==1.0.9
 86 | httpx==0.28.1
 87 | huggingface-hub==0.32.4
 88 | humanfriendly==10.0
 89 | hupper==1.12.1
 90 | idna==3.10
 91 | imageio==2.37.0
 92 | imageio-ffmpeg==0.6.0
 93 | importlib-metadata==8.7.0
 94 | inflect==7.5.0
 95 | inquirerpy==0.3.4
 96 | insightface==0.7.3
 97 | ipykernel==6.29.5
 98 | ipython==9.2.0
 99 | ipython-pygments-lexers==1.1.1
100 | ipywidgets==8.1.7
101 | isoduration==20.11.0
102 | jedi==0.19.2
103 | jinja2==3.1.6
104 | joblib==1.5.0
105 | json5==0.12.0
106 | jsonlines==1.2.0
107 | jsonpointer==3.0.0
108 | jsonschema==4.23.0
109 | jsonschema-specifications==2025.4.1
110 | jupyter==1.1.1
111 | jupyter-client==8.6.3
112 | jupyter-console==6.6.3
113 | jupyter-core==5.7.2
114 | jupyter-events==0.12.0
115 | jupyter-lsp==2.2.5
116 | jupyter-server==2.16.0
117 | jupyter-server-terminals==0.5.3
118 | jupyterlab==4.4.2
119 | jupyterlab-pygments==0.3.0
120 | jupyterlab-server==2.27.3
121 | jupyterlab-widgets==3.0.15
122 | kiwisolver==1.4.8
123 | langcodes==3.5.0
124 | language-data==1.3.0
125 | lazy-loader==0.4
126 | librosa>=0.11.0
127 | llvmlite==0.44.0
128 | marisa-trie==1.2.1
129 | markdown==3.8
130 | markdown-it-py==3.0.0
131 | markupsafe==3.0.2
132 | matplotlib==3.10.3
133 | matplotlib-inline==0.1.7
134 | mdurl==0.1.2
135 | mistune==3.1.3
136 | monotonic-alignment-search==0.1.1
137 | more-itertools==10.7.0
138 | moviepy==2.1.2
139 | mpmath==1.3.0
140 | msgpack==1.1.0
141 | multidict==6.4.3
142 | murmurhash==1.0.12
143 | narwhals==1.41.1
144 | nbclient==0.10.2
145 | nbconvert==7.16.6
146 | nbformat==5.10.4
147 | nest-asyncio==1.6.0
148 | networkx==3.5
149 | notebook==7.4.2
150 | notebook-shim==0.2.4
151 | num2words==0.5.14
152 | numba>=0.61.2
153 | numpy>=1.26.2
154 | nvidia-cublas-cu12==12.1.3.1
155 | nvidia-cuda-cupti-cu12==12.1.105
156 | nvidia-cuda-nvrtc-cu12==12.1.105
157 | nvidia-cuda-runtime-cu12==12.1.105
158 | nvidia-cudnn-cu12==9.1.0.70
159 | nvidia-cufft-cu12==11.0.2.54
160 | nvidia-cufile-cu12==1.11.1.6
161 | nvidia-curand-cu12==10.3.2.106
162 | nvidia-cusolver-cu12==11.4.5.107
163 | nvidia-cusparse-cu12==12.1.0.106
164 | nvidia-cusparselt-cu12==0.6.3
165 | nvidia-nccl-cu12==2.21.5
166 | nvidia-nvjitlink-cu12==12.6.85
167 | nvidia-nvtx-cu12==12.1.105
168 | oauthlib==3.2.2
169 | onnx==1.18.0
170 | onnxruntime-gpu==1.22.0
171 | opencv-contrib-python==4.11.0.86
172 | opencv-python==4.11.0.86
173 | opencv-python-headless==4.11.0.86
174 | orjson==3.10.18
175 | overrides==7.7.0
176 | packaging==24.2
177 | pandas==2.3.0
178 | pandocfilters==1.5.1
179 | parso==0.8.4
180 | pastedeploy==3.1.0
181 | pbkdf2==1.3
182 | peft==0.15.2
183 | pexpect==4.9.0
184 | pfzy==0.3.4
185 | pillow>=9.2.0,<11.0
186 | plaster==1.1.2
187 | plaster-pastedeploy==1.0.1
188 | platformdirs==4.3.8
189 | pooch==1.8.2
190 | preshed==3.0.9
191 | prettytable==3.16.0
192 | proglog==0.1.12
193 | prometheus-client==0.21.1
194 | prompt-toolkit==3.0.51
195 | propcache==0.3.1
196 | protobuf==6.31.0
197 | psutil==7.0.0
198 | ptyprocess==0.7.0
199 | pure-eval==0.2.3
200 | pyarrow==20.0.0
201 | pycparser==2.22
202 | pydantic==2.11.4
203 | pydantic-core==2.33.2
204 | pydeck==0.9.1
205 | pydub==0.25.1
206 | pyfacer==0.0.5
207 | pygments==2.19.1
208 | pyparsing==3.2.3
209 | pyramid==2.0.2
210 | pyramid-mailer==0.15.1
211 | pysbd==0.3.4
212 | python-crfsuite==0.9.11
213 | python-dateutil==2.9.0.post0
214 | python-dotenv==1.1.0
215 | python-json-logger==3.3.0
216 | python-multipart==0.0.20
217 | python3-openid==3.2.0
218 | pytz==2025.2
219 | pyyaml==6.0.2
220 | pyzmq==26.4.0
221 | referencing==0.36.2
222 | regex==2024.11.6
223 | repoze-sendmail==4.4.1
224 | requests==2.31.0
225 | requests-oauthlib==2.0.0
226 | rfc3339-validator==0.1.4
227 | rfc3986-validator==0.1.1
228 | rich==14.0.0
229 | rpds-py==0.25.0
230 | ruff==0.11.13
231 | safehttpx==0.1.6
232 | safetensors==0.5.3
233 | scikit-image==0.25.2
234 | scikit-learn==1.6.1
235 | scipy==1.12.0
236 | semantic-version==2.10.0
237 | send2trash==1.8.3
238 | sentencepiece==0.2.0
239 | setuptools==80.9.0
240 | shellingham==1.5.4
241 | simsimd==6.2.1
242 | six==1.17.0
243 | smart-open==7.1.0
244 | smmap==5.0.2
245 | sniffio==1.3.1
246 | soundfile==0.13.1
247 | soupsieve==2.7
248 | soxr==0.5.0.post1
249 | spacy==3.7.5
250 | spacy-legacy==3.0.12
251 | spacy-loggers==1.0.5
252 | sqlalchemy==2.0.41
253 | srsly==2.5.1
254 | stack-data==0.6.3
255 | starlette==0.46.2
256 | streamlit==1.45.0
257 | stringzilla==3.12.5
258 | sudachidict-core==20250129
259 | sudachipy==0.6.10
260 | sympy==1.13.1
261 | tenacity==9.1.2
262 | tensorboard==2.19.0
263 | tensorboard-data-server==0.7.2
264 | terminado==0.18.1
265 | thinc==8.2.5
266 | threadpoolctl==3.6.0
267 | tifffile==2025.5.10
268 | timm==1.0.15
269 | tinycss2==1.4.0
270 | tokenizers>=0.20.3
271 | toml==0.10.2
272 | tomlkit==0.13.3
273 | # torch==2.5.1+cu121
274 | # torchaudio==2.5.1+cu121
275 | torchsde==0.2.6
276 | # torchvision==0.20.1+cu121
277 | tornado==6.4.2
278 | tqdm==4.67.1
279 | traitlets==5.14.3
280 | trampoline==0.1.2
281 | transaction==5.0
282 | transformers>=4.46.2
283 | translationstring==1.4
284 | triton==3.1.0
285 | typeguard==4.4.2
286 | typer==0.15.4
287 | types-python-dateutil==2.9.0.20241206
288 | typing-extensions==4.14.0
289 | typing-inspection==0.4.0
290 | tzdata==2025.2
291 | tzlocal==5.3.1
292 | uri-template==1.3.0
293 | urllib3==2.4.0
294 | uvicorn==0.34.3
295 | validators==0.35.0
296 | velruse==1.1.1
297 | venusian==3.1.1
298 | wasabi==1.1.3
299 | watchdog==6.0.0
300 | wcwidth==0.2.13
301 | weasel==0.4.1
302 | webcolors==24.11.1
303 | webencodings==0.5.1
304 | webob==1.8.9
305 | websocket-client==1.8.0
306 | websockets==15.0.1
307 | werkzeug==3.1.3
308 | widgetsnbextension==4.0.14
309 | wrapt==1.17.2
310 | wtforms==3.2.1
311 | wtforms-recaptcha==0.3.2
312 | xformers==0.0.29.post1
313 | yarl==1.20.0
314 | zipp==3.22.0
315 | zope-deprecation==5.1
316 | zope-interface==7.2
317 | zope-sqlalchemy==3.1
318 | 


--------------------------------------------------------------------------------
/t2i_modules/t2i_juggernaut.py:
--------------------------------------------------------------------------------
  1 | # In t2i_modules/t2i_juggernaut.py
  2 | import torch
  3 | from typing import List, Optional, Dict, Any, Union
  4 | from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
  5 | from diffusers.utils import load_image
  6 | from transformers import BitsAndBytesConfig
  7 | from diffusers import DPMSolverMultistepScheduler as JuggernautScheduler
  8 | 
  9 | from base_modules import BaseT2I, BaseModuleConfig, ModuleCapabilities
 10 | from config_manager import DEVICE, clear_vram_globally
 11 | 
 12 | class JuggernautT2IConfig(BaseModuleConfig):
 13 |     model_id: str = "RunDiffusion/Juggernaut-XL-v9"
 14 |     refiner_id: Optional[str] = None
 15 |     # --- NEW: Flag to control memory-saving quantization ---
 16 |     use_8bit_quantization: bool = True
 17 |     num_inference_steps: int = 35
 18 |     guidance_scale: float = 6.0 
 19 |     ip_adapter_repo: str = "h94/IP-Adapter"
 20 |     ip_adapter_subfolder: str = "sdxl_models"
 21 |     ip_adapter_weight_name: str = "ip-adapter_sdxl.bin"
 22 | 
 23 | 
 24 | class JuggernautT2I(BaseT2I):
 25 |     Config = JuggernautT2IConfig
 26 | 
 27 |     def __init__(self, config: JuggernautT2IConfig):
 28 |         super().__init__(config)
 29 |         self.refiner_pipe = None
 30 |         self._loaded_ip_adapter_count = 0
 31 |     
 32 |     @classmethod
 33 |     def get_capabilities(cls) -> ModuleCapabilities:
 34 |         return ModuleCapabilities(
 35 |             title="Juggernaut XL v9 (Quality), 2 Subjects considered",
 36 |             vram_gb_min=8.0,
 37 |             ram_gb_min=12.0,
 38 |             supported_formats=["Portrait", "Landscape"],
 39 |             supports_ip_adapter=True,
 40 |             supports_lora=True,
 41 |             max_subjects=2,
 42 |             accepts_text_prompt=True,
 43 |             accepts_negative_prompt=True
 44 |         )
 45 | 
 46 |     def get_model_capabilities(self) -> Dict[str, Any]:
 47 |         return {
 48 |             "resolutions": {"Portrait": (832, 1216), "Landscape": (1216, 832)},
 49 |             "max_shot_duration": 3.0 
 50 |         }
 51 | 
 52 |     def enhance_prompt(self, prompt: str, prompt_type: str = "visual") -> str:
 53 |         quality_keywords = "cinematic photography, hyperdetailed, (skin details:1.1), 8k, professional lighting"
 54 |         if prompt.strip().endswith(','):
 55 |             return f"{prompt} {quality_keywords}"
 56 |         else:
 57 |             return f"{prompt}, {quality_keywords}"
 58 | 
 59 |     def _load_pipeline(self):
 60 |         if self.pipe is None:
 61 |             if self.config.use_8bit_quantization:
 62 |                 print("Loading T2I pipeline (Juggernaut) with 8-bit quantization to save VRAM...")
 63 |                 bnb_config = BitsAndBytesConfig(
 64 |                     load_in_8bit=True,
 65 |                 )
 66 |                 # --- START OF FIX: Remove device_map and use .to(DEVICE) instead ---
 67 |                 # This prevents the accelerate hook conflict when loading IP-Adapters later.
 68 |                 self.pipe = StableDiffusionXLPipeline.from_pretrained(
 69 |                     self.config.model_id,
 70 |                     quantization_config=bnb_config,
 71 |                     torch_dtype=torch.float16,
 72 |                     variant="fp16",
 73 |                     use_safetensors=True,
 74 |                 ).to(DEVICE)
 75 |                 # --- END OF FIX ---
 76 |             else:
 77 |                 print(f"Loading T2I pipeline (Juggernaut) in full precision to {DEVICE}...")
 78 |                 self.pipe = StableDiffusionXLPipeline.from_pretrained(
 79 |                     self.config.model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
 80 |                 ).to(DEVICE)
 81 |             
 82 |             self.pipe.scheduler = JuggernautScheduler.from_config(self.pipe.scheduler.config, use_karras_sigmas=True)
 83 |             print(f"Juggernaut pipeline configured with {JuggernautScheduler.__name__} sampler.")
 84 | 
 85 |             if self.config.refiner_id:
 86 |                 print(f"Refiner specified but not typically used with Juggernaut, skipping load.")
 87 | 
 88 |     def clear_vram(self):
 89 |         print("Clearing T2I (Juggernaut) VRAM...")
 90 |         models = [m for m in [self.pipe, self.refiner_pipe] if m is not None]
 91 |         if models: clear_vram_globally(*models)
 92 |         self.pipe, self.refiner_pipe = None, None
 93 |         self._loaded_ip_adapter_count = 0
 94 |         print("T2I (Juggernaut) VRAM cleared.")
 95 | 
 96 |     def generate_image(self, prompt: str, negative_prompt: str, output_path: str, width: int, height: int, ip_adapter_image: Optional[Union[str, List[str]]] = None, seed: int = -1) -> str:
 97 |         self._load_pipeline()
 98 |         
 99 |         generator = None
100 |         if seed != -1:
101 |             print(f"Using fixed seed for generation: {seed}")
102 |             generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
103 |         else:
104 |             print("Using random seed for generation.")
105 | 
106 |         pipeline_kwargs = {"generator": generator} if generator else {}
107 |         ip_images_to_load = []
108 | 
109 |         if ip_adapter_image:
110 |             if isinstance(ip_adapter_image, str):
111 |                 ip_images_to_load = [ip_adapter_image]
112 |             else:
113 |                 ip_images_to_load = ip_adapter_image
114 |         
115 |         num_ip_images = len(ip_images_to_load)
116 | 
117 |         if num_ip_images > 0:
118 |             print(f"Juggernaut T2I: Activating IP-Adapter with {num_ip_images} character image(s).")
119 |             if self._loaded_ip_adapter_count != num_ip_images:
120 |                 print(f"Loading {num_ip_images} IP-Adapter(s) for the pipeline...")
121 |                 if hasattr(self.pipe, "unload_ip_adapter"): self.pipe.unload_ip_adapter()
122 |                 adapter_weights = [self.config.ip_adapter_weight_name] * num_ip_images
123 |                 self.pipe.load_ip_adapter(
124 |                     self.config.ip_adapter_repo, 
125 |                     subfolder=self.config.ip_adapter_subfolder, 
126 |                     weight_name=adapter_weights
127 |                 )
128 |                 self._loaded_ip_adapter_count = num_ip_images
129 |                 print(f"Successfully loaded {self._loaded_ip_adapter_count} adapters.")
130 |             
131 |             scales = [0.6] * num_ip_images
132 |             self.pipe.set_ip_adapter_scale(scales) 
133 |             ip_images = [load_image(p) for p in ip_images_to_load]
134 |             pipeline_kwargs["ip_adapter_image"] = ip_images
135 |         else:
136 |             print("Juggernaut T2I: No IP-Adapter image provided.")
137 |             if self._loaded_ip_adapter_count > 0:
138 |                  if hasattr(self.pipe, "unload_ip_adapter"): self.pipe.unload_ip_adapter()
139 |                  self._loaded_ip_adapter_count = 0
140 | 
141 |         enhanced_prompt = self.enhance_prompt(prompt)
142 |         print(f"Juggernaut generating image with resolution: {width}x{height}")
143 |         print(f"  - Prompt: '{enhanced_prompt}'")
144 |         print(f"  - Negative: '{negative_prompt}'")
145 | 
146 |         image = self.pipe(
147 |             prompt=enhanced_prompt,
148 |             negative_prompt=negative_prompt,
149 |             width=width,
150 |             height=height,
151 |             num_inference_steps=self.config.num_inference_steps,
152 |             guidance_scale=self.config.guidance_scale,
153 |             **pipeline_kwargs
154 |         ).images[0]
155 |         
156 |         image.save(output_path)
157 |         print(f"Image saved to {output_path}")
158 |         return output_path


--------------------------------------------------------------------------------
/ui_task_executor.py:
--------------------------------------------------------------------------------
  1 | # In ui_task_executor.py
  2 | 
  3 | import streamlit as st
  4 | from task_executor import TaskExecutor
  5 | from config_manager import ContentConfig
  6 | import logging
  7 | from typing import List, Optional, Any
  8 | import os
  9 | from utils import load_and_correct_image_orientation
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class UITaskExecutor:
 14 |     """Handles task execution triggered from the Streamlit UI, providing user feedback."""
 15 |     
 16 |     def __init__(self, project_manager):
 17 |         self.project_manager = project_manager
 18 |         self.task_executor: Optional[TaskExecutor] = None
 19 |         self._initialize_task_executor()
 20 |         
 21 |     def _initialize_task_executor(self):
 22 |         if not self.project_manager.state:
 23 |             st.error("Cannot initialize task executor: Project state not found.")
 24 |             return
 25 |         try:
 26 |             self.task_executor = TaskExecutor(self.project_manager)
 27 |         except Exception as e:
 28 |             logger.error(f"Failed to initialize TaskExecutor: {e}", exc_info=True)
 29 |             st.error(f"Configuration Error: {e}")
 30 | 
 31 |     def update_narration_text(self, scene_idx: int, text: str):
 32 |         self.project_manager.update_narration_part_text(scene_idx, text)
 33 | 
 34 |     def update_shot_prompts(self, scene_idx: int, shot_idx: int, visual_prompt: Optional[str] = None, motion_prompt: Optional[str] = None):
 35 |         self.project_manager.update_shot_content(scene_idx, shot_idx, visual_prompt, motion_prompt)
 36 | 
 37 |     def regenerate_audio(self, scene_idx: int, text: str, speaker_audio: Optional[str] = None) -> bool:
 38 |         if not self.task_executor: return False
 39 |         self.project_manager.update_narration_part_text(scene_idx, text)
 40 |         task_data = {"scene_idx": scene_idx, "text": text, "speaker_wav": speaker_audio if speaker_audio and os.path.exists(speaker_audio) else None}
 41 |         success = self.task_executor.execute_task("generate_audio", task_data)
 42 |         if success: st.toast(f"Audio for Scene {scene_idx + 1} generated!", icon="🔊")
 43 |         else: st.error(f"Failed to generate audio for Scene {scene_idx + 1}.")
 44 |         self.project_manager.load_project()
 45 |         return success
 46 |             
 47 |     def create_scene(self, scene_idx: int) -> bool:
 48 |         if not self.task_executor: return False
 49 |         success = self.task_executor.execute_task("create_scene", {"scene_idx": scene_idx})
 50 |         if success: st.toast(f"Scene {scene_idx + 1} shots created!", icon="🎬")
 51 |         else: st.error(f"Failed to create shots for Scene {scene_idx + 1}.")
 52 |         self.project_manager.load_project()
 53 |         return success
 54 | 
 55 |     # --- NEW METHOD ---
 56 |     def regenerate_scene_shots(self, scene_idx: int) -> bool:
 57 |         """Resets a scene and triggers the 'create_scene' task to regenerate shots."""
 58 |         if not self.task_executor: return False
 59 | 
 60 |         # First, reset the scene, clearing old shots and assets
 61 |         self.project_manager.reset_scene_for_shot_regeneration(scene_idx)
 62 |         st.toast(f"Cleared old shots for Scene {scene_idx + 1}. Regenerating...", icon="♻️")
 63 | 
 64 |         # Now, execute the create_scene task which will find the scene missing and create it
 65 |         success = self.task_executor.execute_task("create_scene", {"scene_idx": scene_idx})
 66 |         
 67 |         if success:
 68 |             st.toast(f"New shots for Scene {scene_idx + 1} generated!", icon="✨")
 69 |         else:
 70 |             st.error(f"Failed to regenerate shots for Scene {scene_idx + 1}.")
 71 |         
 72 |         self.project_manager.load_project()
 73 |         return success
 74 | 
 75 |     def regenerate_shot_image(self, scene_idx: int, shot_idx: int) -> bool:
 76 |         if not self.task_executor: return False
 77 |         self.project_manager.update_shot_content(scene_idx, shot_idx) 
 78 |         shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx]
 79 |         task_data = {"scene_idx": scene_idx, "shot_idx": shot_idx, "visual_prompt": shot.visual_prompt}
 80 |         success = self.task_executor.execute_task("generate_shot_image", task_data)
 81 |         if success: st.toast(f"Image for Shot {shot_idx + 1} generated!", icon="🖼️")
 82 |         else: st.error(f"Failed to generate image for Shot {shot_idx + 1}.")
 83 |         self.project_manager.load_project()
 84 |         return success
 85 | 
 86 |     def regenerate_shot_video(self, scene_idx: int, shot_idx: int) -> bool:
 87 |         if not self.task_executor: return False
 88 |         self.project_manager.update_shot_content(scene_idx, shot_idx)
 89 |         shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx]
 90 |         task_data = {
 91 |             "scene_idx": scene_idx, "shot_idx": shot_idx,
 92 |             "visual_prompt": shot.visual_prompt,
 93 |             "motion_prompt": shot.motion_prompt
 94 |         }
 95 |         success = self.task_executor.execute_task("generate_shot_video", task_data)
 96 |         if success: st.toast(f"Video for Shot {shot_idx + 1} generated!", icon="📹")
 97 |         else: st.error(f"Failed to generate video for Shot {shot_idx + 1}.")
 98 |         self.project_manager.load_project()
 99 |         return success
100 | 
101 |     def regenerate_shot_t2v(self, scene_idx: int, shot_idx: int) -> bool:
102 |         if not self.task_executor: return False
103 |         self.project_manager.update_shot_content(scene_idx, shot_idx)
104 |         shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx]
105 |         task_data = {"scene_idx": scene_idx, "shot_idx": shot_idx, "visual_prompt": shot.visual_prompt}
106 |         success = self.task_executor.execute_task("generate_shot_t2v", task_data)
107 |         if success: st.toast(f"T2V Shot {shot_idx + 1} generated!", icon="📹")
108 |         else: st.error(f"Failed to generate T2V Shot {shot_idx + 1}.")
109 |         self.project_manager.load_project()
110 |         return success
111 |             
112 |     def assemble_final_video(self) -> bool:
113 |         if not self.task_executor: return False
114 |         success = self.task_executor.execute_task("assemble_final", {})
115 |         if success: st.toast("Final video assembled successfully!", icon="🏆")
116 |         else: st.error("Failed to assemble final video.")
117 |         self.project_manager.load_project()
118 |         return success
119 | 
120 |     def add_character(self, name: str, image_file: "UploadedFile"):
121 |         if not self.project_manager.state: return False
122 |         safe_name = name.replace(" ", "_")
123 |         char_dir = os.path.join(self.project_manager.output_dir, "characters", safe_name)
124 |         os.makedirs(char_dir, exist_ok=True)
125 |         ref_image_path = os.path.join(char_dir, "reference.png")
126 |         
127 |         corrected_image = load_and_correct_image_orientation(image_file)
128 |         if corrected_image:
129 |             corrected_image.save(ref_image_path, "PNG")
130 |             char_data = {"name": name, "reference_image_path": ref_image_path}
131 |             self.project_manager.add_character(char_data)
132 |             st.toast(f"Character '{name}' added!", icon="👤")
133 |             return True
134 |         else:
135 |             st.error(f"Could not process image for new character {name}. Aborting.")
136 |             return False
137 | 
138 |     def update_character(self, old_name: str, new_name: str, new_image_file: Optional["UploadedFile"]):
139 |         ref_image_path = None
140 |         if new_image_file:
141 |             safe_name = (new_name or old_name).replace(" ", "_")
142 |             char_dir = os.path.join(self.project_manager.output_dir, "characters", safe_name)
143 |             os.makedirs(char_dir, exist_ok=True)
144 |             ref_image_path = os.path.join(char_dir, "reference.png")
145 | 
146 |             corrected_image = load_and_correct_image_orientation(new_image_file)
147 |             if corrected_image:
148 |                 corrected_image.save(ref_image_path, "PNG")
149 |             else:
150 |                 st.error("Failed to process the new image. Character image was not updated.")
151 |                 ref_image_path = None 
152 | 
153 |         self.project_manager.update_character(old_name, new_name, ref_image_path)
154 |         st.toast(f"Character '{old_name}' updated!", icon="✏️")
155 |         return True
156 | 
157 |     def delete_character(self, name: str):
158 |         self.project_manager.delete_character(name)
159 |         st.toast(f"Character '{name}' deleted!", icon="🗑️")
160 |         return True
161 |     
162 |     def update_project_config(self, key: str, value: Any):
163 |         """UI wrapper to update a specific project configuration value."""
164 |         self.project_manager.update_config_value(key, value)
165 |         st.toast(f"Setting '{key.replace('_', ' ').title()}' updated.")
166 |         st.rerun()
167 | 
168 |     def update_scene_characters(self, scene_idx: int, character_names: List[str]):
169 |         self.project_manager.update_scene_characters(scene_idx, character_names)
170 |         st.toast(f"Characters for Scene {scene_idx+1} updated.", icon="🎬")
171 |     
172 |     def add_new_scene(self, scene_idx: int):
173 |         """UI wrapper to add a new scene."""
174 |         self.project_manager.add_new_scene_at(scene_idx)
175 |         st.toast(f"New scene added at position {scene_idx + 1}!", icon="➕")
176 |         return True
177 | 
178 |     def remove_scene(self, scene_idx: int):
179 |         """UI wrapper to remove a scene."""
180 |         self.project_manager.remove_scene_at(scene_idx)
181 |         st.toast(f"Scene {scene_idx + 1} removed!", icon="🗑️")
182 |         return True


--------------------------------------------------------------------------------
/llm_modules/llm_zephyr.py:
--------------------------------------------------------------------------------
  1 | # llm_modules/llm_zephyr.py
  2 | import torch
  3 | import json
  4 | import re
  5 | from typing import List, Optional, Tuple, Dict, Any
  6 | from transformers import AutoModelForCausalLM, AutoTokenizer
  7 | 
  8 | from base_modules import BaseLLM, BaseModuleConfig, ModuleCapabilities
  9 | from config_manager import ContentConfig, DEVICE, clear_vram_globally
 10 | 
 11 | class ZephyrLLMConfig(BaseModuleConfig):
 12 |     model_id: str = "HuggingFaceH4/zephyr-7b-beta"
 13 |     max_new_tokens_script: int = 2048 # Increased for new fields
 14 |     max_new_tokens_shot_prompt: int = 256
 15 |     temperature: float = 0.7
 16 |     top_k: int = 50
 17 |     top_p: float = 0.95
 18 | 
 19 | class ZephyrLLM(BaseLLM):
 20 |     Config = ZephyrLLMConfig
 21 | 
 22 |     @classmethod
 23 |     def get_capabilities(cls) -> ModuleCapabilities:
 24 |         return ModuleCapabilities(
 25 |             title="Zephyr 7B",
 26 |             vram_gb_min=8.0,
 27 |             ram_gb_min=16.0,
 28 |             # LLM-specific capabilities are not the main focus, so we use defaults.
 29 |         )
 30 |     
 31 |     def _load_model_and_tokenizer(self):
 32 |         if self.model is None or self.tokenizer is None:
 33 |             print(f"Loading LLM: {self.config.model_id}...")
 34 |             self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_id)
 35 |             if self.tokenizer.pad_token is None:
 36 |                 self.tokenizer.pad_token = self.tokenizer.eos_token
 37 |             
 38 |             try:
 39 |                 self.model = AutoModelForCausalLM.from_pretrained(
 40 |                     self.config.model_id, torch_dtype=torch.float16
 41 |                 ).to(DEVICE)
 42 |             except Exception as e:
 43 |                 print(f"Failed to load LLM with device_map='auto' ({e}), trying with explicit device: {DEVICE}")
 44 |                 self.model = AutoModelForCausalLM.from_pretrained(
 45 |                     self.config.model_id, torch_dtype=torch.float16
 46 |                 ).to(DEVICE)
 47 |             print("LLM loaded.")
 48 | 
 49 |     def clear_vram(self):
 50 |         print("Clearing LLM VRAM...")
 51 |         models_to_clear = [m for m in [self.model] if m is not None]
 52 |         if models_to_clear: clear_vram_globally(*models_to_clear)
 53 |         self.model, self.tokenizer = None, None
 54 |         print("LLM VRAM cleared.")
 55 | 
 56 |     def _parse_llm_json_response(self, decoded_output: str, context: str = "script") -> Optional[Dict]:
 57 |         match = re.search(r'\{[\s\S]*\}', decoded_output)
 58 |         json_text = match.group(0) if match else decoded_output
 59 |         try:
 60 |             return json.loads(re.sub(r',(\s*[}\]])', r'\1', json_text))
 61 |         except json.JSONDecodeError as e:
 62 |             print(f"Error parsing LLM JSON for {context}: {e}. Raw output:\n{decoded_output}")
 63 |             return None
 64 | 
 65 |     def generate_script(self, topic: str, content_config: ContentConfig) -> Dict[str, Any]:
 66 |         self._load_model_and_tokenizer()
 67 |         print(f"Generating script for topic: '{topic}' in language: {content_config.language}")
 68 |         
 69 |         # --- MODIFICATION START ---
 70 |         # Map language code to full name for better prompting
 71 |         language_map = {
 72 |             'en': 'English', 'es': 'Spanish', 'fr': 'French',
 73 |             'de': 'German', 'it': 'Italian', 'pt': 'Portuguese',
 74 |             'pl': 'Polish', 'tr': 'Turkish', 'ru': 'Russian',
 75 |             'nl': 'Dutch', 'cs': 'Czech', 'ar': 'Arabic',
 76 |             'zh-cn': 'Chinese (Simplified)', 'ja': 'Japanese',
 77 |             'hu': 'Hungarian', 'ko': 'Korean', 'hi': 'Hindi'
 78 |         }
 79 |         target_language = language_map.get(content_config.language, 'English')
 80 | 
 81 |         system_prompt = (
 82 |             "You are a multilingual AI assistant creating content for a short video. "
 83 |             "You will be asked to write the narration in a specific language, but all other content (visual prompts, descriptions, hashtags) must be in English for the video generation models. "
 84 |             "Your response must be a single, valid JSON object with these exact keys: "
 85 |             "\"main_subject_description\", \"setting_description\", \"narration\", \"visuals\", \"hashtags\"."
 86 |         )
 87 |         
 88 |         user_prompt = f"""
 89 |         **IMPORTANT INSTRUCTIONS:**
 90 |         1.  The **"narration"** text MUST be written in **{target_language}**. Use the native script if applicable (e.g., Devanagari for Hindi).
 91 |         2.  Use proper punctuation (like commas and periods) in the narration for a natural-sounding voiceover.
 92 |         3.  All other fields ("main_subject_description", "setting_description", "visuals", "hashtags") MUST remain in **English**.
 93 | 
 94 |         ---
 95 |         Create content for a short video about "{topic}".
 96 |         The total narration should be ~{content_config.target_video_length_hint}s, with {content_config.min_scenes} to {content_config.max_scenes} scenes.
 97 |         Each scene's narration should be ~{content_config.max_scene_narration_duration_hint}s.
 98 |         
 99 |         Return your response in this exact JSON format:
100 |         {{
101 |             "main_subject_description": "A detailed, consistent description of the main character or subject (e.g., 'Fluffy, a chubby but cute orange tabby cat with green eyes'). MUST BE IN ENGLISH.",
102 |             "setting_description": "A description of the primary environment (e.g., 'a cozy, sunlit living room with plush furniture'). MUST BE IN ENGLISH.",
103 |             "narration": [
104 |                 {{"scene": 1, "text": "First scene narration text, written in {target_language}.", "duration_estimate": {content_config.max_scene_narration_duration_hint}}}
105 |             ],
106 |             "visuals": [
107 |                 {{"scene": 1, "prompt": "Detailed visual prompt for scene 1. MUST BE IN ENGLISH."}}
108 |             ],
109 |             "hashtags": ["relevantTag1", "relevantTag2"]
110 |         }}
111 |         """
112 |         # --- MODIFICATION END ---
113 |         
114 |         messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
115 | 
116 |         for attempt in range(3):
117 |             print(f"Attempt {attempt + 1} of 3 to generate valid script JSON...")
118 |             
119 |             tokenized_chat = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
120 |             outputs = self.model.generate(
121 |                 input_ids=tokenized_chat, max_new_tokens=self.config.max_new_tokens_script, 
122 |                 do_sample=True, top_k=self.config.top_k, top_p=self.config.top_p, 
123 |                 temperature=self.config.temperature, pad_token_id=self.tokenizer.eos_token_id
124 |             )
125 |             decoded_output = self.tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True)
126 |             response_data = self._parse_llm_json_response(decoded_output, "script")
127 | 
128 |             if response_data and all(k in response_data for k in ["narration", "visuals", "main_subject_description"]):
129 |                 print("Successfully generated and parsed valid script JSON.")
130 |                 return {
131 |                     "main_subject_description": response_data.get("main_subject_description"),
132 |                     "setting_description": response_data.get("setting_description"),
133 |                     "narration": sorted(response_data.get("narration", []), key=lambda x: x["scene"]),
134 |                     "visuals": [p["prompt"] for p in sorted(response_data.get("visuals", []), key=lambda x: x["scene"])],
135 |                     "hashtags": response_data.get("hashtags", [])
136 |                 }
137 |             else:
138 |                 print(f"Attempt {attempt + 1} failed. The response was not a valid JSON or was missing required keys.")
139 |                 if attempt < 2:
140 |                     print("Retrying...")
141 |         
142 |         print("LLM script generation failed after 3 attempts. Using fallback.")
143 |         # Fallback remains in English as a safe default
144 |         return {
145 |             "main_subject_description": topic, "setting_description": "a simple background",
146 |             "narration": [{"text": f"An intro to {topic}.", "duration_estimate": 5.0}],
147 |             "visuals": [f"Cinematic overview of {topic}."], "hashtags": [f"#{topic.replace(' ', '')}"]
148 |         }
149 | 
150 |     def generate_shot_visual_prompts(self, scene_narration: str, original_scene_prompt: str, num_shots: int, content_config: ContentConfig, main_subject: str, setting: str) -> List[Tuple[str, str]]:
151 |         self._load_model_and_tokenizer()
152 |         shot_prompts = []
153 |         
154 |         # Define the prompts, which are the same for each shot generation call
155 |         system_prompt = (
156 |             "You are an Movie director. Your task is to generate a 'visual_prompt' and a 'motion_prompt' for a short video shot "
157 |             "The prompts MUST incorporate the provided main subject and setting. Do NOT change the subject. "
158 |             "Respond in this exact JSON format: {\"visual_prompt\": \"...\", \"motion_prompt\": \"...\"}"
159 |         )
160 | 
161 |         for shot_idx in range(num_shots):
162 |             print(f"--- Generating prompts for Shot {shot_idx + 1}/{num_shots} ---")
163 |             
164 |             # --- NEW: Defensive check to prevent intermittent crashes ---
165 |             # This handles rare cases where the model/tokenizer might be cleared from memory
166 |             # between calls within the same task execution.
167 |             if self.model is None or self.tokenizer is None:
168 |                 print("WARNING: LLM was unloaded unexpectedly. Forcing a reload before generating shot prompt.")
169 |                 self._load_model_and_tokenizer()
170 | 
171 |             user_prompt = f"""
172 |             **Main Subject (MUST BE INCLUDED):** {main_subject}
173 |             **Setting (MUST BE INCLUDED):** {setting}
174 |             
175 |             ---
176 |             **Original Scene Goal:** "{original_scene_prompt}"
177 |             **This Shot's Narration:** "{scene_narration}"
178 |             
179 |             Based on ALL the information above, create a visual and motion prompt for shot {shot_idx + 1}/{num_shots}.
180 |             The visual prompt should be a specific, detailed moment consistent with the subject and setting.
181 |             try to describe the visual prompt in minimum words but in very specific details what a director would want  the image to look like.
182 |             Descrive character, subject and envrionment in words, only chose important words no need to make complete sentances.
183 |             try to describe the visual prompt in minimum words but in very specific details what a director would want  the image to look like.
184 |             Descrive character, subject and envrionment in words, only chose important words no need to make complete sentances.
185 |             Also descirbe camera mm, shot type, location, lighting, color, mood, etc.
186 |             Do not include any other text or comments other then given json format.
187 |             """
188 |             messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
189 |             
190 |             visual_prompt, motion_prompt = None, None
191 | 
192 |             # --- MODIFICATION START: Add retry loop for each shot ---
193 |             for attempt in range(3):
194 |                 print(f"Attempt {attempt + 1} of 3 to generate valid prompt JSON for shot {shot_idx + 1}...")
195 |                 
196 |                 tokenized_chat = self.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
197 |                 outputs = self.model.generate(
198 |                     input_ids=tokenized_chat, max_new_tokens=self.config.max_new_tokens_shot_prompt, 
199 |                     do_sample=True, temperature=self.config.temperature, pad_token_id=self.tokenizer.eos_token_id
200 |                 )
201 |                 decoded_output = self.tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True)
202 |                 response_data = self._parse_llm_json_response(decoded_output, f"shot {shot_idx+1} prompt")
203 | 
204 |                 # Check for a dictionary with both required string keys
205 |                 if (isinstance(response_data, dict) and 
206 |                     isinstance(response_data.get("visual_prompt"), str) and 
207 |                     isinstance(response_data.get("motion_prompt"), str)):
208 |                     
209 |                     visual_prompt = response_data["visual_prompt"]
210 |                     motion_prompt = response_data["motion_prompt"]
211 |                     print(f"Successfully generated and parsed prompts for shot {shot_idx + 1}.")
212 |                     break  # Exit the retry loop on success
213 |                 else:
214 |                     print(f"Attempt {attempt + 1} failed for shot {shot_idx + 1}. Invalid JSON or missing keys.")
215 |             # --- MODIFICATION END ---
216 | 
217 |             # If after 3 attempts, we still don't have prompts, use the fallback
218 |             if not visual_prompt or not motion_prompt:
219 |                 print(f"All attempts failed for shot {shot_idx + 1}. Using fallback prompts.")
220 |                 visual_prompt = f"{main_subject} in {setting}, {original_scene_prompt}"
221 |                 motion_prompt = "gentle camera movement"
222 | 
223 |             shot_prompts.append((visual_prompt, motion_prompt))
224 |             print(f"  > Shot {shot_idx+1} Visual: \"{visual_prompt[:80]}...\"")
225 |             print(f"  > Shot {shot_idx+1} Motion: \"{motion_prompt[:80]}...\"")
226 |         
227 |         return shot_prompts


--------------------------------------------------------------------------------
/task_executor.py:
--------------------------------------------------------------------------------
  1 | # In task_executor.py
  2 | import logging
  3 | import math
  4 | import os
  5 | import random
  6 | from typing import Optional, Dict
  7 | import torch
  8 | from importlib import import_module
  9 | 
 10 | from project_manager import ProjectManager, STATUS_IMAGE_GENERATED, STATUS_VIDEO_GENERATED, STATUS_FAILED
 11 | from config_manager import ContentConfig
 12 | from video_assembly import assemble_final_reel, assemble_scene_video_from_sub_clips
 13 | from base_modules import ModuleCapabilities
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | def _import_class(module_path_str: str):
 18 |     module_path, class_name = module_path_str.rsplit('.', 1)
 19 |     module = import_module(module_path)
 20 |     return getattr(module, class_name)
 21 | 
 22 | class TaskExecutor:
 23 |     def __init__(self, project_manager: ProjectManager):
 24 |         self.project_manager = project_manager
 25 |         self.content_cfg = ContentConfig(**self.project_manager.state.project_info.config)
 26 |         
 27 |         module_selections = self.content_cfg.module_selections
 28 |         if not module_selections:
 29 |             raise ValueError("Project state is missing module selections. Cannot initialize TaskExecutor.")
 30 | 
 31 |         # --- START OF FIX: Use .get() for safe module loading to prevent crashes ---
 32 |         
 33 |         # LLM and TTS are always required
 34 |         LlmClass = _import_class(module_selections["llm"])
 35 |         self.llm_module = LlmClass(LlmClass.Config())
 36 |         
 37 |         TtsClass = _import_class(module_selections["tts"])
 38 |         self.tts_module = TtsClass(TtsClass.Config())
 39 | 
 40 |         # Video modules are optional depending on the flow
 41 |         self.t2i_module = None
 42 |         self.i2v_module = None
 43 |         self.t2v_module = None
 44 |         
 45 |         t2i_path = module_selections.get("t2i")
 46 |         if t2i_path:
 47 |             T2iClass = _import_class(t2i_path)
 48 |             self.t2i_module = T2iClass(T2iClass.Config())
 49 | 
 50 |         i2v_path = module_selections.get("i2v")
 51 |         if i2v_path:
 52 |             I2vClass = _import_class(i2v_path)
 53 |             self.i2v_module = I2vClass(I2vClass.Config())
 54 | 
 55 |         t2v_path = module_selections.get("t2v")
 56 |         if t2v_path:
 57 |             T2vClass = _import_class(t2v_path)
 58 |             self.t2v_module = T2vClass(T2vClass.Config())
 59 | 
 60 |         # Determine capabilities based on which modules were actually loaded
 61 |         self.active_flow_supports_characters = False
 62 |         if self.content_cfg.use_svd_flow and self.t2i_module:
 63 |             t2i_caps = self.t2i_module.get_capabilities()
 64 |             self.active_flow_supports_characters = t2i_caps.supports_ip_adapter
 65 |             logger.info("Decisive module for character support: T2I module.")
 66 |         elif not self.content_cfg.use_svd_flow and self.t2v_module:
 67 |             t2v_caps = self.t2v_module.get_capabilities()
 68 |             self.active_flow_supports_characters = t2v_caps.supports_ip_adapter
 69 |             logger.info("Decisive module for character support: T2V module.")
 70 |         # --- END OF FIX ---
 71 |         
 72 |         logger.info(f"Holistic check: Active flow supports characters: {self.active_flow_supports_characters}")
 73 |         self._configure_from_model_capabilities()
 74 | 
 75 |     def _configure_from_model_capabilities(self):
 76 |         logger.info("--- TaskExecutor: Configuring run from model capabilities... ---")
 77 |         if self.content_cfg.use_svd_flow:
 78 |             if self.t2i_module and self.i2v_module:
 79 |                 t2i_caps = self.t2i_module.get_model_capabilities()
 80 |                 i2v_caps = self.i2v_module.get_model_capabilities()
 81 |                 self.content_cfg.generation_resolution = t2i_caps["resolutions"].get(self.content_cfg.aspect_ratio_format)
 82 |                 self.content_cfg.model_max_video_shot_duration = i2v_caps.get("max_shot_duration", 3.0)
 83 |             else:
 84 |                 logger.warning("Warning: T2I or I2V module not loaded for I2V flow. Using default configurations.")
 85 |         else: # T2V Flow
 86 |             if self.t2v_module:
 87 |                 t2v_caps = self.t2v_module.get_model_capabilities()
 88 |                 self.content_cfg.generation_resolution = t2v_caps["resolutions"].get(self.content_cfg.aspect_ratio_format)
 89 |                 self.content_cfg.model_max_video_shot_duration = t2v_caps.get("max_shot_duration", 2.0)
 90 |             else:
 91 |                 logger.warning("Warning: T2V module not loaded for T2V flow. Using default configurations.")
 92 | 
 93 |         logger.info(f"Dynamically set Generation Resolution to: {self.content_cfg.generation_resolution}")
 94 |         logger.info(f"Dynamically set Max Shot Duration to: {self.content_cfg.model_max_video_shot_duration}s")
 95 |         self.project_manager.state.project_info.config = self.content_cfg.model_dump()
 96 |         self.project_manager._save_state()
 97 | 
 98 |     def execute_task(self, task: str, task_data: Dict) -> bool:
 99 |         try:
100 |             # --- START OF FIX: Refresh config before every task to prevent stale state ---
101 |             self.content_cfg = ContentConfig(**self.project_manager.state.project_info.config)
102 |             logger.info(f"Executing task '{task}' with add_narration_text set to: {self.content_cfg.add_narration_text_to_video}")
103 |             # --- END OF FIX ---
104 |             
105 |             task_map = {
106 |                 "generate_script": self._execute_generate_script, "generate_audio": self._execute_generate_audio,
107 |                 "create_scene": self._execute_create_scene, "generate_shot_image": self._execute_generate_shot_image,
108 |                 "generate_shot_video": self._execute_generate_shot_video, "generate_shot_t2v": self._execute_generate_shot_t2v,
109 |                 "assemble_scene": self._execute_assemble_scene, "assemble_final": self._execute_assemble_final,
110 |             }
111 |             if task in task_map: return task_map[task](**task_data)
112 |             logger.error(f"Unknown task: {task}"); return False
113 |         except Exception as e:
114 |             logger.error(f"Error executing task {task}: {e}", exc_info=True); return False
115 | 
116 |     def _execute_generate_script(self, topic: str) -> bool:
117 |         script_data = self.llm_module.generate_script(topic, self.content_cfg)
118 |         self.llm_module.clear_vram()
119 |         self.project_manager.update_script(script_data)
120 |         return True
121 | 
122 |     def _execute_generate_audio(self, scene_idx: int, text: str, speaker_wav: Optional[str] = None) -> bool:
123 |         path, duration = self.tts_module.generate_audio(text, self.content_cfg.output_dir, scene_idx, language=self.content_cfg.language, speaker_wav=speaker_wav)
124 |         self.project_manager.update_narration_part_status(scene_idx, "generated", path, duration if duration > 0.1 else 0.0)
125 |         return True
126 | 
127 |     def _execute_create_scene(self, scene_idx: int) -> bool:
128 |         narration = self.project_manager.state.script.narration_parts[scene_idx]
129 |         visual_prompt = self.project_manager.state.script.visual_prompts[scene_idx]
130 |         main_subject = self.project_manager.state.script.main_subject_description
131 |         setting = self.project_manager.state.script.setting_description
132 |         
133 |         actual_audio_duration = narration.duration
134 |         max_shot_duration = self.content_cfg.model_max_video_shot_duration
135 |         
136 |         if actual_audio_duration <= 0 or max_shot_duration <= 0:
137 |             num_shots = 1
138 |             logger.warning(f"Warning: Invalid duration detected for Scene {scene_idx} (Audio: {actual_audio_duration}s, Max Shot: {max_shot_duration}s). Defaulting to 1 shot.")
139 |         else:
140 |             num_shots = math.ceil(actual_audio_duration / max_shot_duration) or 1
141 | 
142 |         logger.info("--- Calculating Shots for Scene {} ---".format(scene_idx))
143 |         logger.info(f"  - Actual Audio Duration: {actual_audio_duration:.2f}s")
144 |         logger.info(f"  - Model's Max Shot Duration: {max_shot_duration:.2f}s")
145 |         logger.info(f"  - Calculated Number of Shots: {num_shots} ({actual_audio_duration:.2f}s / {max_shot_duration:.2f}s)")
146 |         
147 |         shot_prompts = self.llm_module.generate_shot_visual_prompts(
148 |             narration.text, visual_prompt.prompt, num_shots, self.content_cfg, main_subject, setting
149 |         )
150 |         self.llm_module.clear_vram()
151 |         
152 |         shots = []
153 |         for i, (visual, motion) in enumerate(shot_prompts):
154 |             if i < num_shots - 1:
155 |                 duration = max_shot_duration
156 |             else:
157 |                 duration = actual_audio_duration - (i * max_shot_duration)
158 |             
159 |             shots.append({"shot_idx": i, "target_duration": max(0.5, duration), "visual_prompt": visual, "motion_prompt": motion})
160 |         
161 |         all_character_names = [char.name for char in self.project_manager.state.characters]
162 |         logger.info(f"Creating Scene {scene_idx} and assigning default characters: {all_character_names}")
163 |         self.project_manager.add_scene(scene_idx, shots, character_names=all_character_names)
164 |         return True
165 | 
166 |     def _execute_generate_shot_image(self, scene_idx: int, shot_idx: int, visual_prompt: str, **kwargs) -> bool:
167 |         if not self.t2i_module:
168 |             logger.error("Attempted to generate image, but T2I module is not loaded for this workflow.")
169 |             return False
170 |         w, h = self.content_cfg.generation_resolution
171 |         path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_keyframe.png")
172 |         
173 |         base_seed = self.content_cfg.seed
174 |         shot_seed = random.randint(0, 2**32 - 1) if base_seed == -1 else base_seed + scene_idx * 100 + shot_idx
175 |         
176 |         negative_prompt = "worst quality, low quality, bad anatomy, text, watermark, jpeg artifacts, blurry"
177 |         
178 |         scene = self.project_manager.get_scene_info(scene_idx)
179 |         ip_adapter_image_paths = []
180 |         if scene and scene.character_names:
181 |             logger.info(f"Found characters for Scene {scene_idx}: {scene.character_names}")
182 |             for name in scene.character_names:
183 |                 char = self.project_manager.get_character(name)
184 |                 if char and os.path.exists(char.reference_image_path):
185 |                     ip_adapter_image_paths.append(char.reference_image_path)
186 |         
187 |         self.t2i_module.generate_image(
188 |             prompt=visual_prompt, negative_prompt=negative_prompt, output_path=path, 
189 |             width=w, height=h, ip_adapter_image=ip_adapter_image_paths or None, seed=shot_seed
190 |         )
191 |         
192 |         self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_IMAGE_GENERATED, keyframe_path=path)
193 |         self.t2i_module.clear_vram()
194 |         return True
195 | 
196 |     def _execute_generate_shot_video(self, scene_idx: int, shot_idx: int, visual_prompt: str, motion_prompt: Optional[str], **kwargs) -> bool:
197 |         if not self.i2v_module:
198 |             logger.error("Attempted to generate video from image, but I2V module is not loaded for this workflow.")
199 |             return False
200 |         shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx]
201 |         if not shot.keyframe_image_path or not os.path.exists(shot.keyframe_image_path): return False
202 |         
203 |         enhanced_visual = self.i2v_module.enhance_prompt(visual_prompt, "visual")
204 |         enhanced_motion = self.i2v_module.enhance_prompt(motion_prompt, "motion")
205 | 
206 |         scene = self.project_manager.get_scene_info(scene_idx)
207 |         ip_adapter_image_paths = [self.project_manager.get_character(name).reference_image_path for name in scene.character_names if self.project_manager.get_character(name)]
208 | 
209 |         video_path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_svd.mp4")
210 |         
211 |         sub_clip_path = self.i2v_module.generate_video_from_image(
212 |             image_path=shot.keyframe_image_path, output_video_path=video_path, target_duration=shot.target_duration, 
213 |             content_config=self.content_cfg, visual_prompt=enhanced_visual, motion_prompt=enhanced_motion,
214 |             ip_adapter_image=ip_adapter_image_paths or None
215 |         )
216 | 
217 |         if sub_clip_path and os.path.exists(sub_clip_path):
218 |             self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_VIDEO_GENERATED, video_path=sub_clip_path)
219 |             return True
220 |         self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_FAILED); return False
221 | 
222 |     def _execute_generate_shot_t2v(self, scene_idx: int, shot_idx: int, visual_prompt: str, **kwargs) -> bool:
223 |         if not self.t2v_module:
224 |             logger.error("Attempted to generate video from text, but T2V module is not loaded for this workflow.")
225 |             return False
226 |         shot = self.project_manager.get_scene_info(scene_idx).shots[shot_idx]
227 |         num_frames = int(shot.target_duration * self.content_cfg.fps)
228 |         w, h = self.content_cfg.generation_resolution
229 |         
230 |         enhanced_prompt = self.t2v_module.enhance_prompt(visual_prompt)
231 |         
232 |         scene = self.project_manager.get_scene_info(scene_idx)
233 |         ip_adapter_image_paths = [self.project_manager.get_character(name).reference_image_path for name in scene.character_names if self.project_manager.get_character(name)]
234 |         
235 |         video_path = os.path.join(self.content_cfg.output_dir, f"scene_{scene_idx}_shot_{shot_idx}_t2v.mp4")
236 |         
237 |         sub_clip_path = self.t2v_module.generate_video_from_text(
238 |             enhanced_prompt, video_path, num_frames, self.content_cfg.fps, w, h,
239 |             ip_adapter_image=ip_adapter_image_paths or None
240 |         )
241 | 
242 |         if sub_clip_path and os.path.exists(sub_clip_path):
243 |             self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_VIDEO_GENERATED, video_path=sub_clip_path)
244 |             return True
245 |         self.project_manager.update_shot_status(scene_idx, shot_idx, STATUS_FAILED); return False
246 | 
247 |     def _execute_assemble_scene(self, scene_idx: int, **kwargs) -> bool:
248 |         scene = self.project_manager.get_scene_info(scene_idx)
249 |         if not scene: return False
250 |         video_paths = [c.video_path for c in scene.shots if c.status == STATUS_VIDEO_GENERATED]
251 |         if len(video_paths) != len(scene.shots): return False
252 |         
253 |         narration_duration = self.project_manager.state.script.narration_parts[scene_idx].duration
254 |         final_path = assemble_scene_video_from_sub_clips(video_paths, narration_duration, self.content_cfg, scene_idx)
255 |         
256 |         if final_path:
257 |             self.project_manager.update_scene_status(scene_idx, "completed", assembled_video_path=final_path)
258 |             return True
259 |         self.project_manager.update_scene_status(scene_idx, "failed"); return False
260 |     
261 |     def _execute_assemble_final(self, **kwargs) -> bool:
262 |         narration_parts = self.project_manager.state.script.narration_parts
263 |         assets = [
264 |             (s.assembled_video_path, narration_parts[s.scene_idx].audio_path, {"text": narration_parts[s.scene_idx].text, "duration": narration_parts[s.scene_idx].duration})
265 |             for s in self.project_manager.state.scenes if s.status == "completed"
266 |         ]
267 |         if len(assets) != len(self.project_manager.state.scenes): return False
268 |         
269 |         topic = self.project_manager.state.project_info.topic
270 |         final_path = assemble_final_reel(assets, self.content_cfg, output_filename=f"{topic.replace(' ','_')}_final.mp4")
271 |         
272 |         if final_path and os.path.exists(final_path):
273 |             text = " ".join([a[2]["text"] for a in assets])
274 |             hashtags = self.project_manager.state.script.hashtags
275 |             self.project_manager.update_final_video(final_path, "generated", text, hashtags)
276 |             return True
277 |         self.project_manager.update_final_video("", "pending", "", []); return False


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Modular AI Video Generation Pipeline
  2 | 
  3 | [![Python Version](https://img.shields.io/badge/Python-3.10%2B-blue.svg)](https://www.python.org/)
  4 | [![Framework](https://img.shields.io/badge/Framework-Streamlit-red.svg)](https://streamlit.io)
  5 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
  6 | 
  7 | ## ⚠️ Important Notes
  8 | 
  9 | **Video Quality Issues**: If your generated videos appear scrambled or distorted, this typically means you're not using the optimal video dimensions that the selected model was trained on. Each AI model has specific resolution requirements for best results. Check the model documentation for recommended dimensions and adjust your video settings accordingly.
 10 | 
 11 | **Contributors Welcome!** 🚀 This project is open to contributions from the community. If you're interested in helping improve this pipeline, adding new models, or fixing bugs, please feel free to submit pull requests or open issues.
 12 | 
 13 | **New Project Announcement**: I've started working on a completely separate and different video generation project. If you're interested in learning more or collaborating, feel free to reach out to me on [LinkedIn](https://www.linkedin.com/in/gowravvishwakarma/)!
 14 | 
 15 | ---
 16 | 
 17 | An extensible, modular pipeline for generating short-form videos using a variety of AI models. This tool provides a powerful Streamlit-based web interface to define a video topic, select different AI models for each generation step (language, speech, image, video), and orchestrate the entire content creation process from script to final rendered video.
 18 | 
 19 | ## 🎥 Demo Video
 20 | 
 21 | <div align="center">
 22 |   <a href="https://youtube.com/watch?v=0YBcYGmYV4c">
 23 |     <img src="https://img.youtube.com/vi/0YBcYGmYV4c/maxresdefault.jpg" alt="Modular AI Video Generation Pipeline Demo" width="800">
 24 |   </a>
 25 |   <p><strong>Watch the full demo on YouTube</strong></p>
 26 | </div>
 27 | 
 28 | ## Core Features
 29 | 
 30 | - **End-to-End Video Generation**: Go from a single topic idea to a fully edited video with narration, background visuals, and text overlays in one integrated workflow.
 31 | - **Fully Modular Architecture**: Easily add, remove, or swap different AI models for each part of the pipeline. The system is designed for extension.
 32 | - **Dynamic Model Discovery**: The application automatically discovers any new model modules you add, making them immediately available for selection in the UI.
 33 | - **Dual Generation Workflows**:
 34 |   - **Image-to-Video (High Quality)**: Generates a keyframe image first, then animates it. Offers higher visual quality and control.
 35 |   - **Text-to-Video (Fast)**: Generates video clips directly from text prompts for a faster, more streamlined process.
 36 | - **Character Consistency**: Utilizes IP-Adapters in supported models (like Juggernaut-XL) to maintain the appearance of a specific character or subject across different scenes.
 37 | - **Interactive Project Dashboard**: Once a project is created, you have full control. Edit scripts, regenerate audio, modify visual prompts, and see the progress of every task in real-time.
 38 | - **Stateful Project Management**: Stop and resume your work at any time. The entire project state is saved, allowing you to load existing projects, make changes, and continue where you left off.
 39 | - **Multi-Language Voice Generation**: Generate narration in over 15 languages (including English, Spanish, French, German, Japanese, Hindi, and more) using advanced TTS models.
 40 | - **Voice Cloning**: Provide a short `.wav` file of a reference voice to clone it for the video's narration, powered by Coqui XTTS.
 41 | 
 42 | ## Future Development Plans
 43 | 
 44 | ### TODO List
 45 | 
 46 | 1. **Text-to-Music (TTM) Modules**
 47 | 
 48 |    - Background music generation for videos
 49 |    - Pure music production capabilities
 50 |    - Integration with existing video pipeline
 51 | 
 52 | 2. **Additional Model Support**
 53 | 
 54 |    - FramePack and other advanced video generation models
 55 |    - Enhanced model compatibility and optimization
 56 |    - Lora (Low-Rank Adaptation) support for fine-tuning models
 57 |    - Custom Lora training and management interface
 58 |    - ControlNet integration for pose, depth, and style control
 59 |    - Advanced ControlNet features (canny, segmentation, etc.)
 60 | 
 61 | 3. **Character Consistency Features**
 62 | 
 63 |    - Lora-based character consistency across scenes
 64 |    - Character style preservation and transfer
 65 |    - Multi-character management system
 66 |    - Character pose and expression control
 67 | 
 68 | 4. **Advanced Editing Features**
 69 | 
 70 |    - Multilayer timeline style editor
 71 |    - Professional-grade video editing capabilities
 72 |    - Enhanced control over transitions and effects
 73 | 
 74 | 5. **UI/UX Improvements**
 75 | 
 76 |    - Migration to FastAPI backend
 77 |    - Modern frontend with React/Vue
 78 |    - Enhanced user experience and performance
 79 | 
 80 | 6. **Production Infrastructure**
 81 |    - Distributed model serving system
 82 |    - Load balancing across multiple GPUs/servers
 83 |    - Model caching and optimization
 84 |    - User quota and resource management
 85 |    - Queue management for multiple users
 86 |    - Real-time progress tracking and status updates
 87 |    - Automatic failover and recovery
 88 |    - Resource usage analytics and monitoring
 89 | 
 90 | ## Architectural Overview
 91 | 
 92 | The pipeline follows a state-driven, sequential process. The `ProjectManager` tracks the status of every task in a `project.json` file. The `TaskExecutor` then reads this state and executes the next pending task using the specific modules you selected for the project.
 93 | 
 94 | ```mermaid
 95 | graph TD
 96 |     A[Start: Create New Project in UI] --> B{Select Models & Workflow};
 97 |     B --> C[Provide Topic & Settings];
 98 |     C --> D[Project Initialized - project.json];
 99 |     D --> E[Task: Generate Script LLM];
100 |     E --> F[Task: Generate Audio TTS];
101 |     F --> G[Task: Create Scene Shots - LLM];
102 | 
103 |     subgraph "For Each Scene Shot"
104 |         direction LR
105 |         G --> H{I2V or T2V Flow?};
106 |         H -- I2V --> I[Task: Gen Image T2I];
107 |         I --> J[Task: Gen Video I2V];
108 |         H -- T2V --> K[Task: Gen Video T2V];
109 |     end
110 | 
111 |     J --> L[All Shots Done?];
112 |     K --> L;
113 | 
114 |     L -- Yes --> M[Task: Assemble Scene Videos];
115 |     M --> N[All Scenes Done?];
116 |     N -- Yes --> O[Task: Assemble Final Reel];
117 |     O --> P[✅ Final Video Complete];
118 | 
119 | 
120 | ```
121 | 
122 | ## Installation
123 | 
124 | This project uses `uv` for fast package management.
125 | 
126 | **1. Prerequisites**
127 | 
128 | - Python 3.10 or newer.
129 | - `git` for cloning the repository.
130 | - **For GPU acceleration (highly recommended):** NVIDIA GPU with CUDA drivers installed.
131 | - **FFmpeg**: Required by `moviepy` for video processing. Ensure it's installed and accessible in your system's PATH.
132 |   - **Ubuntu**: `sudo apt update && sudo apt install ffmpeg`
133 |   - **macOS (with Homebrew)**: `brew install ffmpeg`
134 |   - **Windows**: Download from the [official site](https://ffmpeg.org/download.html) and add the `bin` folder to your PATH.
135 | 
136 | **2. Clone the Repository**
137 | 
138 | ```bash
139 | git clone https://github.com/your-username/your-repo-name.git
140 | cd your-repo-name
141 | ```
142 | 
143 | **3. Set up a Virtual Environment and Install Dependencies**
144 | 
145 | First, install `uv`:
146 | 
147 | ```bash
148 | pip install uv
149 | ```
150 | 
151 | Next, create a virtual environment and install all required packages using `uv`. This single command installs all dependencies, including PyTorch for your specific CUDA version (or CPU if CUDA is not available).
152 | 
153 | ```bash
154 | # Create a virtual environment
155 | uv venv
156 | 
157 | # Activate the environment
158 | # On macOS/Linux:
159 | source .venv/bin/activate
160 | # On Windows:
161 | .venv\Scripts\activate
162 | 
163 | # Install all packages using the provided command
164 | uv pip install torch torchvision torchaudio coqui-tts transformers streamlit sentencepiece moviepy psutil gputil ftfy "huggingface-hub[cli]" hf-transfer accelerate bitsandbytes pydantic --no-build-package llvmlite
165 | ```
166 | 
167 | > **Note:** The `--no-build-package llvmlite` flag is included to prevent `uv` from trying to build the `llvmlite` package from source, which can fail without the proper LLVM toolchain. This forces it to use a pre-compiled wheel.
168 | 
169 | ## Getting Started
170 | 
171 | With your virtual environment activated, launch the Streamlit app:
172 | 
173 | ```bash
174 | streamlit run app.py
175 | ```
176 | 
177 | Your web browser should automatically open to the application's UI.
178 | 
179 | ### Workflow
180 | 
181 | 1.  **Create a New Project**: On the main page, fill out the "Create New Project" form.
182 |     - **Generation Flow**: Choose between "Image to Video" (high quality) or "Text to Video" (fast).
183 |     - **Model Selection**: Select your desired AI models from the dropdowns for each stage.
184 |     - **Topic**: Enter the subject of your video.
185 |     - **Settings**: Configure the video format, length, and number of scenes.
186 |     - **Characters (Optional)**: If you select a model flow that supports character consistency, you can upload reference images for your subjects.
187 | 2.  **Processing Dashboard**: After creating the project, you'll be taken to the dashboard.
188 |     - **Automatic Mode**: Toggle "Automatic Mode" and click "Start" to have the pipeline run through all the steps automatically.
189 |     - **Manual Control**: With automatic mode off, you can manually trigger each step (e.g., "Gen Audio", "Gen Image"). This is perfect for fine-tuning.
190 |     - **Edit Everything**: Click into any text box to edit the script narration or visual prompts, then regenerate that specific part.
191 | 3.  **Final Assembly**: Once all scenes and clips are generated, a button will appear to assemble the final video. Click it to view the finished product, complete with subtitles and synchronized audio.
192 | 
193 | ---
194 | 
195 | ## ⭐ How to Develop Your Own Module ⭐
196 | 
197 | The pipeline is designed for easy extension. To add a new AI model, you simply need to create a Python class that inherits from one of the abstract base classes in `base_modules.py` and implements its required methods.
198 | 
199 | **The Core Contract: `base_modules.py`**
200 | 
201 | This file defines the interface for every module type:
202 | 
203 | - `BaseLLM`: For language models.
204 | - `BaseTTS`: For text-to-speech models.
205 | - `BaseT2I`: For text-to-image models.
206 | - `BaseI2V`: For image-to-video models.
207 | - `BaseT2V`: For text-to-video models.
208 | 
209 | ### Step-by-Step Guide to Adding a New I2V Model
210 | 
211 | Let's create a new hypothetical Image-to-Video module called "MotionWeaver".
212 | 
213 | **1. Create the File**
214 | 
215 | Create a new file in the appropriate directory: `i2v_modules/i2v_motion_weaver.py`.
216 | 
217 | **2. Define the `Config` and `Module` Classes**
218 | 
219 | In your new file, set up the basic structure.
220 | 
221 | ```python
222 | # In i2v_modules/i2v_motion_weaver.py
223 | import torch
224 | from typing import Dict, Any, List, Optional, Union
225 | 
226 | # Import from the project's own files
227 | from base_modules import BaseI2V, BaseModuleConfig, ModuleCapabilities
228 | from config_manager import DEVICE, clear_vram_globally, ContentConfig
229 | 
230 | # Step 2a: Define a Pydantic config for your model's parameters.
231 | class MotionWeaverI2VConfig(BaseModuleConfig):
232 |     model_id: str = "some-repo/motion-weaver-pro"
233 |     num_inference_steps: int = 20
234 |     motion_strength: float = 0.9
235 | 
236 | # Step 2b: Create the main class inheriting from the correct base class.
237 | class MotionWeaverI2V(BaseI2V):
238 |     # Link your config class
239 |     Config = MotionWeaverI2VConfig
240 | 
241 |     # Implement all required abstract methods...
242 | ```
243 | 
244 | **3. Implement `get_capabilities()`**
245 | 
246 | This is the most important method for UI integration. It tells the application what your model can do, and this information is used to populate dropdowns and enable/disable features.
247 | 
248 | ```python
249 | # Inside the MotionWeaverI2V class
250 | 
251 | @classmethod
252 | def get_capabilities(cls) -> ModuleCapabilities:
253 |     """Returns the spec sheet for this module."""
254 |     return ModuleCapabilities(
255 |         # This title appears in the UI dropdown. Be descriptive!
256 |         title="MotionWeaver Pro (Smooth & Cinematic)",
257 |         vram_gb_min=10.0,
258 |         ram_gb_min=16.0,
259 |         supports_ip_adapter=False, # This model doesn't support it
260 |         max_subjects=0,
261 |     )
262 | ```
263 | 
264 | **4. Implement Core Functionality (`generate_video_from_image`)**
265 | 
266 | This is where you call your model's code. The method signature is strictly defined by `BaseI2V`.
267 | 
268 | ```python
269 | # Inside the MotionWeaverI2V class
270 | 
271 | def generate_video_from_image(self, image_path: str, output_video_path: str, target_duration: float, content_config: ContentConfig, visual_prompt: str, motion_prompt: Optional[str], ip_adapter_image: Optional[Union[str, List[str]]] = None) -> str:
272 |     # 1. Load the model (if not already loaded)
273 |     self._load_pipeline()
274 | 
275 |     # 2. Prepare inputs (e.g., load image, calculate frames)
276 |     from diffusers.utils import load_image, export_to_video
277 |     input_image = load_image(image_path)
278 |     num_frames = int(target_duration * content_config.fps)
279 | 
280 |     # 3. Call the pipeline
281 |     video_frames = self.pipe(
282 |         image=input_image,
283 |         prompt=visual_prompt, # Use the prompts provided by the controller
284 |         motion_prompt=motion_prompt,
285 |         num_frames=num_frames,
286 |         motion_strength=self.config.motion_strength
287 |     ).frames
288 | 
289 |     # 4. Save the output and return the path
290 |     export_to_video(video_frames, output_video_path, fps=content_config.fps)
291 |     print(f"MotionWeaver video saved to {output_video_path}")
292 |     return output_video_path
293 | ```
294 | 
295 | **5. Implement VRAM Management and Other Helpers**
296 | 
297 | To manage memory, the pipeline loads and unloads models as needed.
298 | 
299 | ```python
300 | # Inside the MotionWeaverI2V class
301 | 
302 | def _load_pipeline(self):
303 |     """Loads the model into memory. Should be idempotent."""
304 |     if self.pipe is None:
305 |         from some_library import MotionWeaverPipeline # Local import
306 |         print(f"Loading MotionWeaver pipeline: {self.config.model_id}...")
307 |         self.pipe = MotionWeaverPipeline.from_pretrained(
308 |             self.config.model_id, torch_dtype=torch.float16
309 |         ).to(DEVICE)
310 | 
311 | def clear_vram(self):
312 |     """Releases the model from VRAM."""
313 |     print("Clearing MotionWeaver VRAM...")
314 |     if self.pipe is not None:
315 |         clear_vram_globally(self.pipe) # Use the global helper
316 |     self.pipe = None
317 | 
318 | def get_model_capabilities(self) -> Dict[str, Any]:
319 |     """Return technical details about the model."""
320 |     return {
321 |         "resolutions": {"Portrait": (512, 768), "Landscape": (768, 512)},
322 |         "max_shot_duration": 4.0 # Max video length it can generate at once
323 |     }
324 | ```
325 | 
326 | **6. Register the Module**
327 | 
328 | Finally, open the `__init__.py` file in the same directory (`i2v_modules/__init__.py`) and add an import for your new class. This makes it discoverable.
329 | 
330 | ```python
331 | # In i2v_modules/__init__.py
332 | 
333 | from .i2v_ltx import LtxI2V
334 | from .i2v_svd import SvdI2V
335 | from .i2v_slideshow import SlideshowI2V
336 | from .i2v_motion_weaver import MotionWeaverI2V # <-- Add this line
337 | ```
338 | 
339 | **That's it!** The next time you run `streamlit run app.py`, "MotionWeaver Pro (Smooth & Cinematic)" will appear as an option in the Image-to-Video Model dropdown.
340 | 
341 | ## Directory Structure
342 | 
343 | ```
344 | .
345 | ├── app.py                      # Main Streamlit web application
346 | ├── base_modules.py             # Abstract base classes for all modules (The Contract)
347 | ├── config_manager.py           # Pydantic configs and global settings
348 | ├── module_discovery.py         # Service to automatically find and load modules
349 | ├── project_manager.py          # Handles loading, saving, and managing project state
350 | ├── task_executor.py            # Orchestrates the execution of generation tasks
351 | ├── ui_task_executor.py         # Bridges the UI with the task executor
352 | ├── utils.py                    # Shared utility functions
353 | ├── video_assembly.py           # Functions for combining clips into the final video
354 | ├── llm_modules/                # Language model modules
355 | │   ├── __init__.py
356 | │   └── llm_zephyr.py
357 | ├── tts_modules/                # Text-to-Speech modules
358 | ├── t2i_modules/                # Text-to-Image modules
359 | ├── i2v_modules/                # Image-to-Video modules
360 | └── t2v_modules/                # Text-to-Video modules
361 | ```
362 | 
363 | ## License
364 | 
365 | This project is licensed under the MIT License - see the LICENSE file for details.
366 | 
367 | ### Important Notice Regarding Model Licenses
368 | 
369 | While this project itself is MIT-licensed, the AI models used within this pipeline (including but not limited to language models, text-to-speech models, image generation models, and video generation models) are subject to their own respective licenses. Users of this project are responsible for:
370 | 
371 | 1. Reviewing and complying with the license terms of each model they choose to use
372 | 2. Ensuring they have the necessary rights and permissions to use these models
373 | 3. Understanding that different models may have different usage restrictions, commercial terms, and attribution requirements
374 | 
375 | The MIT license of this project does not override or modify the license terms of any third-party models. Users must independently verify and comply with all applicable model licenses before use.
376 | 


--------------------------------------------------------------------------------
/video_assembly.py:
--------------------------------------------------------------------------------
  1 | import math # For math.ceil
  2 | import os
  3 | 
  4 | from typing import List, Optional, Tuple, Dict, Any
  5 | from moviepy import VideoFileClip, AudioFileClip, concatenate_videoclips, TextClip, CompositeVideoClip
  6 | from moviepy.audio.AudioClip import concatenate_audioclips, AudioClip
  7 | from moviepy.video.VideoClip import ColorClip
  8 | 
  9 | from config_manager import ContentConfig 
 10 | 
 11 | 
 12 | # --- 5. VIDEO ASSEMBLY ---
 13 | 
 14 | def assemble_scene_video_from_sub_clips(
 15 |     sub_clip_paths: List[str], 
 16 |     target_total_duration: float, 
 17 |     config: ContentConfig, 
 18 |     scene_idx: int
 19 | ) -> str:
 20 |     """Assembles multiple video sub-clips into a single scene video with precise duration control.
 21 |     
 22 |     This function takes multiple video sub-clips and combines them into a single scene video
 23 |     that matches the target duration. If the combined duration is shorter than the target,
 24 |     the video will be looped. If longer, it will be trimmed.
 25 |     
 26 |     Args:
 27 |         sub_clip_paths (List[str]): List of paths to video sub-clips to be combined
 28 |         target_total_duration (float): Desired duration for the final scene video in seconds
 29 |         config (ContentConfig): Configuration object containing video settings
 30 |         scene_idx (int): Index of the scene being assembled
 31 |         
 32 |     Returns:
 33 |         str: Path to the assembled scene video file. Returns empty string if assembly fails.
 34 |         
 35 |     Note:
 36 |         - Handles resource cleanup properly
 37 |         - Supports video concatenation and duration adjustment
 38 |         - Creates output in the directory specified by config.output_dir
 39 |     """
 40 |     if not sub_clip_paths:
 41 |         print(f"Warning: No sub-clips provided for scene {scene_idx}. Cannot assemble scene video.")
 42 |         # Create a short black placeholder?
 43 |         placeholder_path = os.path.join(config.output_dir, f"scene_{scene_idx}_placeholder.mp4")
 44 |         # Simple way to make a black clip with moviepy if needed, but for now, just return empty string or raise error.
 45 |         # For now, let's assume this case is handled upstream or we expect valid paths.
 46 |         return "" 
 47 | 
 48 |     print(f"Assembling video for scene {scene_idx} from {len(sub_clip_paths)} sub-clips to match duration {target_total_duration:.2f}s.")
 49 |     
 50 |     clips_to_close = []
 51 |     video_sub_clips_mvp = []
 52 |     for path in sub_clip_paths:
 53 |         clip = VideoFileClip(path)
 54 |         video_sub_clips_mvp.append(clip)
 55 |         clips_to_close.append(clip)
 56 | 
 57 |     # Concatenate raw sub-clips first
 58 |     concatenated_raw_video = concatenate_videoclips(video_sub_clips_mvp, method="compose")
 59 |     clips_to_close.append(concatenated_raw_video)
 60 |     
 61 |     # Adjust final concatenated clip to precisely match target_total_duration
 62 |     current_duration = concatenated_raw_video.duration
 63 |     if abs(current_duration - target_total_duration) < 0.05 : # If very close, accept it
 64 |          final_scene_video_timed = concatenated_raw_video 
 65 |     elif current_duration > target_total_duration:
 66 |         final_scene_video_timed = concatenated_raw_video.subclipped(0, target_total_duration)
 67 |     else: # current_duration < target_total_duration - loop the whole concatenated clip
 68 |         num_loops = math.ceil(target_total_duration / current_duration)
 69 |         looped_clips = [concatenated_raw_video] * num_loops
 70 |         temp_looped_video = concatenate_videoclips(looped_clips, method="compose")
 71 |         clips_to_close.append(temp_looped_video) # Add to close list
 72 |         final_scene_video_timed = temp_looped_video.subclipped(0, target_total_duration)
 73 | 
 74 |     # Add the final timed clip to close list if it's a new object (subclip creates new)
 75 |     if final_scene_video_timed is not concatenated_raw_video and final_scene_video_timed not in clips_to_close:
 76 |         clips_to_close.append(final_scene_video_timed)
 77 | 
 78 |     final_scene_video_path = os.path.join(config.output_dir, f"scene_{scene_idx}_assembled_video.mp4")
 79 |     try:
 80 |         final_scene_video_timed.write_videofile(
 81 |             final_scene_video_path, 
 82 |             fps=config.fps, 
 83 |             codec="libx264", 
 84 |             audio=False, # Audio will be added in the final assembly step
 85 |             threads=4, preset="medium", logger=None # Quieter logs for sub-assemblies
 86 |         )
 87 |     except Exception as e:
 88 |         print(f"Error writing assembled scene video for scene {scene_idx}: {e}")
 89 |         # Fallback or error handling
 90 |         final_scene_video_path = "" # Indicate failure
 91 |     finally:
 92 |         for clip_obj in clips_to_close:
 93 |             if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')):
 94 |                 clip_obj.close()
 95 |     
 96 |     print(f"Assembled video for scene {scene_idx} saved to {final_scene_video_path} with duration {final_scene_video_timed.duration:.2f}s.")
 97 |     return final_scene_video_path
 98 | 
 99 | 
100 | # In video_assembly.py
101 | 
102 | def assemble_final_reel(
103 |     processed_scene_assets: List[Tuple[str, str, Dict[str, Any]]],
104 |     config: ContentConfig,
105 |     output_filename: str = "final_reel.mp4"
106 | ) -> Optional[str]:
107 |     """Creates the final video reel by combining multiple scene videos with audio and text overlays.
108 |     
109 |     This function takes processed scene assets (video, audio, and narration info) and combines them
110 |     into a final video reel. It handles video resizing, cropping, audio synchronization, and text
111 |     overlay placement. The function ensures proper resource management and cleanup.
112 |     
113 |     Args:
114 |         processed_scene_assets (List[Tuple[str, str, Dict[str, Any]]]): List of tuples containing:
115 |             - scene_video_path: Path to the scene video file
116 |             - scene_audio_path: Path to the scene audio file
117 |             - narration_info: Dictionary containing narration text and duration
118 |         config (ContentConfig): Configuration object containing video settings
119 |         output_filename (str, optional): Name for the final output file. Defaults to "final_reel.mp4"
120 |         
121 |     Returns:
122 |         Optional[str]: Path to the final assembled video file. Returns None if assembly fails.
123 |         
124 |     Features:
125 |         - Combines video, audio, and text captions for each scene
126 |         - Handles video resizing and cropping to target resolution
127 |         - Manages audio synchronization
128 |         - Adds text overlays with proper positioning
129 |         - Implements comprehensive resource cleanup
130 |         
131 |     Note:
132 |         - Requires proper font file for text overlays
133 |         - Handles memory efficiently through proper resource cleanup
134 |         - Provides error handling and fallback mechanisms
135 |     """
136 |     print("Assembling final reel...")
137 |     if not processed_scene_assets:
138 |         print("No processed scene assets to assemble. Final video cannot be created.")
139 |         return None
140 | 
141 |     print(f"config.add_narration_text_to_video: {config.add_narration_text_to_video}")
142 |     
143 |     final_scene_video_clips = [] # Renamed from final_scene_clips_for_reel for clarity
144 |     
145 |     # This list will store all clips that are loaded or created
146 |     # and should be closed in the finally block.
147 |     all_clips_to_close = []
148 | 
149 |     font_path_for_textclip = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
150 |     if not os.path.exists(font_path_for_textclip):
151 |         print(f"Warning: Font file not found at {font_path_for_textclip}. TextClip will use a default font.")
152 |         font_path_for_textclip = "Liberation-Sans-Bold" # Or "Arial" or None
153 | 
154 |     for i, (scene_video_path, scene_audio_path, narration_info) in enumerate(processed_scene_assets):
155 |         video_clip_for_scene = None
156 |         audio_clip_for_scene = None
157 |         text_clip_for_scene = None
158 |         background_clip_for_scene = None
159 |         
160 |         try:
161 |             narration_text = narration_info["text"]
162 |             actual_audio_duration = narration_info["duration"] # This is the target duration for this scene
163 | 
164 |             if not (scene_video_path and os.path.exists(scene_video_path) and \
165 |                     scene_audio_path and os.path.exists(scene_audio_path)):
166 |                 print(f"Skipping scene {i} due to missing media files.")
167 |                 continue
168 | 
169 |             # Load video and audio clips
170 |             video_clip_for_scene = VideoFileClip(scene_video_path)
171 |             audio_clip_for_scene = AudioFileClip(scene_audio_path)
172 |             all_clips_to_close.extend([video_clip_for_scene, audio_clip_for_scene])
173 | 
174 |             video_duration = video_clip_for_scene.duration
175 |             
176 |             # --- Video Duration Matching (using subclipped and concatenate_videoclips for loop) ---
177 |             # First, resize and crop to final shape before timing adjustments IF POSSIBLE,
178 |             # or do timing first. Let's stick to your old code's order:
179 |             # Resize, Crop, Position, THEN Time, then Audio.
180 | 
181 |             # 1. Resize video to target height
182 |             temp_video_clip = video_clip_for_scene.resized(height=config.final_output_resolution[1])
183 | 
184 |             # 2. Crop if wider than target width, or pad if narrower
185 |             if temp_video_clip.w > config.final_output_resolution[0]:
186 |                 # Using .cropped() as per your working old code
187 |                 temp_video_clip = temp_video_clip.cropped(x_center=temp_video_clip.w / 2,
188 |                                                           width=config.final_output_resolution[0])
189 |             elif temp_video_clip.w < config.final_output_resolution[0]:
190 |                 # Pad with a background
191 |                 background_clip_for_scene = ColorClip(size=config.final_output_resolution,
192 |                                            color=(0,0,0), # Black background
193 |                                            duration=actual_audio_duration) # Duration for background
194 |                 all_clips_to_close.append(background_clip_for_scene)
195 |                 # Composite video onto background
196 |                 temp_video_clip = CompositeVideoClip([background_clip_for_scene, temp_video_clip.with_position('center')],
197 |                                                      size=config.final_output_resolution)
198 |             
199 |             # 3. Position video in center (if not already handled by padding composite)
200 |             # The .with_position('center') might have been applied already if padded.
201 |             # If not padded, apply it now.
202 |             if not (video_clip_for_scene.w < config.final_output_resolution[0] and temp_video_clip.w == config.final_output_resolution[0]):
203 |                  temp_video_clip = temp_video_clip.with_position('center')
204 | 
205 |             # 4. Handle duration mismatches for the video
206 |             if video_duration > actual_audio_duration: # If original video was longer
207 |                 video_clip_timed = temp_video_clip.subclipped(0, actual_audio_duration)
208 |             elif video_duration < actual_audio_duration: # If original video was shorter, loop it
209 |                 # Note: we loop the `temp_video_clip` which is already resized/cropped/positioned
210 |                 num_loops = math.ceil(actual_audio_duration / video_duration) # Loop based on original duration
211 |                 if num_loops == 0 : num_loops = 1 # Ensure at least one instance
212 |                 # Create a list of the clip to be looped
213 |                 looped_video_parts = [temp_video_clip] * num_loops
214 |                 video_clip_concatenated_for_loop = concatenate_videoclips(looped_video_parts)
215 |                 all_clips_to_close.append(video_clip_concatenated_for_loop) # This new clip needs closing
216 |                 video_clip_timed = video_clip_concatenated_for_loop.subclipped(0, actual_audio_duration)
217 |             else: # Durations match closely enough
218 |                 video_clip_timed = temp_video_clip # temp_video_clip is already at its full duration here
219 | 
220 |             final_audio_for_scene = audio_clip_for_scene # Start with the loaded audio
221 |             if final_audio_for_scene.duration > actual_audio_duration:
222 |                 final_audio_for_scene = final_audio_for_scene.subclipped(0, actual_audio_duration)
223 |             elif final_audio_for_scene.duration < actual_audio_duration:
224 |                 silence_needed = actual_audio_duration - final_audio_for_scene.duration
225 |                 if silence_needed > 0.01: # Only add if significant
226 |                     silence_clip = AudioClip(frame_function=lambda t: 0, duration=silence_needed)
227 |                     all_clips_to_close.append(silence_clip)
228 |                     final_audio_for_scene = concatenate_audioclips([final_audio_for_scene, silence_clip])
229 | 
230 | 
231 |             # 5. Combine video and audio
232 |             video_clip_with_audio = video_clip_timed.with_audio(final_audio_for_scene)
233 |             
234 |             # This list will hold the video clip, and conditionally, the text clip.
235 |             clips_for_composition = [video_clip_with_audio]
236 | 
237 |             # 6. Add text caption (if enabled in config)
238 |             if config.add_narration_text_to_video:
239 |                 print(f"Adding narration text for scene {i}...")
240 |                 # Calculate font size based on video height (e.g., 5% of height)
241 |                 base_font_size = int(config.final_output_resolution[1] * 0.05)  # 5% of height
242 |                 font_size = max(40, min(base_font_size, 60))  # Between 40 and 60
243 | 
244 |                 text_width = int(config.final_output_resolution[0] * 0.8)
245 |                 aspect_ratio = config.final_output_resolution[0] / config.final_output_resolution[1]
246 |                 vertical_position = 0.7 if aspect_ratio < 1 else 0.75
247 | 
248 |                 # --- THIS IS THE FIX: Reverted to the original working syntax ---
249 |                 text_clip_for_scene = TextClip(
250 |                     font_path_for_textclip,
251 |                     text=narration_text,
252 |                     font_size=font_size,
253 |                     color='white',
254 |                     stroke_color='black',
255 |                     stroke_width=2,
256 |                     method='caption',
257 |                     size=(text_width, None)
258 |                 )
259 |                 all_clips_to_close.append(text_clip_for_scene)
260 | 
261 |                 text_clip_final = text_clip_for_scene.with_position(('center', vertical_position), relative=True).with_duration(actual_audio_duration)
262 |                 
263 |                 clips_for_composition.append(text_clip_final)
264 |             else:
265 |                 print(f"Skipping narration text for scene {i} as per config.")
266 | 
267 | 
268 |             # 7. Combine video and (optional) text into final scene composite
269 |             scene_composite = CompositeVideoClip(
270 |                 clips_for_composition,
271 |                 size=config.final_output_resolution # Ensure composite is target size
272 |             )
273 |             final_scene_video_clips.append(scene_composite)
274 | 
275 |         except Exception as e_scene:
276 |             print(f"Error processing scene {i}: {e_scene}")
277 |             import traceback
278 |             traceback.print_exc()
279 |             # Any clips opened in this iteration (video_clip_for_scene, etc.) are already in all_clips_to_close
280 |             continue
281 | 
282 |     if not final_scene_video_clips:
283 |         print("No scenes were successfully composed.")
284 |         # Close any clips that might have been opened
285 |         for clip_obj in all_clips_to_close:
286 |             if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')):
287 |                 try: clip_obj.close()
288 |                 except: pass # Ignore errors during cleanup after failure
289 |         return None
290 | 
291 |     final_video_output_clip = None
292 |     final_video_path = os.path.join(config.output_dir, output_filename)
293 |     try:
294 |         final_video_output_clip = concatenate_videoclips(final_scene_video_clips, method="compose")
295 |         all_clips_to_close.append(final_video_output_clip) # Add final concatenated clip for closing
296 | 
297 |         final_video_output_clip.write_videofile(
298 |             final_video_path,
299 |             fps=config.fps,
300 |             codec="libx264",
301 |             audio_codec="aac",
302 |             threads=4,
303 |             preset="medium", # "ultrafast" for speed, "medium" for balance
304 |             logger='bar'
305 |         )
306 |     except Exception as e_write:
307 |         print(f"Error during final video writing: {e_write}")
308 |         import traceback
309 |         traceback.print_exc()
310 |         final_video_path = None # Indicate failure
311 |     finally:
312 |         # Close all clips.
313 |         # `final_scene_video_clips` contains CompositeVideoClips that are sources for `final_video_output_clip`.
314 |         # Closing `final_video_output_clip` should ideally handle its sources if method='compose'.
315 |         # `all_clips_to_close` contains initial VideoFileClips, AudioFileClips, created ColorClips, TextClips,
316 |         # and potentially intermediate concatenated clips.
317 |         
318 |         # Make a set of unique clip objects to close to avoid issues with multiple references
319 |         # to the same underlying resources.
320 |         clips_to_actually_close = {id(c): c for c in all_clips_to_close if c}.values()
321 |         
322 |         for clip_obj in clips_to_actually_close:
323 |             if hasattr(clip_obj, 'close') and callable(getattr(clip_obj, 'close')):
324 |                 try:
325 |                     clip_obj.close()
326 |                 except Exception as e_close:
327 |                     # print(f"Error closing a clip {type(clip_obj)}: {e_close}") # Can be noisy
328 |                     pass
329 |         
330 |         # Also ensure the list of scene composites themselves are closed, as they are also clips
331 |         for scene_comp in final_scene_video_clips:
332 |             if hasattr(scene_comp, 'close') and callable(getattr(scene_comp, 'close')):
333 |                 try: scene_comp.close()
334 |                 except: pass
335 | 
336 | 
337 |     if final_video_path:
338 |         print(f"Final reel saved to {final_video_path}")
339 |     return final_video_path


--------------------------------------------------------------------------------
/project_manager.py:
--------------------------------------------------------------------------------
  1 | # project_manager.py
  2 | import os
  3 | import time
  4 | import logging
  5 | import shutil
  6 | from typing import Dict, List, Optional, Any, Tuple
  7 | from pydantic import BaseModel, Field
  8 | 
  9 | from config_manager import ContentConfig
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | STATUS_PENDING, STATUS_GENERATED, STATUS_COMPLETED, STATUS_FAILED, STATUS_IN_PROGRESS = "pending", "generated", "completed", "failed", "in_progress"
 14 | STATUS_IMAGE_GENERATED, STATUS_VIDEO_GENERATED = "image_generated", "video_generated"
 15 | 
 16 | # --- Pydantic Models for Project State ---
 17 | 
 18 | class ProjectInfo(BaseModel):
 19 |     title: str
 20 |     topic: str
 21 |     created_at: float = Field(default_factory=time.time)
 22 |     last_modified: float = Field(default_factory=time.time)
 23 |     status: str = STATUS_IN_PROGRESS
 24 |     config: Dict[str, Any]
 25 |     speaker_audio_path: Optional[str] = None # Stores the relative path within the project dir
 26 | 
 27 | class NarrationPart(BaseModel):
 28 |     text: str
 29 |     status: str = STATUS_PENDING
 30 |     audio_path: str = ""
 31 |     duration: float = 0.0
 32 | 
 33 | class VisualPrompt(BaseModel):
 34 |     prompt: str
 35 | 
 36 | class Script(BaseModel):
 37 |     # NEW: Fields for consistent context
 38 |     main_subject_description: str = ""
 39 |     setting_description: str = ""
 40 |     
 41 |     narration_parts: List[NarrationPart] = Field(default_factory=list)
 42 |     visual_prompts: List[VisualPrompt] = Field(default_factory=list)
 43 |     hashtags: List[str] = Field(default_factory=list)
 44 | 
 45 | class Character(BaseModel):
 46 |     """Represents a character/subject in the project."""
 47 |     name: str
 48 |     reference_image_path: str
 49 |     source_prompt: Optional[str] = None
 50 |     source_image_path: Optional[str] = None # Path to the user-uploaded image
 51 | 
 52 | 
 53 | class Shot(BaseModel):
 54 |     shot_idx: int
 55 |     target_duration: float
 56 |     visual_prompt: str
 57 |     motion_prompt: Optional[str] = ""
 58 |     status: str = STATUS_PENDING
 59 |     keyframe_image_path: str = ""
 60 |     video_path: str = ""
 61 | 
 62 | class Scene(BaseModel):
 63 |     scene_idx: int
 64 |     status: str = STATUS_PENDING
 65 |     assembled_video_path: str = ""
 66 |     shots: List[Shot] = Field(default_factory=list)
 67 |     character_names: List[str] = Field(default_factory=list)
 68 | 
 69 | class FinalVideo(BaseModel):
 70 |     status: str = STATUS_PENDING
 71 |     path: str = ""
 72 |     full_narration_text: str = ""
 73 |     hashtags: List[str] = Field(default_factory=list)
 74 | 
 75 | class ProjectState(BaseModel):
 76 |     project_info: ProjectInfo
 77 |     script: Script = Field(default_factory=Script)
 78 |     characters: List[Character] = Field(default_factory=list)
 79 |     scenes: List[Scene] = Field(default_factory=list)
 80 |     final_video: FinalVideo = Field(default_factory=FinalVideo)
 81 | 
 82 | # --- ProjectManager Class ---
 83 | 
 84 | class ProjectManager:
 85 |     def __init__(self, output_dir: str):
 86 |         self.output_dir = output_dir
 87 |         self.project_file = os.path.join(output_dir, "project.json")
 88 |         self.state: Optional[ProjectState] = None
 89 |         os.makedirs(self.output_dir, exist_ok=True)
 90 |         
 91 |     def _save_state(self):
 92 |         if not self.state: return
 93 |         self.state.project_info.last_modified = time.time()
 94 |         with open(self.project_file, 'w') as f:
 95 |             f.write(self.state.model_dump_json(indent=4))
 96 |             
 97 |     def initialize_project(self, title: str, topic: str, config: ContentConfig):
 98 |         project_info = ProjectInfo(title=title, topic=topic, config=config.model_dump())
 99 |         self.state = ProjectState(project_info=project_info)
100 |         self._save_state()
101 |     
102 |     def set_speaker_audio(self, relative_path: str):
103 |         """Saves the relative path of the speaker audio to the project state."""
104 |         if not self.state: return
105 |         self.state.project_info.speaker_audio_path = relative_path
106 |         self._save_state()
107 |         
108 |     def load_project(self) -> bool:
109 |         if not os.path.exists(self.project_file): return False
110 |         try:
111 |             with open(self.project_file, 'r') as f:
112 |                 self.state = ProjectState.model_validate_json(f.read())
113 |             return True
114 |         except Exception as e:
115 |             logger.error(f"Error loading project with Pydantic: {e}", exc_info=True); return False
116 | 
117 |     def update_script(self, script_data: Dict[str, Any]):
118 |         if not self.state: return
119 |         self.state.script.main_subject_description = script_data.get("main_subject_description", "")
120 |         self.state.script.setting_description = script_data.get("setting_description", "")
121 |         self.state.script.narration_parts = [NarrationPart(**p) for p in script_data.get("narration", [])]
122 |         self.state.script.visual_prompts = [VisualPrompt(prompt=p) for p in script_data.get("visuals", [])]
123 |         self.state.script.hashtags = script_data.get("hashtags", [])
124 |         self._save_state()
125 | 
126 |     def get_next_pending_task(self) -> Tuple[Optional[str], Optional[Dict]]:
127 |         if not self.state: return None, None
128 |         
129 |         cfg = ContentConfig(**self.state.project_info.config)
130 |         use_svd_flow = cfg.use_svd_flow
131 | 
132 |         if not self.state.script.narration_parts: return "generate_script", {"topic": self.state.project_info.topic}
133 |         
134 |         for i, part in enumerate(self.state.script.narration_parts):
135 |             if part.status != STATUS_GENERATED: return "generate_audio", {"scene_idx": i, "text": part.text}
136 |         
137 |         narration_indices_with_scenes = {s.scene_idx for s in self.state.scenes}
138 |         for i in range(len(self.state.script.narration_parts)):
139 |             if i not in narration_indices_with_scenes: return "create_scene", {"scene_idx": i}
140 | 
141 |         for scene in sorted(self.state.scenes, key=lambda s: s.scene_idx):
142 |             for shot in sorted(scene.shots, key=lambda c: c.shot_idx):
143 |                 if shot.status != STATUS_VIDEO_GENERATED:
144 |                     task_data = { "scene_idx": scene.scene_idx, "shot_idx": shot.shot_idx, "visual_prompt": shot.visual_prompt, "motion_prompt": shot.motion_prompt}
145 |                     if use_svd_flow:
146 |                         if shot.status == STATUS_PENDING: return "generate_shot_image", task_data
147 |                         if shot.status == STATUS_IMAGE_GENERATED: return "generate_shot_video", task_data
148 |                     else: # T2V Flow
149 |                         return "generate_shot_t2v", task_data
150 | 
151 |         for scene in self.state.scenes:
152 |             if all(c.status == STATUS_VIDEO_GENERATED for c in scene.shots) and scene.status != STATUS_COMPLETED:
153 |                 return "assemble_scene", {"scene_idx": scene.scene_idx}
154 |         
155 |         if self.state.scenes and all(s.status == STATUS_COMPLETED for s in self.state.scenes) and self.state.final_video.status != STATUS_GENERATED:
156 |             return "assemble_final", {}
157 |             
158 |         return None, None
159 | 
160 |     def update_narration_part_text(self, part_idx: int, text: str):
161 |         if not self.state or part_idx >= len(self.state.script.narration_parts): return
162 |         part = self.state.script.narration_parts[part_idx]
163 |         if part.text != text:
164 |             part.text = text; part.status = STATUS_PENDING; part.audio_path = ""; part.duration = 0
165 |             self.state.scenes = [s for s in self.state.scenes if s.scene_idx != part_idx]
166 |             self._mark_final_for_reassembly()
167 |             self._save_state()
168 | 
169 |     def add_scene(self, scene_idx: int, shots: List[Dict], character_names: List[str]):
170 |         """Adds a new scene and assigns the provided characters to it."""
171 |         if not self.state: return
172 |         scene_data = Scene(
173 |             scene_idx=scene_idx, 
174 |             shots=[Shot(**c) for c in shots],
175 |             character_names=character_names
176 |         )
177 |         self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx]
178 |         self.state.scenes.append(scene_data)
179 |         self.state.scenes.sort(key=lambda s: s.scene_idx)
180 |         self._save_state()
181 | 
182 |     def update_shot_content(self, scene_idx: int, shot_idx: int, visual_prompt: Optional[str] = None, motion_prompt: Optional[str] = None):
183 |         scene = self.get_scene_info(scene_idx)
184 |         if not scene or shot_idx >= len(scene.shots): return
185 |         shot = scene.shots[shot_idx]
186 |         changed = False
187 |         if visual_prompt is not None and shot.visual_prompt != visual_prompt:
188 |             shot.visual_prompt = visual_prompt; changed = True
189 |         if motion_prompt is not None and shot.motion_prompt != motion_prompt:
190 |             shot.motion_prompt = motion_prompt; changed = True
191 |         if changed:
192 |             shot.status = STATUS_PENDING; shot.keyframe_image_path = ""; shot.video_path = ""
193 |             self._mark_scene_for_reassembly(scene_idx)
194 |             self._save_state()
195 |             
196 |     def _mark_scene_for_reassembly(self, scene_idx: int):
197 |         scene = self.get_scene_info(scene_idx)
198 |         if scene and scene.status == STATUS_COMPLETED:
199 |             scene.status = STATUS_PENDING; scene.assembled_video_path = ""
200 |             self._mark_final_for_reassembly()
201 | 
202 |     def _mark_final_for_reassembly(self):
203 |         if self.state and self.state.final_video.status == STATUS_GENERATED:
204 |             self.state.final_video.status = STATUS_PENDING; self.state.final_video.path = ""
205 |             self.state.project_info.status = STATUS_IN_PROGRESS
206 |     
207 |     def get_scene_info(self, scene_idx: int) -> Optional[Scene]:
208 |         if not self.state: return None
209 |         return next((s for s in self.state.scenes if s.scene_idx == scene_idx), None)
210 | 
211 |     def update_narration_part_status(self, part_idx: int, status: str, audio_path: str = "", duration: float = 0.0):
212 |         if not self.state or part_idx >= len(self.state.script.narration_parts): return
213 |         part = self.state.script.narration_parts[part_idx]
214 |         part.status = status; part.audio_path = audio_path; part.duration = duration
215 |         self._save_state()
216 | 
217 |     def update_shot_status(self, scene_idx, shot_idx, status, keyframe_path=None, video_path=None):
218 |         scene = self.get_scene_info(scene_idx)
219 |         if not scene or shot_idx >= len(scene.shots): return
220 |         shot = scene.shots[shot_idx]
221 |         shot.status = status
222 |         if keyframe_path: shot.keyframe_image_path = keyframe_path
223 |         if video_path: shot.video_path = video_path
224 |         self._save_state()
225 | 
226 |     def update_scene_status(self, scene_idx, status, assembled_video_path=None):
227 |         scene = self.get_scene_info(scene_idx)
228 |         if not scene: return
229 |         scene.status = status
230 |         if assembled_video_path: scene.assembled_video_path = assembled_video_path
231 |         self._save_state()
232 |         
233 |     def update_final_video(self, path, status, full_narration_text, hashtags):
234 |         if not self.state: return
235 |         self.state.final_video.path = path
236 |         self.state.final_video.status = status
237 |         self.state.final_video.full_narration_text = full_narration_text
238 |         self.state.final_video.hashtags = hashtags
239 |         if status == "generated": self.state.project_info.status = "completed"
240 |         self._save_state()
241 |     
242 |     def add_character(self, character_data: Dict[str, Any]):
243 |         if not self.state: return
244 |         char = Character(**character_data)
245 |         self.state.characters = [c for c in self.state.characters if c.name != char.name]
246 |         self.state.characters.append(char)
247 |         self._save_state()
248 | 
249 |     def update_config_value(self, key: str, value: Any):
250 |         """Updates a specific key in the project's ContentConfig."""
251 |         if not self.state: return
252 |         
253 |         if key in ContentConfig.model_fields:
254 |             config_dict = self.state.project_info.config
255 |             if config_dict.get(key) != value:
256 |                 config_dict[key] = value
257 |                 self._mark_final_for_reassembly() # If assembly setting changes, reassembly is needed
258 |                 self._save_state()
259 |                 logger.info(f"Updated project config: set {key} to {value}")
260 |         else:
261 |             logger.warning(f"Warning: Attempted to update an unknown config key: {key}")
262 | 
263 |     def update_character(self, old_name: str, new_name: str, new_reference_image_path: Optional[str]):
264 |         char = self.get_character(old_name)
265 |         if not char: return
266 | 
267 |         image_changed = new_reference_image_path and char.reference_image_path != new_reference_image_path
268 |         name_changed = new_name and char.name != new_name
269 | 
270 |         if image_changed:
271 |             char.reference_image_path = new_reference_image_path
272 |             self._reset_visuals_for_character(old_name)
273 | 
274 |         if name_changed:
275 |             char.name = new_name
276 |             self._update_scene_references_on_name_change(old_name, new_name)
277 |         
278 |         self._save_state()
279 | 
280 |     def delete_character(self, name: str):
281 |         if not self.state: return
282 |         self.state.characters = [c for c in self.state.characters if c.name != name]
283 |         for scene in self.state.scenes:
284 |             if name in scene.character_names:
285 |                 scene.character_names.remove(name)
286 |         
287 |         safe_name = name.replace(" ", "_")
288 |         char_dir = os.path.join(self.output_dir, "characters", safe_name)
289 |         if os.path.exists(char_dir):
290 |             shutil.rmtree(char_dir)
291 |             logger.info(f"Removed character asset directory: {char_dir}")
292 | 
293 |         self._save_state()
294 |         
295 |     def _reset_visuals_for_character(self, character_name: str):
296 |         logger.info(f"Resetting visuals for scenes containing character: {character_name}")
297 |         for scene in self.state.scenes:
298 |             if character_name in scene.character_names:
299 |                 for shot in scene.shots:
300 |                     shot.status = STATUS_PENDING
301 |                     shot.keyframe_image_path = ""
302 |                     shot.video_path = ""
303 |                 scene.status = STATUS_PENDING
304 |                 scene.assembled_video_path = ""
305 |         self._mark_final_for_reassembly()
306 |         
307 |     def _update_scene_references_on_name_change(self, old_name: str, new_name: str):
308 |         for scene in self.state.scenes:
309 |             if old_name in scene.character_names:
310 |                 scene.character_names = [new_name if name == old_name else name for name in scene.character_names]
311 | 
312 |     def get_character(self, name: str) -> Optional[Character]:
313 |         if not self.state: return None
314 |         return next((c for c in self.state.characters if c.name == name), None)
315 |         
316 |     def update_scene_characters(self, scene_idx: int, character_names: List[str]):
317 |         scene = self.get_scene_info(scene_idx)
318 |         if scene:
319 |             scene.character_names = character_names
320 |             self._save_state()
321 | 
322 |     def add_new_scene_at(self, scene_idx: int, narration_text: str = "New scene narration.", visual_prompt: str = "A vibrant new scene."):
323 |         if not self.state: return
324 |         logger.info(f"Adding new scene at index {scene_idx}")
325 | 
326 |         new_narration = NarrationPart(text=narration_text)
327 |         new_visual = VisualPrompt(prompt=visual_prompt)
328 |         self.state.script.narration_parts.insert(scene_idx, new_narration)
329 |         self.state.script.visual_prompts.insert(scene_idx, new_visual)
330 | 
331 |         for i in range(len(self.state.scenes) - 1, -1, -1):
332 |             scene = self.state.scenes[i]
333 |             if scene.scene_idx >= scene_idx:
334 |                 scene.scene_idx += 1
335 |         
336 |         self._mark_final_for_reassembly()
337 |         self._save_state()
338 |     
339 |     # --- NEW METHOD ---
340 |     def reset_scene_for_shot_regeneration(self, scene_idx: int):
341 |         """Deletes a scene's assets and state, preparing it for shot regeneration."""
342 |         if not self.state: return
343 |         
344 |         scene_to_reset = self.get_scene_info(scene_idx)
345 |         if not scene_to_reset:
346 |             logger.warning(f"No scene found at index {scene_idx} to reset.")
347 |             return
348 | 
349 |         logger.info(f"Resetting Scene {scene_idx} for shot regeneration.")
350 |         # Delete physical assets associated with the scene's shots
351 |         for shot in scene_to_reset.shots:
352 |             if shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path):
353 |                 try: os.remove(shot.keyframe_image_path)
354 |                 except OSError as e: logger.error(f"Error removing keyframe image {shot.keyframe_image_path}: {e}")
355 |             if shot.video_path and os.path.exists(shot.video_path):
356 |                 try: os.remove(shot.video_path)
357 |                 except OSError as e: logger.error(f"Error removing shot video {shot.video_path}: {e}")
358 |         
359 |         # Delete the assembled scene video if it exists
360 |         if scene_to_reset.assembled_video_path and os.path.exists(scene_to_reset.assembled_video_path):
361 |             try: os.remove(scene_to_reset.assembled_video_path)
362 |             except OSError as e: logger.error(f"Error removing assembled scene video {scene_to_reset.assembled_video_path}: {e}")
363 | 
364 |         # Remove the Scene object from the state
365 |         self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx]
366 |         
367 |         # Mark the final video for reassembly
368 |         self._mark_final_for_reassembly()
369 |         self._save_state()
370 | 
371 | 
372 |     def remove_scene_at(self, scene_idx: int):
373 |         if not self.state or scene_idx >= len(self.state.script.narration_parts): return
374 |         logger.info(f"Removing scene at index {scene_idx}")
375 | 
376 |         del self.state.script.narration_parts[scene_idx]
377 |         del self.state.script.visual_prompts[scene_idx]
378 | 
379 |         scene_to_remove = self.get_scene_info(scene_idx)
380 |         if scene_to_remove:
381 |             base_dir = self.output_dir
382 |             audio_path = os.path.join(base_dir, f"scene_{scene_idx}_audio.wav")
383 |             if os.path.exists(audio_path): os.remove(audio_path)
384 |             assembled_path = os.path.join(base_dir, f"scene_{scene_idx}_assembled_video.mp4")
385 |             if os.path.exists(assembled_path): os.remove(assembled_path)
386 |             for shot in scene_to_remove.shots:
387 |                 if shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path):
388 |                     os.remove(shot.keyframe_image_path)
389 |                 if shot.video_path and os.path.exists(shot.video_path):
390 |                     os.remove(shot.video_path)
391 |             
392 |             self.state.scenes = [s for s in self.state.scenes if s.scene_idx != scene_idx]
393 | 
394 |         for scene in self.state.scenes:
395 |             if scene.scene_idx > scene_idx:
396 |                 scene.scene_idx -= 1
397 |         
398 |         self._mark_final_for_reassembly()
399 |         self._save_state()


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # In app.py
  2 | 
  3 | import streamlit as st
  4 | import os
  5 | import json
  6 | from datetime import datetime
  7 | import torch
  8 | import time
  9 | from typing import List, Dict, Any
 10 | 
 11 | # Fix for Streamlit/Torch conflict
 12 | torch.classes.__path__ = []
 13 | 
 14 | # Local imports
 15 | from project_manager import ProjectManager
 16 | from config_manager import ContentConfig
 17 | from ui_task_executor import UITaskExecutor
 18 | from utils import list_projects, load_and_correct_image_orientation
 19 | from module_discovery import discover_modules
 20 | # --- START OF MODIFICATION ---
 21 | # Import the new detection function
 22 | from system import SystemConfig, load_system_config, save_system_config, detect_system_specs
 23 | # --- END OF MODIFICATION ---
 24 | 
 25 | # Page Config
 26 | st.set_page_config(page_title="AI Video Generation Pipeline", page_icon="🎥", layout="wide")
 27 | 
 28 | # Session State
 29 | def init_session_state():
 30 |     system_config = load_system_config()
 31 |     
 32 |     defaults = {
 33 |         'current_project': None, 
 34 |         'current_step': 'system_config_setup' if not system_config else 'project_selection',
 35 |         'system_config': system_config,
 36 |         'auto_mode': True, 
 37 |         'ui_executor': None, 
 38 |         'speaker_audio': None, 
 39 |         'is_processing': False,
 40 |         'new_project_characters': [],
 41 |         'discovered_modules': discover_modules()
 42 |     }
 43 |     for key, value in defaults.items():
 44 |         if key not in st.session_state: st.session_state[key] = value
 45 | init_session_state()
 46 | 
 47 | def go_to_step(step_name):
 48 |     st.session_state.current_step = step_name
 49 |     st.rerun()
 50 | 
 51 | def load_project(project_name):
 52 |     project_manager = ProjectManager(f"modular_reels_output/{project_name}")
 53 |     if project_manager.load_project():
 54 |         st.session_state.current_project = project_manager
 55 |         st.session_state.ui_executor = UITaskExecutor(project_manager)
 56 |         st.session_state.auto_mode = False; 
 57 |         st.session_state.is_processing = False
 58 |         speaker_relative_path = project_manager.state.project_info.speaker_audio_path
 59 |         if speaker_relative_path:
 60 |             full_speaker_path = os.path.join(project_manager.output_dir, speaker_relative_path)
 61 |             if os.path.exists(full_speaker_path):
 62 |                 st.session_state.speaker_audio = full_speaker_path
 63 |             else:
 64 |                 st.session_state.speaker_audio = None
 65 |                 st.warning(f"Saved speaker audio not found at: {full_speaker_path}")
 66 |         else:
 67 |             st.session_state.speaker_audio = None
 68 |         go_to_step('processing_dashboard')
 69 |     else:
 70 |         st.error("Failed to load project.")
 71 | 
 72 | 
 73 | def create_new_project(title, topic, auto, audio, video_format, length, min_s, max_s, use_svd, characters, module_selections, language, add_narration_text, seed):
 74 |     name = "".join(c for c in title.lower() if c.isalnum() or c in " ").replace(" ", "_")[:50]
 75 |     output_dir = f"modular_reels_output/{name}_{int(time.time())}"
 76 |     
 77 |     cfg = ContentConfig(
 78 |         output_dir=output_dir, 
 79 |         aspect_ratio_format=video_format,
 80 |         target_video_length_hint=length, 
 81 |         min_scenes=min_s, 
 82 |         max_scenes=max_s, 
 83 |         use_svd_flow=use_svd,
 84 |         module_selections=module_selections,
 85 |         language=language,
 86 |         add_narration_text_to_video=add_narration_text,
 87 |         seed=seed
 88 |     )
 89 |     pm = ProjectManager(output_dir)
 90 |     pm.initialize_project(title, topic, cfg)
 91 | 
 92 |     if characters:
 93 |         for char_info in characters:
 94 |             safe_name = char_info['name'].replace(" ", "_")
 95 |             char_dir = os.path.join(output_dir, "characters", safe_name)
 96 |             os.makedirs(char_dir, exist_ok=True)
 97 |             ref_image_path = os.path.join(char_dir, "reference.png")
 98 |             
 99 |             corrected_image = load_and_correct_image_orientation(char_info['image'])
100 |             if corrected_image:
101 |                 corrected_image.save(ref_image_path, "PNG") 
102 |                 pm.add_character({"name": char_info['name'], "reference_image_path": ref_image_path})
103 |             else:
104 |                 st.error(f"Could not process image for character {char_info['name']}. Skipping.")
105 |     
106 |     st.session_state.current_project = pm
107 |     st.session_state.ui_executor = UITaskExecutor(pm)
108 |     st.session_state.auto_mode = auto
109 |     if audio:
110 |         relative_speaker_path = "speaker_audio.wav"
111 |         full_speaker_path = os.path.join(output_dir, relative_speaker_path)
112 |         with open(full_speaker_path, "wb") as f: f.write(audio.getbuffer())
113 |         st.session_state.speaker_audio = full_speaker_path
114 |         pm.set_speaker_audio(relative_speaker_path)
115 |         
116 |     with st.spinner("Generating script..."):
117 |         success = st.session_state.ui_executor.task_executor.execute_task("generate_script", {"topic": topic})
118 | 
119 |     if success:
120 |         st.success("Script generated!")
121 |         st.session_state.current_project.load_project()
122 |         st.session_state.new_project_characters = []
123 |         go_to_step('processing_dashboard')
124 |     else:
125 |         st.error("Failed to generate script.")
126 |         st.session_state.current_project = None
127 | 
128 | def handle_flow_change():
129 |     st.session_state.new_project_characters = []
130 | 
131 | def render_system_config_setup():
132 |     st.title("⚙️ System Configuration")
133 |     st.info("First, let's specify your available system resources. This helps the pipeline select compatible AI models and prevent memory errors. This information will be saved locally in `system.json` for future use.")
134 |     
135 |     # --- START OF MODIFICATION ---
136 |     # Call the detection function to get default values for the form
137 |     detected_vram, detected_ram = detect_system_specs()
138 |     # --- END OF MODIFICATION ---
139 |     
140 |     with st.form("system_config_form"):
141 |         # --- START OF MODIFICATION ---
142 |         # Use the detected values as the default for the number_input widgets
143 |         vram = st.number_input("Available GPU VRAM (GB)", min_value=1.0, value=detected_vram, step=0.5, help="We've tried to detect this automatically. Please confirm or adjust.")
144 |         ram = st.number_input("Available System RAM (GB)", min_value=1.0, value=float(detected_ram), step=1.0, help="We've tried to detect this automatically. Please confirm or adjust.")
145 |         # --- END OF MODIFICATION ---
146 |         
147 |         submitted = st.form_submit_button("Save and Continue", type="primary")
148 |         
149 |         if submitted:
150 |             save_system_config(vram, ram)
151 |             st.session_state.system_config = SystemConfig(vram_gb=vram, ram_gb=ram)
152 |             st.success("System configuration saved!")
153 |             time.sleep(1) 
154 |             go_to_step('project_selection')
155 | 
156 | 
157 | def render_project_selection():
158 |     st.title("🎥 AI Video Generation Pipeline")
159 |     
160 |     def filter_modules_by_resources(modules: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
161 |         system_config = st.session_state.system_config
162 |         if not system_config:
163 |             return [] 
164 |         
165 |         compatible_modules = []
166 |         for mod in modules:
167 |             caps = mod['caps']
168 |             if caps.vram_gb_min <= system_config.vram_gb and caps.ram_gb_min <= system_config.ram_gb:
169 |                 compatible_modules.append(mod)
170 |             else:
171 |                 print(f"Filtering out module '{caps.title}': Needs {caps.vram_gb_min}GB VRAM / {caps.ram_gb_min}GB RAM. Have {system_config.vram_gb}/{system_config.ram_gb}.")
172 |         return compatible_modules
173 | 
174 |     def get_caps_from_path(mod_type: str, path: str) -> Dict[str, Any]:
175 |         if not path: return None
176 |         for mod in st.session_state.discovered_modules.get(mod_type, []):
177 |             if mod['path'] == path:
178 |                 return mod['caps']
179 |         return None
180 |         
181 |     def format_module_option(mod_type: str, path: str) -> str:
182 |         caps = get_caps_from_path(mod_type, path)
183 |         return caps.title if caps and caps.title else (path.split('.')[-1] if path else "Not Selected")
184 | 
185 |     c1, c2 = st.columns([1.2, 2])
186 |     
187 |     with c2:
188 |         st.subheader("Existing Projects")
189 |         
190 |         projects = list_projects()
191 |         if not projects:
192 |             st.info("No projects found. Create one to get started!")
193 | 
194 |         for p in projects:
195 |             with st.container(border=True):
196 |                 proj_c1, proj_c2 = st.columns([3, 1])
197 |                 with proj_c1:
198 |                     st.markdown(f"**{p['title']}**")
199 |                 with proj_c2:
200 |                     st.caption(f"_{p['created_at'].strftime('%Y-%m-%d %H:%M')}_")
201 |                 
202 |                 status_map = { "completed": "✅ Completed", "in_progress": "⚙️ In Progress", "failed": "❌ Failed" }
203 |                 display_status = status_map.get(p['status'], p['status'].title())
204 |                 
205 |                 info_parts = [ f"**Flow:** {p['flow']}", f"**Status:** {display_status}" ]
206 |                 if p['duration'] > 0: info_parts.append(f"**Duration:** {p['duration']:.1f}s")
207 |                 
208 |                 st.markdown(" | ".join(info_parts), help="Project details")
209 |                 
210 |                 with st.expander("Show Modules Used"):
211 |                     modules_used = p.get('modules', {})
212 |                     if not modules_used:
213 |                         st.caption("Module info not available.")
214 |                     else:
215 |                         module_info_str = ""
216 |                         llm_title = format_module_option('llm', modules_used.get('llm'))
217 |                         tts_title = format_module_option('tts', modules_used.get('tts'))
218 |                         
219 |                         module_info_str += f"- **LLM:** {llm_title}\n"
220 |                         module_info_str += f"- **TTS:** {tts_title}\n"
221 | 
222 |                         if p['flow'] == "Image-to-Video":
223 |                             t2i_title = format_module_option('t2i', modules_used.get('t2i'))
224 |                             i2v_title = format_module_option('i2v', modules_used.get('i2v'))
225 |                             module_info_str += f"- **Image Model:** {t2i_title}\n"
226 |                             module_info_str += f"- **Video Model:** {i2v_title}\n"
227 |                         else: # Text-to-Video
228 |                             t2v_title = format_module_option('t2v', modules_used.get('t2v'))
229 |                             module_info_str += f"- **Video Model:** {t2v_title}\n"
230 |                         
231 |                         st.markdown(module_info_str)
232 | 
233 |                 btn_c1, btn_c2 = st.columns(2)
234 |                 
235 |                 with btn_c1:
236 |                     st.button("Load Project", key=f"load_{p['name']}", on_click=load_project, args=(p['name'],), use_container_width=True)
237 |                 
238 |                 with btn_c2:
239 |                     if p['final_video_path']:
240 |                         with st.popover("▶️ Play Video", use_container_width=True):
241 |                             st.video(p['final_video_path'])
242 |                     else:
243 |                         st.button("▶️ Play Video", key=f"play_{p['name']}", disabled=True, use_container_width=True, help="Video not available or project not completed.")
244 |     
245 |     with c1:
246 |         st.subheader("Create New Project")
247 |         with st.container(border=True):
248 |             st.markdown(f"**System Specs:** `{st.session_state.system_config.vram_gb}` GB VRAM | `{st.session_state.system_config.ram_gb}` GB RAM")
249 |             if st.button("Change System Specs", key="change_specs"):
250 |                 go_to_step('system_config_setup')
251 | 
252 |         st.info("Step 1: Choose your workflow and AI models (filtered by your specs).")
253 |         st.radio("Generation Flow", ("Image to Video (High Quality)", "Text to Video (Fast)"), horizontal=True, key="flow_choice", on_change=handle_flow_change)
254 |         use_svd = st.session_state.flow_choice == "Image to Video (High Quality)"
255 | 
256 |         tts_options = filter_modules_by_resources(st.session_state.discovered_modules.get('tts', []))
257 |         tts_paths = [m['path'] for m in tts_options]
258 |         st.selectbox("Text-to-Speech Model", options=tts_paths, format_func=lambda path: format_module_option('tts', path), key="selected_tts_module", on_change=lambda: st.session_state.update())
259 | 
260 |         selected_tts_caps = get_caps_from_path('tts', st.session_state.get('selected_tts_module'))
261 |         language = "en"
262 |         if selected_tts_caps and selected_tts_caps.supported_tts_languages:
263 |             supported_langs = selected_tts_caps.supported_tts_languages
264 |             language = st.selectbox("Narration Language", options=supported_langs, index=0, key="selected_language")
265 |         elif selected_tts_caps:
266 |             st.caption("Language selection not available for this model.")
267 | 
268 |         with st.form("new_project_form"):
269 |             has_characters = len(st.session_state.new_project_characters) > 0
270 |             module_selections = {'tts': st.session_state.get('selected_tts_module')}
271 |             
272 |             llm_options_filtered = filter_modules_by_resources(st.session_state.discovered_modules.get('llm', []))
273 |             module_selections['llm'] = st.selectbox("Language Model (LLM)", options=[m['path'] for m in llm_options_filtered], format_func=lambda path: format_module_option('llm', path))
274 |             
275 |             show_char_section = False
276 |             
277 |             selected_video_model_path = None
278 |             if use_svd:
279 |                 all_t2i_options = filter_modules_by_resources(st.session_state.discovered_modules.get('t2i', []))
280 |                 t2i_options = [m for m in all_t2i_options if not has_characters or m['caps'].supports_ip_adapter]
281 |                 
282 |                 all_i2v_options_filtered = filter_modules_by_resources(st.session_state.discovered_modules.get('i2v', []))
283 |                 
284 |                 module_selections['t2i'] = st.selectbox("Image Model (T2I)", options=[m['path'] for m in t2i_options], format_func=lambda path: format_module_option('t2i', path), key="t2i_selection", help="Models are filtered based on your system specs and character support.")
285 |                 module_selections['i2v'] = st.selectbox("Image-to-Video Model (I2V)", options=[m['path'] for m in all_i2v_options_filtered], format_func=lambda path: format_module_option('i2v', path), help="Models are filtered based on your system specs.")
286 | 
287 |                 selected_video_model_path = module_selections.get('t2i')
288 |                 if selected_video_model_path:
289 |                     selected_caps = get_caps_from_path('t2i', selected_video_model_path)
290 |                     if selected_caps and selected_caps.supports_ip_adapter:
291 |                         show_char_section = True
292 |             else: # T2V Flow
293 |                 all_t2v_options = filter_modules_by_resources(st.session_state.discovered_modules.get('t2v', []))
294 |                 t2v_options = [m for m in all_t2v_options if not has_characters or m['caps'].supports_ip_adapter]
295 |                 
296 |                 module_selections['t2v'] = st.selectbox("Text-to-Video Model (T2V)", options=[m['path'] for m in t2v_options], format_func=lambda path: format_module_option('t2v', path), key="t2v_selection", help="Models are filtered based on your system specs and character support.")
297 |                 
298 |                 selected_video_model_path = module_selections.get('t2v')
299 |                 if selected_video_model_path:
300 |                     selected_caps = get_caps_from_path('t2v', selected_video_model_path)
301 |                     if selected_caps and selected_caps.supports_ip_adapter:
302 |                         show_char_section = True
303 |             
304 |             st.divider()
305 |             st.info("Step 2: Define your project title and content topic.")
306 |             title = st.text_input("Project Title", help="A user-friendly name for your project. This will be used for the folder name.")
307 |             topic = st.text_area("Video Topic / Prompt", help="The main idea or prompt for the AI to generate the script.")
308 |             col1, col2 = st.columns(2)
309 |             fmt = col1.selectbox("Format", ("Portrait", "Landscape"), index=1)
310 |             length = col2.number_input("Length (s)", min_value=5, value=20, step=5)
311 |             c1_s, c2_s = st.columns(2)
312 |             min_s = c1_s.number_input("Min Scenes", 1, 10, 2, 1)
313 |             max_s = c2_s.number_input("Max Scenes", min_s, 10, 5, 1)
314 |             
315 |             st.divider()
316 |             st.info("Step 3: Final Touches")
317 |             seed = st.number_input("Image Generation Seed", min_value=-1, value=-1, step=1, help="-1 for a random seed, or any other number for a fixed seed.")
318 |             auto = st.checkbox("Automatic Mode", value=True)
319 |             audio = st.file_uploader(
320 |                 "Reference Speaker Audio (Required, .wav)", 
321 |                 type=['wav'],
322 |                 help="Upload a short .wav file of the desired voice. This is required to create a project."
323 |             )
324 |             add_narration_text = st.checkbox("Add Narration Text to Video", value=True, help="Renders the narration text as captions on the final video.")
325 | 
326 |             submitted = st.form_submit_button("Create & Start Project", type="primary")
327 |             if submitted:
328 |                 final_language = st.session_state.get('selected_language', 'en')
329 |                 flow_is_valid = (use_svd and module_selections.get('t2i') and module_selections.get('i2v')) or \
330 |                                 (not use_svd and module_selections.get('t2v'))
331 | 
332 |                 if not flow_is_valid or not module_selections.get('llm') or not module_selections.get('tts'):
333 |                     st.error("A required module for the selected workflow is missing. Please check your selections.")
334 |                 elif not title:
335 |                     st.error("Project Title is required.")
336 |                 elif not topic: 
337 |                     st.error("Video Topic / Prompt is required.")
338 |                 elif not audio:
339 |                     st.error("Reference Speaker Audio is required. Please upload a .wav file.")
340 |                 else:
341 |                     final_chars = st.session_state.new_project_characters if show_char_section else []
342 |                     create_new_project(title, topic, auto, audio, fmt, length, min_s, max_s, use_svd, final_chars, module_selections, final_language, add_narration_text, seed)
343 |         
344 |         st.divider()
345 |         st.subheader("Add Characters (Optional)")
346 |         if show_char_section:
347 |             st.caption("Add characters to use for consistent generation.")
348 |             for i, char in enumerate(st.session_state.new_project_characters):
349 |                 with st.container(border=True):
350 |                     char_c1, char_c2 = st.columns([1, 4])
351 |                     corrected_image = load_and_correct_image_orientation(char['image'])
352 |                     if corrected_image: char_c1.image(corrected_image, width=64)
353 |                     char_c2.write(f"**{char['name']}**")
354 |             with st.expander("Add a New Character"):
355 |                 with st.form("add_character_form", clear_on_submit=True):
356 |                     char_name = st.text_input("Character Name")
357 |                     char_image = st.file_uploader("Upload Character Image", type=['png', 'jpg', 'jpeg'])
358 |                     if st.form_submit_button("Add Character to Project"):
359 |                         if char_name and char_image:
360 |                             st.session_state.new_project_characters.append({"name": char_name, "image": char_image})
361 |                             st.rerun()
362 |                         else: st.warning("Character name and image are required.")
363 |         else:
364 |             st.info("The selected model workflow does not support character consistency.")
365 |             if st.session_state.new_project_characters: st.session_state.new_project_characters = []
366 | 
367 | 
368 | def render_processing_dashboard():
369 |     project = st.session_state.current_project
370 |     ui_executor = st.session_state.ui_executor
371 | 
372 |     def add_scene_at_callback(index_to_add): st.session_state.ui_executor.add_new_scene(index_to_add)
373 |     def remove_scene_callback(scene_idx_to_remove): st.session_state.ui_executor.remove_scene(scene_idx_to_remove)
374 |     def regen_shots_callback(scene_idx_to_regen):
375 |         with st.spinner(f"Regenerating shots for Scene {scene_idx_to_regen + 1}..."): 
376 |             st.session_state.ui_executor.regenerate_scene_shots(scene_idx_to_regen)
377 |         st.rerun()
378 | 
379 |     supports_characters = ui_executor.task_executor.active_flow_supports_characters
380 |     use_svd_flow = project.state.project_info.config.get("use_svd_flow", True)
381 | 
382 |     st.title(f"🎬 Project: {project.state.project_info.title}")
383 |     st.caption(f"LLM Topic: {project.state.project_info.topic}")
384 |     
385 |     with st.container(border=True):
386 |         def get_module_title(mod_type: str, path: str) -> str:
387 |             if not path: return "N/A"
388 |             for mod in st.session_state.discovered_modules.get(mod_type, []):
389 |                 if mod['path'] == path:
390 |                     return mod['caps'].title
391 |             return path.split('.')[-1]
392 | 
393 |         config_dict = project.state.project_info.config
394 |         modules = config_dict.get('module_selections', {})
395 |         
396 |         c1, c2, c3 = st.columns(3)
397 |         
398 |         with c1:
399 |             st.caption("Project Settings")
400 |             flow = "Image-to-Video" if config_dict.get('use_svd_flow', True) else "Text-to-Video"
401 |             fmt = config_dict.get('aspect_ratio_format', 'N/A')
402 |             length = config_dict.get('target_video_length_hint', 'N/A')
403 |             st.markdown(f"**Flow:** {flow}<br>**Format:** {fmt}<br>**Length:** {length}s", unsafe_allow_html=True)
404 |             
405 |         with c2:
406 |             st.caption("Core Models")
407 |             llm_title = get_module_title('llm', modules.get('llm'))
408 |             tts_title = get_module_title('tts', modules.get('tts'))
409 |             st.markdown(f"**LLM:** {llm_title}<br>**TTS:** {tts_title}", unsafe_allow_html=True)
410 | 
411 |         with c3:
412 |             st.caption("Video Generation Models")
413 |             if config_dict.get('use_svd_flow', True):
414 |                 t2i_title = get_module_title('t2i', modules.get('t2i'))
415 |                 i2v_title = get_module_title('i2v', modules.get('i2v'))
416 |                 st.markdown(f"**Image:** {t2i_title}<br>**Video:** {i2v_title}", unsafe_allow_html=True)
417 |             else:
418 |                 t2v_title = get_module_title('t2v', modules.get('t2v'))
419 |                 st.markdown(f"**Video:** {t2v_title}")
420 | 
421 |     c1, c2, c3 = st.columns([2, 3, 2])
422 |     with c1:
423 |         if st.button("⬅️ Back to Projects"): go_to_step('project_selection')
424 |     with c2:
425 |         if st.session_state.auto_mode:
426 |             btn_text = "⏹️ Stop" if st.session_state.is_processing else "🚀 Start"
427 |             if st.button(f"{btn_text} Automatic Processing", use_container_width=True, type="primary" if not st.session_state.is_processing else "secondary"):
428 |                 st.session_state.is_processing = not st.session_state.is_processing
429 |     with c3:
430 |         st.session_state.auto_mode = st.toggle("Automatic Mode", value=st.session_state.auto_mode, disabled=st.session_state.is_processing)
431 |     st.divider()
432 | 
433 |     if supports_characters:
434 |         expander_label = "👤 Project Characters & Subjects"
435 |         if project.state.characters: expander_label = f"👤 Project Characters & Subjects: {', '.join([c.name for c in project.state.characters])}"
436 |         with st.expander(expander_label, expanded=False):
437 |             if not project.state.characters: st.info("No characters defined.")
438 |             for char in project.state.characters:
439 |                 with st.container(border=True):
440 |                     c1_char, c2_char = st.columns([1, 3])
441 |                     with c1_char:
442 |                         corrected_image = load_and_correct_image_orientation(char.reference_image_path)
443 |                         if corrected_image: st.image(corrected_image, caption=char.name, use_container_width=True)
444 |                     with c2_char:
445 |                         with st.popover("Edit Character", use_container_width=True):
446 |                             with st.form(f"edit_char_{char.name}"):
447 |                                 st.write(f"Editing: **{char.name}**")
448 |                                 new_name = st.text_input("New Name", value=char.name)
449 |                                 new_image = st.file_uploader("Upload New Image", type=['png', 'jpg', 'jpeg'], key=f"edit_img_{char.name}")
450 |                                 if st.form_submit_button("Save", type="primary"): ui_executor.update_character(char.name, new_name, new_image)
451 |                         if st.button("Delete Character", key=f"del_char_{char.name}", type="secondary", use_container_width=True): ui_executor.delete_character(char.name)
452 |             with st.form("add_new_character_dashboard"):
453 |                 st.subheader("Add New Character")
454 |                 name = st.text_input("Character Name")
455 |                 image = st.file_uploader("Upload Reference Image", type=['png', 'jpg', 'jpeg'])
456 |                 if st.form_submit_button("Add Character", type="primary"):
457 |                     if name and image: ui_executor.add_character(name, image)
458 |                     else: st.error("Name and image are required.")
459 |     else:
460 |         st.info("This project's workflow does not support character consistency.")
461 | 
462 |     st.subheader("Content Generation Dashboard")
463 | 
464 |     with st.expander("Assembly & Export Settings"):
465 |         cfg = ContentConfig(**project.state.project_info.config)
466 |         
467 |         c1, c2 = st.columns(2)
468 |         current_text_setting = cfg.add_narration_text_to_video
469 |         new_text_setting = c1.checkbox("Add Narration Text to Video", value=current_text_setting, help="Render the narration as captions. Requires re-assembly.")
470 |         if new_text_setting != current_text_setting:
471 |             ui_executor.update_project_config('add_narration_text_to_video', new_text_setting)
472 | 
473 |         current_seed = cfg.seed
474 |         new_seed = c2.number_input("Image Seed", value=current_seed, min_value=-1, step=1, help="-1 for random. Changing this requires re-generating images.")
475 |         if new_seed != current_seed:
476 |             ui_executor.update_project_config('seed', new_seed)
477 | 
478 | 
479 |     with st.expander("Reference Speaker Audio"):
480 |         uploaded_file = st.file_uploader("Upload New Speaker Audio (.wav)", key="speaker_upload", disabled=st.session_state.is_processing)
481 |         if uploaded_file:
482 |             relative_speaker_path = "speaker_audio.wav"
483 |             speaker_path = os.path.join(project.output_dir, relative_speaker_path)
484 |             with open(speaker_path, "wb") as f: f.write(uploaded_file.getbuffer())
485 |             st.session_state.speaker_audio = speaker_path
486 |             project.set_speaker_audio(relative_speaker_path)
487 |             st.success("Speaker audio updated!")
488 |             st.rerun() 
489 |         if st.session_state.speaker_audio and os.path.exists(st.session_state.speaker_audio):
490 |             st.write("Current audio:"); st.audio(st.session_state.speaker_audio)
491 |         else:
492 |             st.info("No reference audio provided.")
493 | 
494 |     next_task_name, next_task_data = project.get_next_pending_task()
495 |     is_ready_for_assembly = (next_task_name == "assemble_final")
496 |     is_fully_complete = (next_task_name is None)
497 | 
498 |     if is_ready_for_assembly or is_fully_complete:
499 |         if st.button("Assemble / View Final Video ➡️", type="primary"):
500 |             if is_ready_for_assembly:
501 |                 with st.spinner("Assembling final video..."):
502 |                     success = ui_executor.assemble_final_video()
503 |                     if success: go_to_step('video_assembly')
504 |             else:
505 |                 go_to_step('video_assembly')
506 |     
507 |     st.write("---")
508 | 
509 |     insert_c1, insert_c2, insert_c3 = st.columns([1, 1, 1])
510 |     with insert_c2:
511 |         st.button("➕ Insert Scene Here", key="add_scene_at_0", on_click=add_scene_at_callback, args=(0,), use_container_width=True, disabled=st.session_state.is_processing)
512 | 
513 |     for i, part in enumerate(project.state.script.narration_parts):
514 |         with st.container(border=True):
515 |             header_c1, header_c2 = st.columns([0.9, 0.1])
516 |             with header_c1: st.header(f"Scene {i+1}")
517 |             with header_c2: st.button("❌", key=f"delete_scene_{i}", help="Delete this scene", disabled=st.session_state.is_processing, on_click=remove_scene_callback, args=(i,))
518 |             
519 |             if supports_characters:
520 |                 scene = project.get_scene_info(i)
521 |                 if scene and project.state.characters:
522 |                     all_char_names = [c.name for c in project.state.characters]
523 |                     selected_chars = st.multiselect("Characters in this Scene", options=all_char_names, default=scene.character_names, key=f"scene_chars_{i}")
524 |                     if selected_chars != scene.character_names: ui_executor.update_scene_characters(i, selected_chars)
525 |             
526 |             st.subheader("Narration")
527 |             new_text = st.text_area("Script", part.text, key=f"text_{i}", height=100, label_visibility="collapsed", disabled=st.session_state.is_processing)
528 |             if new_text != part.text: ui_executor.update_narration_text(i, new_text)
529 | 
530 |             audio_col1, audio_col2 = st.columns(2)
531 |             if part.audio_path and os.path.exists(part.audio_path):
532 |                 audio_col1.audio(part.audio_path)
533 |                 if audio_col2.button("Regen Audio", key=f"regen_audio_{i}", disabled=st.session_state.is_processing, use_container_width=True):
534 |                     with st.spinner("..."): ui_executor.regenerate_audio(i, new_text, st.session_state.speaker_audio); st.rerun()
535 |             else:
536 |                 if audio_col1.button("Gen Audio", key=f"gen_audio_{i}", disabled=st.session_state.is_processing, use_container_width=True):
537 |                     with st.spinner("..."): ui_executor.regenerate_audio(i, new_text, st.session_state.speaker_audio); st.rerun()
538 |             
539 |             st.divider()
540 |             
541 |             scene = project.get_scene_info(i)
542 |             if scene:
543 |                 shots_header_c1, shots_header_c2 = st.columns([0.75, 0.25])
544 |                 with shots_header_c1: st.subheader("Visual Shots")
545 |                 with shots_header_c2: st.button("Regen Shots", key=f"regen_shots_{i}", on_click=regen_shots_callback, args=(i,), disabled=st.session_state.is_processing, use_container_width=True, help="Regenerate all visual and motion prompts for this scene.")
546 |                 
547 |                 for shot in scene.shots:
548 |                     shot_idx = shot.shot_idx
549 |                     with st.container(border=True):
550 |                         if use_svd_flow:
551 |                             p_col, i_col, v_col = st.columns([2, 1, 1])
552 |                             with p_col:
553 |                                 st.write(f"**Shot {shot_idx + 1}**")
554 |                                 vis = st.text_area("Visual", shot.visual_prompt, key=f"v_prompt_{i}_{shot_idx}", height=125, disabled=st.session_state.is_processing)
555 |                                 if vis != shot.visual_prompt: ui_executor.update_shot_prompts(i, shot_idx, visual_prompt=vis)
556 |                                 mot = st.text_area("Motion", shot.motion_prompt, key=f"m_prompt_{i}_{shot_idx}", height=75, disabled=st.session_state.is_processing)
557 |                                 if mot != shot.motion_prompt: ui_executor.update_shot_prompts(i, shot_idx, motion_prompt=mot)
558 |                             with i_col:
559 |                                 st.write("**Image**"); has_image = shot.keyframe_image_path and os.path.exists(shot.keyframe_image_path)
560 |                                 if has_image: st.image(shot.keyframe_image_path)
561 |                                 else: st.info("Image pending...")
562 |                                 if st.button("Regen Image" if has_image else "Gen Image", key=f"gen_img_{i}_{shot_idx}", disabled=st.session_state.is_processing, use_container_width=True):
563 |                                     with st.spinner("..."): ui_executor.regenerate_shot_image(i, shot_idx); st.rerun()
564 |                             with v_col:
565 |                                 st.write("**Video**"); has_video = shot.video_path and os.path.exists(shot.video_path)
566 |                                 if has_video: st.video(shot.video_path)
567 |                                 else: st.info("Video pending...")
568 |                                 if st.button("Regen Video" if has_video else "Gen Video", key=f"gen_vid_{i}_{shot_idx}", disabled=st.session_state.is_processing or not has_image, use_container_width=True):
569 |                                     with st.spinner("..."): ui_executor.regenerate_shot_video(i, shot_idx); st.rerun()
570 |                         else: # T2V Flow
571 |                             p_col, v_col = st.columns([2, 1])
572 |                             with p_col:
573 |                                 st.write(f"**Shot {shot_idx + 1} Prompt**")
574 |                                 vis = st.text_area("Prompt", shot.visual_prompt, key=f"v_prompt_{i}_{shot_idx}", height=125, disabled=st.session_state.is_processing)
575 |                                 if vis != shot.visual_prompt: ui_executor.update_shot_prompts(i, shot_idx, visual_prompt=vis)
576 |                             with v_col:
577 |                                 st.write("**Video**"); has_video = shot.video_path and os.path.exists(shot.video_path)
578 |                                 if has_video: st.video(shot.video_path)
579 |                                 else: st.info("Video pending...")
580 |                                 if st.button("Regen Video" if has_video else "Gen Video", key=f"gen_t2v_{i}_{shot_idx}", disabled=st.session_state.is_processing, use_container_width=True):
581 |                                     with st.spinner("..."): ui_executor.regenerate_shot_t2v(i, shot_idx); st.rerun()
582 |             elif part.status == "generated":
583 |                  if st.button("Define Visual Shots", key=f"create_scene_{i}", disabled=st.session_state.is_processing, use_container_width=True, help="Generates the visual and motion prompts for this scene based on its narration."):
584 |                     with st.spinner("..."): ui_executor.create_scene(i); st.rerun()
585 |             else: st.info("Generate audio before scene creation.")
586 |         
587 |         insert_c1, insert_c2, insert_c3 = st.columns([1, 1, 1])
588 |         with insert_c2:
589 |             st.button("➕ Insert Scene Here", key=f"add_scene_at_{i+1}", on_click=add_scene_at_callback, args=(i + 1,), use_container_width=True, disabled=st.session_state.is_processing)
590 |         
591 |     st.divider()
592 | 
593 |     if st.session_state.auto_mode and st.session_state.is_processing:
594 |         if next_task_name is None:
595 |             st.session_state.is_processing = False; st.toast("✅ All tasks done!"); go_to_step('video_assembly')
596 |         else:
597 |             msg = f"Executing: {next_task_name.replace('_', ' ')} for Scene {next_task_data.get('scene_idx', 0) + 1}..."
598 |             if "shot" in next_task_name: msg += f" / Shot {next_task_data.get('shot_idx', 0) + 1}"
599 |             with st.spinner(msg):
600 |                 if next_task_name == 'generate_audio': next_task_data['speaker_wav'] = st.session_state.speaker_audio
601 |                 success = st.session_state.ui_executor.task_executor.execute_task(next_task_name, next_task_data)
602 |             if success:
603 |                 fresh_pm = ProjectManager(st.session_state.current_project.output_dir); fresh_pm.load_project()
604 |                 st.session_state.current_project = fresh_pm
605 |                 st.session_state.ui_executor = UITaskExecutor(fresh_pm)
606 |                 st.rerun()
607 |             else:
608 |                 st.error(f"❌ Failed on: {next_task_name}. Stopping."); st.session_state.is_processing = False
609 | 
610 | def render_video_assembly():
611 |     st.title("Final Video Assembly")
612 |     project = st.session_state.current_project
613 |     if st.button("⬅️ Back to Dashboard"): go_to_step('processing_dashboard')
614 |     st.divider()
615 |     final_path = project.state.final_video.path
616 |     if final_path and os.path.exists(final_path):
617 |         st.subheader("Final Video"); st.video(final_path)
618 |         with st.expander("Details"):
619 |             st.write("**Narration:**", project.state.final_video.full_narration_text)
620 |             st.write("**Hashtags:**", ", ".join(project.state.final_video.hashtags))
621 |             
622 |     if st.button("Re-Assemble Final Video", type="primary"):
623 |         with st.spinner("..."):
624 |             if not all(s.status == 'completed' for s in project.state.scenes):
625 |                 for scene in project.state.scenes:
626 |                     if scene.status != 'completed':
627 |                         st.write(f"Assembling scene {scene.scene_idx+1}...")
628 |                         st.session_state.ui_executor.task_executor.execute_task("assemble_scene", {"scene_idx": scene.scene_idx})
629 |             
630 |             st.write("Assembling final video...")
631 |             success = st.session_state.ui_executor.assemble_final_video()
632 |             
633 |         if success:
634 |             st.success("Assembled!")
635 |             st.rerun() 
636 |         else:
637 |             st.error("Failed.")
638 | 
639 | # Main application router
640 | if st.session_state.current_step == 'system_config_setup':
641 |     render_system_config_setup()
642 | elif st.session_state.current_step == 'project_selection':
643 |     render_project_selection()
644 | elif st.session_state.current_project:
645 |     if st.session_state.current_step == 'processing_dashboard':
646 |         render_processing_dashboard()
647 |     elif st.session_state.current_step == 'video_assembly':
648 |         render_video_assembly()
649 | else:
650 |     go_to_step('project_selection')


--------------------------------------------------------------------------------