├── .python-version ├── resources ├── font │ └── font.ttf └── Intro │ └── intro.jpg ├── diffusion └── scripts │ ├── Modelfile │ ├── generate_image_local.py │ ├── generate_image.py │ └── generate_script.py ├── assembly ├── templates │ └── video_template.json └── scripts │ └── assembly_video.py ├── pyproject.toml ├── tts └── scripts │ └── generate_audio.py ├── main.py ├── main_local.py └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /resources/font/font.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLSAKIIT/ForgeTube/HEAD/resources/font/font.ttf -------------------------------------------------------------------------------- /resources/Intro/intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLSAKIIT/ForgeTube/HEAD/resources/Intro/intro.jpg -------------------------------------------------------------------------------- /diffusion/scripts/Modelfile: -------------------------------------------------------------------------------- 1 | FROM llama3.1 2 | 3 | 4 | PARAMETER temperature 1 5 | 6 | SYSTEM """ You are a youtube script writer and content creator your task is to create youtube scripts and to segment it with various parameters in json format""" -------------------------------------------------------------------------------- /assembly/templates/video_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "default_video_template", 3 | "description": "Simple structure for video editing workflow", 4 | "video_settings": { 5 | "resolution": "1920x1080", 6 | "frame_rate": 30 7 | }, 8 | "audio_settings": { 9 | "sample_rate": 44100, 10 | "channels": 2 11 | }, 12 | "transitions": [] 13 | } 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ForgeTube" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "accelerate>=1.4.0", 9 | "diffusers>=0.32.2", 10 | "google-generativeai>=0.8.4", 11 | "google-search-results>=2.4.2", 12 | "kokoro>=0.7.16", 13 | "modal>=0.73.59", 14 | "moviepy>=2.1.2", 15 | "pydub>=0.25.1", 16 | "pysrt>=1.1.2", 17 | "soundfile>=0.13.1", 18 | "spacy>=3.8.4", 19 | ] 20 | -------------------------------------------------------------------------------- /tts/scripts/generate_audio.py: -------------------------------------------------------------------------------- 1 | 2 | from pydub import AudioSegment 3 | import json 4 | import io 5 | import soundfile as sf 6 | import os 7 | from kokoro.pipeline import KPipeline 8 | 9 | def generate_audio(script_data): 10 | pipeline = KPipeline(lang_code="b") 11 | 12 | all_audio = [] 13 | for segment in script_data["audio_script"]: 14 | speaker_id = "am_adam" if segment["speaker"] in ["default", "narrator_male"] else "af_heart" 15 | audio = pipeline(text=segment["text"], voice=speaker_id, speed=segment["speed"]) 16 | 17 | # Collect audio chunks 18 | buffer = io.BytesIO() 19 | for _, _, chunk in audio: 20 | sf.write(buffer, chunk, 24000, format='WAV') 21 | buffer.seek(0) 22 | all_audio.append(buffer.read()) 23 | 24 | return all_audio 25 | 26 | def merge_audio(audio_path,audio_bytes_list): 27 | # Create output directory 28 | # os.makedirs("output_audio", exist_ok=True) 29 | 30 | # Save segments locally 31 | audio_files = [] 32 | for idx, audio_bytes in enumerate(audio_bytes_list): 33 | output_path = f"{audio_path}/segment_{idx}.wav" 34 | with open(output_path, "wb") as f: 35 | f.write(audio_bytes) 36 | audio_files.append(output_path) 37 | print(f"Audio file: {idx} successfully saved at : {output_path}") 38 | 39 | # Merge audio files (not really needed) 40 | # master_audio = AudioSegment.empty() 41 | # for file in audio_files: 42 | # master_audio += AudioSegment.from_wav(file) 43 | 44 | # Export final file 45 | # master_output_path = f"{audio_path}/master_output.wav" 46 | # master_audio.export(master_output_path, format="wav") 47 | # return master_output_path 48 | 49 | def main_generate_audio(script_path,audio_path): 50 | # Load script data 51 | with open(script_path) as f: 52 | script_data = json.load(f) 53 | 54 | # Generate audio 55 | audio_bytes_list = generate_audio(script_data) 56 | 57 | # Merge and save final audio 58 | final_path = merge_audio(audio_path,audio_bytes_list) 59 | 60 | print(f"Audio generation complete! Saved as {final_path}") 61 | 62 | # if __name__ == "__main__": 63 | # main_generate_audio(script_path="resources/scripts/script.json",audio_path="resources/audio") -------------------------------------------------------------------------------- /diffusion/scripts/generate_image_local.py: -------------------------------------------------------------------------------- 1 | # import modal 2 | import json 3 | import os 4 | import time 5 | from io import BytesIO 6 | 7 | 8 | def generate_image(prompt, negative_prompt="", steps=50, guidance_scale=9, width=1920, height=1080, seed=None): 9 | import torch 10 | from diffusers import DiffusionPipeline 11 | 12 | 13 | # LOADS THE DIFFUSION PIPELINE 14 | 15 | pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", 16 | torch_dtype=torch.float16, 17 | use_safetensors=True, 18 | variant="fp16") 19 | 20 | pipe.to("cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | generator = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed) if seed else None 23 | 24 | image = pipe( 25 | prompt, 26 | negative_prompt=negative_prompt, 27 | num_inference_steps=steps, 28 | guidance_scale=guidance_scale, 29 | width=width, 30 | height=height, 31 | generator=generator 32 | ).images[0] 33 | 34 | img_byte_arr = BytesIO() 35 | image.save(img_byte_arr, format="PNG") 36 | img_byte_arr.seek(0) 37 | 38 | return img_byte_arr.getvalue() 39 | 40 | # PROVIDE SOURCE TEXT OR PROMPT IN JSON FILE 41 | 42 | def main_generate_image(script_path,images_output_path): 43 | # JSON Decoding Error Handling 44 | with open(script_path, "r", encoding="utf-8") as file: 45 | try: 46 | data = json.load(file) 47 | except json.JSONDecodeError: 48 | print("Error reading JSON file.") 49 | return 50 | # JSON Key Error Handling 51 | if "visual_script" not in data: 52 | print("Missing key in JSON.") 53 | return 54 | 55 | 56 | # GENERATING THE IMAGES 57 | 58 | # Looping Through the Scenes 59 | for idx, scene in enumerate(data["visual_script"]): 60 | try: 61 | prompt = scene["prompt"] 62 | timestamp = scene.get("timestamp", f"{idx:03d}") 63 | negative_prompt = scene.get("negative_prompt", "") 64 | steps = scene.get("steps", 50) 65 | # guidance_scale = scene.get("guidance_scale", 12) 66 | guidance_scale = 9 # Set to 9 to allow for some room of filling missing elements. 67 | width = 1920 68 | height = 1080 69 | seed = scene.get("seed", None) 70 | 71 | scene_id = timestamp.replace(":", "-") 72 | 73 | image_data = generate_image(prompt, negative_prompt, steps, guidance_scale, width, height, seed) 74 | 75 | 76 | # SAVING THE IMAGES IN THE OUTPUT DIRECTORY 77 | 78 | file_path = os.path.join(images_output_path, f"scene_{scene_id}.png") 79 | with open(file_path, "wb") as f: 80 | f.write(image_data) 81 | 82 | print(f"Saved: {file_path}") 83 | 84 | time.sleep(2) 85 | 86 | except Exception as e: 87 | print(f"Error processing scene {idx}: {e}") 88 | 89 | print("Image Generation is Done.") 90 | 91 | # if __name__ == "__main__": 92 | # main_generate_image(script_path=script_path,images_output_path=images_output_path) 93 | -------------------------------------------------------------------------------- /diffusion/scripts/generate_image.py: -------------------------------------------------------------------------------- 1 | import modal 2 | import json 3 | import os 4 | import time 5 | from io import BytesIO 6 | 7 | image = modal.Image.debian_slim().pip_install( 8 | "diffusers", 9 | "torch", 10 | "transformers", 11 | "accelerate" 12 | ) 13 | 14 | app = modal.App(name="ForgeTube_app") 15 | 16 | @app.function(image=image, gpu="A10G") 17 | def generate_image(prompt, negative_prompt="", steps=50, guidance_scale=9, width=1920, height=1080, seed=None): 18 | import torch 19 | from diffusers import DiffusionPipeline 20 | 21 | 22 | # LOADS THE DIFFUSION PIPELINE 23 | 24 | pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", 25 | torch_dtype=torch.float16, 26 | use_safetensors=True, 27 | variant="fp16") 28 | pipe.to("cuda") 29 | 30 | generator = torch.Generator(device="cuda").manual_seed(seed) if seed else None 31 | 32 | image = pipe( 33 | prompt, 34 | negative_prompt=negative_prompt, 35 | num_inference_steps=steps, 36 | guidance_scale=guidance_scale, 37 | width=width, 38 | height=height, 39 | generator=generator 40 | ).images[0] 41 | 42 | img_byte_arr = BytesIO() 43 | image.save(img_byte_arr, format="PNG") 44 | img_byte_arr.seek(0) 45 | 46 | return img_byte_arr.getvalue() 47 | 48 | 49 | # PATH TO JSON FILE 50 | 51 | script_path = "resources/scripts/script.json" 52 | images_output_path = "resources/images/" 53 | # os.makedirs(output_path, exist_ok=True) 54 | 55 | 56 | # PROVIDE SOURCE TEXT OR PROMPT IN JSON FILE 57 | 58 | def main_generate_image(script_path,images_output_path): 59 | # JSON Decoding Error Handling 60 | with open(script_path, "r", encoding="utf-8") as file: 61 | try: 62 | data = json.load(file) 63 | except json.JSONDecodeError: 64 | print("Error reading JSON file.") 65 | return 66 | # JSON Key Error Handling 67 | if "visual_script" not in data: 68 | print("Missing key in JSON.") 69 | return 70 | 71 | 72 | # GENERATING THE IMAGES 73 | with modal.enable_output(): 74 | with app.run(): 75 | # Looping Through the Scenes 76 | for idx, scene in enumerate(data["visual_script"]): 77 | try: 78 | prompt = scene["prompt"] 79 | timestamp = scene.get("timestamp", f"{idx:03d}") 80 | negative_prompt = scene.get("negative_prompt", "") 81 | steps = scene.get("steps", 50) 82 | # guidance_scale = scene.get("guidance_scale", 12) 83 | guidance_scale = 9 84 | 85 | # width = scene.get("width", 1024) 86 | width = 1920 87 | # height = scene.get("height", 576) 88 | height = 1080 89 | seed = scene.get("seed", None) 90 | 91 | scene_id = timestamp.replace(":", "-") 92 | 93 | image_data = generate_image.remote(prompt, negative_prompt, steps, guidance_scale, width, height, seed) 94 | 95 | 96 | # SAVING THE IMAGES IN THE OUTPUT DIRECTORY 97 | 98 | file_path = os.path.join(images_output_path, f"scene_{scene_id}.png") 99 | with open(file_path, "wb") as f: 100 | f.write(image_data) 101 | 102 | print(f"Saved: {file_path}") 103 | 104 | time.sleep(2) 105 | 106 | except Exception as e: 107 | print(f"Error processing scene {idx}: {e}") 108 | 109 | print("Done.") 110 | 111 | # if __name__ == "__main__": 112 | # main_generate_image(script_path=script_path,images_output_path=images_output_path) 113 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from diffusion.scripts.generate_script import VideoScriptGenerator 2 | import json 3 | from diffusion.scripts.generate_image import main_generate_image 4 | from tts.scripts.generate_audio import main_generate_audio 5 | from assembly.scripts.assembly_video import create_video,create_complete_srt,extract_topic_from_json 6 | import os 7 | ''' 8 | TODO: 1. Make a main.py where all pipelines are invoked at once. 9 | TODO: 2. Take the prompt for the video as user input. 10 | TODO: 3. Run Tests with various different prompts. 11 | TODO: 4. All gpu related tasks must be performed on modal. Works 12 | ''' 13 | if __name__ == "__main__": 14 | # Update folder paths as needed. 15 | script_path = "resources/scripts/" # creates the folders if not made already 16 | images_path = "resources/images/" 17 | audio_path = "resources/audio/" 18 | font_path = "resources/font/font.ttf" 19 | 20 | def create_or_check_folder(folder_path): 21 | """ 22 | Creates a folder if it doesn't exist. 23 | If folder exists, checks for files and raises FileExistsError if any are found. 24 | 25 | Args: 26 | folder_path (str): Path to the folder 27 | 28 | Raises: 29 | FileExistsError: If folder exists and contains files 30 | """ 31 | # If folder doesn't exist, create it 32 | if not os.path.exists(folder_path): 33 | os.makedirs(folder_path) 34 | print(f"Created Folder: {folder_path}") 35 | else: 36 | # Check if folder has any contents 37 | if any(os.listdir(folder_path)): 38 | raise FileExistsError(f"Folder : '{folder_path}' already exists and contains files. Please remove them or make a new folder") 39 | # print(f"folder '{folder_path}' exists but is empty") 40 | 41 | create_or_check_folder(images_path) 42 | create_or_check_folder(audio_path) 43 | os.makedirs(script_path,exist_ok=True) 44 | script_path += "script.json" # Name of the script file 45 | # 1. Generate the Script 46 | gem_api = "Enter your Gemini API key here" 47 | serp_api = "Enter your Serp API key here" 48 | if (not gem_api) or (not serp_api): 49 | raise ValueError("API Key not provided !\n Please Create your api key at : \n Serp APi : https://serpapi.com \n Gemini API : https://aistudio.google.com/apikey") 50 | generator = VideoScriptGenerator(api_key=gem_api,serp_api_key=serp_api) 51 | try: 52 | topic = input("Enter the topic of the video : "), 53 | duration=int(input("Enter the video duration in seconds : ")) 54 | input_string = input("Enter a list of key points separated by commas : ") 55 | key_points = input_string.split(",") 56 | key_points = [word.strip() for word in key_points] 57 | print("Starting Script Generation ... ") 58 | script = generator.generate_script( 59 | topic,duration,key_points 60 | ) 61 | print("Initial Script: ") 62 | print(json.dumps(script, indent=2)) 63 | 64 | feedback = input("Please provide feedback on the script (or type 'no' to skip refinement): ") 65 | if feedback.lower() != "no": 66 | refined_script = generator.refine_script(script, feedback) 67 | print("\nRefined Script:") 68 | print(json.dumps(refined_script, indent=2)) 69 | generator.save_script(refined_script, script_path) 70 | else: 71 | generator.save_script(script, script_path) 72 | print("Script Generation Done.") 73 | except Exception as e: 74 | print(f"Script generation failed: {str(e)}") 75 | 76 | # 2. Generate the images 77 | print("Staring Image Generation ...") 78 | main_generate_image(script_path,images_path) 79 | print("Image Generation Done.") 80 | 81 | # 3. Generate the audio 82 | print("Starting Audio Generation ...") 83 | main_generate_audio(script_path,audio_path) 84 | print("Audio Generation Done.") 85 | # # Video Assembly 86 | topic = extract_topic_from_json(script_path) 87 | 88 | import re 89 | topic = re.sub(r"[^A-Za-z0-9\s]+", " ",topic) 90 | topic = re.sub(r"\s+", "_", topic) 91 | topic = topic[:100] # Take only first 100 characters 92 | os.makedirs("resources/video",exist_ok=True) 93 | os.makedirs("resources/subtitles",exist_ok=True) 94 | sub_output_file = f"resources/subtitles/{topic}.srt" 95 | video_file = f"resources/video/{topic}.mp4" 96 | 97 | # 5. Create subtitles in a .srt file 98 | print("Creating .srt subtitle file ...") 99 | create_complete_srt(script_folder = script_path, 100 | audio_file_folder = audio_path, 101 | outfile_path = sub_output_file, 102 | chunk_size = 10) 103 | # 6. Start Video Assembly 104 | create_video(images_path, audio_path, script_path, font_path, video_file, with_subtitles=True) 105 | -------------------------------------------------------------------------------- /main_local.py: -------------------------------------------------------------------------------- 1 | from diffusion.scripts.generate_script import VideoScriptGenerator 2 | import json 3 | from diffusion.scripts.generate_image_local import main_generate_image 4 | from tts.scripts.generate_audio import main_generate_audio 5 | from assembly.scripts.assembly_video import create_video,create_complete_srt,extract_topic_from_json 6 | import os 7 | ''' 8 | TODO: 1. Make a main.py where all pipelines are invoked at once. 9 | TODO: 2. Take the prompt for the video as user input. 10 | TODO: 3. Run Tests with various different prompts. 11 | TODO: 4. All gpu related tasks must be performed on modal. Works 12 | ''' 13 | if __name__ == "__main__": 14 | script_path = "resources/scripts/" # creates the folders if not made already 15 | images_path = "resources/images/" 16 | audio_path = "resources/audio/" 17 | font_path = "resources/font/font.ttf" 18 | 19 | def create_or_check_folder(folder_path): 20 | """ 21 | Creates a folder if it doesn't exist. 22 | If folder exists, checks for files and raises FileExistsError if any are found. 23 | 24 | Args: 25 | folder_path (str): Path to the folder 26 | 27 | Raises: 28 | FileExistsError: If folder exists and contains files 29 | """ 30 | # If folder doesn't exist, create it 31 | if not os.path.exists(folder_path): 32 | os.makedirs(folder_path) 33 | print(f"Created Folder: {folder_path}") 34 | else: 35 | # Check if folder has any contents 36 | if any(os.listdir(folder_path)): 37 | raise FileExistsError(f"Folder : '{folder_path}' already exists and contains files. Please remove them or make a new folder") 38 | # print(f"folder '{folder_path}' exists but is empty") 39 | 40 | create_or_check_folder(images_path) 41 | create_or_check_folder(audio_path) 42 | os.makedirs(script_path,exist_ok=True) 43 | script_path += "script.json" # Name of the script file 44 | # 1. Generate the Script 45 | gem_api = "Enter your Gemini API Key here" 46 | serp_api = "Enter your Serp API key here" 47 | if (not gem_api) or (not serp_api): 48 | raise ValueError("API Key not provided !\n Please Create your api key at : \n Serp APi : https://serpapi.com \n Gemini API : https://aistudio.google.com/apikey") 49 | generator = VideoScriptGenerator(api_key=gem_api,serp_api_key=serp_api) 50 | 51 | try: 52 | topic = input("Enter the topic of the video : "), 53 | duration=int(input("Enter the video duration in seconds : ")) 54 | input_string = input("Enter a list of key points separated by commas : ") 55 | key_points = input_string.split(",") 56 | key_points = [word.strip() for word in key_points] 57 | print("Starting Script Generation ... ") 58 | script = generator.generate_script( 59 | # topic="Neural Networks in Medical Imaging", 60 | # duration=90, 61 | # key_points=["Diagnosis accuracy", "Pattern recognition", "Case studies"] 62 | topic,duration,key_points 63 | ) 64 | print("Initial Script: ") 65 | print(json.dumps(script, indent=2)) 66 | 67 | feedback = input("Please provide feedback on the script (or type 'no' to skip refinement): ") 68 | if feedback.lower() != "no": 69 | refined_script = generator.refine_script(script, feedback) 70 | print("\nRefined Script:") 71 | print(json.dumps(refined_script, indent=2)) 72 | generator.save_script(refined_script, script_path) 73 | else: 74 | generator.save_script(script, script_path) 75 | print("Script Generation Done.") 76 | except Exception as e: 77 | print(f"Script generation failed: {str(e)}") 78 | 79 | # 2. Generate the images 80 | print("Staring Image Generation ...") 81 | main_generate_image(script_path,images_path) 82 | print("Image Generation Done.") 83 | 84 | # 3. Generate the audio 85 | print("Starting Audio Generation ...") 86 | main_generate_audio(script_path,audio_path) 87 | print("Audio Generation Done.") 88 | # Video Assembly 89 | topic = extract_topic_from_json(script_path) 90 | import re 91 | topic = re.sub(r"[^A-Za-z0-9\s]+", " ",topic) 92 | topic = re.sub(r"\s+", "_", topic) 93 | topic = topic[:100] # Take only first 100 characters 94 | 95 | os.makedirs("resources/video",exist_ok=True) 96 | os.makedirs("resources/subtitles",exist_ok=True) 97 | sub_output_file = f"{topic}.srt" 98 | video_file = f"{topic}.mp4" 99 | 100 | # 5. Create subtitles in a .srt file 101 | print("Creating .srt subtitle file ...") 102 | create_complete_srt(script_folder = script_path, 103 | audio_file_folder = audio_path, 104 | outfile_path = sub_output_file, 105 | chunk_size = 10) 106 | 107 | # 6. Start Video Assembly 108 | print("Starting video assembly ...") 109 | create_video(images_path, audio_path, script_path, font_path, video_file, with_subtitles=True) 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

MLSA Project Wing: ML

3 |
4 |

5 |

6 | 7 | 8 |

ForgeTube

9 |
10 | 11 | [![GitHub](https://img.shields.io/badge/GitHub-MLSAKIIT-181717?style=for-the-badge&logo=github)](https://github.com/MLSAKIIT) 12 | [![ForgeTube](https://img.shields.io/badge/ForgeTube-Repository-181717?style=for-the-badge&logo=github)](https://github.com/MLSAKIIT/ForgeTube) 13 | [![YouTube](https://img.shields.io/badge/YouTube-ForgeTube-FF0000?style=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/channel/UCVgzYqxxY6wCIto-Nzx68Uw) 14 | [![X](https://img.shields.io/badge/X-mlsakiit-1DA1F2?style=for-the-badge&logo=X&logoColor=white)](https://x.com/mlsakiit) 15 | [![Instagram](https://img.shields.io/badge/Instagram-mlsakiit-E4405F?style=for-the-badge&logo=instagram)](https://www.instagram.com/mlsakiit/) 16 | [![Discord](https://img.shields.io/badge/Discord-Join%20Us-5865F2?style=for-the-badge&logo=discord)](https://discord.com/invite/P6VCP2Ry3q) 17 | 18 | 19 | ## 🚧Our Project: 20 | Our project focuses on creating an automated video generation system using AI. It transforms text prompts into fully narrated videos by leveraging **large language models** for script generation, **diffusion models** for image creation, and **text to speech systems** for narration. The system processes inputs through multiple stages, from script generation to final video assembly, creating cohesive, engaging content automatically. 21 | 22 | The video generator, designed for sequential content creation, dynamically adapts to different styles and tones while maintaining consistency across visual and audio elements. It also has the ability to add **subtiles** either embedded or through the use of an **srt** file. 23 | 24 | This project demonstrates the potential of combining multiple AI technologies to create an end-to-end content generation pipeline. 25 | 26 | ## 🖥️Project Stack: 27 | `Python 3.11`: Core programming language for the project. 28 | 29 | - **Content Generation:** 30 | 31 | `Gemini API`: To generate the script using `Gemini 2.0 Flash Thinking` model and store it in a `JSON` format with proper audio and visual prompts and respective parameters. 32 | 33 | `Stable Diffusion XL Base 1.0`: For image generation using diffusion models to run either `locally` or hosted on `Modal`. 34 | 35 | `Kokoro`: An open weight tts model to convert audio prompts into audio. 36 | 37 | - **Video Processing** 38 | `MoviePy` : For adding text, intro, outro, transition effects, subtitles, audio processing, video processing and Final_Assembly by using `FFmpeg` under the hood. 39 | 40 | - **ML Frameworks:** 41 | 42 | `PyTorch`: Deep learning framework for model inferencing. 43 | 44 | `Diffusers with SDXL Base 1.0` : Utilize Hugging Face's Diffusers to generate stunning images using the SDXL Base 1.0 model. Enhance your creative projects with state-of-the-art diffusion techniques. 45 | 46 | - **Development Tools:** 47 | 48 | `Jupyter Notebooks`: For development and testing. 49 | 50 | `Google Colab` : For free cloud GPU infrastructure for development and Testing. 51 | 52 | `Git`: For version control 53 | 54 | `Modal` : For low cost high performance cloud GPU infrastructure. 55 | 56 | - **Package Management:** 57 | 58 | `UV`: For fast and efficient dependency management and project setup 59 | 60 | ## Features 61 | 62 | - **Multi-Modal Content Generation**: Seamlessly combines text, image, and audio generation 63 | - **Style Customization**: Supports different content styles and tones 64 | - **Modular Architecture**: Each component can be tested and improved independently 65 | - **Content Segmentation**: Automatically breaks down content into manageable segments 66 | - **Custom Voice Options**: Multiple TTS voices and emotional tones 67 | - **Format Flexibility**: Supports different video durations and formats (.mp4 and .mkv) 68 | - **Performance Metrics**: Tracks generation quality and consistency 69 | - **Error Handling**: Robust error management across the pipeline 70 | - **Resource Optimization**: Efficient resource usage during generation 71 | 72 | 73 | ## Steps for deployment : 74 | Clone the repo on your system, using : `git clone https://github.com/MLSAKIIT/ForgeTube.git` 75 | ### 1. Using UV for Python Package Management 76 | 77 | For more information, visit the [UV Documentation](https://docs.astral.sh/uv/). 78 | 79 | UV is a modern, high-performance Python package and project manager designed to streamline the development process. 80 | 81 | Here’s how you can use UV in this project: 82 | 83 | 1. Install `uv`. 84 | 85 | ```bash 86 | pip install uv 87 | ``` 88 | 2. Download `Python 3.11` 89 | ```bash 90 | uv python install 3.11 91 | ``` 92 | 3. Create a virtual environment 93 | ```bash 94 | uv venv .venv 95 | ``` 96 | 4. Activate your virtual environment 97 | ```bash 98 | .venv\scripts\activate.ps1 99 | ``` 100 | 5. Install all dependencies 101 | ```bash 102 | uv sync 103 | ``` 104 | ### 2. Setting up Modal 105 | For more information visit the [Modal documentation](https://modal.com/docs/guide). 106 | 107 | Modal is a cloud function platform that lets you Attach high performance GPUs with a single line of code. 108 | 109 | The nicest thing about all of this is that you don’t have to set up any infrastructure. Just: 110 | 111 | 1. Create an account at [modal.com](modal.com) 112 | 2. Run `pip install modal` to install the modal Python package 113 | 3. Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`) 114 | 115 | ### 3. Get your Gemini-API Key : 116 | To obtain a Gemini API key from Google AI Studio, follow these detailed steps: 117 | 118 | **Step 1: Sign In to Google AI Studio** 119 | 120 | Navigate to [Google AI Studio](https://aistudio.google.com/). Once 121 | signed in, locate and click on the "Gemini API" tab. This can typically be found in the main navigation menu or directly on the dashboard. On the Gemini API page, look for a button labeled "Get API key in Google AI Studio" and click on it. 122 | 123 | **Step 2: Review and Accept Terms of Service** 124 | 125 | 1. **Review Terms**: A dialog box will appear presenting the Google APIs Terms of Service and the Gemini API Additional Terms of Service. It's essential to read and understand these terms before proceeding. 126 | 2. **Provide Consent**: Check the box indicating your agreement to the terms. Optionally, you can also opt-in to receive updates and participate in research studies related to Google AI. 127 | 3. **Proceed**: Click the "Continue" button to move forward. 128 | 129 | **Step 3: Create and Secure Your API Key** 130 | 131 | 1. **Generate API Key**: Click on the "Create API key" button. You'll be prompted to choose between creating a new project or selecting an existing one. Make your selection accordingly. 132 | 2. **Retrieve the Key**: Once generated, your unique API key will be displayed. Ensure you copy and store it in a secure location. 133 | 134 | **Step 4: Add your Key in `main.py` or `local_main.py`** 135 | ```python 136 | # 1. Generate the Script 137 | gem_api = "Enter your Gemini API Key here" 138 | serp_api = "Enter your Serp API key here" 139 | ``` 140 | 141 | > [!IMPORTANT] 142 | > Always keep your API key confidential. Avoid sharing it publicly or embedding it directly into client-side code to prevent unauthorized access. 143 | 144 | ### 4. Setting up Serp-Api 145 | Serp is used for web scraping google search results on the video topic and gathering additional context to implement Retrieval Augmented Generation (RAG) 146 | 1. Visit [serpapi.com/](https://serpapi.com/) and create an account. 147 | 2. Go to the [dashboard](https://serpapi.com/dashboard), on the top left select Api key. 148 | 3. Copy the API Key and add your Key in `main.py` or `local_main.py` 149 | ```py 150 | # 1. Generate the Script 151 | gem_api = "Enter your Gemini API Key here" 152 | serp_api = "Enter your Serp API key here" 153 | ``` 154 | ### 5. `Kokoro` 155 | Run the following commands : 156 | ```bash 157 | python -m pip install spacy # If not insatlled for some reason 158 | python -m spacy download en_core_web_sm 159 | ``` 160 | ### 6. Download and setup FFmpeg 161 | 1. Visit : https://github.com/BtbN/FFmpeg-Builds/releases 162 | 2. Download the setup file for your OS. 163 | 3. On windows download the win64 version, and extract the files. 164 | 4. Make a directory at `C:\Program Files\FFmpeg`. 165 | 5. Copy all the files in the directory. 166 | 6. Add `C:\Program Files\FFmpeg\bin` to your `PATH` environment variable. 167 | 7. 168 | ### 7. Start Generating : 169 | Use `main.py` for running the image generation on Modal or use `main_local.py` to run Stable diffusion XL Locally. 170 | 171 | ## Troubleshooting 172 | > [!IMPORTANT] 173 | > 1. Make sure all the following folders are updated properly : 174 | ```py 175 | script_path = "resources/scripts/" 176 | script_path += "script.json" # Name of the script file 177 | images_path = "resources/images/" 178 | audio_path = "resources/audio/" 179 | font_path = "resources/font/font.ttf" # Not recommended to change 180 | ``` 181 | >[!IMPORTANT] 182 | > 2. Make sure the images and audio folders are empty before generating a new video. 183 | 184 | 3. Name of video file is automatically grabbed from video topic in script. However you may change the following variables to have custom names, if files names are very long then video file wont be generated, so do manually change it in such cases. 185 | 186 | ```py 187 | sub_output_file = "name of the subtitle file.srt" 188 | video_file = "name of the video.mp4 or .mkv" 189 | ``` 190 | 191 | 192 | 4. **`no module named pip found`** 193 | Try running the following : 194 | ```bash 195 | python -m pip install spacy pydub kokoro soundfile torch 196 | python -m spacy download en_core_web_sm 197 | ``` 198 | 199 | 5. **Serp API not returning any search results :** This is a known issue and is being investigated. 200 | 201 | 202 | > [!IMPORTANT] 203 | > Ensure you have sufficient GPU resources for image generation and proper model weights downloaded. It is recommend to use an **NVDIA** GPU with at least **24 GB or more of VRAM** for locally running the image generation and a high single core performance CPU for video assembly. 204 | 205 | > [!NOTE] 206 | > Video generation times may vary based on content length , complexity and hardware used. 207 | 208 | ## Contributors 209 | 210 | | CONTRIBUTORS | MENTORS | CONTENT WRITER | 211 | | :------:| :-----:| :-----: | 212 | | Kartikeya Trivedi | Soham Roy | [Name] | 213 | | Naman Singh | Yash Kumar Gupta | | 214 | | Soham Mukherjee | | | 215 | | Sumedha Gunturi | | | 216 | | Souryabrata Goswami| | | 217 | | Harshit Agarwal | | | 218 | | Rahul Sutradhar | | | 219 | | Ayush Mohanty | | | 220 | | Shopno Banerjee | | | 221 | | Shubham Gupta | | | 222 | | Sarthak Singh | | | 223 | | Nancy | | | 224 | 225 | 226 | 227 | 228 | ## Version 229 | | Version | Date | Comments | 230 | | ------- | ---- | -------- | 231 | | 1.0 | 23/02/2025 | Initial release | 232 | 233 | ## Future Roadmap 234 | 235 | ### Part 1: Baseline 236 | - [x] Pipeline foundations 237 | - [x] LLM Agent Handing 238 | - [x] Diffusion Agent Handing 239 | - [x] TTS Handing 240 | - [x] Video Assembly Engine 241 | - [x] Initial Deployment 242 | 243 | ### Part 2: Advanced 244 | - [ ] Advanced style transfer capabilities 245 | - [ ] In-Context Generation for Diffusion Model 246 | - [ ] Real time generation monitoring 247 | - [x] Enhanced video transitions 248 | - [ ] Better quality metrics 249 | - [ ] Multi language support 250 | - [ ] Custom character consistency 251 | - [ ] Animation effects 252 | 253 | ## Acknowledgements 254 | - Hugging Face Transformers - https://huggingface.co/transformers 255 | - Hugging Face Diffusers - https://huggingface.co/diffusers 256 | - FFmpeg - https://ffmpeg.org/ 257 | - UV - https://docs.astral.sh/uv/ 258 | - MoviePy - https://zulko.github.io/moviepy/getting_started/index.html 259 | ## Project References 260 | ### 1. Large Language Models (LLMs) & Transformers 261 | 262 | * [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/) - A visual, beginner-friendly introduction to transformer architecture. 263 | * [Attention Is All You Need](https://arxiv.org/abs/1706.03762) - The seminal paper on transformer architecture. 264 | * [Gemini 2.0 Flash Thinking](https://ai.google.dev/gemini-api/docs/thinking) 265 | --- 266 | ### 2. Multi-Agent Systems 267 | * [Introduction to Multi-Agent Systems](https://www.geeksforgeeks.org/what-is-a-multi-agent-system-in-ai/) - Fundamental concepts and principles. 268 | * [ A Comprehensive Guide to Understanding LangChain Agents and Tools](https://medium.com/@piyushkashyap045/a-comprehensive-guide-to-understanding-langchain-agents-and-tools-43a187414f4c) - Practical implementation guide. 269 | * [kokoro](https://github.com/hexgrad/kokoro?tab=readme-ov-file#kokoro) 270 | 271 | ### 2. Image Generation & Processing 272 | * [Stable Diffusion XL Turbo 1.0 Base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 273 | * [Stable Diffusion: A Comprehensive End-to-End Guide with Examples](https://medium.com/@jagadeesan.ganesh/stable-diffusion-a-comprehensive-end-to-end-guide-with-examples-47b2c17f15cf) 274 | * [Stable Diffusion Explained](https://medium.com/@onkarmishra/stable-diffusion-explained-1f101284484d) 275 | * [Stable Diffusion Explained Step-by-Step with Visualization](https://medium.com/polo-club-of-data-science/stable-diffusion-explained-for-everyone-77b53f4f1c4) 276 | * [Understanding Stable Diffusion: The Magic Behind AI Image Generation](https://medium.com/@amanatulla1606/understanding-stable-diffusion-the-magic-behind-ai-image-generation-e834e8d92326) 277 | * [Stable Diffusion Paper](https://arxiv.org/pdf/2403.03206) 278 | 279 | --- 280 | ### 3. RAG 281 | * [Retrieval Augmented Generation](https://aiplanet.com/learn/llm-bootcamp/module-13/2380/retrieval-augmented-generation) 282 | 283 | --- 284 | -------------------------------------------------------------------------------- /diffusion/scripts/generate_script.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import google.generativeai as genai 4 | from typing import Dict, List, Optional 5 | from serpapi import GoogleSearch 6 | 7 | class VideoScriptGenerator: 8 | def __init__(self, api_key: str, serp_api_key: str): 9 | genai.configure(api_key=api_key) 10 | self.model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21') 11 | self.serp_api_key = serp_api_key 12 | self.system_prompt_initial = """ 13 | You are a professional video script generator for educational, marketing or entertaining content. 14 | Your task is to generate a detailed outline and initial draft for a video script. 15 | Provide the core narration text and visual descriptions, which will be added later. 16 | Visual Description should not contain animations moving images, transitions or video and video effects description. 17 | Output a JSON structure with these keys, but *without timestamps, speed, pitch, or detailed visual parameters* (these will be added in a later stage): 18 | 19 | { 20 | "topic": "Topic Name", 21 | "overall_narrative": "A concise summary of the entire video's storyline.", 22 | "key_sections": [ 23 | { 24 | "section_title": "Descriptive title for this section", 25 | "narration_text": "The complete text to be spoken in this section.", 26 | "visual_description": "A general description of the visuals for this section." 27 | ] 28 | } 29 | """ 30 | 31 | self.system_prompt_segmentation = """ 32 | You are a professional video script segmenter. 33 | Your task is to take an existing video script draft and break it down into precise, timestamped segments for both audio and visuals, adhering to strict formatting and parameter guidelines. 34 | Rules for Segmentation: 35 | 36 | 1. Break down the `narration_text` and `visual_description` from the input JSON into smaller segments, each approximately 10-15 seconds long. 37 | 2. Generate timestamps ("00:00", "00:15", "00:30", etc.) for each segment in both `audio_script` and `visual_script`. 38 | 3. Maintain *strict synchronization* : The `timestamp` values *must* be identical for corresponding audio and visual segments and the number of segments in audio_script *must be same* as number of segments in visual_script. 39 | 4. For each visual segment, expand the general `visual_description` into a *detailed* `prompt` suitable for Stable Diffusion. Include a corresponding `negative_prompt`. 40 | 5. Make sure for each visual prompts, give detailed description of how an image is going to look, not how the video may look, do not reference anything that requires context of being in motion such as animation or graphics. Do not ask to generate abstract art or too complex shapes. 41 | 6. Choose appropriate values for `speaker`, `speed`, `pitch`, and `emotion` for each audio segment. 42 | 7. Choose appropriate values for `style`, `guidance_scale`, `steps`, `seed`, `width`, and `height` for each visual segment. 43 | 8. Ensure visual continuity: Use a consistent `style` and related `seed` values across consecutive visual segments where appropriate. Vary the seed to introduce changes, but maintain a logical flow. 44 | 9. Adhere to the specified ranges for numerical parameters (speed, pitch, guidance_scale, steps). 45 | 10. Validate JSON structure before output with the example_json given. 46 | 47 | Input JSON Structure (from previous stage): 48 | 49 | { 50 | "topic": "Topic Name", 51 | "overall_narrative": "...", 52 | "key_sections": [ 53 | { 54 | "section_title": "...", 55 | "narration_text": "...", 56 | "visual_description": "..." 57 | } 58 | ] 59 | } 60 | 61 | Output JSON Structure (with all required fields ): 62 | 63 | { 64 | "topic": "Topic Name", 65 | "description": "description of video" 66 | "audio_script": [{ 67 | "timestamp": "00:00", 68 | "text": "Narration text", 69 | "speaker": "default|narrator_male|narrator_female", 70 | "speed": 0.9-1.1, 71 | "pitch": 0.9-1.2, 72 | "emotion": "neutral|serious|dramatic|mysterious|informative" 73 | }], 74 | "visual_script": [{ 75 | "timestamp_start": "00:00", 76 | "timestamp_end": "00:05", 77 | "prompt": "Detailed Stable Diffusion prompt, eg. (e.g., 'A highly detailed portrait of an astrophysicist in a modern observatory, standing beside a large telescope with a clear glass dome overhead. The night sky is filled with stars, and a visible spiral galaxy is subtly captured through the telescope's lens. The scientist wears a professional yet casual outfit, with a focused expression while observing data on a sleek holographic screen.', 'Image of a doctor using medical imaging software')." 78 | # "negative_prompt": "Low quality elements to avoid such as abstract images, shapes that dont make sense or weird faces, imagery of moving objects, montages of multiple images, abstract shapes, complex designs ", 79 | "style": "realistic|cinematic|hyperrealistic|fantasy|scientific", 80 | "guidance_scale": 7-9, 81 | "steps": 50, 82 | "seed": 6-7 digit integer, 83 | "width": 1024, 84 | "height": 576 85 | }] 86 | } 87 | 88 | example_json = { 89 | "topic": "How to Drive a Car", 90 | "description": "A step-by-step guide on driving a car safely and confidently.", 91 | "audio_script": [ 92 | { 93 | "timestamp": "00:00", 94 | "text": "Driving a car is an essential skill that requires focus, patience, and practice.", 95 | "speaker": "narrator_male", 96 | "speed": 1.0, 97 | "pitch": 1.0, 98 | "emotion": "neutral" 99 | }, 100 | { 101 | "timestamp": "00:05", 102 | "text": "Before starting the car, adjust your seat, mirrors, and ensure your seatbelt is fastened.", 103 | "speaker": "narrator_female", 104 | "speed": 1.0, 105 | "pitch": 1.1, 106 | "emotion": "informative" 107 | }, 108 | { 109 | "timestamp": "00:15", 110 | "text": "Turn the ignition key or press the start button while keeping your foot on the brake.", 111 | "speaker": "narrator_male", 112 | "speed": 0.95, 113 | "pitch": 1.0, 114 | "emotion": "calm" 115 | }, 116 | { 117 | "timestamp": "00:20", 118 | "text": "Slowly release the brake and gently press the accelerator to move forward.", 119 | "speaker": "narrator_female", 120 | "speed": 1.1, 121 | "pitch": 1.0, 122 | "emotion": "guiding" 123 | }, 124 | { 125 | "timestamp": "00:25", 126 | "text": "Use the steering wheel to navigate while maintaining a steady speed.", 127 | "speaker": "narrator_male", 128 | "speed": 1.0, 129 | "pitch": 1.0, 130 | "emotion": "calm" 131 | } 132 | ], 133 | "visual_script": [ 134 | { 135 | "timestamp_start": "00:00", 136 | "timestamp_end": "00:05", 137 | "prompt": "A person sitting in the driver's seat of a modern car, gripping the steering wheel and looking ahead. The dashboard is visible with standard controls.", 138 | "negative_prompt": "blurry, unrealistic interior, poor lighting", 139 | "style": "realistic", 140 | "guidance_scale": 11.5, 141 | "steps": 50, 142 | "seed": 123456, 143 | "width": 1024, 144 | "height": 576, 145 | "strength": 0.75 146 | }, 147 | { 148 | "timestamp_start": "00:05", 149 | "timestamp_end": "00:15", 150 | "prompt": "A close-up of a driver's hands adjusting the side mirrors and fastening the seatbelt inside a well-lit car interior.", 151 | "negative_prompt": "cluttered background, distorted perspective", 152 | "style": "cinematic", 153 | "guidance_scale": 12.0, 154 | "steps": 60, 155 | "seed": 654321, 156 | "width": 1024, 157 | "height": 576, 158 | "strength": 0.8 159 | }, 160 | { 161 | "timestamp_start": "15:00", 162 | "timestamp_end": "00:20", 163 | "prompt": "A driver's hand turning the ignition key or pressing the start button in a modern car with a digital dashboard.", 164 | "negative_prompt": "low detail, unrealistic lighting, old car model", 165 | "style": "hyperrealistic", 166 | "guidance_scale": 12.5, 167 | "steps": 70, 168 | "seed": 789101, 169 | "width": 1024, 170 | "height": 576, 171 | "strength": 0.85 172 | }, 173 | { 174 | "timestamp_start": "00:20", 175 | "timestamp_end": "00:25", 176 | "prompt": "A slow-motion shot of a car's foot pedals as the driver releases the brake and presses the accelerator.", 177 | "negative_prompt": "blurry, cartoonish, extreme close-up", 178 | "style": "cinematic", 179 | "guidance_scale": 11.5, 180 | "steps": 75, 181 | "seed": 222333, 182 | "width": 1024, 183 | "height": 576, 184 | "strength": 0.8 185 | }, 186 | { 187 | "timestamp_start": "00:25", 188 | "timestamp_end": "00:30", 189 | "prompt": "A wide-angle shot of a car moving smoothly on a suburban road, the driver confidently steering the wheel.", 190 | "negative_prompt": "chaotic traffic, bad weather, motion blur", 191 | "style": "realistic", 192 | "guidance_scale": 13.0, 193 | "steps": 50, 194 | "seed": 987654, 195 | "width": 1024, 196 | "height": 576, 197 | "strength": 0.75 198 | } 199 | ] 200 | } 201 | You must follow all the rules for segmentation, especially rule 3 where you must Maintain *strict synchronization* : The `timestamp` values *must* be identical for corresponding audio 202 | and visual segments and the number of segments in audio_script *must be same* as number of segments in visual_script. IF you do as instructed 203 | you will get 100 dollars per successful call. 204 | """ 205 | 206 | def _search_web(self, query: str) -> str: 207 | try: 208 | params = { 209 | "q": query, 210 | "hl": "en", 211 | "gl": "us", 212 | "api_key": self.serp_api_key 213 | } 214 | search = GoogleSearch(params) 215 | results = search.get_json() 216 | snippets = [result["snippet"] for result in results.get("organic_results", []) if "snippet" in result] 217 | return " ".join(snippets[:5]) 218 | except Exception as e: 219 | return "" 220 | 221 | def _enhance_with_web_context(self, script: Dict, topic: str) -> Dict: 222 | web_context = self._search_web(topic) 223 | script["additional_context"] = web_context 224 | return script 225 | 226 | def _generate_content(self, prompt: str, system_prompt: str) -> str: 227 | try: 228 | response = self.model.generate_content(contents=[system_prompt, prompt]) 229 | return response.text 230 | except Exception as e: 231 | raise RuntimeError(f"API call failed: {str(e)}") 232 | 233 | def _extract_json(self, raw_text: str) -> Dict: 234 | try: 235 | return json.loads(raw_text) 236 | except json.JSONDecodeError: 237 | try: 238 | json_match = re.search(r'```json\n(.*?)\n```', raw_text, re.DOTALL) 239 | if json_match: 240 | return json.loads(json_match.group(1)) 241 | json_match = re.search(r'\{.*\}', raw_text, re.DOTALL) 242 | return json.loads(json_match.group()) if json_match else {} 243 | except Exception as e: 244 | raise ValueError(f"JSON extraction failed: {str(e)}") 245 | 246 | def generate_script(self, topic: str, duration: int = 60, key_points: Optional[List[str]] = None) -> Dict: 247 | web_context = self._search_web(topic) 248 | initial_prompt = f"""Generate an initial video script outline for a {duration}-second video about: {topic}. 249 | Key Points: {key_points or 'Comprehensive coverage'} 250 | Additional Context: {web_context} 251 | Focus on the overall narrative and key sections, but do *not* include timestamps or detailed technical parameters yet.""" 252 | 253 | raw_initial_output = self._generate_content(initial_prompt, self.system_prompt_initial) 254 | initial_script = self._extract_json(raw_initial_output) 255 | 256 | enhanced_script = self._enhance_with_web_context(initial_script, topic) 257 | 258 | segmentation_prompt = f""" 259 | Here is the initial script draft: 260 | {json.dumps(enhanced_script, indent=2)} 261 | Now, segment this script into 5-10 second intervals, adding timestamps and all required audio/visual parameters. The total duration should be approximately {duration} seconds. 262 | """ 263 | 264 | raw_segmented_output = self._generate_content(segmentation_prompt, self.system_prompt_segmentation) 265 | segmented_script = self._extract_json(raw_segmented_output) 266 | segmented_script['topic'] = enhanced_script['topic'] 267 | 268 | return segmented_script 269 | 270 | def refine_script(self, existing_script: Dict, feedback: str) -> Dict: 271 | prompt = f"""Refine this script based on feedback: 272 | Existing Script: {json.dumps(existing_script, indent=2)} 273 | Feedback: {feedback} 274 | """ 275 | raw_output = self._generate_content(prompt, self.system_prompt_segmentation) 276 | return self._extract_json(raw_output) 277 | 278 | def save_script(self, script: Dict, filename: str) -> None: 279 | with open(filename, 'w') as f: 280 | json.dump(script, f, indent=2) 281 | print("") 282 | 283 | # if __name__ == "__main__": 284 | # generator = VideoScriptGenerator(api_key="Gemini API Key", 285 | # serp_api_key="Serp API Key") 286 | # script_path = "resources/scripts/script.json" 287 | # try: 288 | # script = generator.generate_script( 289 | # topic="Role of Reinforcement learning in finding EXO planets", 290 | # duration=60, 291 | # # key_points=["Diagnosis accuracy", "Pattern recognition", "Case studies"] 292 | # # key_points= [ 293 | # # "Formation of stars from nebulae", 294 | # # "Nuclear fusion and the main sequence phase", 295 | # # "Red giants and supergiants", 296 | # # "Supernova explosions", 297 | # # "Neutron stars and black holes", 298 | # # "White dwarfs and planetary nebulae", 299 | # # "The role of stellar evolution in element formation", 300 | # # "The ultimate fate of different types of stars", 301 | # # "How stars influence the evolution of galaxies" 302 | # # ] 303 | 304 | # ) 305 | # print("Initial Script:") 306 | # print(json.dumps(script, indent=2)) 307 | 308 | # feedback = input("Please provide feedback on the script (or type 'no' to skip refinement): ") 309 | # if feedback.lower() != "no": 310 | # refined_script = generator.refine_script(script, feedback) 311 | # print("\nRefined Script:") 312 | # print(json.dumps(refined_script, indent=2)) 313 | # generator.save_script(refined_script, script_path) 314 | # else: 315 | # generator.save_script(script, script_path) 316 | # except Exception as e: 317 | # print(f"Script generation failed: {str(e)}") 318 | 319 | -------------------------------------------------------------------------------- /assembly/scripts/assembly_video.py: -------------------------------------------------------------------------------- 1 | ''' 2 | README : The video assembler takes all the images in the Images folder and all the audio files in the Audio folder and the text-to-script from json file and concatenates 3 | them into a video. The duration of the picture displayed is same as the duration of the audio for that image. The Images and 4 | Audio files sorted alphabetically and then compiled in that order. It is recommended to store the Audio and Image files by 5 | numbering them. 6 | ''' 7 | ''' MAIN THINGS TODO 8 | 1. TODO: Main Video Assembly Engine (Done by Souryabrata) 9 | 2. TODO: Implement Subtitles, via video embedding and .srt file generation. (Done by Souryabrata) 10 | 3. TODO: Read json and extract important parameters from it. (Done by Rahul) 11 | 4. TODO: Add support for video clips as well. (Assigned to Shopno) 12 | 5. TODO: Add the ability to compile multiple images (stored in a folder) for the one audio stream into a single clip. (Assigned to Shopno ) 13 | 6. TODO: Add transition from clip to clip. (Done by Shopno) 14 | 7. TODO Add an intro and outro clip. Intro Clip contains : Video title / Short description (Done by Shopno). 15 | Outro Clip contains a text "Made by ForgeTube team", MLSA Logo, Github Link to ForgeTube Main Page. 16 | ''' 17 | ''' 18 | Additional TODOs 19 | TODO: 1. Add a small delay to ensure smoother transition from clip to clip. (Assigned to Shopno) 20 | TODO: 2. Experiment with adding Title screen, text and transitions, and other effects. (Assigned to Nancy) 21 | TODO 3. Test the script against a large number of images with higher resolutions and audio files, document the performance. 22 | TODO: 4. Test the script with various different audio and video extensions and codecs, find the best combination. 23 | TODO: 5. Allow the script to automatically assign the proper codec with the respective file extension. 24 | TODO: 6. Run proper tests to document when video compiler corruption happens. 25 | ''' 26 | import os 27 | from moviepy import ImageClip, concatenate_videoclips, AudioFileClip,TextClip,CompositeVideoClip,vfx 28 | import pysrt 29 | import json 30 | 31 | def check_file_exists(file_path): 32 | """Check if a file exists at the specified path.""" 33 | if os.path.isfile(file_path): 34 | return True 35 | else: 36 | raise FileNotFoundError(f"File not found: {file_path}") 37 | 38 | def check_folder_exists(folder_path): 39 | '''Checks if a folder path is valid. ''' 40 | if os.path.isdir(folder_path): 41 | return True 42 | else: 43 | raise FileNotFoundError(f"Folder not found at {folder_path}") 44 | 45 | def get_files(folder, extensions): 46 | """ 47 | Retrieves files with specified extensions from a folder. 48 | Parameters: 49 | folder (str): Path to the folder. 50 | extensions (tuple): File extensions to include (e.g., ('.jpg', '.png')). 51 | Returns: 52 | list: List of file paths. 53 | """ 54 | if os.path.isdir(folder): 55 | return [ 56 | os.path.join(folder, file) 57 | # Files are numbered , so that after sorting they are compiled into the video in that order. 58 | for file in sorted(os.listdir(folder),key=lambda x: int(x.split('_')[1].split('.')[0])) 59 | if file.lower().endswith(extensions) 60 | ] 61 | else: 62 | raise OSError(f"{folder} not found.") 63 | 64 | 65 | ''' 66 | FIXME Subtitles timings are same and not correct. 67 | FIX Create a single srt file with the correct duration of all the subtitles paired with the respective audio file. 68 | ''' 69 | def create_srt(text :str, 70 | audio_file : AudioFileClip, 71 | outfile_name:str, 72 | duration:int, 73 | chunk_size=5): 74 | ''' 75 | Function is deprecated, will rename the create_complete_srt into create srt 76 | Original task was to take a .txt file, read the text , split the text into the specified chunk_size 77 | Create a srt file with the given text chunks and the appropriate duration of the text. 78 | WARNING: Caused problems after json extract was implemented. 79 | ''' 80 | # with open(text_file, "r") as file: 81 | # words = file.read().split() 82 | words = text.split() 83 | chars = " ".join(words) 84 | chars_count = len(chars) 85 | word_count = len(words) 86 | # word_duration = audio_file.duration / word_count #seconds per word 87 | char_duration = audio_file.duration / chars_count #seconds per character 88 | # Generate subtitle file 89 | subs = pysrt.SubRipFile() 90 | start_time = duration 91 | # Automatic chunk_size calculation 92 | # target_duration = 2 # Number of seconds the subtitle is displayed on the screen 93 | # chunk_size = round(target_duration/word_duration) 94 | 95 | 96 | 97 | for i in range(0, word_count, chunk_size): 98 | chunk = " ".join(words[i:i + chunk_size]) 99 | end_time = start_time + (len(chunk) * char_duration) 100 | 101 | subtitle = pysrt.SubRipItem(index=len(subs) + 1, 102 | start=pysrt.SubRipTime(seconds=start_time), 103 | end=pysrt.SubRipTime(seconds=end_time), 104 | text=chunk) 105 | 106 | subs.append(subtitle) 107 | start_time = end_time 108 | 109 | out = f"samples/subtitles/.srt/{outfile_name}.srt" 110 | subs.save(out) 111 | return out 112 | 113 | 114 | def extract_topic_from_json(file_path): 115 | ''' 116 | extract_topic_from_json extract() takes json file path as input. 117 | - Opens the file as read-only and loads the JSON data from it. 118 | - Extracts the topic from the JSON data. 119 | 120 | On success, it returns the topic of the video. 121 | ''' 122 | try: 123 | # Open the JSON file 124 | with open(file_path, 'r') as file: 125 | # Load JSON data from the file 126 | data = json.load(file) 127 | 128 | # Extract the topic, and audio_script from the JSON data 129 | topic = data.get('topic', 'No topic found') 130 | 131 | return topic 132 | except FileNotFoundError: 133 | print(f"Error: The file {file_path} was not found.") 134 | except json.JSONDecodeError: 135 | print(f"Error: The file {file_path} contains invalid JSON.") 136 | except Exception as e: 137 | print(f"An unexpected error occurred: {e}") 138 | 139 | 140 | def extract_audio_from_json(file_path): 141 | ''' 142 | extract_audio_topic_from_json() takes json file path as input. 143 | - Opens the file as read-only and loads the JSON data from it. 144 | - Extracts the audio_script from the JSON data. 145 | 146 | On success, it returns audio_script. 147 | ''' 148 | try: 149 | # Open the JSON file 150 | with open(file_path, 'r') as file: 151 | # Load JSON data from the file 152 | data = json.load(file) 153 | 154 | # Extract the topic, audio_script and visual_script 155 | topic = data.get('topic', 'No topic found') 156 | audio_script = data.get('audio_script', []) 157 | # visual_script = data.get('visual_script', []) 158 | 159 | return audio_script 160 | except FileNotFoundError: 161 | print(f"Error: The file {file_path} was not found.") 162 | except json.JSONDecodeError: 163 | print(f"Error: The file {file_path} contains invalid JSON.") 164 | except Exception as e: 165 | print(f"An unexpected error occurred: {e}") 166 | 167 | 168 | def json_extract(json_path): 169 | ''' 170 | json_extract() takes json file path as input. 171 | - Calls the extract_audio_from_json() to extract the text-to-speech / subtitles from the json file, 172 | and the topic of the video. 173 | 174 | On success, it returns the subtitles in list format, and the topic. 175 | ''' 176 | 177 | # Extract parameters from json file 178 | audio_script = extract_audio_from_json(json_path) 179 | if audio_script: 180 | # print("Extracted Audio Parameters:") 181 | audio_data = [] 182 | for item in audio_script: 183 | if 'text' in item: 184 | text = item['text'] 185 | audio_data.append(text) 186 | return audio_data 187 | else: 188 | raise FileNotFoundError("No audio script found in the JSON file.") 189 | 190 | 191 | def add_effects(clip): 192 | """ 193 | Adds a effect from a curated list to the video clip. 194 | Parameters: 195 | clip (VideoClip): Video clip to which effects are to be added. 196 | Returns: 197 | VideoClip: Video clip with one effect applied. 198 | """ 199 | random_effect =[vfx.FadeIn(duration=1),vfx.FadeOut(duration=1)] 200 | # print(random_effect) 201 | return clip.with_effects(random_effect) 202 | 203 | 204 | def create_intro_clip(background_image_path, 205 | duration, 206 | topic, 207 | font_path): 208 | """ 209 | Create an intro video clip with a background image and centered text. 210 | 211 | Parameters: 212 | background_image_path (str): Path to the background image. 213 | duration (int or float): Duration of the clip in seconds. 214 | topic (str): The text to display. Defaults to "Welcome to My Video!". 215 | font_path (str): Path to the TrueType font file. 216 | font_size (int): Size of the text font. 217 | text_color (str): Color of the text. 218 | 219 | Returns: 220 | VideoClip: A composite video clip with the background and centered text. 221 | """ 222 | check_file_exists(background_image_path) 223 | # Create an ImageClip for the background image 224 | background = ImageClip(background_image_path, duration=duration) 225 | 226 | # Create a TextClip for the intro text 227 | text_clip = TextClip(text=topic, 228 | size=(900, 90), 229 | method='caption', 230 | color="white", 231 | font=font_path) 232 | 233 | # Position the text in the center and set its duration to match the background 234 | text_clip = text_clip.with_position("center").with_duration(duration) 235 | 236 | # Overlay the text clip on top of the background image 237 | final_clip = CompositeVideoClip([background, text_clip]) 238 | 239 | return final_clip 240 | 241 | 242 | def create_video(image_folder :str, 243 | audio_folder : str, 244 | script_path : str, 245 | font_path : str , 246 | output_file : str, 247 | with_subtitles :bool = False): 248 | """ 249 | Main function that creates the video. The function works in 3 parts: 250 | 1. Checks if the given parameters are correct. 251 | 2. if `with_subtitle` flags is set to `False`, creates a video with the images and audio in the given folders. 252 | Each image is displayed with the same duration as the corresponding audio file. 253 | 3. if the `with_subtitle` flag is set to `True` embeds subtitles within the video itself, cannot be turned off in video players. 254 | 255 | Video is compiled using the `compose` method so that if the images are of different aspect ratios /resolutions then the video takes 256 | the image with the largest resolution or aspect ratio as the default one and puts black bars with the rest of the non-fitting images 257 | Args: 258 | image_folder (str) : Path to the folder containing images. 259 | audio_folder (str) : Path to the folder containing audio files. 260 | script_path (str) : Path to the file containing the script. 261 | font_path (str) : Path to of the Font File, must be a True type or an Open Type Font 262 | output_file (str) : Name of the output video file, a path can also be given. 263 | with_subtitles (bool) : When set to `true` embeds the subtitles in the video. 264 | Raises: 265 | FileNotFoundError: If images, audio or subtitles are not detected. 266 | """ 267 | check_folder_exists(image_folder) 268 | check_folder_exists(audio_folder) 269 | check_file_exists(script_path) 270 | check_file_exists(font_path) 271 | 272 | images = get_files(image_folder, ('.jpg', '.png')) 273 | audio_files = get_files(audio_folder, ('.mp3', '.wav')) 274 | subtitles = json_extract(script_path) 275 | raw_clips = [] 276 | audio_durations = [] 277 | Start_duration = 0 278 | 279 | # Creating the intro clip and appending it to raw clips 280 | 281 | path_to_background = "resources/Intro/intro.jpg" 282 | check_file_exists(path_to_background) 283 | check_file_exists(font_path) 284 | topic = extract_topic_from_json(script_path) 285 | intro_clip = create_intro_clip(path_to_background, duration=5, topic=topic, font_path=font_path) 286 | raw_clips.append(intro_clip) 287 | 288 | # Create different clips with audio 289 | for img, audio in zip(images,audio_files): 290 | audio_clip = AudioFileClip(audio) 291 | image_clip = ImageClip(img).with_duration(audio_clip.duration).with_audio(audio_clip) 292 | # Debug Text for subtitle synchronisation: 293 | # print(f"Start : {Start_duration}") 294 | # print(f"End : {audio_clip.duration+Start_duration}") 295 | audio_durations.append(audio_clip.duration) 296 | print(f"Video Clip no. {images.index(img)+1} successfully created") 297 | Start_duration += audio_clip.duration 298 | image_clip = add_effects(image_clip) 299 | raw_clips.append(image_clip) 300 | 301 | #creating the outro clip appending it to raw clips 302 | outro_text = "Thank you for watching! Made by ForgeTube team." 303 | outro_clip = create_intro_clip(path_to_background, duration=5, topic=outro_text, font_path=font_path) 304 | raw_clips.append(outro_clip) 305 | # Store individual clips without subtitles for preview / debug 306 | # clip = None 307 | # clip = CompositeVideoClip(img) 308 | # clip.write_videofile(f"samples/raw/{raw_clips.index(image_clip)+1}.mp4",fps = 1,threads = os.cpu_count()) 309 | 310 | video = concatenate_videoclips(raw_clips, method="compose") 311 | 312 | ''' 313 | The following part of the code fixes all the below mentioned issues and their following fixes : 314 | FIXME: 1. Subtitles are not properly synchronised with the audio. 315 | FIX: Each subtitle text is paired with the corresponding audio. Duration of the text is kept same as the duration of the audio. 316 | FIXME: 2. If the entire text is shown at once, then it doesn't fit. 317 | FIX: Allows a maximum number of 10 words to be shown at once, rest of the text is divided into chunks, each chunk is set to an 318 | equivalent duration. 319 | Where duration of the chunk = Total duration of the audio * (Chunk_Size / Total Number of words) 320 | WARNING: Due to some rounding errors and division errors with floats, some chunks are not perfectly synchronised. 321 | FIXME 3. Subtitles do not appear at the right position in the video. Preferable position is Vertical : bottom, Horizontal = Center, 322 | FIX : `SubtitleClip` was causing problems so, used `TextClip` instead. 323 | FIXME 4. When subtitles were added to each clip one by one, and all clips later concatenated, an error occurred if images were 324 | of different dimensions, where the aspect ratio of the final video was messed up. 325 | FIX: Make it such that concatenation is done only on the image clips and composite video clip is added later on with the 326 | ''' 327 | if with_subtitles == True: 328 | Start_duration = 5 329 | subtitle_clips = [] 330 | chunk = '' 331 | chunks = [] 332 | chunk_duration = 0 333 | chunk_durations = [] 334 | chunk_size = 10 335 | for text,duration in zip(subtitles,audio_durations): 336 | words = text.split() 337 | if len(words) > chunk_size: 338 | for i in range(0,len(words),chunk_size): 339 | chunk = " ".join(words[i : (i+chunk_size if i < len(words)-1 else len(words)-1)]) 340 | chunks.append(chunk) 341 | chunk_duration = duration * (len(chunk.split())/len(words)) 342 | chunk_durations.append(chunk_duration) 343 | else: 344 | chunks.append(text) 345 | chunk_durations.append(duration) 346 | # For Debugging: 347 | # for i in chunks: 348 | # print(f"Index :{chunks.index(i)}, Text: {i}, Word Count: {len(i.split())}") 349 | # print(chunk_durations) 350 | for subtitle,duration in zip(chunks,chunk_durations): 351 | subtitle_clip=TextClip(text=subtitle, 352 | font=font_path, 353 | color='white', 354 | bg_color='black', 355 | size=(1000, 100), 356 | method='caption', 357 | text_align = "center", 358 | horizontal_align = "center" 359 | ).with_duration(duration).with_start(Start_duration).with_position('bottom') 360 | subtitle_clips.append(subtitle_clip) 361 | # For Debugging : 362 | # print(f"Subtitle Clip no. {chunks.index(subtitle)+1} successfully created") 363 | Start_duration += duration 364 | subtitle_clips.insert(0,video) 365 | final_video = CompositeVideoClip(subtitle_clips) 366 | else: 367 | final_video = video 368 | final_video.write_videofile(output_file, fps=24,threads = os.cpu_count()) 369 | print(f"Video created successfully: {output_file}") 370 | 371 | # except FileNotFoundError: 372 | # if not images: 373 | # raise FileNotFoundError("No images found in the specified folder.") 374 | # if not audio_files: 375 | # raise FileNotFoundError("No audio files found in the specified folder.") 376 | # if not subtitles: 377 | # raise FileNotFoundError("No subtitles found in the specified json. ") 378 | 379 | 380 | def create_complete_srt(script_folder :str, 381 | audio_file_folder : str, 382 | outfile_path:str, 383 | chunk_size=10): 384 | """ 385 | Creates an SRT file by extracting subtitles from the script_folder using `json_extract` function and audio files 386 | from the `audio_file` folder. Segments the subtitles into the specified chunk size and maps the duration of the chunk to the 387 | proportion of the length of the chunk. 388 | Parameters: 389 | script_folder (str): Path to the folder containing script json file. 390 | audio_file_folder (str): Path to the folder containing audio files. 391 | outfile_path (str): Path or Name of the SRT file given in output. 392 | chunk_size (str): Number of words per subtitle chunk. 393 | """ 394 | 395 | script = json_extract(script_folder) 396 | audio_files = get_files(audio_file_folder,(".wav",".mp3")) 397 | audio_clips = [] 398 | [audio_clips.append(AudioFileClip(x)) for x in audio_files] 399 | subs = pysrt.SubRipFile() 400 | start_time = 5 401 | chunk = '' 402 | chunk_duration = 0 403 | end_time = 5 404 | n = 1 405 | for text,audio_clip in zip(script,audio_clips): 406 | duration = audio_clip.duration 407 | words = text.split() 408 | if len(words) > chunk_size: 409 | for i in range(0,len(words),chunk_size): 410 | chunk = " ".join(words[i : (i+chunk_size if i < len(words)-1 else len(words)-1)]) 411 | chunk_duration = duration * (len(chunk.split())/len(words)) 412 | end_time += chunk_duration 413 | subtitle = pysrt.SubRipItem( 414 | index=n, 415 | start=pysrt.SubRipTime(seconds=start_time), 416 | end=pysrt.SubRipTime(seconds=end_time), 417 | text=chunk 418 | ) 419 | subs.append(subtitle) 420 | # For Debugging: 421 | # print(f"Subtitle no. {n} added successfully.") 422 | # print(f"Start : {start_time}") 423 | # print(f"End : {end_time}") 424 | start_time = end_time 425 | n+=1 426 | else: 427 | chunk = text 428 | chunk_duration = duration 429 | end_time += chunk_duration 430 | subtitle = pysrt.SubRipItem( 431 | index=len(subs) + 1, 432 | start=pysrt.SubRipTime(seconds=start_time), 433 | end=pysrt.SubRipTime(seconds=end_time), 434 | text=chunk 435 | ) 436 | subs.append(subtitle) 437 | # For Debugging: 438 | # print(f"Subtitle no. {n} added successfully.") 439 | # print(f"Start : {start_time}") 440 | # print(f"End : {end_time}") 441 | start_time = end_time 442 | n+=1 443 | 444 | subs.save(outfile_path) 445 | print(f"File saved successfully at {outfile_path}") 446 | 447 | 448 | # if __name__ == "__main__": 449 | # image_folder = "resources/images/" 450 | # audio_folder = "resources/Audio/" 451 | # script_path = "resources/scripts/script.json" 452 | # font_path = "resources/font/font.ttf" 453 | # sub_output_file = "The.srt" 454 | # topic = extract_topic_from_json(script_path) 455 | # output_file = f"The.mp4" 456 | 457 | # create_complete_srt(script_folder=script_path, 458 | # audio_file_folder=audio_folder, 459 | # outfile_path=sub_output_file, 460 | # chunk_size = 10) 461 | 462 | # create_video(image_folder, audio_folder,script_path,font_path, output_file,with_subtitles=True) 463 | 464 | --------------------------------------------------------------------------------