├── requirements.txt ├── .env copy ├── .gitignore ├── images.py ├── narration.py ├── README.md ├── video.py ├── text.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | python-dotenv 3 | elevenlabs 4 | openai 5 | pydub 6 | numpy 7 | -------------------------------------------------------------------------------- /.env copy: -------------------------------------------------------------------------------- 1 | 2 | # ELVENLABS_API 3 | ELEVENLABS_API_KEY=YOUR_API_KEY 4 | 5 | # OPENAI API 6 | OPENAI_API_KEY=YOUR_API_KEY -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | old/ 3 | shorts/ 4 | venv/ 5 | myvenv/ 6 | aishort/ 7 | *.txt 8 | !response.txt 9 | !requirements.txt 10 | .env 11 | 12 | 13 | -------------------------------------------------------------------------------- /images.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import base64 3 | import os 4 | 5 | 6 | 7 | client = OpenAI() 8 | 9 | def create_from_data(data, output_dir): 10 | if not os.path.exists(output_dir): 11 | os.makedirs(output_dir) 12 | 13 | image_number = 0 14 | for element in data: 15 | if element["type"] != "image": 16 | continue 17 | image_number += 1 18 | image_name = f"image_{image_number}.webp" 19 | generate(element["description"] + ". Vertical image, fully filling the canvas.", os.path.join(output_dir, image_name)) 20 | 21 | def generate(prompt, output_file, size="1024x1792"): 22 | response = client.images.generate( 23 | model="dall-e-3", 24 | prompt=prompt, 25 | size=size, 26 | quality="standard", 27 | response_format="b64_json", 28 | n=1, 29 | ) 30 | 31 | image_b64 = response.data[0].b64_json 32 | 33 | with open(output_file, "wb") as f: 34 | f.write(base64.b64decode(image_b64)) 35 | 36 | -------------------------------------------------------------------------------- /narration.py: -------------------------------------------------------------------------------- 1 | from elevenlabs import generate, set_api_key, save, RateLimitError 2 | import openai 3 | import os 4 | 5 | set_api_key("YOUR_ELEVENLABS_APIKEY") 6 | 7 | narration_api = "elevenlabs" # (or "openai") 8 | 9 | def parse(narration): 10 | data = [] 11 | narrations = [] 12 | lines = narration.split("\n") 13 | for line in lines: 14 | if line.startswith('Narrator: '): 15 | text = line.replace('Narrator: ', '') 16 | data.append({ 17 | "type": "text", 18 | "content": text.strip('"'), 19 | }) 20 | narrations.append(text.strip('"')) 21 | elif line.startswith('['): 22 | background = line.strip('[]') 23 | data.append({ 24 | "type": "image", 25 | "description": background, 26 | }) 27 | return data, narrations 28 | 29 | def create(data, output_folder): 30 | if not os.path.exists(output_folder): 31 | os.makedirs(output_folder) 32 | 33 | n = 0 34 | for element in data: 35 | if element["type"] != "text": 36 | continue 37 | 38 | n += 1 39 | output_file = os.path.join(output_folder, f"narration_{n}.mp3") 40 | 41 | if narration_api == "openai": 42 | audio = openai.audio.speech.create( 43 | input=element["content"], 44 | model="tts-1", 45 | voice="alloy", 46 | ) 47 | 48 | audio.stream_to_file(output_file) 49 | else: 50 | audio = generate( 51 | # Ændre Speaker og model her! 52 | # https://elevenlabs.io/docs/api-reference/text-to-speech 53 | 54 | text=element["content"], 55 | voice="Michael", 56 | model="eleven_monolingual_v1" 57 | ) 58 | save(audio, output_file) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI Shorts Generator 2 | 3 | AI Shorts Generator is an innovative tool designed to create short videos. The tool utilizes scripts powered by GPT-4 and voices by either Elevenlabs' speech synthesis or OpenAI's text-to-speech. DALL-E 3 generates background images which are carefully assembled by OpenCV, culminating into a compelling short video or reel. 4 | 5 | The tool offers an option to use open-source language models like Mistral-7b and open-source stable diffusion models, but this requires access to significant GPU resources. 6 | 7 | ## License 8 | 9 | This code is released under the MIT license, implying it's free to be used by anyone. Nevertheless, it comes with no warranties and is provided 'as is'. The responsibility for the content you generate using this software, including potential copyright issues, rests solely with you. 10 | 11 | ## Getting Started 12 | 13 | Getting started with AI Shorts Generator involves a few expenses, primarily for generating images via DALL-E 3. Anticipate spending approximately $0.10-0.45 in OpenAI credits per video. 14 | 15 | You will initially need an OpenAI API key and an Elevenlabs key (which may not be immediately necessary, but will be after generating a few videos). 16 | 17 | * Obtain an OpenAI API key [here](https://platform.openai.com/api-keys) 18 | * Get an Elevenlabs API key [here](https://elevenlabs.io/speech-synthesis) (not immediately necessary) 19 | 20 | ### Installation and Usage 21 | 22 | Here are the steps to install and use the AI Shorts Generator: 23 | 24 | 1. **Clone the Repository** 25 | ``` 26 | git clone https://github.com/MainAIdk/AI-Shorts-Generator.git 27 | ``` 28 | 2. **Navigate to the Directory** 29 | ``` 30 | cd AI-Shorts-Generator 31 | ``` 32 | 3. **Setup a Python Virtual Environment** 33 | ``` 34 | python -m venv myvenv 35 | ``` 36 | 4. **Activate the Environment** 37 | ``` 38 | source myvenv/bin/activate 39 | ``` 40 | 5. **Install Required Packages** 41 | ``` 42 | pip install -r requirements.txt 43 | ``` 44 | 6. **Set Environment Variables** 45 | ``` 46 | Open .env copy, insert your API keys and rename to .env. 47 | ``` 48 | 7. **Create GPT Input File** 49 | ``` 50 | Create a file (e.g. source.txt) and provide a brief about your short video. 51 | ``` 52 | 8. **Generate Your Short Video** 53 | ``` 54 | python main.py source.txt 55 | ``` 56 | The script will generate your video. A successful operation should output: 57 | ```console 58 | Generating script... 59 | Generating narration... 60 | Generating images... 61 | Generating video... 62 | 63 | FINISHED! Here is your video: shorts/1701788183/short.avi 64 | ``` 65 | 66 | ## Need Assistance? 67 | 68 | If you need help setting up this project or require assistance for your personal or company's AI project, we are just an email away. Reach us at info@mainai.dk. Happy Generating! 69 | -------------------------------------------------------------------------------- /video.py: -------------------------------------------------------------------------------- 1 | from pydub import AudioSegment 2 | import numpy as np 3 | import math 4 | import cv2 5 | import os 6 | 7 | import text 8 | 9 | def get_audio_duration(audio_file): 10 | return len(AudioSegment.from_file(audio_file)) 11 | 12 | def resize_image(image, width, height): 13 | # Calculate the aspect ratio of the original image 14 | aspect_ratio = image.shape[1] / image.shape[0] 15 | 16 | # Calculate the new dimensions to fit within the desired size while preserving aspect ratio 17 | if aspect_ratio > (width / height): 18 | new_width = width 19 | new_height = int(width / aspect_ratio) 20 | else: 21 | new_height = height 22 | new_width = int(height * aspect_ratio) 23 | 24 | # Resize the image to the new dimensions without distorting it 25 | return cv2.resize(image, (new_width, new_height)) 26 | 27 | def create(narrations, output_dir, output_filename): 28 | # Define the dimensions and frame rate of the video 29 | width, height = 1080, 1920 # Change as needed for your vertical video 30 | frame_rate = 30 # Adjust as needed 31 | 32 | fade_time = 1000 33 | 34 | # Create a VideoWriter object to save the video 35 | fourcc = cv2.VideoWriter_fourcc(*'XVID') # You can change the codec as needed 36 | temp_video = os.path.join(output_dir, "temp_video.avi") # Output video file name 37 | out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height)) 38 | 39 | # List of image file paths to use in the video 40 | image_paths = os.listdir(os.path.join(output_dir, "images")) # Replace with your image paths 41 | image_count = len(image_paths) 42 | 43 | # Load images and perform the transition effect 44 | for i in range(image_count): 45 | image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp")) 46 | 47 | if i+1 < image_count: 48 | image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp")) 49 | else: 50 | image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp")) 51 | 52 | image1 = resize_image(image1, width, height) 53 | image2 = resize_image(image2, width, height) 54 | 55 | narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3") 56 | duration = get_audio_duration(narration) 57 | 58 | if i > 0: 59 | duration -= fade_time 60 | 61 | if i == image_count-1: 62 | duration -= fade_time 63 | 64 | for _ in range(math.floor(duration/1000*30)): 65 | vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) 66 | vertical_video_frame[:image1.shape[0], :] = image1 67 | 68 | out.write(vertical_video_frame) 69 | 70 | for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)): 71 | blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0) 72 | vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) 73 | vertical_video_frame[:image1.shape[0], :] = blended_image 74 | 75 | out.write(vertical_video_frame) 76 | 77 | # Release the VideoWriter and close the window if any 78 | out.release() 79 | cv2.destroyAllWindows() 80 | 81 | text.add_narration_to_video(narrations, temp_video, output_dir, output_filename) 82 | 83 | os.remove(temp_video) 84 | -------------------------------------------------------------------------------- /text.py: -------------------------------------------------------------------------------- 1 | from pydub import AudioSegment 2 | import subprocess 3 | import math 4 | import cv2 5 | import os 6 | 7 | offset = 50 8 | 9 | def get_audio_duration(audio_file): 10 | return len(AudioSegment.from_file(audio_file)) 11 | 12 | def write_text(text, frame, video_writer): 13 | font = cv2.FONT_HERSHEY_SIMPLEX 14 | white_color = (255, 255, 255) 15 | black_color = (0, 0, 0) 16 | thickness = 10 17 | font_scale = 3 18 | border = 5 19 | 20 | # Calculate the position for centered text 21 | text_size = cv2.getTextSize(text, font, font_scale, thickness)[0] 22 | text_x = (frame.shape[1] - text_size[0]) // 2 # Center horizontally 23 | text_y = (frame.shape[0] + text_size[1]) // 2 # Center vertically 24 | org = (text_x, text_y) # Position of the text 25 | 26 | frame = cv2.putText(frame, text, org, font, font_scale, black_color, thickness + border * 2, cv2.LINE_AA) 27 | frame = cv2.putText(frame, text, org, font, font_scale, white_color, thickness, cv2.LINE_AA) 28 | 29 | video_writer.write(frame) 30 | 31 | def add_narration_to_video(narrations, input_video, output_dir, output_file): 32 | # Open the video file 33 | cap = cv2.VideoCapture(input_video) 34 | 35 | # Define the codec and create a VideoWriter object to save the output video 36 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 37 | temp_video = os.path.join(output_dir, "with_transcript.avi") 38 | out = cv2.VideoWriter(temp_video, fourcc, 30, (int(cap.get(3)), int(cap.get(4)))) 39 | 40 | full_narration = AudioSegment.empty() 41 | 42 | for i, narration in enumerate(narrations): 43 | audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3") 44 | duration = get_audio_duration(audio) 45 | narration_frames = math.floor(duration / 1000 * 30) 46 | 47 | full_narration += AudioSegment.from_file(audio) 48 | 49 | char_count = len(narration.replace(" ", "")) 50 | ms_per_char = duration / char_count 51 | 52 | frames_written = 0 53 | words = narration.split(" ") 54 | for w, word in enumerate(words): 55 | word_ms = len(word) * ms_per_char 56 | 57 | if i == 0 and w == 0: 58 | word_ms -= offset 59 | if word_ms < 0: 60 | word_ms = 0 61 | 62 | for _ in range(math.floor(word_ms/1000*30)): 63 | ret, frame = cap.read() 64 | if not ret: 65 | break 66 | write_text(word, frame, out) 67 | frames_written += 1 68 | 69 | for _ in range(narration_frames - frames_written): 70 | ret, frame = cap.read() 71 | out.write(frame) 72 | 73 | while out.isOpened(): 74 | ret, frame = cap.read() 75 | if not ret: 76 | break 77 | out.write(frame) 78 | 79 | temp_narration = os.path.join(output_dir, "narration.mp3") 80 | full_narration.export(temp_narration, format="mp3") 81 | 82 | # Release the VideoCapture and VideoWriter objects 83 | cap.release() 84 | out.release() 85 | 86 | # Close all OpenCV windows (if any) 87 | cv2.destroyAllWindows() 88 | 89 | ffmpeg_command = [ 90 | 'ffmpeg', 91 | '-y', 92 | '-i', temp_video, 93 | '-i', temp_narration, 94 | '-map', '0:v', # Map video from the first input 95 | '-map', '1:a', # Map audio from the second input 96 | '-c:v', 'copy', # Copy video codec 97 | '-c:a', 'aac', # AAC audio codec 98 | '-strict', 'experimental', 99 | os.path.join(output_dir, output_file) 100 | ] 101 | 102 | subprocess.run(ffmpeg_command, capture_output=True) 103 | 104 | os.remove(temp_video) 105 | os.remove(temp_narration) 106 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Denne software er under MIT license 2 | # Og kan derfor benyttes frit af enhver. 3 | # Lavet af Mads Andersen @ MainAI 4 | # Stjerne modtages med kyshånd 5 | 6 | from openai import OpenAI 7 | import time 8 | import json 9 | import sys 10 | import os 11 | 12 | import narration 13 | import images 14 | import video 15 | from dotenv import load_dotenv 16 | 17 | # Load the environment variables from the .env file 18 | load_dotenv() 19 | 20 | # Fetch API keys from environment 21 | elevenlabs_api_key = os.getenv('ELEVENLABS_API_KEY') 22 | openai_api_key = os.getenv('OPENAI_API_KEY') 23 | 24 | 25 | # Initialize the OpenAI client with the API key 26 | client = OpenAI(api_key=openai_api_key) 27 | 28 | if len(sys.argv) < 2: 29 | print(f"USAGE: {sys.argv[0]} SOURCE_FILENAME") 30 | sys.exit(1) 31 | 32 | with open(sys.argv[1]) as f: 33 | source_material = f.read() 34 | 35 | short_id = str(int(time.time())) 36 | output_file = "short.avi" 37 | 38 | basedir = os.path.join("shorts", short_id) 39 | if not os.path.exists(basedir): 40 | os.makedirs(basedir) 41 | 42 | print("Genererer manuskript..") 43 | 44 | response = client.chat.completions.create( 45 | model="gpt-4", 46 | messages=[ 47 | { 48 | "role": "system", 49 | "content": """You are a YouTube short narration generator. You generate 30 seconds to 1 minute of narration. The shorts you create have a background that fades from image to image as the narration is going on. 50 | 51 | You will need to generate descriptions of images for each of the sentences in the short. They will be passed to an AI image generator. DO NOT IN ANY CIRCUMSTANCES use names of celebrities or people in the image descriptions. It is illegal to generate images of celebrities. Only describe persons without their names. Do not reference any real person or group in the image descriptions. Don't mention the female figure or other sexual content in the images because they are not allowed. 52 | 53 | You are however allowed to use any content, including real names in the narration. Only image descriptions are restricted. 54 | 55 | Note that the narration will be fed into a text-to-speech engine, so don't use special characters. 56 | 57 | DO NOT generate any image describtions of anything that violates the image generation saftey systems. 58 | 59 | Respond with a pair of an image description in square brackets and a narration below it. Both of them should be on their own lines, as follows: 60 | 61 | ### 62 | 63 | [Description of a background image] 64 | 65 | Narrator: "One sentence of narration" 66 | 67 | [Description of a background image] 68 | 69 | Narrator: "One sentence of narration" 70 | 71 | [Description of a background image] 72 | 73 | Narrator: "One sentence of narration" 74 | 75 | ### 76 | 77 | The short should be 6 sentences maximum. 78 | 79 | You should add a description of a fitting, beautiful and captivating background image in between all of the narrations. It will later be used to generate an image with AI. Avoid images with letters and sign. 80 | """ 81 | }, 82 | { 83 | "role": "user", 84 | "content": f"Create a YouTube short narration based on the following source material:\n\n{source_material}" 85 | } 86 | ] 87 | ) 88 | 89 | response_text = response.choices[0].message.content 90 | response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("“", '"').replace("”", '"') 91 | 92 | with open(os.path.join(basedir, "response.txt"), "w") as f: 93 | f.write(response_text) 94 | 95 | data, narrations = narration.parse(response_text) 96 | with open(os.path.join(basedir, "data.json"), "w") as f: 97 | json.dump(data, f, ensure_ascii=False) 98 | 99 | print(f"Genererer fortælling....") 100 | narration.create(data, os.path.join(basedir, "narrations")) 101 | 102 | print("Genererer billeder...") 103 | images.create_from_data(data, os.path.join(basedir, "images")) 104 | 105 | print("Genererer video...") 106 | video.create(narrations, basedir, output_file) 107 | 108 | print(f"FÆRDIG! Her er din video: {os.path.join(basedir, output_file)}") 109 | --------------------------------------------------------------------------------