├── requirements.txt
├── .env copy
├── .gitignore
├── images.py
├── narration.py
├── README.md
├── video.py
├── text.py
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
2 | python-dotenv
3 | elevenlabs
4 | openai
5 | pydub
6 | numpy
7 | 


--------------------------------------------------------------------------------
/.env copy:
--------------------------------------------------------------------------------
1 | 
2 | # ELVENLABS_API
3 | ELEVENLABS_API_KEY=YOUR_API_KEY
4 | 
5 | # OPENAI API
6 | OPENAI_API_KEY=YOUR_API_KEY


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | old/
 3 | shorts/
 4 | venv/
 5 | myvenv/
 6 | aishort/
 7 | *.txt
 8 | !response.txt
 9 | !requirements.txt
10 | .env
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/images.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import base64
 3 | import os
 4 | 
 5 | 
 6 | 
 7 | client = OpenAI()
 8 | 
 9 | def create_from_data(data, output_dir):
10 |     if not os.path.exists(output_dir):
11 |         os.makedirs(output_dir)
12 | 
13 |     image_number = 0
14 |     for element in data:
15 |         if element["type"] != "image":
16 |             continue
17 |         image_number += 1
18 |         image_name = f"image_{image_number}.webp"
19 |         generate(element["description"] + ". Vertical image, fully filling the canvas.", os.path.join(output_dir, image_name))
20 | 
21 | def generate(prompt, output_file, size="1024x1792"):
22 |     response = client.images.generate(
23 |         model="dall-e-3",
24 |         prompt=prompt,
25 |         size=size,
26 |         quality="standard",
27 |         response_format="b64_json",
28 |         n=1,
29 |     )
30 | 
31 |     image_b64 = response.data[0].b64_json
32 | 
33 |     with open(output_file, "wb") as f:
34 |         f.write(base64.b64decode(image_b64))
35 | 
36 | 


--------------------------------------------------------------------------------
/narration.py:
--------------------------------------------------------------------------------
 1 | from elevenlabs import generate, set_api_key, save, RateLimitError
 2 | import openai
 3 | import os
 4 | 
 5 | set_api_key("YOUR_ELEVENLABS_APIKEY")
 6 | 
 7 | narration_api = "elevenlabs" # (or "openai")
 8 | 
 9 | def parse(narration):
10 |     data = []
11 |     narrations = []
12 |     lines = narration.split("\n")
13 |     for line in lines:
14 |         if line.startswith('Narrator: '):
15 |             text = line.replace('Narrator: ', '')
16 |             data.append({
17 |                 "type": "text",
18 |                 "content": text.strip('"'),
19 |             })
20 |             narrations.append(text.strip('"'))
21 |         elif line.startswith('['):
22 |             background = line.strip('[]')
23 |             data.append({
24 |                 "type": "image",
25 |                 "description": background,
26 |             })
27 |     return data, narrations
28 | 
29 | def create(data, output_folder):
30 |     if not os.path.exists(output_folder):
31 |         os.makedirs(output_folder)
32 | 
33 |     n = 0
34 |     for element in data:
35 |         if element["type"] != "text":
36 |             continue
37 | 
38 |         n += 1
39 |         output_file = os.path.join(output_folder, f"narration_{n}.mp3")
40 | 
41 |         if narration_api == "openai":
42 |             audio = openai.audio.speech.create(
43 |                 input=element["content"],
44 |                 model="tts-1",
45 |                 voice="alloy",
46 |             )
47 | 
48 |             audio.stream_to_file(output_file)
49 |         else:
50 |             audio = generate(
51 |             # Ændre Speaker og model her!
52 |             # https://elevenlabs.io/docs/api-reference/text-to-speech
53 |                  
54 |                 text=element["content"],
55 |                 voice="Michael",
56 |                 model="eleven_monolingual_v1"
57 |             )
58 |             save(audio, output_file)
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AI Shorts Generator
 2 | 
 3 | AI Shorts Generator is an innovative tool designed to create short videos. The tool utilizes scripts powered by GPT-4 and voices by either Elevenlabs' speech synthesis or OpenAI's text-to-speech. DALL-E 3 generates background images which are carefully assembled by OpenCV, culminating into a compelling short video or reel.
 4 | 
 5 | The tool offers an option to use open-source language models like Mistral-7b and open-source stable diffusion models, but this requires access to significant GPU resources.
 6 | 
 7 | ## License
 8 | 
 9 | This code is released under the MIT license, implying it's free to be used by anyone. Nevertheless, it comes with no warranties and is provided 'as is'. The responsibility for the content you generate using this software, including potential copyright issues, rests solely with you. 
10 | 
11 | ## Getting Started
12 | 
13 | Getting started with AI Shorts Generator involves a few expenses, primarily for generating images via DALL-E 3. Anticipate spending approximately $0.10-0.45 in OpenAI credits per video.
14 | 
15 | You will initially need an OpenAI API key and an Elevenlabs key (which may not be immediately necessary, but will be after generating a few videos).
16 | 
17 | * Obtain an OpenAI API key [here](https://platform.openai.com/api-keys)
18 | * Get an Elevenlabs API key [here](https://elevenlabs.io/speech-synthesis) (not immediately necessary)
19 | 
20 | ### Installation and Usage
21 | 
22 | Here are the steps to install and use the AI Shorts Generator:
23 | 
24 | 1. **Clone the Repository**  
25 | ```
26 | git clone https://github.com/MainAIdk/AI-Shorts-Generator.git
27 | ```
28 | 2. **Navigate to the Directory**
29 | ```
30 | cd AI-Shorts-Generator
31 | ```
32 | 3. **Setup a Python Virtual Environment**
33 | ```
34 | python -m venv myvenv
35 | ```
36 | 4. **Activate the Environment**
37 | ```
38 | source myvenv/bin/activate
39 | ```
40 | 5. **Install Required Packages**
41 | ```
42 | pip install -r requirements.txt
43 | ```
44 | 6. **Set Environment Variables**
45 | ```
46 | Open .env copy, insert your API keys and rename to .env.
47 | ```
48 | 7. **Create GPT Input File**
49 | ```
50 | Create a file (e.g. source.txt) and provide a brief about your short video.
51 | ```
52 | 8. **Generate Your Short Video**
53 | ```
54 | python main.py source.txt
55 | ```
56 | The script will generate your video. A successful operation should output:  
57 | ```console
58 | Generating script...
59 | Generating narration...
60 | Generating images...
61 | Generating video...
62 | 
63 | FINISHED! Here is your video: shorts/1701788183/short.avi
64 | ```
65 | 
66 | ## Need Assistance?
67 | 
68 | If you need help setting up this project or require assistance for your personal or company's AI project, we are just an email away. Reach us at info@mainai.dk. Happy Generating!
69 | 


--------------------------------------------------------------------------------
/video.py:
--------------------------------------------------------------------------------
 1 | from pydub import AudioSegment
 2 | import numpy as np
 3 | import math
 4 | import cv2
 5 | import os
 6 | 
 7 | import text
 8 | 
 9 | def get_audio_duration(audio_file):
10 |     return len(AudioSegment.from_file(audio_file))
11 | 
12 | def resize_image(image, width, height):
13 |     # Calculate the aspect ratio of the original image
14 |     aspect_ratio = image.shape[1] / image.shape[0]
15 | 
16 |     # Calculate the new dimensions to fit within the desired size while preserving aspect ratio
17 |     if aspect_ratio > (width / height):
18 |         new_width = width
19 |         new_height = int(width / aspect_ratio)
20 |     else:
21 |         new_height = height
22 |         new_width = int(height * aspect_ratio)
23 | 
24 |     # Resize the image to the new dimensions without distorting it
25 |     return cv2.resize(image, (new_width, new_height))
26 | 
27 | def create(narrations, output_dir, output_filename):
28 |     # Define the dimensions and frame rate of the video
29 |     width, height = 1080, 1920  # Change as needed for your vertical video
30 |     frame_rate = 30  # Adjust as needed
31 | 
32 |     fade_time = 1000
33 | 
34 |     # Create a VideoWriter object to save the video
35 |     fourcc = cv2.VideoWriter_fourcc(*'XVID')  # You can change the codec as needed
36 |     temp_video = os.path.join(output_dir, "temp_video.avi")  # Output video file name
37 |     out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height))
38 | 
39 |     # List of image file paths to use in the video
40 |     image_paths = os.listdir(os.path.join(output_dir, "images"))  # Replace with your image paths
41 |     image_count = len(image_paths)
42 | 
43 |     # Load images and perform the transition effect
44 |     for i in range(image_count):
45 |         image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp"))
46 | 
47 |         if i+1 < image_count:
48 |             image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp"))
49 |         else:
50 |             image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp"))
51 | 
52 |         image1 = resize_image(image1, width, height)
53 |         image2 = resize_image(image2, width, height)
54 | 
55 |         narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3")
56 |         duration = get_audio_duration(narration)
57 | 
58 |         if i > 0:
59 |             duration -= fade_time
60 | 
61 |         if i == image_count-1:
62 |             duration -= fade_time
63 | 
64 |         for _ in range(math.floor(duration/1000*30)):
65 |             vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
66 |             vertical_video_frame[:image1.shape[0], :] = image1
67 | 
68 |             out.write(vertical_video_frame)
69 | 
70 |         for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)):
71 |             blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0)
72 |             vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
73 |             vertical_video_frame[:image1.shape[0], :] = blended_image
74 | 
75 |             out.write(vertical_video_frame)
76 | 
77 |     # Release the VideoWriter and close the window if any
78 |     out.release()
79 |     cv2.destroyAllWindows()
80 | 
81 |     text.add_narration_to_video(narrations, temp_video, output_dir, output_filename)
82 | 
83 |     os.remove(temp_video)
84 | 


--------------------------------------------------------------------------------
/text.py:
--------------------------------------------------------------------------------
  1 | from pydub import AudioSegment
  2 | import subprocess
  3 | import math
  4 | import cv2
  5 | import os
  6 | 
  7 | offset = 50
  8 | 
  9 | def get_audio_duration(audio_file):
 10 |     return len(AudioSegment.from_file(audio_file))
 11 | 
 12 | def write_text(text, frame, video_writer):
 13 |     font = cv2.FONT_HERSHEY_SIMPLEX
 14 |     white_color = (255, 255, 255)
 15 |     black_color = (0, 0, 0)
 16 |     thickness = 10
 17 |     font_scale = 3
 18 |     border = 5
 19 | 
 20 |     # Calculate the position for centered text
 21 |     text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
 22 |     text_x = (frame.shape[1] - text_size[0]) // 2  # Center horizontally
 23 |     text_y = (frame.shape[0] + text_size[1]) // 2  # Center vertically
 24 |     org = (text_x, text_y)  # Position of the text
 25 | 
 26 |     frame = cv2.putText(frame, text, org, font, font_scale, black_color, thickness + border * 2, cv2.LINE_AA)
 27 |     frame = cv2.putText(frame, text, org, font, font_scale, white_color, thickness, cv2.LINE_AA)
 28 | 
 29 |     video_writer.write(frame)
 30 | 
 31 | def add_narration_to_video(narrations, input_video, output_dir, output_file):
 32 |     # Open the video file
 33 |     cap = cv2.VideoCapture(input_video)
 34 | 
 35 |     # Define the codec and create a VideoWriter object to save the output video
 36 |     fourcc = cv2.VideoWriter_fourcc(*'XVID')
 37 |     temp_video = os.path.join(output_dir, "with_transcript.avi")
 38 |     out = cv2.VideoWriter(temp_video, fourcc, 30, (int(cap.get(3)), int(cap.get(4))))
 39 | 
 40 |     full_narration = AudioSegment.empty()
 41 | 
 42 |     for i, narration in enumerate(narrations):
 43 |         audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3")
 44 |         duration = get_audio_duration(audio)
 45 |         narration_frames = math.floor(duration / 1000 * 30)
 46 | 
 47 |         full_narration += AudioSegment.from_file(audio)
 48 | 
 49 |         char_count = len(narration.replace(" ", ""))
 50 |         ms_per_char = duration / char_count
 51 | 
 52 |         frames_written = 0
 53 |         words = narration.split(" ")
 54 |         for w, word in enumerate(words):
 55 |             word_ms = len(word) * ms_per_char
 56 | 
 57 |             if i == 0 and w == 0:
 58 |                 word_ms -= offset
 59 |                 if word_ms < 0:
 60 |                     word_ms = 0
 61 | 
 62 |             for _ in range(math.floor(word_ms/1000*30)):
 63 |                 ret, frame = cap.read()
 64 |                 if not ret:
 65 |                     break
 66 |                 write_text(word, frame, out)
 67 |                 frames_written += 1
 68 | 
 69 |         for _ in range(narration_frames - frames_written):
 70 |             ret, frame = cap.read()
 71 |             out.write(frame)
 72 | 
 73 |     while out.isOpened():
 74 |         ret, frame = cap.read()
 75 |         if not ret:
 76 |             break
 77 |         out.write(frame)
 78 | 
 79 |     temp_narration = os.path.join(output_dir, "narration.mp3")
 80 |     full_narration.export(temp_narration, format="mp3")
 81 | 
 82 |     # Release the VideoCapture and VideoWriter objects
 83 |     cap.release()
 84 |     out.release()
 85 | 
 86 |     # Close all OpenCV windows (if any)
 87 |     cv2.destroyAllWindows()
 88 | 
 89 |     ffmpeg_command = [
 90 |         'ffmpeg',
 91 |         '-y',
 92 |         '-i', temp_video,
 93 |         '-i', temp_narration,
 94 |         '-map', '0:v',   # Map video from the first input
 95 |         '-map', '1:a',   # Map audio from the second input
 96 |         '-c:v', 'copy',  # Copy video codec
 97 |         '-c:a', 'aac',   # AAC audio codec
 98 |         '-strict', 'experimental',
 99 |         os.path.join(output_dir, output_file)
100 |     ]
101 | 
102 |     subprocess.run(ffmpeg_command, capture_output=True)
103 | 
104 |     os.remove(temp_video)
105 |     os.remove(temp_narration)
106 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # Denne software er under MIT license
  2 | # Og kan derfor benyttes frit af enhver.
  3 | # Lavet af Mads Andersen @ MainAI
  4 | # Stjerne modtages med kyshånd
  5 | 
  6 | from openai import OpenAI
  7 | import time
  8 | import json
  9 | import sys
 10 | import os
 11 | 
 12 | import narration
 13 | import images
 14 | import video
 15 | from dotenv import load_dotenv
 16 | 
 17 | # Load the environment variables from the .env file
 18 | load_dotenv()
 19 | 
 20 | # Fetch API keys from environment
 21 | elevenlabs_api_key = os.getenv('ELEVENLABS_API_KEY')
 22 | openai_api_key = os.getenv('OPENAI_API_KEY')
 23 | 
 24 | 
 25 | # Initialize the OpenAI client with the API key
 26 | client = OpenAI(api_key=openai_api_key)
 27 | 
 28 | if len(sys.argv) < 2:
 29 |     print(f"USAGE: {sys.argv[0]} SOURCE_FILENAME")
 30 |     sys.exit(1)
 31 | 
 32 | with open(sys.argv[1]) as f:
 33 |     source_material = f.read()
 34 | 
 35 | short_id = str(int(time.time()))
 36 | output_file = "short.avi"
 37 | 
 38 | basedir = os.path.join("shorts", short_id)
 39 | if not os.path.exists(basedir):
 40 |     os.makedirs(basedir)
 41 | 
 42 | print("Genererer manuskript..")
 43 | 
 44 | response = client.chat.completions.create(
 45 |     model="gpt-4",
 46 |     messages=[
 47 |         {
 48 |             "role": "system",
 49 |             "content": """You are a YouTube short narration generator. You generate 30 seconds to 1 minute of narration. The shorts you create have a background that fades from image to image as the narration is going on.
 50 | 
 51 | You will need to generate descriptions of images for each of the sentences in the short. They will be passed to an AI image generator. DO NOT IN ANY CIRCUMSTANCES use names of celebrities or people in the image descriptions. It is illegal to generate images of celebrities. Only describe persons without their names. Do not reference any real person or group in the image descriptions. Don't mention the female figure or other sexual content in the images because they are not allowed.
 52 | 
 53 | You are however allowed to use any content, including real names in the narration. Only image descriptions are restricted.
 54 | 
 55 | Note that the narration will be fed into a text-to-speech engine, so don't use special characters.
 56 | 
 57 | DO NOT generate any image describtions of anything that violates the image generation saftey systems.
 58 | 
 59 | Respond with a pair of an image description in square brackets and a narration below it. Both of them should be on their own lines, as follows:
 60 | 
 61 | ###
 62 | 
 63 | [Description of a background image]
 64 | 
 65 | Narrator: "One sentence of narration"
 66 | 
 67 | [Description of a background image]
 68 | 
 69 | Narrator: "One sentence of narration"
 70 | 
 71 | [Description of a background image]
 72 | 
 73 | Narrator: "One sentence of narration"
 74 | 
 75 | ###
 76 | 
 77 | The short should be 6 sentences maximum.
 78 | 
 79 | You should add a description of a fitting, beautiful and captivating background image in between all of the narrations. It will later be used to generate an image with AI. Avoid images with letters and sign.
 80 | """
 81 |         },
 82 |         {
 83 |             "role": "user",
 84 |             "content": f"Create a YouTube short narration based on the following source material:\n\n{source_material}"
 85 |         }
 86 |     ]
 87 | )
 88 | 
 89 | response_text = response.choices[0].message.content
 90 | response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("“", '"').replace("”", '"')
 91 | 
 92 | with open(os.path.join(basedir, "response.txt"), "w") as f:
 93 |     f.write(response_text)
 94 | 
 95 | data, narrations = narration.parse(response_text)
 96 | with open(os.path.join(basedir, "data.json"), "w") as f:
 97 |     json.dump(data, f, ensure_ascii=False)
 98 | 
 99 | print(f"Genererer fortælling....")
100 | narration.create(data, os.path.join(basedir, "narrations"))
101 | 
102 | print("Genererer billeder...")
103 | images.create_from_data(data, os.path.join(basedir, "images"))
104 | 
105 | print("Genererer video...")
106 | video.create(narrations, basedir, output_file)
107 | 
108 | print(f"FÆRDIG! Her er din video: {os.path.join(basedir, output_file)}")
109 | 


--------------------------------------------------------------------------------