├── .gitignore
├── requirements.txt
├── images.py
├── README.md
├── narration.py
├── main.py
└── video.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | old/
3 | shorts/
4 | *.txt
5 | !response.txt
6 | !requirements.txt
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
2 | elevenlabs
3 | captacity
4 | openai
5 | pydub
6 | numpy
7 | 


--------------------------------------------------------------------------------
/images.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import base64
 3 | import os
 4 | 
 5 | client = OpenAI()
 6 | 
 7 | def create_from_data(data, output_dir):
 8 |     if not os.path.exists(output_dir):
 9 |         os.makedirs(output_dir)
10 | 
11 |     image_number = 0
12 |     for element in data:
13 |         if element["type"] != "image":
14 |             continue
15 |         image_number += 1
16 |         image_name = f"image_{image_number}.webp"
17 |         generate(element["description"] + ". Vertical image, fully filling the canvas.", os.path.join(output_dir, image_name))
18 | 
19 | def generate(prompt, output_file, size="1024x1792"):
20 |     response = client.images.generate(
21 |         model="dall-e-3",
22 |         prompt=prompt,
23 |         size=size,
24 |         quality="standard",
25 |         response_format="b64_json",
26 |         n=1,
27 |     )
28 | 
29 |     image_b64 = response.data[0].b64_json
30 | 
31 |     with open(output_file, "wb") as f:
32 |         f.write(base64.b64decode(image_b64))
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Shortrocity
 2 | 
 3 | Shortrocity is a tool for making AI generated short videos ("shorts" or "reels") with a ChatGPT generated script, narrated by ElevenLabs or OpenAI text-to-speech. DALL-E 3 generated background images are also added to the background. Captions with word highlighting are generated with [Captacity](https://github.com/unconv/captacity), which uses [OpenAI Whisper](https://github.com/openai/whisper).
 4 | 
 5 | ## Quick Start
 6 | 
 7 | First, add your API-keys to the environment:
 8 | 
 9 | ```console
10 | $ export OPENAI_API_KEY=YOUR_OPENAI_API_KEY
11 | $ export ELEVEN_API_KEY=YOUR_ELEVENLABS_API_KEY
12 | ```
13 | 
14 | Then, put your source content in a file, for example `source.txt` and run the `main.py`:
15 | 
16 | ```console
17 | $ ./main.py source.txt
18 | Generating script...
19 | Generating narration...
20 | Generating images...
21 | Generating video...
22 | DONE! Here's your video: shorts/1701788183/short.avi
23 | ```
24 | 
25 | ## Caption styling
26 | 
27 | Optionally, you can specify a settings file to define settings for the caption styling:
28 | 
29 | ```console
30 | $ ./main.py source.txt settings.json
31 | ```
32 | 
33 | The settings file can look like this, for example:
34 | 
35 | ```json
36 | {
37 |     "font": "Bangers-Regular.ttf",
38 |     "font_size": 130,
39 |     "font_color": "yellow",
40 | 
41 |     "stroke_width": 3,
42 |     "stroke_color": "black",
43 | 
44 |     "highlight_current_word": true,
45 |     "word_highlight_color": "red",
46 | 
47 |     "line_count": 2,
48 | 
49 |     "padding": 50,
50 | 
51 |     "shadow_strength": 1.0,
52 |     "shadow_blur": 0.1
53 | }
54 | ```
55 | 


--------------------------------------------------------------------------------
/narration.py:
--------------------------------------------------------------------------------
 1 | from elevenlabs.client import ElevenLabs
 2 | from elevenlabs import save
 3 | import openai
 4 | import os
 5 | 
 6 | elevenlabs = ElevenLabs(
 7 |     api_key=os.getenv("ELEVEN_API_KEY")
 8 | )
 9 | 
10 | narration_api = "elevenlabs" # (or "openai")
11 | 
12 | def parse(narration):
13 |     data = []
14 |     narrations = []
15 |     lines = narration.split("\n")
16 |     for line in lines:
17 |         if line.startswith('Narrator: '):
18 |             text = line.replace('Narrator: ', '')
19 |             data.append({
20 |                 "type": "text",
21 |                 "content": text.strip('"'),
22 |             })
23 |             narrations.append(text.strip('"'))
24 |         elif line.startswith('['):
25 |             background = line.strip('[]')
26 |             data.append({
27 |                 "type": "image",
28 |                 "description": background,
29 |             })
30 |     return data, narrations
31 | 
32 | def create(data, output_folder):
33 |     if not os.path.exists(output_folder):
34 |         os.makedirs(output_folder)
35 | 
36 |     n = 0
37 |     for element in data:
38 |         if element["type"] != "text":
39 |             continue
40 | 
41 |         n += 1
42 |         output_file = os.path.join(output_folder, f"narration_{n}.mp3")
43 | 
44 |         if narration_api == "openai":
45 |             audio = openai.audio.speech.create(
46 |                 input=element["content"],
47 |                 model="tts-1",
48 |                 voice="alloy",
49 |             )
50 | 
51 |             audio.stream_to_file(output_file)
52 |         else:
53 |             audio = elevenlabs.generate(
54 |                 text=element["content"],
55 |                 voice="Michael",
56 |                 model="eleven_monolingual_v1"
57 |             )
58 |             save(audio, output_file)
59 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from openai import OpenAI
 4 | import time
 5 | import json
 6 | import sys
 7 | import os
 8 | 
 9 | import narration
10 | import images
11 | import video
12 | 
13 | client = OpenAI()
14 | 
15 | if len(sys.argv) < 2:
16 |     print(f"Usage: {sys.argv[0]} <source_file> [settings_file]")
17 |     sys.exit(1)
18 | 
19 | with open(sys.argv[1]) as f:
20 |     source_material = f.read()
21 | 
22 | caption_settings = {}
23 | if len(sys.argv) > 2:
24 |     with open(sys.argv[2]) as f:
25 |         caption_settings = json.load(f)
26 | 
27 | short_id = str(int(time.time()))
28 | output_file = "short.avi"
29 | 
30 | basedir = os.path.join("shorts", short_id)
31 | if not os.path.exists(basedir):
32 |     os.makedirs(basedir)
33 | 
34 | print("Generating script...")
35 | 
36 | response = client.chat.completions.create(
37 |     model="gpt-4",
38 |     messages=[
39 |         {
40 |             "role": "system",
41 |             "content": """You are a YouTube short narration generator. You generate 30 seconds to 1 minute of narration. The shorts you create have a background that fades from image to image as the narration is going on.
42 | 
43 | You will need to generate descriptions of images for each of the sentences in the short. They will be passed to an AI image generator. DO NOT IN ANY CIRCUMSTANCES use names of celebrities or people in the image descriptions. It is illegal to generate images of celebrities. Only describe persons without their names. Do not reference any real person or group in the image descriptions. Don't mention the female figure or other sexual content in the images because they are not allowed.
44 | 
45 | You are however allowed to use any content, including real names in the narration. Only image descriptions are restricted.
46 | 
47 | Note that the narration will be fed into a text-to-speech engine, so don't use special characters.
48 | 
49 | Respond with a pair of an image description in square brackets and a narration below it. Both of them should be on their own lines, as follows:
50 | 
51 | ###
52 | 
53 | [Description of a background image]
54 | 
55 | Narrator: "One sentence of narration"
56 | 
57 | [Description of a background image]
58 | 
59 | Narrator: "One sentence of narration"
60 | 
61 | [Description of a background image]
62 | 
63 | Narrator: "One sentence of narration"
64 | 
65 | ###
66 | 
67 | The short should be 6 sentences maximum.
68 | 
69 | You should add a description of a fitting backround image in between all of the narrations. It will later be used to generate an image with AI.
70 | """
71 |         },
72 |         {
73 |             "role": "user",
74 |             "content": f"Create a YouTube short narration based on the following source material:\n\n{source_material}"
75 |         }
76 |     ]
77 | )
78 | 
79 | response_text = response.choices[0].message.content
80 | response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("“", '"').replace("”", '"')
81 | 
82 | with open(os.path.join(basedir, "response.txt"), "w") as f:
83 |     f.write(response_text)
84 | 
85 | data, narrations = narration.parse(response_text)
86 | with open(os.path.join(basedir, "data.json"), "w") as f:
87 |     json.dump(data, f, ensure_ascii=False)
88 | 
89 | print(f"Generating narration...")
90 | narration.create(data, os.path.join(basedir, "narrations"))
91 | 
92 | print("Generating images...")
93 | images.create_from_data(data, os.path.join(basedir, "images"))
94 | 
95 | print("Generating video...")
96 | video.create(narrations, basedir, output_file, caption_settings)
97 | 
98 | print(f"DONE! Here's your video: {os.path.join(basedir, output_file)}")
99 | 


--------------------------------------------------------------------------------
/video.py:
--------------------------------------------------------------------------------
  1 | from pydub import AudioSegment
  2 | import subprocess
  3 | import numpy as np
  4 | import captacity
  5 | import json
  6 | import math
  7 | import cv2
  8 | import os
  9 | 
 10 | def get_audio_duration(audio_file):
 11 |     return len(AudioSegment.from_file(audio_file))
 12 | 
 13 | def add_narration_to_video(narrations, input_video, output_dir, output_file):
 14 |     full_narration = AudioSegment.empty()
 15 | 
 16 |     for i, _ in enumerate(narrations):
 17 |         audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3")
 18 |         full_narration += AudioSegment.from_file(audio)
 19 | 
 20 |     temp_narration = os.path.join(output_dir, "narration.mp3")
 21 |     full_narration.export(temp_narration, format="mp3")
 22 | 
 23 |     ffmpeg_command = [
 24 |         'ffmpeg',
 25 |         '-y',
 26 |         '-i', input_video,
 27 |         '-i', temp_narration,
 28 |         '-map', '0:v',   # Map video from the first input
 29 |         '-map', '1:a',   # Map audio from the second input
 30 |         '-c:v', 'copy',  # Copy video codec
 31 |         '-c:a', 'aac',   # AAC audio codec
 32 |         '-strict', 'experimental',
 33 |         os.path.join(output_dir, output_file)
 34 |     ]
 35 | 
 36 |     subprocess.run(ffmpeg_command, capture_output=True)
 37 | 
 38 |     os.remove(temp_narration)
 39 | 
 40 | def resize_image(image, width, height):
 41 |     # Calculate the aspect ratio of the original image
 42 |     aspect_ratio = image.shape[1] / image.shape[0]
 43 | 
 44 |     # Calculate the new dimensions to fit within the desired size while preserving aspect ratio
 45 |     if aspect_ratio > (width / height):
 46 |         new_width = width
 47 |         new_height = int(width / aspect_ratio)
 48 |     else:
 49 |         new_height = height
 50 |         new_width = int(height * aspect_ratio)
 51 | 
 52 |     # Resize the image to the new dimensions without distorting it
 53 |     return cv2.resize(image, (new_width, new_height))
 54 | 
 55 | def create(narrations, output_dir, output_filename, caption_settings: dict|None = None):
 56 |     if caption_settings is None:
 57 |         caption_settings = {}
 58 | 
 59 |     # Define the dimensions and frame rate of the video
 60 |     width, height = 1080, 1920  # Change as needed for your vertical video
 61 |     frame_rate = 30  # Adjust as needed
 62 | 
 63 |     fade_time = 1000
 64 | 
 65 |     # Create a VideoWriter object to save the video
 66 |     fourcc = cv2.VideoWriter_fourcc(*'XVID')  # You can change the codec as needed
 67 |     temp_video = os.path.join(output_dir, "temp_video.avi")  # Output video file name
 68 |     out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height))
 69 | 
 70 |     # List of image file paths to use in the video
 71 |     image_paths = os.listdir(os.path.join(output_dir, "images"))  # Replace with your image paths
 72 |     image_count = len(image_paths)
 73 | 
 74 |     # Load images and perform the transition effect
 75 |     for i in range(image_count):
 76 |         image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp"))
 77 | 
 78 |         if i+1 < image_count:
 79 |             image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp"))
 80 |         else:
 81 |             image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp"))
 82 | 
 83 |         image1 = resize_image(image1, width, height)
 84 |         image2 = resize_image(image2, width, height)
 85 | 
 86 |         narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3")
 87 |         duration = get_audio_duration(narration)
 88 | 
 89 |         if i > 0:
 90 |             duration -= fade_time
 91 | 
 92 |         if i == image_count-1:
 93 |             duration -= fade_time
 94 | 
 95 |         for _ in range(math.floor(duration/1000*30)):
 96 |             vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
 97 |             vertical_video_frame[:image1.shape[0], :] = image1
 98 | 
 99 |             out.write(vertical_video_frame)
100 | 
101 |         for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)):
102 |             blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0)
103 |             vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8)
104 |             vertical_video_frame[:image1.shape[0], :] = blended_image
105 | 
106 |             out.write(vertical_video_frame)
107 | 
108 |     # Release the VideoWriter and close the window if any
109 |     out.release()
110 |     cv2.destroyAllWindows()
111 | 
112 |     # Add narration audio to video
113 |     with_narration = "with_narration.mp4"
114 |     add_narration_to_video(narrations, temp_video, output_dir, with_narration)
115 | 
116 |     # Add captions to video
117 |     output_path = os.path.join(output_dir, output_filename)
118 |     input_path = os.path.join(output_dir, with_narration)
119 |     segments = create_segments(narrations, output_dir)
120 | 
121 |     captacity.add_captions(
122 |         video_file=input_path,
123 |         output_file=output_path,
124 |         segments=segments,
125 |         print_info=True,
126 |         **caption_settings,
127 |     )
128 | 
129 |     # Clean up temporary files
130 |     os.remove(input_path)
131 |     os.remove(temp_video)
132 | 
133 | def create_segments(narrations, output_dir):
134 |     segments = []
135 | 
136 |     offset = 0
137 |     for i, narration in enumerate(narrations):
138 |         audio_file = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3")
139 | 
140 |         try:
141 |             t_segments = captacity.transcriber.transcribe_locally(
142 |                 audio_file=audio_file,
143 |                 prompt=narration,
144 |             )
145 |         except ImportError:
146 |             t_segments = captacity.transcriber.transcribe_with_api(
147 |                 audio_file=audio_file,
148 |                 prompt=narration,
149 |             )
150 | 
151 |         o_segments = offset_segments(t_segments, offset)
152 | 
153 |         segments += o_segments
154 |         offset += get_audio_duration(audio_file) / 1000
155 | 
156 |     return segments
157 | 
158 | def offset_segments(segments: list[dict], offset: float):
159 |     for segment in segments:
160 |         segment["start"] += offset
161 |         segment["end"] += offset
162 |         for word in segment["words"]:
163 |             word["start"] += offset
164 |             word["end"] += offset
165 |     return segments
166 | 


--------------------------------------------------------------------------------