├── .gitignore ├── requirements.txt ├── images.py ├── README.md ├── narration.py ├── main.py └── video.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | old/ 3 | shorts/ 4 | *.txt 5 | !response.txt 6 | !requirements.txt 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | elevenlabs 3 | captacity 4 | openai 5 | pydub 6 | numpy 7 | -------------------------------------------------------------------------------- /images.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import base64 3 | import os 4 | 5 | client = OpenAI() 6 | 7 | def create_from_data(data, output_dir): 8 | if not os.path.exists(output_dir): 9 | os.makedirs(output_dir) 10 | 11 | image_number = 0 12 | for element in data: 13 | if element["type"] != "image": 14 | continue 15 | image_number += 1 16 | image_name = f"image_{image_number}.webp" 17 | generate(element["description"] + ". Vertical image, fully filling the canvas.", os.path.join(output_dir, image_name)) 18 | 19 | def generate(prompt, output_file, size="1024x1792"): 20 | response = client.images.generate( 21 | model="dall-e-3", 22 | prompt=prompt, 23 | size=size, 24 | quality="standard", 25 | response_format="b64_json", 26 | n=1, 27 | ) 28 | 29 | image_b64 = response.data[0].b64_json 30 | 31 | with open(output_file, "wb") as f: 32 | f.write(base64.b64decode(image_b64)) 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Shortrocity 2 | 3 | Shortrocity is a tool for making AI generated short videos ("shorts" or "reels") with a ChatGPT generated script, narrated by ElevenLabs or OpenAI text-to-speech. DALL-E 3 generated background images are also added to the background. Captions with word highlighting are generated with [Captacity](https://github.com/unconv/captacity), which uses [OpenAI Whisper](https://github.com/openai/whisper). 4 | 5 | ## Quick Start 6 | 7 | First, add your API-keys to the environment: 8 | 9 | ```console 10 | $ export OPENAI_API_KEY=YOUR_OPENAI_API_KEY 11 | $ export ELEVEN_API_KEY=YOUR_ELEVENLABS_API_KEY 12 | ``` 13 | 14 | Then, put your source content in a file, for example `source.txt` and run the `main.py`: 15 | 16 | ```console 17 | $ ./main.py source.txt 18 | Generating script... 19 | Generating narration... 20 | Generating images... 21 | Generating video... 22 | DONE! Here's your video: shorts/1701788183/short.avi 23 | ``` 24 | 25 | ## Caption styling 26 | 27 | Optionally, you can specify a settings file to define settings for the caption styling: 28 | 29 | ```console 30 | $ ./main.py source.txt settings.json 31 | ``` 32 | 33 | The settings file can look like this, for example: 34 | 35 | ```json 36 | { 37 | "font": "Bangers-Regular.ttf", 38 | "font_size": 130, 39 | "font_color": "yellow", 40 | 41 | "stroke_width": 3, 42 | "stroke_color": "black", 43 | 44 | "highlight_current_word": true, 45 | "word_highlight_color": "red", 46 | 47 | "line_count": 2, 48 | 49 | "padding": 50, 50 | 51 | "shadow_strength": 1.0, 52 | "shadow_blur": 0.1 53 | } 54 | ``` 55 | -------------------------------------------------------------------------------- /narration.py: -------------------------------------------------------------------------------- 1 | from elevenlabs.client import ElevenLabs 2 | from elevenlabs import save 3 | import openai 4 | import os 5 | 6 | elevenlabs = ElevenLabs( 7 | api_key=os.getenv("ELEVEN_API_KEY") 8 | ) 9 | 10 | narration_api = "elevenlabs" # (or "openai") 11 | 12 | def parse(narration): 13 | data = [] 14 | narrations = [] 15 | lines = narration.split("\n") 16 | for line in lines: 17 | if line.startswith('Narrator: '): 18 | text = line.replace('Narrator: ', '') 19 | data.append({ 20 | "type": "text", 21 | "content": text.strip('"'), 22 | }) 23 | narrations.append(text.strip('"')) 24 | elif line.startswith('['): 25 | background = line.strip('[]') 26 | data.append({ 27 | "type": "image", 28 | "description": background, 29 | }) 30 | return data, narrations 31 | 32 | def create(data, output_folder): 33 | if not os.path.exists(output_folder): 34 | os.makedirs(output_folder) 35 | 36 | n = 0 37 | for element in data: 38 | if element["type"] != "text": 39 | continue 40 | 41 | n += 1 42 | output_file = os.path.join(output_folder, f"narration_{n}.mp3") 43 | 44 | if narration_api == "openai": 45 | audio = openai.audio.speech.create( 46 | input=element["content"], 47 | model="tts-1", 48 | voice="alloy", 49 | ) 50 | 51 | audio.stream_to_file(output_file) 52 | else: 53 | audio = elevenlabs.generate( 54 | text=element["content"], 55 | voice="Michael", 56 | model="eleven_monolingual_v1" 57 | ) 58 | save(audio, output_file) 59 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from openai import OpenAI 4 | import time 5 | import json 6 | import sys 7 | import os 8 | 9 | import narration 10 | import images 11 | import video 12 | 13 | client = OpenAI() 14 | 15 | if len(sys.argv) < 2: 16 | print(f"Usage: {sys.argv[0]} [settings_file]") 17 | sys.exit(1) 18 | 19 | with open(sys.argv[1]) as f: 20 | source_material = f.read() 21 | 22 | caption_settings = {} 23 | if len(sys.argv) > 2: 24 | with open(sys.argv[2]) as f: 25 | caption_settings = json.load(f) 26 | 27 | short_id = str(int(time.time())) 28 | output_file = "short.avi" 29 | 30 | basedir = os.path.join("shorts", short_id) 31 | if not os.path.exists(basedir): 32 | os.makedirs(basedir) 33 | 34 | print("Generating script...") 35 | 36 | response = client.chat.completions.create( 37 | model="gpt-4", 38 | messages=[ 39 | { 40 | "role": "system", 41 | "content": """You are a YouTube short narration generator. You generate 30 seconds to 1 minute of narration. The shorts you create have a background that fades from image to image as the narration is going on. 42 | 43 | You will need to generate descriptions of images for each of the sentences in the short. They will be passed to an AI image generator. DO NOT IN ANY CIRCUMSTANCES use names of celebrities or people in the image descriptions. It is illegal to generate images of celebrities. Only describe persons without their names. Do not reference any real person or group in the image descriptions. Don't mention the female figure or other sexual content in the images because they are not allowed. 44 | 45 | You are however allowed to use any content, including real names in the narration. Only image descriptions are restricted. 46 | 47 | Note that the narration will be fed into a text-to-speech engine, so don't use special characters. 48 | 49 | Respond with a pair of an image description in square brackets and a narration below it. Both of them should be on their own lines, as follows: 50 | 51 | ### 52 | 53 | [Description of a background image] 54 | 55 | Narrator: "One sentence of narration" 56 | 57 | [Description of a background image] 58 | 59 | Narrator: "One sentence of narration" 60 | 61 | [Description of a background image] 62 | 63 | Narrator: "One sentence of narration" 64 | 65 | ### 66 | 67 | The short should be 6 sentences maximum. 68 | 69 | You should add a description of a fitting backround image in between all of the narrations. It will later be used to generate an image with AI. 70 | """ 71 | }, 72 | { 73 | "role": "user", 74 | "content": f"Create a YouTube short narration based on the following source material:\n\n{source_material}" 75 | } 76 | ] 77 | ) 78 | 79 | response_text = response.choices[0].message.content 80 | response_text.replace("’", "'").replace("`", "'").replace("…", "...").replace("“", '"').replace("”", '"') 81 | 82 | with open(os.path.join(basedir, "response.txt"), "w") as f: 83 | f.write(response_text) 84 | 85 | data, narrations = narration.parse(response_text) 86 | with open(os.path.join(basedir, "data.json"), "w") as f: 87 | json.dump(data, f, ensure_ascii=False) 88 | 89 | print(f"Generating narration...") 90 | narration.create(data, os.path.join(basedir, "narrations")) 91 | 92 | print("Generating images...") 93 | images.create_from_data(data, os.path.join(basedir, "images")) 94 | 95 | print("Generating video...") 96 | video.create(narrations, basedir, output_file, caption_settings) 97 | 98 | print(f"DONE! Here's your video: {os.path.join(basedir, output_file)}") 99 | -------------------------------------------------------------------------------- /video.py: -------------------------------------------------------------------------------- 1 | from pydub import AudioSegment 2 | import subprocess 3 | import numpy as np 4 | import captacity 5 | import json 6 | import math 7 | import cv2 8 | import os 9 | 10 | def get_audio_duration(audio_file): 11 | return len(AudioSegment.from_file(audio_file)) 12 | 13 | def add_narration_to_video(narrations, input_video, output_dir, output_file): 14 | full_narration = AudioSegment.empty() 15 | 16 | for i, _ in enumerate(narrations): 17 | audio = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3") 18 | full_narration += AudioSegment.from_file(audio) 19 | 20 | temp_narration = os.path.join(output_dir, "narration.mp3") 21 | full_narration.export(temp_narration, format="mp3") 22 | 23 | ffmpeg_command = [ 24 | 'ffmpeg', 25 | '-y', 26 | '-i', input_video, 27 | '-i', temp_narration, 28 | '-map', '0:v', # Map video from the first input 29 | '-map', '1:a', # Map audio from the second input 30 | '-c:v', 'copy', # Copy video codec 31 | '-c:a', 'aac', # AAC audio codec 32 | '-strict', 'experimental', 33 | os.path.join(output_dir, output_file) 34 | ] 35 | 36 | subprocess.run(ffmpeg_command, capture_output=True) 37 | 38 | os.remove(temp_narration) 39 | 40 | def resize_image(image, width, height): 41 | # Calculate the aspect ratio of the original image 42 | aspect_ratio = image.shape[1] / image.shape[0] 43 | 44 | # Calculate the new dimensions to fit within the desired size while preserving aspect ratio 45 | if aspect_ratio > (width / height): 46 | new_width = width 47 | new_height = int(width / aspect_ratio) 48 | else: 49 | new_height = height 50 | new_width = int(height * aspect_ratio) 51 | 52 | # Resize the image to the new dimensions without distorting it 53 | return cv2.resize(image, (new_width, new_height)) 54 | 55 | def create(narrations, output_dir, output_filename, caption_settings: dict|None = None): 56 | if caption_settings is None: 57 | caption_settings = {} 58 | 59 | # Define the dimensions and frame rate of the video 60 | width, height = 1080, 1920 # Change as needed for your vertical video 61 | frame_rate = 30 # Adjust as needed 62 | 63 | fade_time = 1000 64 | 65 | # Create a VideoWriter object to save the video 66 | fourcc = cv2.VideoWriter_fourcc(*'XVID') # You can change the codec as needed 67 | temp_video = os.path.join(output_dir, "temp_video.avi") # Output video file name 68 | out = cv2.VideoWriter(temp_video, fourcc, frame_rate, (width, height)) 69 | 70 | # List of image file paths to use in the video 71 | image_paths = os.listdir(os.path.join(output_dir, "images")) # Replace with your image paths 72 | image_count = len(image_paths) 73 | 74 | # Load images and perform the transition effect 75 | for i in range(image_count): 76 | image1 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+1}.webp")) 77 | 78 | if i+1 < image_count: 79 | image2 = cv2.imread(os.path.join(output_dir, "images", f"image_{i+2}.webp")) 80 | else: 81 | image2 = cv2.imread(os.path.join(output_dir, "images", f"image_1.webp")) 82 | 83 | image1 = resize_image(image1, width, height) 84 | image2 = resize_image(image2, width, height) 85 | 86 | narration = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3") 87 | duration = get_audio_duration(narration) 88 | 89 | if i > 0: 90 | duration -= fade_time 91 | 92 | if i == image_count-1: 93 | duration -= fade_time 94 | 95 | for _ in range(math.floor(duration/1000*30)): 96 | vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) 97 | vertical_video_frame[:image1.shape[0], :] = image1 98 | 99 | out.write(vertical_video_frame) 100 | 101 | for alpha in np.linspace(0, 1, math.floor(fade_time/1000*30)): 102 | blended_image = cv2.addWeighted(image1, 1 - alpha, image2, alpha, 0) 103 | vertical_video_frame = np.zeros((height, width, 3), dtype=np.uint8) 104 | vertical_video_frame[:image1.shape[0], :] = blended_image 105 | 106 | out.write(vertical_video_frame) 107 | 108 | # Release the VideoWriter and close the window if any 109 | out.release() 110 | cv2.destroyAllWindows() 111 | 112 | # Add narration audio to video 113 | with_narration = "with_narration.mp4" 114 | add_narration_to_video(narrations, temp_video, output_dir, with_narration) 115 | 116 | # Add captions to video 117 | output_path = os.path.join(output_dir, output_filename) 118 | input_path = os.path.join(output_dir, with_narration) 119 | segments = create_segments(narrations, output_dir) 120 | 121 | captacity.add_captions( 122 | video_file=input_path, 123 | output_file=output_path, 124 | segments=segments, 125 | print_info=True, 126 | **caption_settings, 127 | ) 128 | 129 | # Clean up temporary files 130 | os.remove(input_path) 131 | os.remove(temp_video) 132 | 133 | def create_segments(narrations, output_dir): 134 | segments = [] 135 | 136 | offset = 0 137 | for i, narration in enumerate(narrations): 138 | audio_file = os.path.join(output_dir, "narrations", f"narration_{i+1}.mp3") 139 | 140 | try: 141 | t_segments = captacity.transcriber.transcribe_locally( 142 | audio_file=audio_file, 143 | prompt=narration, 144 | ) 145 | except ImportError: 146 | t_segments = captacity.transcriber.transcribe_with_api( 147 | audio_file=audio_file, 148 | prompt=narration, 149 | ) 150 | 151 | o_segments = offset_segments(t_segments, offset) 152 | 153 | segments += o_segments 154 | offset += get_audio_duration(audio_file) / 1000 155 | 156 | return segments 157 | 158 | def offset_segments(segments: list[dict], offset: float): 159 | for segment in segments: 160 | segment["start"] += offset 161 | segment["end"] += offset 162 | for word in segment["words"]: 163 | word["start"] += offset 164 | word["end"] += offset 165 | return segments 166 | --------------------------------------------------------------------------------