├── .gitattributes ├── .gitignore └── app.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .env 3 | demo.mp4 4 | merge.py 5 | output_audio.wav 6 | output_video.mp4 7 | output_videoTEMP_MPY_wvf_snd.mp4 8 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from IPython.display import display, Image, Audio 3 | from moviepy.editor import VideoFileClip, AudioFileClip 4 | from moviepy.audio.io.AudioFileClip import AudioFileClip 5 | 6 | import cv2 # We're using OpenCV to read video 7 | import base64 8 | import time 9 | import io 10 | import openai 11 | import os 12 | import requests 13 | 14 | import streamlit as st 15 | import tempfile 16 | import numpy as np 17 | 18 | load_dotenv() 19 | 20 | 21 | def video_to_frames(video_file): 22 | # Save the uploaded video file to a temporary file 23 | with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile: 24 | tmpfile.write(video_file.read()) 25 | video_filename = tmpfile.name 26 | 27 | video_duration = VideoFileClip(video_filename).duration 28 | 29 | video = cv2.VideoCapture(video_filename) 30 | base64Frames = [] 31 | 32 | while video.isOpened(): 33 | success, frame = video.read() 34 | if not success: 35 | break 36 | _, buffer = cv2.imencode(".jpg", frame) 37 | base64Frames.append(base64.b64encode(buffer).decode("utf-8")) 38 | 39 | video.release() 40 | print(len(base64Frames), "frames read.") 41 | return base64Frames, video_filename, video_duration 42 | 43 | 44 | def frames_to_story(base64Frames, prompt): 45 | PROMPT_MESSAGES = [ 46 | { 47 | "role": "user", 48 | "content": [ 49 | prompt, 50 | *map(lambda x: {"image": x, "resize": 768}, 51 | base64Frames[0::25]), 52 | ], 53 | }, 54 | ] 55 | params = { 56 | "model": "gpt-4-vision-preview", 57 | "messages": PROMPT_MESSAGES, 58 | "api_key": os.environ["OPENAI_API_KEY"], 59 | "headers": {"Openai-Version": "2020-11-07"}, 60 | "max_tokens": 500, 61 | } 62 | 63 | result = openai.ChatCompletion.create(**params) 64 | print(result.choices[0].message.content) 65 | return result.choices[0].message.content 66 | 67 | 68 | def text_to_audio(text): 69 | response = requests.post( 70 | "https://api.openai.com/v1/audio/speech", 71 | headers={ 72 | "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}", 73 | }, 74 | json={ 75 | "model": "tts-1", 76 | "input": text, 77 | "voice": "onyx", 78 | }, 79 | ) 80 | 81 | # audio_file_path = "output_audio.wav" 82 | # with open(audio_file_path, "wb") as audio_file: 83 | # for chunk in response.iter_content(chunk_size=1024 * 1024): 84 | # audio_file.write(chunk) 85 | 86 | # # To play the audio in Jupyter after saving 87 | # Audio(audio_file_path) 88 | # Check if the request was successful 89 | if response.status_code != 200: 90 | raise Exception("Request failed with status code") 91 | # ... 92 | # Create an in-memory bytes buffer 93 | audio_bytes_io = io.BytesIO() 94 | 95 | # Write audio data to the in-memory bytes buffer 96 | for chunk in response.iter_content(chunk_size=1024 * 1024): 97 | audio_bytes_io.write(chunk) 98 | 99 | # Important: Seek to the start of the BytesIO buffer before returning 100 | audio_bytes_io.seek(0) 101 | 102 | # Save audio to a temporary file 103 | with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmpfile: 104 | for chunk in response.iter_content(chunk_size=1024 * 1024): 105 | tmpfile.write(chunk) 106 | audio_filename = tmpfile.name 107 | 108 | return audio_filename, audio_bytes_io 109 | 110 | 111 | def merge_audio_video(video_filename, audio_filename, output_filename): 112 | print("Merging audio and video...") 113 | print("Video filename:", video_filename) 114 | print("Audio filename:", audio_filename) 115 | 116 | # Load the video file 117 | video_clip = VideoFileClip(video_filename) 118 | 119 | # Load the audio file 120 | audio_clip = AudioFileClip(audio_filename) 121 | 122 | # Set the audio of the video clip as the audio file 123 | final_clip = video_clip.set_audio(audio_clip) 124 | 125 | # Write the result to a file (without audio) 126 | final_clip.write_videofile( 127 | output_filename, codec='libx264', audio_codec='aac') 128 | 129 | # Close the clips 130 | video_clip.close() 131 | audio_clip.close() 132 | 133 | # Return the path to the new video file 134 | return output_filename 135 | 136 | 137 | def main(): 138 | st.set_page_config(page_title="Video voice over", page_icon=":bird:") 139 | 140 | st.header("Video voice over :bird:") 141 | uploaded_file = st.file_uploader("Choose a file") 142 | 143 | if uploaded_file is not None: 144 | st.video(uploaded_file) 145 | prompt = st.text_area( 146 | "Prompt", value="These are frames of a quick product demo walkthrough. Create a short voiceover script that outline the key actions to take, that can be used along this product demo.") 147 | 148 | if st.button('Generate', type="primary") and uploaded_file is not None: 149 | with st.spinner('Processing...'): 150 | base64Frames, video_filename, video_duration = video_to_frames( 151 | uploaded_file) 152 | 153 | est_word_count = video_duration * 2 154 | final_prompt = prompt + f"(This video is ONLY {video_duration} seconds long, so make sure the voice over MUST be able to be explained in less than {est_word_count} words)" 155 | 156 | # st.write(final_prompt) 157 | text = frames_to_story(base64Frames, final_prompt) 158 | st.write(text) 159 | 160 | # Generate audio from text 161 | audio_filename, audio_bytes_io = text_to_audio(text) 162 | 163 | # Merge audio and video 164 | output_video_filename = os.path.splitext(video_filename)[ 165 | 0] + '_output.mp4' 166 | final_video_filename = merge_audio_video( 167 | video_filename, audio_filename, output_video_filename) 168 | 169 | # Display the result 170 | st.video(final_video_filename) 171 | 172 | # Clean up the temporary files 173 | os.unlink(video_filename) 174 | os.unlink(audio_filename) 175 | os.unlink(final_video_filename) 176 | 177 | 178 | if __name__ == '__main__': 179 | main() 180 | --------------------------------------------------------------------------------