├── .gitattributes
├── .gitignore
└── app.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .env
3 | demo.mp4
4 | merge.py
5 | output_audio.wav
6 | output_video.mp4
7 | output_videoTEMP_MPY_wvf_snd.mp4
8 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | from IPython.display import display, Image, Audio
  3 | from moviepy.editor import VideoFileClip, AudioFileClip
  4 | from moviepy.audio.io.AudioFileClip import AudioFileClip
  5 | 
  6 | import cv2  # We're using OpenCV to read video
  7 | import base64
  8 | import time
  9 | import io
 10 | import openai
 11 | import os
 12 | import requests
 13 | 
 14 | import streamlit as st
 15 | import tempfile
 16 | import numpy as np
 17 | 
 18 | load_dotenv()
 19 | 
 20 | 
 21 | def video_to_frames(video_file):
 22 |     # Save the uploaded video file to a temporary file
 23 |     with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
 24 |         tmpfile.write(video_file.read())
 25 |         video_filename = tmpfile.name
 26 | 
 27 |     video_duration = VideoFileClip(video_filename).duration
 28 | 
 29 |     video = cv2.VideoCapture(video_filename)
 30 |     base64Frames = []
 31 | 
 32 |     while video.isOpened():
 33 |         success, frame = video.read()
 34 |         if not success:
 35 |             break
 36 |         _, buffer = cv2.imencode(".jpg", frame)
 37 |         base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
 38 | 
 39 |     video.release()
 40 |     print(len(base64Frames), "frames read.")
 41 |     return base64Frames, video_filename, video_duration
 42 | 
 43 | 
 44 | def frames_to_story(base64Frames, prompt):
 45 |     PROMPT_MESSAGES = [
 46 |         {
 47 |             "role": "user",
 48 |             "content": [
 49 |                 prompt,
 50 |                 *map(lambda x: {"image": x, "resize": 768},
 51 |                      base64Frames[0::25]),
 52 |             ],
 53 |         },
 54 |     ]
 55 |     params = {
 56 |         "model": "gpt-4-vision-preview",
 57 |         "messages": PROMPT_MESSAGES,
 58 |         "api_key": os.environ["OPENAI_API_KEY"],
 59 |         "headers": {"Openai-Version": "2020-11-07"},
 60 |         "max_tokens": 500,
 61 |     }
 62 | 
 63 |     result = openai.ChatCompletion.create(**params)
 64 |     print(result.choices[0].message.content)
 65 |     return result.choices[0].message.content
 66 | 
 67 | 
 68 | def text_to_audio(text):
 69 |     response = requests.post(
 70 |         "https://api.openai.com/v1/audio/speech",
 71 |         headers={
 72 |             "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}",
 73 |         },
 74 |         json={
 75 |             "model": "tts-1",
 76 |             "input": text,
 77 |             "voice": "onyx",
 78 |         },
 79 |     )
 80 | 
 81 |     # audio_file_path = "output_audio.wav"
 82 |     # with open(audio_file_path, "wb") as audio_file:
 83 |     #     for chunk in response.iter_content(chunk_size=1024 * 1024):
 84 |     #         audio_file.write(chunk)
 85 | 
 86 |     # # To play the audio in Jupyter after saving
 87 |     # Audio(audio_file_path)
 88 |     # Check if the request was successful
 89 |     if response.status_code != 200:
 90 |         raise Exception("Request failed with status code")
 91 |     # ...
 92 |     # Create an in-memory bytes buffer
 93 |     audio_bytes_io = io.BytesIO()
 94 | 
 95 |     # Write audio data to the in-memory bytes buffer
 96 |     for chunk in response.iter_content(chunk_size=1024 * 1024):
 97 |         audio_bytes_io.write(chunk)
 98 | 
 99 |     # Important: Seek to the start of the BytesIO buffer before returning
100 |     audio_bytes_io.seek(0)
101 | 
102 |     # Save audio to a temporary file
103 |     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmpfile:
104 |         for chunk in response.iter_content(chunk_size=1024 * 1024):
105 |             tmpfile.write(chunk)
106 |         audio_filename = tmpfile.name
107 | 
108 |     return audio_filename, audio_bytes_io
109 | 
110 | 
111 | def merge_audio_video(video_filename, audio_filename, output_filename):
112 |     print("Merging audio and video...")
113 |     print("Video filename:", video_filename)
114 |     print("Audio filename:", audio_filename)
115 | 
116 |     # Load the video file
117 |     video_clip = VideoFileClip(video_filename)
118 | 
119 |     # Load the audio file
120 |     audio_clip = AudioFileClip(audio_filename)
121 | 
122 |     # Set the audio of the video clip as the audio file
123 |     final_clip = video_clip.set_audio(audio_clip)
124 | 
125 |     # Write the result to a file (without audio)
126 |     final_clip.write_videofile(
127 |         output_filename, codec='libx264', audio_codec='aac')
128 | 
129 |     # Close the clips
130 |     video_clip.close()
131 |     audio_clip.close()
132 | 
133 |     # Return the path to the new video file
134 |     return output_filename
135 | 
136 | 
137 | def main():
138 |     st.set_page_config(page_title="Video voice over", page_icon=":bird:")
139 | 
140 |     st.header("Video voice over :bird:")
141 |     uploaded_file = st.file_uploader("Choose a file")
142 | 
143 |     if uploaded_file is not None:
144 |         st.video(uploaded_file)
145 |         prompt = st.text_area(
146 |             "Prompt", value="These are frames of a quick product demo walkthrough. Create a short voiceover script that outline the key actions to take, that can be used along this product demo.")
147 | 
148 |     if st.button('Generate', type="primary") and uploaded_file is not None:
149 |         with st.spinner('Processing...'):
150 |             base64Frames, video_filename, video_duration = video_to_frames(
151 |                 uploaded_file)
152 | 
153 |             est_word_count = video_duration * 2
154 |             final_prompt = prompt + f"(This video is ONLY {video_duration} seconds long, so make sure the voice over MUST be able to be explained in less than {est_word_count} words)"
155 | 
156 |             # st.write(final_prompt)
157 |             text = frames_to_story(base64Frames, final_prompt)
158 |             st.write(text)
159 | 
160 |             # Generate audio from text
161 |             audio_filename, audio_bytes_io = text_to_audio(text)
162 | 
163 |             # Merge audio and video
164 |             output_video_filename = os.path.splitext(video_filename)[
165 |                 0] + '_output.mp4'
166 |             final_video_filename = merge_audio_video(
167 |                 video_filename, audio_filename, output_video_filename)
168 | 
169 |             # Display the result
170 |             st.video(final_video_filename)
171 | 
172 |             # Clean up the temporary files
173 |             os.unlink(video_filename)
174 |             os.unlink(audio_filename)
175 |             os.unlink(final_video_filename)
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     main()
180 | 


--------------------------------------------------------------------------------