├── frames └── frame.jpg ├── .idea ├── .gitignore ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── GPT4V.iml ├── requirements.txt ├── LICENSE ├── capture.py ├── README.md ├── main.py └── singlestore.py /frames/frame.jpg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/GPT4V.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.6.0 2 | anyio==3.7.1 3 | certifi==2023.11.17 4 | cffi==1.16.0 5 | charset-normalizer==3.3.2 6 | distro==1.8.0 7 | exceptiongroup==1.1.3 8 | h11==0.14.0 9 | httpcore==1.0.2 10 | httpx==0.25.1 11 | idna==3.4 12 | numpy==1.26.2 13 | openai==1.3.3 14 | opencv-python==4.8.1.78 15 | Pillow==10.1.0 16 | playsound==1.3.0 17 | PyAudio==0.2.14 18 | pycparser==2.21 19 | pydantic==2.5.1 20 | pydantic_core==2.14.3 21 | pyobjc==10.0 22 | requests==2.31.0 23 | sniffio==1.3.0 24 | sounddevice==0.4.6 25 | soundfile==0.12.1 26 | SpeechRecognition==3.10.0 27 | tqdm==4.66.1 28 | typing_extensions==4.8.0 29 | urllib3==2.1.0 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ayush Pai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /capture.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | from PIL import Image 4 | import numpy as np 5 | import os 6 | 7 | # Folder 8 | folder = "frames" 9 | 10 | # Create the frames folder if it doesn't exist 11 | frames_dir = os.path.join(os.getcwd(), folder) 12 | os.makedirs(frames_dir, exist_ok=True) 13 | 14 | # Initialize the video capture 15 | cap = cv2.VideoCapture("videos/clippers.mp4") 16 | 17 | # Wait for the camera to initialize and adjust light levels 18 | time.sleep(2) 19 | 20 | while True: 21 | ret, frame = cap.read() 22 | if ret: 23 | # Convert the frame to a PIL image 24 | pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 25 | 26 | # Convert the PIL image back to an OpenCV image 27 | frame = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) 28 | 29 | # Display the frame 30 | cv2.imshow("Video Preview", frame) 31 | 32 | # Save the frame as an image file 33 | print("Saved current frame") 34 | path = f"{folder}/frame.jpg" 35 | cv2.imwrite(path, frame) 36 | else: 37 | print("Failed to capture image") 38 | break 39 | 40 | # Check if the user pressed the 'q' key 41 | if cv2.waitKey(25) & 0xFF == ord('q'): 42 | break 43 | 44 | # Release the camera and close all windows 45 | cap.release() 46 | cv2.destroyAllWindows() 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Conversational AI with GPT-4 Vision, OpenAI Whisper, and TTS 2 | ## Overview 3 | This project integrates GPT-4 Vision, OpenAI Whisper, and OpenAI Text-to-Speech (TTS) to create an interactive AI system for conversations. It combines visual and audio inputs for a seamless user experience. 4 | 5 | ## Demo Video: 6 | https://twitter.com/ayushspai/status/1726222559480557647 7 | 8 | ## Components 9 | - **GPT-4 Vision**: Analyzes visual input and generates contextual responses. 10 | - **OpenAI Whisper**: Converts spoken language into text. 11 | - **OpenAI TTS**: Transforms text responses into spoken language. 12 | 13 | ## Main Files 14 | - `main.py`: Manages audio processing, image encoding, AI interactions, and text-to-speech output. 15 | - `capture.py`: Captures and processes video frames for visual analysis. 16 | 17 | ## Installation 18 | 19 | ### Prerequisites 20 | - Python 3.x 21 | - An OpenAI API key (set as an environment variable `OPENAI_API_KEY`) 22 | 23 | ### Libraries 24 | Install the necessary libraries with the requirements.txt file. 25 | ``` 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | ## Usage 30 | 31 | ### Running the Scripts 32 | - **Start `capture.py`**: Captures video frames and saves them for AI analysis. 33 | - Reads a video file, displays the video, and saves the current frame as `frame.jpg`. 34 | - Execute with `python capture.py`. 35 | 36 | - **Run `main.py` concurrently**: Orchestrates the conversational AI. 37 | - Continuously listens for user audio input. 38 | - Transcribes speech to text, captures the current video frame, and sends both to GPT-4 for analysis. 39 | - Converts the AI's response to speech and plays it back. 40 | - Execute with `python main.py`. 41 | 42 | ### Workflow 43 | 1. `main.py` listens for audio input and transcribes it using OpenAI Whisper. 44 | 2. Meanwhile, `capture.py` captures a video frame. 45 | 3. Both the audio transcription and the encoded image are sent to GPT-4 Vision. 46 | 4. GPT-4 Vision responds, considering the visual and textual context. 47 | 5. The response is vocalized using OpenAI TTS and played to the user. 48 | 49 | ### Notes 50 | - Ensure both `main.py` and `capture.py` are active for the system to function. 51 | - The video file in `capture.py` can be customized. 52 | - Adequate hardware is recommended for smooth audio and video processing. 53 | 54 | ## Conclusion 55 | This project demonstrates a novel approach to combining various AI technologies, creating a dynamic and interactive conversational AI experience. It harnesses the capabilities of GPT-4 Vision, Whisper, and TTS for a comprehensive audio-visual interaction. 56 | 57 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sounddevice as sd 2 | import soundfile as sf 3 | import numpy as np 4 | import speech_recognition as sr 5 | from openai import OpenAI 6 | import os 7 | import base64 8 | from playsound import playsound 9 | from io import BytesIO 10 | 11 | OpenAI.api_key = os.environ["OPENAI_API_KEY"] 12 | 13 | 14 | # Function to encode the image 15 | def encode_image(image_path): 16 | with open(image_path, "rb") as image_file: 17 | return base64.b64encode(image_file.read()).decode('utf-8') 18 | 19 | 20 | client = OpenAI() 21 | 22 | 23 | def frame_description(base64_image, user_prompt): 24 | return [ 25 | { 26 | "role": "user", 27 | "content": [ 28 | {"type": "text", "text": user_prompt}, 29 | { 30 | "type": "image_url", 31 | "image_url": f"data:image/jpeg;base64,{base64_image}", 32 | }, 33 | ], 34 | }, 35 | ] 36 | 37 | 38 | def analyze_image(full_analysis, base64_image, user_prompt): 39 | response = client.chat.completions.create( 40 | model="gpt-4-vision-preview", 41 | messages=[ 42 | { 43 | "role": "system", 44 | "content": """ 45 | Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game. 46 | They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks. 47 | """, 48 | }, 49 | ] 50 | + full_analysis 51 | + frame_description(base64_image, user_prompt), 52 | max_tokens=500, 53 | ) 54 | response_text = response.choices[0].message.content 55 | return response_text 56 | 57 | 58 | def play_audio(text): 59 | response = client.audio.speech.create( 60 | model="tts-1", 61 | voice="alloy", 62 | input=text, 63 | ) 64 | 65 | response.stream_to_file("audio/output.mp3") 66 | playsound("audio/output.mp3") 67 | 68 | 69 | def get_prompt(): 70 | audio_file = open("audio/input.mp3", "rb") 71 | transcript = client.audio.transcriptions.create( 72 | model="whisper-1", 73 | file=audio_file, 74 | response_format="text" 75 | ) 76 | return transcript 77 | 78 | 79 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None): 80 | recognizer = sr.Recognizer() 81 | with sr.Microphone() as mic: 82 | print("Listening for speech...") 83 | # Adjust the recognizer sensitivity to ambient noise 84 | recognizer.adjust_for_ambient_noise(mic) 85 | started = False 86 | start_time = None 87 | audio_frames = [] 88 | 89 | recording = True 90 | 91 | def callback(indata, frames, time, status): 92 | nonlocal started, start_time, audio_frames, recording, base64_image 93 | if np.any(indata > threshold): 94 | if not started: 95 | print("Starting recording...") 96 | # Path to your image 97 | image_path = "frames/frame.jpg" 98 | # Getting the base64 string 99 | base64_image = encode_image(image_path) 100 | started = True 101 | start_time = time.inputBufferAdcTime 102 | audio_frames.append(indata.copy()) 103 | elif started: 104 | if time.inputBufferAdcTime - start_time > silence_duration: 105 | recording = False 106 | raise sd.CallbackAbort 107 | 108 | with sd.InputStream(callback=callback, channels=1): 109 | while True: 110 | if not recording: 111 | break 112 | 113 | if audio_frames: 114 | audio_data = np.concatenate(audio_frames, axis=0) 115 | with BytesIO() as f: 116 | sf.write(f, audio_data, samplerate=70000, format='WAV') 117 | f.seek(0) 118 | with sr.AudioFile(f) as source: 119 | audio = recognizer.record(source) 120 | with open("audio/input.mp3", "wb") as mp3_file: 121 | mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2)) 122 | #print("Audio saved as input.mp3") 123 | else: 124 | print("No speech detected") 125 | return base64_image 126 | 127 | 128 | def main(): 129 | full_analysis = [] 130 | while True: 131 | final_image = get_input_file() 132 | user_prompt = get_prompt() 133 | print(user_prompt) 134 | analysis = analyze_image(full_analysis, final_image, user_prompt) 135 | print(analysis) 136 | play_audio(analysis) 137 | full_analysis = full_analysis + [{"role": "assistant", "content": analysis}] 138 | 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /singlestore.py: -------------------------------------------------------------------------------- 1 | import singlestoredb as s2 2 | import sounddevice as sd 3 | import soundfile as sf 4 | import numpy as np 5 | import speech_recognition as sr 6 | from openai import OpenAI 7 | import os 8 | import base64 9 | from playsound import playsound 10 | from io import BytesIO 11 | 12 | # You can store user queries with SingleStore. If you do not want to use SingleStore, just use main.py. 13 | 14 | OpenAI.api_key = os.environ["OPENAI_API_KEY"] 15 | 16 | 17 | # Function to encode the image 18 | def encode_image(image_path): 19 | with open(image_path, "rb") as image_file: 20 | return base64.b64encode(image_file.read()).decode('utf-8') 21 | 22 | 23 | client = OpenAI() 24 | 25 | 26 | def frame_description(base64_image, user_prompt): 27 | return [ 28 | { 29 | "role": "user", 30 | "content": [ 31 | {"type": "text", "text": user_prompt}, 32 | { 33 | "type": "image_url", 34 | "image_url": f"data:image/jpeg;base64,{base64_image}", 35 | }, 36 | ], 37 | }, 38 | ] 39 | 40 | 41 | def analyze_image(full_analysis, base64_image, user_prompt): 42 | response = client.chat.completions.create( 43 | model="gpt-4-vision-preview", 44 | messages=[ 45 | { 46 | "role": "system", 47 | "content": """ 48 | Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game. The clippers are the black jersey team (with white logo on bottom). The other team is the rockets and they are in a white jersey (red logo on bottom). 49 | They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks. 50 | """, 51 | }, 52 | ] 53 | + full_analysis 54 | + frame_description(base64_image, user_prompt), 55 | max_tokens=500, 56 | ) 57 | response_text = response.choices[0].message.content 58 | return response_text 59 | 60 | 61 | def play_audio(text): 62 | response = client.audio.speech.create( 63 | model="tts-1", 64 | voice="alloy", 65 | input=text, 66 | ) 67 | 68 | response.stream_to_file("audio/output.mp3") 69 | playsound("audio/output.mp3") 70 | 71 | 72 | def get_prompt(): 73 | audio_file = open("audio/input.mp3", "rb") 74 | transcript = client.audio.transcriptions.create( 75 | model="whisper-1", 76 | file=audio_file, 77 | response_format="text" 78 | ) 79 | return transcript 80 | 81 | 82 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None): 83 | recognizer = sr.Recognizer() 84 | with sr.Microphone() as mic: 85 | print("Listening for speech...") 86 | # Adjust the recognizer sensitivity to ambient noise 87 | recognizer.adjust_for_ambient_noise(mic) 88 | started = False 89 | start_time = None 90 | audio_frames = [] 91 | 92 | recording = True 93 | 94 | def callback(indata, frames, time, status): 95 | nonlocal started, start_time, audio_frames, recording, base64_image 96 | if np.any(indata > threshold): 97 | if not started: 98 | print("Starting recording...") 99 | # Path to your image 100 | image_path = "frames/frame.jpg" 101 | # Getting the base64 string 102 | base64_image = encode_image(image_path) 103 | started = True 104 | start_time = time.inputBufferAdcTime 105 | audio_frames.append(indata.copy()) 106 | elif started: 107 | if time.inputBufferAdcTime - start_time > silence_duration: 108 | recording = False 109 | raise sd.CallbackAbort 110 | 111 | with sd.InputStream(callback=callback, channels=1): 112 | while True: 113 | if not recording: 114 | break 115 | 116 | if audio_frames: 117 | audio_data = np.concatenate(audio_frames, axis=0) 118 | with BytesIO() as f: 119 | sf.write(f, audio_data, samplerate=70000, format='WAV') 120 | f.seek(0) 121 | with sr.AudioFile(f) as source: 122 | audio = recognizer.record(source) 123 | with open("audio/input.mp3", "wb") as mp3_file: 124 | mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2)) 125 | #print("Audio saved as input.mp3") 126 | else: 127 | print("No speech detected") 128 | return base64_image 129 | 130 | 131 | def main(): 132 | full_analysis = [] 133 | conn = s2.connect( 134 | 'admin:Testing123@svc-ca8fa339-0d39-4942-ad73-4463f4110a1c-dml.aws-virginia-5.svc.singlestore.com:3306/testing') 135 | 136 | conn.autocommit(True) 137 | try: 138 | while True: 139 | final_image = get_input_file() 140 | user_prompt = get_prompt() 141 | print(user_prompt) 142 | analysis = analyze_image(full_analysis, final_image, user_prompt) 143 | print(analysis) 144 | play_audio(analysis) 145 | full_analysis.append({"role": "assistant", "content": analysis}) 146 | 147 | # SQL statement for a regular INSERT 148 | insert_stmt = 'INSERT INTO OpenAISingleStore (TextValue) VALUES (%s)' 149 | 150 | with conn.cursor() as cur: 151 | # Insert the data without specifying TextKey; it will auto-increment 152 | cur.execute(insert_stmt, (user_prompt,)) 153 | 154 | # Retrieve the last inserted ID 155 | cur.execute('SELECT LAST_INSERT_ID()') 156 | last_id_result = cur.fetchone() 157 | if last_id_result: 158 | last_id = last_id_result[0] 159 | print("Last inserted ID:", last_id) 160 | 161 | finally: 162 | # Ensure the connection is closed after the loop 163 | conn.close() 164 | 165 | 166 | if __name__ == "__main__": 167 | main() 168 | --------------------------------------------------------------------------------